From 814abfabef3ceed390c10d06a0cc69a86454b6cf Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Mon, 17 Jul 2017 09:27:07 -0700
Subject: xdp: add bpf_redirect helper function

This adds support for a bpf_redirect helper function to the XDP
infrastructure. For now this only supports redirecting to the egress
path of a port.

In order to support drivers handling a xdp_buff natively this patches
uses a new ndo operation ndo_xdp_xmit() that takes pushes a xdp_buff
to the specified device.

If the program specifies either (a) an unknown device or (b) a device
that does not support the operation a BPF warning is thrown and the
XDP_ABORTED error code is returned.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/bpf.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index e99e3e6f8b37..4dbb7a3f4677 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -717,6 +717,7 @@ enum xdp_action {
 	XDP_DROP,
 	XDP_PASS,
 	XDP_TX,
+	XDP_REDIRECT,
 };
 
 /* user accessible metadata for XDP packet hook
-- 
cgit v1.2.3-71-gd317


From 546ac1ffb70d25b56c1126940e5ec639c4dd7413 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Mon, 17 Jul 2017 09:28:56 -0700
Subject: bpf: add devmap, a map for storing net device references

Device map (devmap) is a BPF map, primarily useful for networking
applications, that uses a key to lookup a reference to a netdevice.

The map provides a clean way for BPF programs to build virtual port
to physical port maps. Additionally, it provides a scoping function
for the redirect action itself allowing multiple optimizations. Future
patches will leverage the map to provide batching at the XDP layer.

Another optimization/feature, that is not yet implemented, would be
to support multiple netdevices per key to support efficient multicast
and broadcast support.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf_types.h               |   3 +
 include/uapi/linux/bpf.h                |   1 +
 kernel/bpf/Makefile                     |   3 +
 kernel/bpf/devmap.c                     | 264 ++++++++++++++++++++++++++++++++
 kernel/bpf/verifier.c                   |   8 +
 tools/testing/selftests/bpf/test_maps.c |  15 ++
 6 files changed, 294 insertions(+)
 create mode 100644 kernel/bpf/devmap.c

(limited to 'include/uapi/linux')

diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index 3d137c33d664..b1e1035ca24b 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -35,3 +35,6 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_STACK_TRACE, stack_map_ops)
 #endif
 BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY_OF_MAPS, array_of_maps_map_ops)
 BPF_MAP_TYPE(BPF_MAP_TYPE_HASH_OF_MAPS, htab_of_maps_map_ops)
+#ifdef CONFIG_NET
+BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP, dev_map_ops)
+#endif
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 4dbb7a3f4677..ecbb0e7e15bc 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -104,6 +104,7 @@ enum bpf_map_type {
 	BPF_MAP_TYPE_LPM_TRIE,
 	BPF_MAP_TYPE_ARRAY_OF_MAPS,
 	BPF_MAP_TYPE_HASH_OF_MAPS,
+	BPF_MAP_TYPE_DEVMAP,
 };
 
 enum bpf_prog_type {
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index e1e5e658f2db..48e92705be59 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -2,6 +2,9 @@ obj-y := core.o
 
 obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o
 obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o
+ifeq ($(CONFIG_NET),y)
+obj-$(CONFIG_BPF_SYSCALL) += devmap.o
+endif
 ifeq ($(CONFIG_PERF_EVENTS),y)
 obj-$(CONFIG_BPF_SYSCALL) += stackmap.o
 endif
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
new file mode 100644
index 000000000000..1a878356bd37
--- /dev/null
+++ b/kernel/bpf/devmap.c
@@ -0,0 +1,264 @@
+/* Copyright (c) 2017 Covalent IO, Inc. http://covalent.io
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+/* Devmaps primary use is as a backend map for XDP BPF helper call
+ * bpf_redirect_map(). Because XDP is mostly concerned with performance we
+ * spent some effort to ensure the datapath with redirect maps does not use
+ * any locking. This is a quick note on the details.
+ *
+ * We have three possible paths to get into the devmap control plane bpf
+ * syscalls, bpf programs, and driver side xmit/flush operations. A bpf syscall
+ * will invoke an update, delete, or lookup operation. To ensure updates and
+ * deletes appear atomic from the datapath side xchg() is used to modify the
+ * netdev_map array. Then because the datapath does a lookup into the netdev_map
+ * array (read-only) from an RCU critical section we use call_rcu() to wait for
+ * an rcu grace period before free'ing the old data structures. This ensures the
+ * datapath always has a valid copy. However, the datapath does a "flush"
+ * operation that pushes any pending packets in the driver outside the RCU
+ * critical section. Each bpf_dtab_netdev tracks these pending operations using
+ * an atomic per-cpu bitmap. The bpf_dtab_netdev object will not be destroyed
+ * until all bits are cleared indicating outstanding flush operations have
+ * completed.
+ *
+ * BPF syscalls may race with BPF program calls on any of the update, delete
+ * or lookup operations. As noted above the xchg() operation also keep the
+ * netdev_map consistent in this case. From the devmap side BPF programs
+ * calling into these operations are the same as multiple user space threads
+ * making system calls.
+ */
+#include <linux/bpf.h>
+#include <linux/jhash.h>
+#include <linux/filter.h>
+#include <linux/rculist_nulls.h>
+#include "percpu_freelist.h"
+#include "bpf_lru_list.h"
+#include "map_in_map.h"
+
+struct bpf_dtab_netdev {
+	struct net_device *dev;
+	int key;
+	struct rcu_head rcu;
+	struct bpf_dtab *dtab;
+};
+
+struct bpf_dtab {
+	struct bpf_map map;
+	struct bpf_dtab_netdev **netdev_map;
+};
+
+static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
+{
+	struct bpf_dtab *dtab;
+	u64 cost;
+	int err;
+
+	/* check sanity of attributes */
+	if (attr->max_entries == 0 || attr->key_size != 4 ||
+	    attr->value_size != 4 || attr->map_flags)
+		return ERR_PTR(-EINVAL);
+
+	/* if value_size is bigger, the user space won't be able to
+	 * access the elements.
+	 */
+	if (attr->value_size > KMALLOC_MAX_SIZE)
+		return ERR_PTR(-E2BIG);
+
+	dtab = kzalloc(sizeof(*dtab), GFP_USER);
+	if (!dtab)
+		return ERR_PTR(-ENOMEM);
+
+	/* mandatory map attributes */
+	dtab->map.map_type = attr->map_type;
+	dtab->map.key_size = attr->key_size;
+	dtab->map.value_size = attr->value_size;
+	dtab->map.max_entries = attr->max_entries;
+	dtab->map.map_flags = attr->map_flags;
+
+	err = -ENOMEM;
+
+	/* make sure page count doesn't overflow */
+	cost = (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *);
+	if (cost >= U32_MAX - PAGE_SIZE)
+		goto free_dtab;
+
+	dtab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
+
+	/* if map size is larger than memlock limit, reject it early */
+	err = bpf_map_precharge_memlock(dtab->map.pages);
+	if (err)
+		goto free_dtab;
+
+	dtab->netdev_map = bpf_map_area_alloc(dtab->map.max_entries *
+					      sizeof(struct bpf_dtab_netdev *));
+	if (!dtab->netdev_map)
+		goto free_dtab;
+
+	return &dtab->map;
+
+free_dtab:
+	kfree(dtab);
+	return ERR_PTR(err);
+}
+
+static void dev_map_free(struct bpf_map *map)
+{
+	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
+	int i;
+
+	/* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
+	 * so the programs (can be more than one that used this map) were
+	 * disconnected from events. Wait for outstanding critical sections in
+	 * these programs to complete. The rcu critical section only guarantees
+	 * no further reads against netdev_map. It does __not__ ensure pending
+	 * flush operations (if any) are complete.
+	 */
+	synchronize_rcu();
+
+	for (i = 0; i < dtab->map.max_entries; i++) {
+		struct bpf_dtab_netdev *dev;
+
+		dev = dtab->netdev_map[i];
+		if (!dev)
+			continue;
+
+		dev_put(dev->dev);
+		kfree(dev);
+	}
+
+	/* At this point bpf program is detached and all pending operations
+	 * _must_ be complete
+	 */
+	bpf_map_area_free(dtab->netdev_map);
+	kfree(dtab);
+}
+
+static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
+{
+	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
+	u32 index = key ? *(u32 *)key : U32_MAX;
+	u32 *next = (u32 *)next_key;
+
+	if (index >= dtab->map.max_entries) {
+		*next = 0;
+		return 0;
+	}
+
+	if (index == dtab->map.max_entries - 1)
+		return -ENOENT;
+
+	*next = index + 1;
+	return 0;
+}
+
+/* rcu_read_lock (from syscall and BPF contexts) ensures that if a delete and/or
+ * update happens in parallel here a dev_put wont happen until after reading the
+ * ifindex.
+ */
+static void *dev_map_lookup_elem(struct bpf_map *map, void *key)
+{
+	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
+	struct bpf_dtab_netdev *dev;
+	u32 i = *(u32 *)key;
+
+	if (i >= map->max_entries)
+		return NULL;
+
+	dev = READ_ONCE(dtab->netdev_map[i]);
+	return dev ? &dev->dev->ifindex : NULL;
+}
+
+static void __dev_map_entry_free(struct rcu_head *rcu)
+{
+	struct bpf_dtab_netdev *old_dev;
+
+	old_dev = container_of(rcu, struct bpf_dtab_netdev, rcu);
+	dev_put(old_dev->dev);
+	kfree(old_dev);
+}
+
+static int dev_map_delete_elem(struct bpf_map *map, void *key)
+{
+	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
+	struct bpf_dtab_netdev *old_dev;
+	int k = *(u32 *)key;
+
+	if (k >= map->max_entries)
+		return -EINVAL;
+
+	/* Use synchronize_rcu() here to ensure any rcu critical sections
+	 * have completed, but this does not guarantee a flush has happened
+	 * yet. Because driver side rcu_read_lock/unlock only protects the
+	 * running XDP program. However, for pending flush operations the
+	 * dev and ctx are stored in another per cpu map. And additionally,
+	 * the driver tear down ensures all soft irqs are complete before
+	 * removing the net device in the case of dev_put equals zero.
+	 */
+	old_dev = xchg(&dtab->netdev_map[k], NULL);
+	if (old_dev)
+		call_rcu(&old_dev->rcu, __dev_map_entry_free);
+	return 0;
+}
+
+static int dev_map_update_elem(struct bpf_map *map, void *key, void *value,
+				u64 map_flags)
+{
+	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
+	struct net *net = current->nsproxy->net_ns;
+	struct bpf_dtab_netdev *dev, *old_dev;
+	u32 i = *(u32 *)key;
+	u32 ifindex = *(u32 *)value;
+
+	if (unlikely(map_flags > BPF_EXIST))
+		return -EINVAL;
+
+	if (unlikely(i >= dtab->map.max_entries))
+		return -E2BIG;
+
+	if (unlikely(map_flags == BPF_NOEXIST))
+		return -EEXIST;
+
+	if (!ifindex) {
+		dev = NULL;
+	} else {
+		dev = kmalloc(sizeof(*dev), GFP_ATOMIC | __GFP_NOWARN);
+		if (!dev)
+			return -ENOMEM;
+
+		dev->dev = dev_get_by_index(net, ifindex);
+		if (!dev->dev) {
+			kfree(dev);
+			return -EINVAL;
+		}
+
+		dev->key = i;
+		dev->dtab = dtab;
+	}
+
+	/* Use call_rcu() here to ensure rcu critical sections have completed
+	 * Remembering the driver side flush operation will happen before the
+	 * net device is removed.
+	 */
+	old_dev = xchg(&dtab->netdev_map[i], dev);
+	if (old_dev)
+		call_rcu(&old_dev->rcu, __dev_map_entry_free);
+
+	return 0;
+}
+
+const struct bpf_map_ops dev_map_ops = {
+	.map_alloc = dev_map_alloc,
+	.map_free = dev_map_free,
+	.map_get_next_key = dev_map_get_next_key,
+	.map_lookup_elem = dev_map_lookup_elem,
+	.map_update_elem = dev_map_update_elem,
+	.map_delete_elem = dev_map_delete_elem,
+};
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 6a86723c5b64..4016774d5cca 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1276,6 +1276,14 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
 		    func_id != BPF_FUNC_current_task_under_cgroup)
 			goto error;
 		break;
+	/* devmap returns a pointer to a live net_device ifindex that we cannot
+	 * allow to be modified from bpf side. So do not allow lookup elements
+	 * for now.
+	 */
+	case BPF_MAP_TYPE_DEVMAP:
+		if (func_id == BPF_FUNC_map_lookup_elem)
+			goto error;
+		break;
 	case BPF_MAP_TYPE_ARRAY_OF_MAPS:
 	case BPF_MAP_TYPE_HASH_OF_MAPS:
 		if (func_id != BPF_FUNC_map_lookup_elem)
diff --git a/tools/testing/selftests/bpf/test_maps.c b/tools/testing/selftests/bpf/test_maps.c
index 79601c81e169..36d6ac3f0c1c 100644
--- a/tools/testing/selftests/bpf/test_maps.c
+++ b/tools/testing/selftests/bpf/test_maps.c
@@ -438,6 +438,21 @@ static void test_arraymap_percpu_many_keys(void)
 	close(fd);
 }
 
+static void test_devmap(int task, void *data)
+{
+	int next_key, fd;
+	__u32 key, value;
+
+	fd = bpf_create_map(BPF_MAP_TYPE_DEVMAP, sizeof(key), sizeof(value),
+			    2, 0);
+	if (fd < 0) {
+		printf("Failed to create arraymap '%s'!\n", strerror(errno));
+		exit(1);
+	}
+
+	close(fd);
+}
+
 #define MAP_SIZE (32 * 1024)
 
 static void test_map_large(void)
-- 
cgit v1.2.3-71-gd317


From 97f91a7cf04ff605845c20948b8a80e54cbd3376 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Mon, 17 Jul 2017 09:29:18 -0700
Subject: bpf: add bpf_redirect_map helper routine

BPF programs can use the devmap with a bpf_redirect_map() helper
routine to forward packets to netdevice in map.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h      |  3 +++
 include/uapi/linux/bpf.h |  8 +++++++-
 kernel/bpf/devmap.c      | 12 +++++++++++
 kernel/bpf/verifier.c    |  4 ++++
 net/core/filter.c        | 52 ++++++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 78 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index b69e7a5869ff..d0d3281ac678 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -379,4 +379,7 @@ extern const struct bpf_func_proto bpf_get_stackid_proto;
 void bpf_user_rnd_init_once(void);
 u64 bpf_user_rnd_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
 
+/* Map specifics */
+struct net_device  *__dev_map_lookup_elem(struct bpf_map *map, u32 key);
+
 #endif /* _LINUX_BPF_H */
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index ecbb0e7e15bc..1106a8c4cd36 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -348,6 +348,11 @@ union bpf_attr {
  *     @flags: bit 0 - if set, redirect to ingress instead of egress
  *             other bits - reserved
  *     Return: TC_ACT_REDIRECT
+ * int bpf_redirect_map(key, map, flags)
+ *     redirect to endpoint in map
+ *     @key: index in map to lookup
+ *     @map: fd of map to do lookup in
+ *     @flags: --
  *
  * u32 bpf_get_route_realm(skb)
  *     retrieve a dst's tclassid
@@ -592,7 +597,8 @@ union bpf_attr {
 	FN(get_socket_uid),		\
 	FN(set_hash),			\
 	FN(setsockopt),			\
-	FN(skb_adjust_room),
+	FN(skb_adjust_room),		\
+	FN(redirect_map),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 1a878356bd37..36dc13deb2e1 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -159,6 +159,18 @@ static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
 	return 0;
 }
 
+struct net_device  *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
+{
+	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
+	struct bpf_dtab_netdev *dev;
+
+	if (key >= map->max_entries)
+		return NULL;
+
+	dev = READ_ONCE(dtab->netdev_map[key]);
+	return dev ? dev->dev : NULL;
+}
+
 /* rcu_read_lock (from syscall and BPF contexts) ensures that if a delete and/or
  * update happens in parallel here a dev_put wont happen until after reading the
  * ifindex.
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 4016774d5cca..df05d65f0c87 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1312,6 +1312,10 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
 		if (map->map_type != BPF_MAP_TYPE_CGROUP_ARRAY)
 			goto error;
 		break;
+	case BPF_FUNC_redirect_map:
+		if (map->map_type != BPF_MAP_TYPE_DEVMAP)
+			goto error;
+		break;
 	default:
 		break;
 	}
diff --git a/net/core/filter.c b/net/core/filter.c
index e30d38b27f21..e93a558324b5 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1779,6 +1779,7 @@ static const struct bpf_func_proto bpf_clone_redirect_proto = {
 struct redirect_info {
 	u32 ifindex;
 	u32 flags;
+	struct bpf_map *map;
 };
 
 static DEFINE_PER_CPU(struct redirect_info, redirect_info);
@@ -1792,6 +1793,7 @@ BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags)
 
 	ri->ifindex = ifindex;
 	ri->flags = flags;
+	ri->map = NULL;
 
 	return TC_ACT_REDIRECT;
 }
@@ -1819,6 +1821,29 @@ static const struct bpf_func_proto bpf_redirect_proto = {
 	.arg2_type      = ARG_ANYTHING,
 };
 
+BPF_CALL_3(bpf_redirect_map, struct bpf_map *, map, u32, ifindex, u64, flags)
+{
+	struct redirect_info *ri = this_cpu_ptr(&redirect_info);
+
+	if (unlikely(flags))
+		return XDP_ABORTED;
+
+	ri->ifindex = ifindex;
+	ri->flags = flags;
+	ri->map = map;
+
+	return XDP_REDIRECT;
+}
+
+static const struct bpf_func_proto bpf_redirect_map_proto = {
+	.func           = bpf_redirect_map,
+	.gpl_only       = false,
+	.ret_type       = RET_INTEGER,
+	.arg1_type      = ARG_CONST_MAP_PTR,
+	.arg2_type      = ARG_ANYTHING,
+	.arg3_type      = ARG_ANYTHING,
+};
+
 BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb)
 {
 	return task_get_classid(skb);
@@ -2423,14 +2448,39 @@ static int __bpf_tx_xdp(struct net_device *dev, struct xdp_buff *xdp)
 	return -EOPNOTSUPP;
 }
 
+int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp,
+			struct bpf_prog *xdp_prog)
+{
+	struct redirect_info *ri = this_cpu_ptr(&redirect_info);
+	struct bpf_map *map = ri->map;
+	struct net_device *fwd;
+	int err = -EINVAL;
+
+	ri->ifindex = 0;
+	ri->map = NULL;
+
+	fwd = __dev_map_lookup_elem(map, ri->ifindex);
+	if (!fwd)
+		goto out;
+
+	trace_xdp_redirect(dev, fwd, xdp_prog, XDP_REDIRECT);
+	err = __bpf_tx_xdp(fwd, xdp);
+out:
+	return err;
+}
+
 int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,
 		    struct bpf_prog *xdp_prog)
 {
 	struct redirect_info *ri = this_cpu_ptr(&redirect_info);
 	struct net_device *fwd;
 
+	if (ri->map)
+		return xdp_do_redirect_map(dev, xdp, xdp_prog);
+
 	fwd = dev_get_by_index_rcu(dev_net(dev), ri->ifindex);
 	ri->ifindex = 0;
+	ri->map = NULL;
 	if (unlikely(!fwd)) {
 		bpf_warn_invalid_xdp_redirect(ri->ifindex);
 		return -EINVAL;
@@ -3089,6 +3139,8 @@ xdp_func_proto(enum bpf_func_id func_id)
 		return &bpf_xdp_adjust_head_proto;
 	case BPF_FUNC_redirect:
 		return &bpf_xdp_redirect_proto;
+	case BPF_FUNC_redirect_map:
+		return &bpf_redirect_map_proto;
 	default:
 		return bpf_base_func_proto(func_id);
 	}
-- 
cgit v1.2.3-71-gd317


From fc60a8b675bd9499c71716d21c238eed5092ddfc Mon Sep 17 00:00:00 2001
From: Andreas Färber <afaerber@suse.de>
Date: Sun, 9 Jul 2017 22:29:42 +0200
Subject: tty: serial: owl: Implement console driver
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Implement serial console driver to complement earlycon.

Based on LeMaker linux-actions tree.

Signed-off-by: Andreas Färber <afaerber@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/serial/Kconfig       |   4 +-
 drivers/tty/serial/owl-uart.c    | 635 ++++++++++++++++++++++++++++++++++++++-
 include/uapi/linux/serial_core.h |   1 +
 3 files changed, 636 insertions(+), 4 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/tty/serial/Kconfig b/drivers/tty/serial/Kconfig
index 1f096e2bb398..b788fee54249 100644
--- a/drivers/tty/serial/Kconfig
+++ b/drivers/tty/serial/Kconfig
@@ -1689,7 +1689,7 @@ config SERIAL_MVEBU_CONSOLE
 	  Otherwise, say 'N'.
 
 config SERIAL_OWL
-	bool "Actions Semi Owl serial port support"
+	tristate "Actions Semi Owl serial port support"
 	depends on ARCH_ACTIONS || COMPILE_TEST
 	select SERIAL_CORE
 	help
@@ -1705,7 +1705,7 @@ config SERIAL_OWL_CONSOLE
 	default y
 	help
 	  Say 'Y' here if you wish to use Actions Semiconductor S500/S900 UART
-	  as the system console. Only earlycon is implemented currently.
+	  as the system console.
 
 endmenu
 
diff --git a/drivers/tty/serial/owl-uart.c b/drivers/tty/serial/owl-uart.c
index 1b8008797a1b..683b05478ade 100644
--- a/drivers/tty/serial/owl-uart.c
+++ b/drivers/tty/serial/owl-uart.c
@@ -20,6 +20,7 @@
  * along with this program. If not, see <http://www.gnu.org/licenses/>.
  */
 
+#include <linux/clk.h>
 #include <linux/console.h>
 #include <linux/delay.h>
 #include <linux/io.h>
@@ -28,22 +29,66 @@
 #include <linux/platform_device.h>
 #include <linux/serial.h>
 #include <linux/serial_core.h>
+#include <linux/tty.h>
+#include <linux/tty_flip.h>
+
+#define OWL_UART_PORT_NUM 7
+#define OWL_UART_DEV_NAME "ttyOWL"
 
 #define OWL_UART_CTL	0x000
+#define OWL_UART_RXDAT	0x004
 #define OWL_UART_TXDAT	0x008
 #define OWL_UART_STAT	0x00c
 
+#define OWL_UART_CTL_DWLS_MASK		GENMASK(1, 0)
+#define OWL_UART_CTL_DWLS_5BITS		(0x0 << 0)
+#define OWL_UART_CTL_DWLS_6BITS		(0x1 << 0)
+#define OWL_UART_CTL_DWLS_7BITS		(0x2 << 0)
+#define OWL_UART_CTL_DWLS_8BITS		(0x3 << 0)
+#define OWL_UART_CTL_STPS_2BITS		BIT(2)
+#define OWL_UART_CTL_PRS_MASK		GENMASK(6, 4)
+#define OWL_UART_CTL_PRS_NONE		(0x0 << 4)
+#define OWL_UART_CTL_PRS_ODD		(0x4 << 4)
+#define OWL_UART_CTL_PRS_MARK		(0x5 << 4)
+#define OWL_UART_CTL_PRS_EVEN		(0x6 << 4)
+#define OWL_UART_CTL_PRS_SPACE		(0x7 << 4)
+#define OWL_UART_CTL_AFE		BIT(12)
 #define OWL_UART_CTL_TRFS_TX		BIT(14)
 #define OWL_UART_CTL_EN			BIT(15)
+#define OWL_UART_CTL_RXDE		BIT(16)
+#define OWL_UART_CTL_TXDE		BIT(17)
 #define OWL_UART_CTL_RXIE		BIT(18)
 #define OWL_UART_CTL_TXIE		BIT(19)
+#define OWL_UART_CTL_LBEN		BIT(20)
 
 #define OWL_UART_STAT_RIP		BIT(0)
 #define OWL_UART_STAT_TIP		BIT(1)
+#define OWL_UART_STAT_RXER		BIT(2)
+#define OWL_UART_STAT_TFER		BIT(3)
+#define OWL_UART_STAT_RXST		BIT(4)
+#define OWL_UART_STAT_RFEM		BIT(5)
 #define OWL_UART_STAT_TFFU		BIT(6)
-#define OWL_UART_STAT_TRFL_MASK		(0x1f << 11)
+#define OWL_UART_STAT_CTSS		BIT(7)
+#define OWL_UART_STAT_RTSS		BIT(8)
+#define OWL_UART_STAT_TFES		BIT(10)
+#define OWL_UART_STAT_TRFL_MASK		GENMASK(16, 11)
 #define OWL_UART_STAT_UTBB		BIT(17)
 
+static struct uart_driver owl_uart_driver;
+
+struct owl_uart_info {
+	unsigned int tx_fifosize;
+};
+
+struct owl_uart_port {
+	struct uart_port port;
+	struct clk *clk;
+};
+
+#define to_owl_uart_port(prt) container_of(prt, struct owl_uart_port, prt)
+
+static struct owl_uart_port *owl_uart_ports[OWL_UART_PORT_NUM];
+
 static inline void owl_uart_write(struct uart_port *port, u32 val, unsigned int off)
 {
 	writel(val, port->membase + off);
@@ -54,6 +99,397 @@ static inline u32 owl_uart_read(struct uart_port *port, unsigned int off)
 	return readl(port->membase + off);
 }
 
+static void owl_uart_set_mctrl(struct uart_port *port, unsigned int mctrl)
+{
+	u32 ctl;
+
+	ctl = owl_uart_read(port, OWL_UART_CTL);
+
+	if (mctrl & TIOCM_LOOP)
+		ctl |= OWL_UART_CTL_LBEN;
+	else
+		ctl &= ~OWL_UART_CTL_LBEN;
+
+	owl_uart_write(port, ctl, OWL_UART_CTL);
+}
+
+static unsigned int owl_uart_get_mctrl(struct uart_port *port)
+{
+	unsigned int mctrl = TIOCM_CAR | TIOCM_DSR;
+	u32 stat, ctl;
+
+	ctl = owl_uart_read(port, OWL_UART_CTL);
+	stat = owl_uart_read(port, OWL_UART_STAT);
+	if (stat & OWL_UART_STAT_RTSS)
+		mctrl |= TIOCM_RTS;
+	if ((stat & OWL_UART_STAT_CTSS) || !(ctl & OWL_UART_CTL_AFE))
+		mctrl |= TIOCM_CTS;
+	return mctrl;
+}
+
+static unsigned int owl_uart_tx_empty(struct uart_port *port)
+{
+	unsigned long flags;
+	u32 val;
+	unsigned int ret;
+
+	spin_lock_irqsave(&port->lock, flags);
+
+	val = owl_uart_read(port, OWL_UART_STAT);
+	ret = (val & OWL_UART_STAT_TFES) ? TIOCSER_TEMT : 0;
+
+	spin_unlock_irqrestore(&port->lock, flags);
+
+	return ret;
+}
+
+static void owl_uart_stop_rx(struct uart_port *port)
+{
+	u32 val;
+
+	val = owl_uart_read(port, OWL_UART_CTL);
+	val &= ~(OWL_UART_CTL_RXIE | OWL_UART_CTL_RXDE);
+	owl_uart_write(port, val, OWL_UART_CTL);
+
+	val = owl_uart_read(port, OWL_UART_STAT);
+	val |= OWL_UART_STAT_RIP;
+	owl_uart_write(port, val, OWL_UART_STAT);
+}
+
+static void owl_uart_stop_tx(struct uart_port *port)
+{
+	u32 val;
+
+	val = owl_uart_read(port, OWL_UART_CTL);
+	val &= ~(OWL_UART_CTL_TXIE | OWL_UART_CTL_TXDE);
+	owl_uart_write(port, val, OWL_UART_CTL);
+
+	val = owl_uart_read(port, OWL_UART_STAT);
+	val |= OWL_UART_STAT_TIP;
+	owl_uart_write(port, val, OWL_UART_STAT);
+}
+
+static void owl_uart_start_tx(struct uart_port *port)
+{
+	u32 val;
+
+	if (uart_tx_stopped(port)) {
+		owl_uart_stop_tx(port);
+		return;
+	}
+
+	val = owl_uart_read(port, OWL_UART_STAT);
+	val |= OWL_UART_STAT_TIP;
+	owl_uart_write(port, val, OWL_UART_STAT);
+
+	val = owl_uart_read(port, OWL_UART_CTL);
+	val |= OWL_UART_CTL_TXIE;
+	owl_uart_write(port, val, OWL_UART_CTL);
+}
+
+static void owl_uart_send_chars(struct uart_port *port)
+{
+	struct circ_buf *xmit = &port->state->xmit;
+	unsigned int ch;
+
+	if (uart_tx_stopped(port))
+		return;
+
+	if (port->x_char) {
+		while (!(owl_uart_read(port, OWL_UART_STAT) & OWL_UART_STAT_TFFU))
+			cpu_relax();
+		owl_uart_write(port, port->x_char, OWL_UART_TXDAT);
+		port->icount.tx++;
+		port->x_char = 0;
+	}
+
+	while (!(owl_uart_read(port, OWL_UART_STAT) & OWL_UART_STAT_TFFU)) {
+		if (uart_circ_empty(xmit))
+			break;
+
+		ch = xmit->buf[xmit->tail];
+		owl_uart_write(port, ch, OWL_UART_TXDAT);
+		xmit->tail = (xmit->tail + 1) & (SERIAL_XMIT_SIZE - 1);
+		port->icount.tx++;
+	}
+
+	if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS)
+		uart_write_wakeup(port);
+
+	if (uart_circ_empty(xmit))
+		owl_uart_stop_tx(port);
+}
+
+static void owl_uart_receive_chars(struct uart_port *port)
+{
+	u32 stat, val;
+
+	val = owl_uart_read(port, OWL_UART_CTL);
+	val &= ~OWL_UART_CTL_TRFS_TX;
+	owl_uart_write(port, val, OWL_UART_CTL);
+
+	stat = owl_uart_read(port, OWL_UART_STAT);
+	while (!(stat & OWL_UART_STAT_RFEM)) {
+		char flag = TTY_NORMAL;
+
+		if (stat & OWL_UART_STAT_RXER)
+			port->icount.overrun++;
+
+		if (stat & OWL_UART_STAT_RXST) {
+			/* We are not able to distinguish the error type. */
+			port->icount.brk++;
+			port->icount.frame++;
+
+			stat &= port->read_status_mask;
+			if (stat & OWL_UART_STAT_RXST)
+				flag = TTY_PARITY;
+		} else
+			port->icount.rx++;
+
+		val = owl_uart_read(port, OWL_UART_RXDAT);
+		val &= 0xff;
+
+		if ((stat & port->ignore_status_mask) == 0)
+			tty_insert_flip_char(&port->state->port, val, flag);
+
+		stat = owl_uart_read(port, OWL_UART_STAT);
+	}
+
+	spin_unlock(&port->lock);
+	tty_flip_buffer_push(&port->state->port);
+	spin_lock(&port->lock);
+}
+
+static irqreturn_t owl_uart_irq(int irq, void *dev_id)
+{
+	struct uart_port *port = dev_id;
+	unsigned long flags;
+	u32 stat;
+
+	spin_lock_irqsave(&port->lock, flags);
+
+	stat = owl_uart_read(port, OWL_UART_STAT);
+
+	if (stat & OWL_UART_STAT_RIP)
+		owl_uart_receive_chars(port);
+
+	if (stat & OWL_UART_STAT_TIP)
+		owl_uart_send_chars(port);
+
+	stat = owl_uart_read(port, OWL_UART_STAT);
+	stat |= OWL_UART_STAT_RIP | OWL_UART_STAT_TIP;
+	owl_uart_write(port, stat, OWL_UART_STAT);
+
+	spin_unlock_irqrestore(&port->lock, flags);
+
+	return IRQ_HANDLED;
+}
+
+static void owl_uart_shutdown(struct uart_port *port)
+{
+	u32 val;
+	unsigned long flags;
+
+	spin_lock_irqsave(&port->lock, flags);
+
+	val = owl_uart_read(port, OWL_UART_CTL);
+	val &= ~(OWL_UART_CTL_TXIE | OWL_UART_CTL_RXIE
+		| OWL_UART_CTL_TXDE | OWL_UART_CTL_RXDE | OWL_UART_CTL_EN);
+	owl_uart_write(port, val, OWL_UART_CTL);
+
+	spin_unlock_irqrestore(&port->lock, flags);
+
+	free_irq(port->irq, port);
+}
+
+static int owl_uart_startup(struct uart_port *port)
+{
+	u32 val;
+	unsigned long flags;
+	int ret;
+
+	ret = request_irq(port->irq, owl_uart_irq, IRQF_TRIGGER_HIGH,
+			"owl-uart", port);
+	if (ret)
+		return ret;
+
+	spin_lock_irqsave(&port->lock, flags);
+
+	val = owl_uart_read(port, OWL_UART_STAT);
+	val |= OWL_UART_STAT_RIP | OWL_UART_STAT_TIP
+		| OWL_UART_STAT_RXER | OWL_UART_STAT_TFER | OWL_UART_STAT_RXST;
+	owl_uart_write(port, val, OWL_UART_STAT);
+
+	val = owl_uart_read(port, OWL_UART_CTL);
+	val |= OWL_UART_CTL_RXIE | OWL_UART_CTL_TXIE;
+	val |= OWL_UART_CTL_EN;
+	owl_uart_write(port, val, OWL_UART_CTL);
+
+	spin_unlock_irqrestore(&port->lock, flags);
+
+	return 0;
+}
+
+static void owl_uart_change_baudrate(struct owl_uart_port *owl_port,
+				     unsigned long baud)
+{
+	clk_set_rate(owl_port->clk, baud * 8);
+}
+
+static void owl_uart_set_termios(struct uart_port *port,
+				 struct ktermios *termios,
+				 struct ktermios *old)
+{
+	struct owl_uart_port *owl_port = to_owl_uart_port(port);
+	unsigned int baud;
+	u32 ctl;
+	unsigned long flags;
+
+	spin_lock_irqsave(&port->lock, flags);
+
+	ctl = owl_uart_read(port, OWL_UART_CTL);
+
+	ctl &= ~OWL_UART_CTL_DWLS_MASK;
+	switch (termios->c_cflag & CSIZE) {
+	case CS5:
+		ctl |= OWL_UART_CTL_DWLS_5BITS;
+		break;
+	case CS6:
+		ctl |= OWL_UART_CTL_DWLS_6BITS;
+		break;
+	case CS7:
+		ctl |= OWL_UART_CTL_DWLS_7BITS;
+		break;
+	case CS8:
+	default:
+		ctl |= OWL_UART_CTL_DWLS_8BITS;
+		break;
+	}
+
+	if (termios->c_cflag & CSTOPB)
+		ctl |= OWL_UART_CTL_STPS_2BITS;
+	else
+		ctl &= ~OWL_UART_CTL_STPS_2BITS;
+
+	ctl &= ~OWL_UART_CTL_PRS_MASK;
+	if (termios->c_cflag & PARENB) {
+		if (termios->c_cflag & CMSPAR) {
+			if (termios->c_cflag & PARODD)
+				ctl |= OWL_UART_CTL_PRS_MARK;
+			else
+				ctl |= OWL_UART_CTL_PRS_SPACE;
+		} else if (termios->c_cflag & PARODD)
+			ctl |= OWL_UART_CTL_PRS_ODD;
+		else
+			ctl |= OWL_UART_CTL_PRS_EVEN;
+	} else
+		ctl |= OWL_UART_CTL_PRS_NONE;
+
+	if (termios->c_cflag & CRTSCTS)
+		ctl |= OWL_UART_CTL_AFE;
+	else
+		ctl &= ~OWL_UART_CTL_AFE;
+
+	owl_uart_write(port, ctl, OWL_UART_CTL);
+
+	baud = uart_get_baud_rate(port, termios, old, 9600, 3200000);
+	owl_uart_change_baudrate(owl_port, baud);
+
+	/* Don't rewrite B0 */
+	if (tty_termios_baud_rate(termios))
+		tty_termios_encode_baud_rate(termios, baud, baud);
+
+	port->read_status_mask |= OWL_UART_STAT_RXER;
+	if (termios->c_iflag & INPCK)
+		port->read_status_mask |= OWL_UART_STAT_RXST;
+
+	uart_update_timeout(port, termios->c_cflag, baud);
+
+	spin_unlock_irqrestore(&port->lock, flags);
+}
+
+static void owl_uart_release_port(struct uart_port *port)
+{
+	struct platform_device *pdev = to_platform_device(port->dev);
+	struct resource *res;
+
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	if (!res)
+		return;
+
+	if (port->flags & UPF_IOREMAP) {
+		devm_release_mem_region(port->dev, port->mapbase,
+			resource_size(res));
+		devm_iounmap(port->dev, port->membase);
+		port->membase = NULL;
+	}
+}
+
+static int owl_uart_request_port(struct uart_port *port)
+{
+	struct platform_device *pdev = to_platform_device(port->dev);
+	struct resource *res;
+
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	if (!res)
+		return -ENXIO;
+
+	if (!devm_request_mem_region(port->dev, port->mapbase,
+			resource_size(res), dev_name(port->dev)))
+		return -EBUSY;
+
+	if (port->flags & UPF_IOREMAP) {
+		port->membase = devm_ioremap_nocache(port->dev, port->mapbase,
+				resource_size(res));
+		if (!port->membase)
+			return -EBUSY;
+	}
+
+	return 0;
+}
+
+static const char *owl_uart_type(struct uart_port *port)
+{
+	return (port->type == PORT_OWL) ? "owl-uart" : NULL;
+}
+
+static int owl_uart_verify_port(struct uart_port *port,
+				struct serial_struct *ser)
+{
+	if (port->type != PORT_OWL)
+		return -EINVAL;
+
+	if (port->irq != ser->irq)
+		return -EINVAL;
+
+	return 0;
+}
+
+static void owl_uart_config_port(struct uart_port *port, int flags)
+{
+	if (flags & UART_CONFIG_TYPE) {
+		port->type = PORT_OWL;
+		owl_uart_request_port(port);
+	}
+}
+
+static struct uart_ops owl_uart_ops = {
+	.set_mctrl = owl_uart_set_mctrl,
+	.get_mctrl = owl_uart_get_mctrl,
+	.tx_empty = owl_uart_tx_empty,
+	.start_tx = owl_uart_start_tx,
+	.stop_rx = owl_uart_stop_rx,
+	.stop_tx = owl_uart_stop_tx,
+	.startup = owl_uart_startup,
+	.shutdown = owl_uart_shutdown,
+	.set_termios = owl_uart_set_termios,
+	.type = owl_uart_type,
+	.config_port = owl_uart_config_port,
+	.request_port = owl_uart_request_port,
+	.release_port = owl_uart_release_port,
+	.verify_port = owl_uart_verify_port,
+};
+
 #ifdef CONFIG_SERIAL_OWL_CONSOLE
 
 static void owl_console_putchar(struct uart_port *port, int ch)
@@ -110,6 +546,57 @@ static void owl_uart_port_write(struct uart_port *port, const char *s,
 	local_irq_restore(flags);
 }
 
+static void owl_uart_console_write(struct console *co, const char *s,
+				   u_int count)
+{
+	struct owl_uart_port *owl_port;
+
+	owl_port = owl_uart_ports[co->index];
+	if (!owl_port)
+		return;
+
+	owl_uart_port_write(&owl_port->port, s, count);
+}
+
+static int owl_uart_console_setup(struct console *co, char *options)
+{
+	struct owl_uart_port *owl_port;
+	int baud = 115200;
+	int bits = 8;
+	int parity = 'n';
+	int flow = 'n';
+
+	if (co->index < 0 || co->index >= OWL_UART_PORT_NUM)
+		return -EINVAL;
+
+	owl_port = owl_uart_ports[co->index];
+	if (!owl_port || !owl_port->port.membase)
+		return -ENODEV;
+
+	if (options)
+		uart_parse_options(options, &baud, &parity, &bits, &flow);
+
+	return uart_set_options(&owl_port->port, co, baud, parity, bits, flow);
+}
+
+static struct console owl_uart_console = {
+	.name = OWL_UART_DEV_NAME,
+	.write = owl_uart_console_write,
+	.device = uart_console_device,
+	.setup = owl_uart_console_setup,
+	.flags = CON_PRINTBUFFER,
+	.index = -1,
+	.data = &owl_uart_driver,
+};
+
+static int __init owl_uart_console_init(void)
+{
+	register_console(&owl_uart_console);
+
+	return 0;
+}
+console_initcall(owl_uart_console_init);
+
 static void owl_uart_early_console_write(struct console *co,
 					 const char *s,
 					 u_int count)
@@ -132,4 +619,148 @@ owl_uart_early_console_setup(struct earlycon_device *device, const char *opt)
 OF_EARLYCON_DECLARE(owl, "actions,owl-uart",
 		    owl_uart_early_console_setup);
 
-#endif /* CONFIG_SERIAL_OWL_CONSOLE */
+#define OWL_UART_CONSOLE (&owl_uart_console)
+#else
+#define OWL_UART_CONSOLE NULL
+#endif
+
+static struct uart_driver owl_uart_driver = {
+	.owner = THIS_MODULE,
+	.driver_name = "owl-uart",
+	.dev_name = OWL_UART_DEV_NAME,
+	.nr = OWL_UART_PORT_NUM,
+	.cons = OWL_UART_CONSOLE,
+};
+
+static const struct owl_uart_info owl_s500_info = {
+	.tx_fifosize = 16,
+};
+
+static const struct owl_uart_info owl_s900_info = {
+	.tx_fifosize = 32,
+};
+
+static const struct of_device_id owl_uart_dt_matches[] = {
+	{ .compatible = "actions,s500-uart", .data = &owl_s500_info },
+	{ .compatible = "actions,s900-uart", .data = &owl_s900_info },
+	{ }
+};
+MODULE_DEVICE_TABLE(of, owl_uart_dt_matches);
+
+static int owl_uart_probe(struct platform_device *pdev)
+{
+	const struct of_device_id *match;
+	const struct owl_uart_info *info = NULL;
+	struct resource *res_mem;
+	struct owl_uart_port *owl_port;
+	int ret, irq;
+
+	if (pdev->dev.of_node) {
+		pdev->id = of_alias_get_id(pdev->dev.of_node, "serial");
+		match = of_match_node(owl_uart_dt_matches, pdev->dev.of_node);
+		if (match)
+			info = match->data;
+	}
+
+	if (pdev->id < 0 || pdev->id >= OWL_UART_PORT_NUM) {
+		dev_err(&pdev->dev, "id %d out of range\n", pdev->id);
+		return -EINVAL;
+	}
+
+	res_mem = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	if (!res_mem) {
+		dev_err(&pdev->dev, "could not get mem\n");
+		return -ENODEV;
+	}
+
+	irq = platform_get_irq(pdev, 0);
+	if (irq < 0) {
+		dev_err(&pdev->dev, "could not get irq\n");
+		return irq;
+	}
+
+	if (owl_uart_ports[pdev->id]) {
+		dev_err(&pdev->dev, "port %d already allocated\n", pdev->id);
+		return -EBUSY;
+	}
+
+	owl_port = devm_kzalloc(&pdev->dev, sizeof(*owl_port), GFP_KERNEL);
+	if (!owl_port)
+		return -ENOMEM;
+
+	owl_port->clk = devm_clk_get(&pdev->dev, NULL);
+	if (IS_ERR(owl_port->clk)) {
+		dev_err(&pdev->dev, "could not get clk\n");
+		return PTR_ERR(owl_port->clk);
+	}
+
+	owl_port->port.dev = &pdev->dev;
+	owl_port->port.line = pdev->id;
+	owl_port->port.type = PORT_OWL;
+	owl_port->port.iotype = UPIO_MEM;
+	owl_port->port.mapbase = res_mem->start;
+	owl_port->port.irq = irq;
+	owl_port->port.uartclk = clk_get_rate(owl_port->clk);
+	if (owl_port->port.uartclk == 0) {
+		dev_err(&pdev->dev, "clock rate is zero\n");
+		return -EINVAL;
+	}
+	owl_port->port.flags = UPF_BOOT_AUTOCONF | UPF_IOREMAP | UPF_LOW_LATENCY;
+	owl_port->port.x_char = 0;
+	owl_port->port.fifosize = (info) ? info->tx_fifosize : 16;
+	owl_port->port.ops = &owl_uart_ops;
+
+	owl_uart_ports[pdev->id] = owl_port;
+	platform_set_drvdata(pdev, owl_port);
+
+	ret = uart_add_one_port(&owl_uart_driver, &owl_port->port);
+	if (ret)
+		owl_uart_ports[pdev->id] = NULL;
+
+	return ret;
+}
+
+static int owl_uart_remove(struct platform_device *pdev)
+{
+	struct owl_uart_port *owl_port = platform_get_drvdata(pdev);
+
+	uart_remove_one_port(&owl_uart_driver, &owl_port->port);
+	owl_uart_ports[pdev->id] = NULL;
+
+	return 0;
+}
+
+static struct platform_driver owl_uart_platform_driver = {
+	.probe = owl_uart_probe,
+	.remove = owl_uart_remove,
+	.driver = {
+		.name = "owl-uart",
+		.of_match_table = owl_uart_dt_matches,
+	},
+};
+
+static int __init owl_uart_init(void)
+{
+	int ret;
+
+	ret = uart_register_driver(&owl_uart_driver);
+	if (ret)
+		return ret;
+
+	ret = platform_driver_register(&owl_uart_platform_driver);
+	if (ret)
+		uart_unregister_driver(&owl_uart_driver);
+
+	return ret;
+}
+
+static void __init owl_uart_exit(void)
+{
+	platform_driver_unregister(&owl_uart_platform_driver);
+	uart_unregister_driver(&owl_uart_driver);
+}
+
+module_init(owl_uart_init);
+module_exit(owl_uart_exit);
+
+MODULE_LICENSE("GPL");
diff --git a/include/uapi/linux/serial_core.h b/include/uapi/linux/serial_core.h
index c34a2a3eeff5..38bea3217ead 100644
--- a/include/uapi/linux/serial_core.h
+++ b/include/uapi/linux/serial_core.h
@@ -70,6 +70,7 @@
 #define PORT_CLPS711X	33
 #define PORT_SA1100	34
 #define PORT_UART00	35
+#define PORT_OWL	36
 #define PORT_21285	37
 
 /* Sparc type numbers.  */
-- 
cgit v1.2.3-71-gd317


From 6303d97873d340e89acdef12effb66f88d79836f Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hans.verkuil@cisco.com>
Date: Tue, 11 Jul 2017 03:30:38 -0300
Subject: media: linux/cec.h: add pin monitoring API support

Add support for low-level CEC pin monitoring. This adds a new monitor
mode, a new capability and two new events.

Signed-off-by: Hans Verkuil <hans.verkuil@cisco.com>
Reviewed-by: Maxime Ripard <maxime.ripard@free-electrons.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
---
 include/uapi/linux/cec.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/cec.h b/include/uapi/linux/cec.h
index 44579a24f95d..bba73f33c8aa 100644
--- a/include/uapi/linux/cec.h
+++ b/include/uapi/linux/cec.h
@@ -318,6 +318,7 @@ static inline int cec_is_unconfigured(__u16 log_addr_mask)
 #define CEC_MODE_FOLLOWER		(0x1 << 4)
 #define CEC_MODE_EXCL_FOLLOWER		(0x2 << 4)
 #define CEC_MODE_EXCL_FOLLOWER_PASSTHRU	(0x3 << 4)
+#define CEC_MODE_MONITOR_PIN		(0xd << 4)
 #define CEC_MODE_MONITOR		(0xe << 4)
 #define CEC_MODE_MONITOR_ALL		(0xf << 4)
 #define CEC_MODE_FOLLOWER_MSK		0xf0
@@ -338,6 +339,8 @@ static inline int cec_is_unconfigured(__u16 log_addr_mask)
 #define CEC_CAP_MONITOR_ALL	(1 << 5)
 /* Hardware can use CEC only if the HDMI HPD pin is high. */
 #define CEC_CAP_NEEDS_HPD	(1 << 6)
+/* Hardware can monitor CEC pin transitions */
+#define CEC_CAP_MONITOR_PIN	(1 << 7)
 
 /**
  * struct cec_caps - CEC capabilities structure.
@@ -405,6 +408,8 @@ struct cec_log_addrs {
  * didn't empty the message queue in time
  */
 #define CEC_EVENT_LOST_MSGS		2
+#define CEC_EVENT_PIN_LOW		3
+#define CEC_EVENT_PIN_HIGH		4
 
 #define CEC_EVENT_FL_INITIAL_STATE	(1 << 0)
 
-- 
cgit v1.2.3-71-gd317


From 6b2bbb08747a56dcf4ee33606a06025eca571260 Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hans.verkuil@cisco.com>
Date: Tue, 11 Jul 2017 03:30:39 -0300
Subject: media: cec: rework the cec event handling

Event handling was always fairly simplistic since there were only
two events. With the addition of pin events this needed to be redesigned.

The state_change and lost_msgs events are now core events with the
guarantee that the last state is always available. The new pin events
are a queue of events (up to 64 for each event) and the oldest event
will be dropped if the application cannot keep up. Lost events are
marked with a new event flag.

Signed-off-by: Hans Verkuil <hans.verkuil@cisco.com>
Reviewed-by: Maxime Ripard <maxime.ripard@free-electrons.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
---
 drivers/media/cec/cec-adap.c | 128 +++++++++++++++++++++++++------------------
 drivers/media/cec/cec-api.c  |  48 +++++++++++-----
 include/media/cec.h          |  14 ++++-
 include/uapi/linux/cec.h     |   3 +-
 4 files changed, 124 insertions(+), 69 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/media/cec/cec-adap.c b/drivers/media/cec/cec-adap.c
index 82c1633f5b92..5080d7aa6105 100644
--- a/drivers/media/cec/cec-adap.c
+++ b/drivers/media/cec/cec-adap.c
@@ -78,42 +78,62 @@ static unsigned int cec_log_addr2dev(const struct cec_adapter *adap, u8 log_addr
  * Queue a new event for this filehandle. If ts == 0, then set it
  * to the current time.
  *
- * The two events that are currently defined do not need to keep track
- * of intermediate events, so no actual queue of events is needed,
- * instead just store the latest state and the total number of lost
- * messages.
- *
- * Should new events be added in the future that require intermediate
- * results to be queued as well, then a proper queue data structure is
- * required. But until then, just keep it simple.
+ * We keep a queue of at most max_event events where max_event differs
+ * per event. If the queue becomes full, then drop the oldest event and
+ * keep track of how many events we've dropped.
  */
 void cec_queue_event_fh(struct cec_fh *fh,
 			const struct cec_event *new_ev, u64 ts)
 {
-	struct cec_event *ev = &fh->events[new_ev->event - 1];
+	static const u8 max_events[CEC_NUM_EVENTS] = {
+		1, 1, 64, 64,
+	};
+	struct cec_event_entry *entry;
+	unsigned int ev_idx = new_ev->event - 1;
+
+	if (WARN_ON(ev_idx >= ARRAY_SIZE(fh->events)))
+		return;
 
 	if (ts == 0)
 		ts = ktime_get_ns();
 
 	mutex_lock(&fh->lock);
-	if (new_ev->event == CEC_EVENT_LOST_MSGS &&
-	    fh->pending_events & (1 << new_ev->event)) {
-		/*
-		 * If there is already a lost_msgs event, then just
-		 * update the lost_msgs count. This effectively
-		 * merges the old and new events into one.
-		 */
-		ev->lost_msgs.lost_msgs += new_ev->lost_msgs.lost_msgs;
-		goto unlock;
-	}
+	if (ev_idx < CEC_NUM_CORE_EVENTS)
+		entry = &fh->core_events[ev_idx];
+	else
+		entry = kmalloc(sizeof(*entry), GFP_KERNEL);
+	if (entry) {
+		if (new_ev->event == CEC_EVENT_LOST_MSGS &&
+		    fh->queued_events[ev_idx]) {
+			entry->ev.lost_msgs.lost_msgs +=
+				new_ev->lost_msgs.lost_msgs;
+			goto unlock;
+		}
+		entry->ev = *new_ev;
+		entry->ev.ts = ts;
+
+		if (fh->queued_events[ev_idx] < max_events[ev_idx]) {
+			/* Add new msg at the end of the queue */
+			list_add_tail(&entry->list, &fh->events[ev_idx]);
+			fh->queued_events[ev_idx]++;
+			fh->total_queued_events++;
+			goto unlock;
+		}
 
-	/*
-	 * Intermediate states are not interesting, so just
-	 * overwrite any older event.
-	 */
-	*ev = *new_ev;
-	ev->ts = ts;
-	fh->pending_events |= 1 << new_ev->event;
+		if (ev_idx >= CEC_NUM_CORE_EVENTS) {
+			list_add_tail(&entry->list, &fh->events[ev_idx]);
+			/* drop the oldest event */
+			entry = list_first_entry(&fh->events[ev_idx],
+						 struct cec_event_entry, list);
+			list_del(&entry->list);
+			kfree(entry);
+		}
+	}
+	/* Mark that events were lost */
+	entry = list_first_entry_or_null(&fh->events[ev_idx],
+					 struct cec_event_entry, list);
+	if (entry)
+		entry->ev.flags |= CEC_EVENT_FL_DROPPED_EVENTS;
 
 unlock:
 	mutex_unlock(&fh->lock);
@@ -134,46 +154,50 @@ static void cec_queue_event(struct cec_adapter *adap,
 }
 
 /*
- * Queue a new message for this filehandle. If there is no more room
- * in the queue, then send the LOST_MSGS event instead.
+ * Queue a new message for this filehandle.
+ *
+ * We keep a queue of at most CEC_MAX_MSG_RX_QUEUE_SZ messages. If the
+ * queue becomes full, then drop the oldest message and keep track
+ * of how many messages we've dropped.
  */
 static void cec_queue_msg_fh(struct cec_fh *fh, const struct cec_msg *msg)
 {
-	static const struct cec_event ev_lost_msg = {
-		.ts = 0,
+	static const struct cec_event ev_lost_msgs = {
 		.event = CEC_EVENT_LOST_MSGS,
-		.flags = 0,
-		{
-			.lost_msgs.lost_msgs = 1,
-		},
+		.lost_msgs.lost_msgs = 1,
 	};
 	struct cec_msg_entry *entry;
 
 	mutex_lock(&fh->lock);
 	entry = kmalloc(sizeof(*entry), GFP_KERNEL);
-	if (!entry)
-		goto lost_msgs;
-
-	entry->msg = *msg;
-	/* Add new msg at the end of the queue */
-	list_add_tail(&entry->list, &fh->msgs);
+	if (entry) {
+		entry->msg = *msg;
+		/* Add new msg at the end of the queue */
+		list_add_tail(&entry->list, &fh->msgs);
+
+		if (fh->queued_msgs < CEC_MAX_MSG_RX_QUEUE_SZ) {
+			/* All is fine if there is enough room */
+			fh->queued_msgs++;
+			mutex_unlock(&fh->lock);
+			wake_up_interruptible(&fh->wait);
+			return;
+		}
 
-	/*
-	 * if the queue now has more than CEC_MAX_MSG_RX_QUEUE_SZ
-	 * messages, drop the oldest one and send a lost message event.
-	 */
-	if (fh->queued_msgs == CEC_MAX_MSG_RX_QUEUE_SZ) {
+		/*
+		 * if the message queue is full, then drop the oldest one and
+		 * send a lost message event.
+		 */
+		entry = list_first_entry(&fh->msgs, struct cec_msg_entry, list);
 		list_del(&entry->list);
-		goto lost_msgs;
+		kfree(entry);
 	}
-	fh->queued_msgs++;
 	mutex_unlock(&fh->lock);
-	wake_up_interruptible(&fh->wait);
-	return;
 
-lost_msgs:
-	mutex_unlock(&fh->lock);
-	cec_queue_event_fh(fh, &ev_lost_msg, 0);
+	/*
+	 * We lost a message, either because kmalloc failed or the queue
+	 * was full.
+	 */
+	cec_queue_event_fh(fh, &ev_lost_msgs, ktime_get_ns());
 }
 
 /*
diff --git a/drivers/media/cec/cec-api.c b/drivers/media/cec/cec-api.c
index f7eb4c54a354..48bef1c718ad 100644
--- a/drivers/media/cec/cec-api.c
+++ b/drivers/media/cec/cec-api.c
@@ -57,7 +57,7 @@ static unsigned int cec_poll(struct file *filp,
 		res |= POLLOUT | POLLWRNORM;
 	if (fh->queued_msgs)
 		res |= POLLIN | POLLRDNORM;
-	if (fh->pending_events)
+	if (fh->total_queued_events)
 		res |= POLLPRI;
 	poll_wait(filp, &fh->wait, poll);
 	mutex_unlock(&adap->lock);
@@ -289,15 +289,17 @@ static long cec_receive(struct cec_adapter *adap, struct cec_fh *fh,
 static long cec_dqevent(struct cec_adapter *adap, struct cec_fh *fh,
 			bool block, struct cec_event __user *parg)
 {
-	struct cec_event *ev = NULL;
+	struct cec_event_entry *ev = NULL;
 	u64 ts = ~0ULL;
 	unsigned int i;
+	unsigned int ev_idx;
 	long err = 0;
 
 	mutex_lock(&fh->lock);
-	while (!fh->pending_events && block) {
+	while (!fh->total_queued_events && block) {
 		mutex_unlock(&fh->lock);
-		err = wait_event_interruptible(fh->wait, fh->pending_events);
+		err = wait_event_interruptible(fh->wait,
+					       fh->total_queued_events);
 		if (err)
 			return err;
 		mutex_lock(&fh->lock);
@@ -305,23 +307,29 @@ static long cec_dqevent(struct cec_adapter *adap, struct cec_fh *fh,
 
 	/* Find the oldest event */
 	for (i = 0; i < CEC_NUM_EVENTS; i++) {
-		if (fh->pending_events & (1 << (i + 1)) &&
-		    fh->events[i].ts <= ts) {
-			ev = &fh->events[i];
-			ts = ev->ts;
+		struct cec_event_entry *entry =
+			list_first_entry_or_null(&fh->events[i],
+						 struct cec_event_entry, list);
+
+		if (entry && entry->ev.ts <= ts) {
+			ev = entry;
+			ev_idx = i;
+			ts = ev->ev.ts;
 		}
 	}
+
 	if (!ev) {
 		err = -EAGAIN;
 		goto unlock;
 	}
+	list_del(&ev->list);
 
-	if (copy_to_user(parg, ev, sizeof(*ev))) {
+	if (copy_to_user(parg, &ev->ev, sizeof(ev->ev)))
 		err = -EFAULT;
-		goto unlock;
-	}
-
-	fh->pending_events &= ~(1 << ev->event);
+	if (ev_idx >= CEC_NUM_CORE_EVENTS)
+		kfree(ev);
+	fh->queued_events[ev_idx]--;
+	fh->total_queued_events--;
 
 unlock:
 	mutex_unlock(&fh->lock);
@@ -495,6 +503,7 @@ static int cec_open(struct inode *inode, struct file *filp)
 		.event = CEC_EVENT_STATE_CHANGE,
 		.flags = CEC_EVENT_FL_INITIAL_STATE,
 	};
+	unsigned int i;
 	int err;
 
 	if (!fh)
@@ -502,6 +511,8 @@ static int cec_open(struct inode *inode, struct file *filp)
 
 	INIT_LIST_HEAD(&fh->msgs);
 	INIT_LIST_HEAD(&fh->xfer_list);
+	for (i = 0; i < CEC_NUM_EVENTS; i++)
+		INIT_LIST_HEAD(&fh->events[i]);
 	mutex_init(&fh->lock);
 	init_waitqueue_head(&fh->wait);
 
@@ -544,6 +555,7 @@ static int cec_release(struct inode *inode, struct file *filp)
 	struct cec_devnode *devnode = cec_devnode_data(filp);
 	struct cec_adapter *adap = to_cec_adapter(devnode);
 	struct cec_fh *fh = filp->private_data;
+	unsigned int i;
 
 	mutex_lock(&adap->lock);
 	if (adap->cec_initiator == fh)
@@ -585,6 +597,16 @@ static int cec_release(struct inode *inode, struct file *filp)
 		list_del(&entry->list);
 		kfree(entry);
 	}
+	for (i = CEC_NUM_CORE_EVENTS; i < CEC_NUM_EVENTS; i++) {
+		while (!list_empty(&fh->events[i])) {
+			struct cec_event_entry *entry =
+				list_first_entry(&fh->events[i],
+						 struct cec_event_entry, list);
+
+			list_del(&entry->list);
+			kfree(entry);
+		}
+	}
 	kfree(fh);
 
 	cec_put_device(devnode);
diff --git a/include/media/cec.h b/include/media/cec.h
index 37768203572d..6cc862af74e5 100644
--- a/include/media/cec.h
+++ b/include/media/cec.h
@@ -81,7 +81,13 @@ struct cec_msg_entry {
 	struct cec_msg		msg;
 };
 
-#define CEC_NUM_EVENTS		CEC_EVENT_LOST_MSGS
+struct cec_event_entry {
+	struct list_head	list;
+	struct cec_event	ev;
+};
+
+#define CEC_NUM_CORE_EVENTS 2
+#define CEC_NUM_EVENTS CEC_EVENT_PIN_HIGH
 
 struct cec_fh {
 	struct list_head	list;
@@ -92,9 +98,11 @@ struct cec_fh {
 
 	/* Events */
 	wait_queue_head_t	wait;
-	unsigned int		pending_events;
-	struct cec_event	events[CEC_NUM_EVENTS];
 	struct mutex		lock;
+	struct list_head	events[CEC_NUM_EVENTS]; /* queued events */
+	u8			queued_events[CEC_NUM_EVENTS];
+	unsigned int		total_queued_events;
+	struct cec_event_entry	core_events[CEC_NUM_CORE_EVENTS];
 	struct list_head	msgs; /* queued messages */
 	unsigned int		queued_msgs;
 };
diff --git a/include/uapi/linux/cec.h b/include/uapi/linux/cec.h
index bba73f33c8aa..d87a67b0bb06 100644
--- a/include/uapi/linux/cec.h
+++ b/include/uapi/linux/cec.h
@@ -412,6 +412,7 @@ struct cec_log_addrs {
 #define CEC_EVENT_PIN_HIGH		4
 
 #define CEC_EVENT_FL_INITIAL_STATE	(1 << 0)
+#define CEC_EVENT_FL_DROPPED_EVENTS	(1 << 1)
 
 /**
  * struct cec_event_state_change - used when the CEC adapter changes state.
@@ -424,7 +425,7 @@ struct cec_event_state_change {
 };
 
 /**
- * struct cec_event_lost_msgs - tells you how many messages were lost due.
+ * struct cec_event_lost_msgs - tells you how many messages were lost.
  * @lost_msgs: how many messages were lost.
  */
 struct cec_event_lost_msgs {
-- 
cgit v1.2.3-71-gd317


From eb0baf8a0d9259d168523b8e7c436b55ade7c546 Mon Sep 17 00:00:00 2001
From: Jin Yao <yao.jin@linux.intel.com>
Date: Tue, 18 Jul 2017 20:13:09 +0800
Subject: perf/core: Define the common branch type classification
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

It is often useful to know the branch types while analyzing branch data.
For example, a call is very different from a conditional branch.

Currently we have to look it up in binary while the binary may later not
be available and even the binary is available but user has to take some
time. It is very useful for user to check it directly in perf report.

Perf already has support for disassembling the branch instruction to get
the x86 branch type.

To keep consistent on kernel and userspace and make the classification
more common, the patch adds the common branch type classification
in perf_event.h.

The patch only defines a minimum but most common set of branch types.

PERF_BR_UNKNOWN         : unknown
PERF_BR_COND            ：conditional
PERF_BR_UNCOND          : unconditional
PERF_BR_IND             : indirect
PERF_BR_CALL            : function call
PERF_BR_IND_CALL        : indirect function call
PERF_BR_RET             : function return
PERF_BR_SYSCALL         : syscall
PERF_BR_SYSRET          : syscall return
PERF_BR_COND_CALL       : conditional function call
PERF_BR_COND_RET        : conditional function return

The patch also adds a new field type (4 bits) in perf_branch_entry
to record the branch type.

Since the disassembling of branch instruction needs some overhead,
a new PERF_SAMPLE_BRANCH_TYPE_SAVE is introduced to indicate if it
needs to disassemble the branch instruction and record the branch
type.

Change log:

v10: Not changed.

v9: Not changed.

v8: Change PERF_BR_NONE to PERF_BR_UNKNOWN.
    No other change.

v7: Just keep the most common branch types.
    Others are removed.

v6: Not changed.

v5: Not changed. The v5 patch series just change the userspace.

v4: Comparing to previous version, the major changes are:

1. Remove the PERF_BR_JCC_FWD/PERF_BR_JCC_BWD, they will be
   computed later in userspace.

2. Remove the "cross" field in perf_branch_entry. The cross page
   computing will be done later in userspace.

Signed-off-by: Yao Jin <yao.jin@linux.intel.com>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Acked-by: Michael Ellerman <mpe@ellerman.id.au>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Kan Liang <kan.liang@intel.com>
Link: http://lkml.kernel.org/r/1500379995-6449-2-git-send-email-yao.jin@linux.intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 include/uapi/linux/perf_event.h       | 27 ++++++++++++++++++++++++++-
 tools/include/uapi/linux/perf_event.h | 27 ++++++++++++++++++++++++++-
 2 files changed, 52 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index b1c0b187acfe..642db5fa3286 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -174,6 +174,8 @@ enum perf_branch_sample_type_shift {
 	PERF_SAMPLE_BRANCH_NO_FLAGS_SHIFT	= 14, /* no flags */
 	PERF_SAMPLE_BRANCH_NO_CYCLES_SHIFT	= 15, /* no cycles */
 
+	PERF_SAMPLE_BRANCH_TYPE_SAVE_SHIFT	= 16, /* save branch type */
+
 	PERF_SAMPLE_BRANCH_MAX_SHIFT		/* non-ABI */
 };
 
@@ -198,9 +200,30 @@ enum perf_branch_sample_type {
 	PERF_SAMPLE_BRANCH_NO_FLAGS	= 1U << PERF_SAMPLE_BRANCH_NO_FLAGS_SHIFT,
 	PERF_SAMPLE_BRANCH_NO_CYCLES	= 1U << PERF_SAMPLE_BRANCH_NO_CYCLES_SHIFT,
 
+	PERF_SAMPLE_BRANCH_TYPE_SAVE	=
+		1U << PERF_SAMPLE_BRANCH_TYPE_SAVE_SHIFT,
+
 	PERF_SAMPLE_BRANCH_MAX		= 1U << PERF_SAMPLE_BRANCH_MAX_SHIFT,
 };
 
+/*
+ * Common flow change classification
+ */
+enum {
+	PERF_BR_UNKNOWN		= 0,	/* unknown */
+	PERF_BR_COND		= 1,	/* conditional */
+	PERF_BR_UNCOND		= 2,	/* unconditional  */
+	PERF_BR_IND		= 3,	/* indirect */
+	PERF_BR_CALL		= 4,	/* function call */
+	PERF_BR_IND_CALL	= 5,	/* indirect function call */
+	PERF_BR_RET		= 6,	/* function return */
+	PERF_BR_SYSCALL		= 7,	/* syscall */
+	PERF_BR_SYSRET		= 8,	/* syscall return */
+	PERF_BR_COND_CALL	= 9,	/* conditional function call */
+	PERF_BR_COND_RET	= 10,	/* conditional function return */
+	PERF_BR_MAX,
+};
+
 #define PERF_SAMPLE_BRANCH_PLM_ALL \
 	(PERF_SAMPLE_BRANCH_USER|\
 	 PERF_SAMPLE_BRANCH_KERNEL|\
@@ -1015,6 +1038,7 @@ union perf_mem_data_src {
  *     in_tx: running in a hardware transaction
  *     abort: aborting a hardware transaction
  *    cycles: cycles from last branch (or 0 if not supported)
+ *      type: branch type
  */
 struct perf_branch_entry {
 	__u64	from;
@@ -1024,7 +1048,8 @@ struct perf_branch_entry {
 		in_tx:1,    /* in transaction */
 		abort:1,    /* transaction abort */
 		cycles:16,  /* cycle count to last branch */
-		reserved:44;
+		type:4,     /* branch type */
+		reserved:40;
 };
 
 #endif /* _UAPI_LINUX_PERF_EVENT_H */
diff --git a/tools/include/uapi/linux/perf_event.h b/tools/include/uapi/linux/perf_event.h
index b1c0b187acfe..642db5fa3286 100644
--- a/tools/include/uapi/linux/perf_event.h
+++ b/tools/include/uapi/linux/perf_event.h
@@ -174,6 +174,8 @@ enum perf_branch_sample_type_shift {
 	PERF_SAMPLE_BRANCH_NO_FLAGS_SHIFT	= 14, /* no flags */
 	PERF_SAMPLE_BRANCH_NO_CYCLES_SHIFT	= 15, /* no cycles */
 
+	PERF_SAMPLE_BRANCH_TYPE_SAVE_SHIFT	= 16, /* save branch type */
+
 	PERF_SAMPLE_BRANCH_MAX_SHIFT		/* non-ABI */
 };
 
@@ -198,9 +200,30 @@ enum perf_branch_sample_type {
 	PERF_SAMPLE_BRANCH_NO_FLAGS	= 1U << PERF_SAMPLE_BRANCH_NO_FLAGS_SHIFT,
 	PERF_SAMPLE_BRANCH_NO_CYCLES	= 1U << PERF_SAMPLE_BRANCH_NO_CYCLES_SHIFT,
 
+	PERF_SAMPLE_BRANCH_TYPE_SAVE	=
+		1U << PERF_SAMPLE_BRANCH_TYPE_SAVE_SHIFT,
+
 	PERF_SAMPLE_BRANCH_MAX		= 1U << PERF_SAMPLE_BRANCH_MAX_SHIFT,
 };
 
+/*
+ * Common flow change classification
+ */
+enum {
+	PERF_BR_UNKNOWN		= 0,	/* unknown */
+	PERF_BR_COND		= 1,	/* conditional */
+	PERF_BR_UNCOND		= 2,	/* unconditional  */
+	PERF_BR_IND		= 3,	/* indirect */
+	PERF_BR_CALL		= 4,	/* function call */
+	PERF_BR_IND_CALL	= 5,	/* indirect function call */
+	PERF_BR_RET		= 6,	/* function return */
+	PERF_BR_SYSCALL		= 7,	/* syscall */
+	PERF_BR_SYSRET		= 8,	/* syscall return */
+	PERF_BR_COND_CALL	= 9,	/* conditional function call */
+	PERF_BR_COND_RET	= 10,	/* conditional function return */
+	PERF_BR_MAX,
+};
+
 #define PERF_SAMPLE_BRANCH_PLM_ALL \
 	(PERF_SAMPLE_BRANCH_USER|\
 	 PERF_SAMPLE_BRANCH_KERNEL|\
@@ -1015,6 +1038,7 @@ union perf_mem_data_src {
  *     in_tx: running in a hardware transaction
  *     abort: aborting a hardware transaction
  *    cycles: cycles from last branch (or 0 if not supported)
+ *      type: branch type
  */
 struct perf_branch_entry {
 	__u64	from;
@@ -1024,7 +1048,8 @@ struct perf_branch_entry {
 		in_tx:1,    /* in transaction */
 		abort:1,    /* transaction abort */
 		cycles:16,  /* cycle count to last branch */
-		reserved:44;
+		type:4,     /* branch type */
+		reserved:40;
 };
 
 #endif /* _UAPI_LINUX_PERF_EVENT_H */
-- 
cgit v1.2.3-71-gd317


From 727f8914477e4642c7d1ff381667cdc4178b40c6 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 21 Jul 2017 10:39:26 +0100
Subject: rxrpc: Expose UAPI definitions to userspace

Move UAPI definitions from the internal header and place them in a UAPI
header file so that userspace can make use of them.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 include/linux/rxrpc.h      | 79 ---------------------------------------------
 include/uapi/linux/rxrpc.h | 80 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 80 insertions(+), 79 deletions(-)
 delete mode 100644 include/linux/rxrpc.h
 create mode 100644 include/uapi/linux/rxrpc.h

(limited to 'include/uapi/linux')

diff --git a/include/linux/rxrpc.h b/include/linux/rxrpc.h
deleted file mode 100644
index 7343f71783dc..000000000000
--- a/include/linux/rxrpc.h
+++ /dev/null
@@ -1,79 +0,0 @@
-/* AF_RXRPC parameters
- *
- * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#ifndef _LINUX_RXRPC_H
-#define _LINUX_RXRPC_H
-
-#include <linux/in.h>
-#include <linux/in6.h>
-
-/*
- * RxRPC socket address
- */
-struct sockaddr_rxrpc {
-	sa_family_t	srx_family;	/* address family */
-	u16		srx_service;	/* service desired */
-	u16		transport_type;	/* type of transport socket (SOCK_DGRAM) */
-	u16		transport_len;	/* length of transport address */
-	union {
-		sa_family_t family;		/* transport address family */
-		struct sockaddr_in sin;		/* IPv4 transport address */
-		struct sockaddr_in6 sin6;	/* IPv6 transport address */
-	} transport;
-};
-
-/*
- * RxRPC socket options
- */
-#define RXRPC_SECURITY_KEY		1	/* [clnt] set client security key */
-#define RXRPC_SECURITY_KEYRING		2	/* [srvr] set ring of server security keys */
-#define RXRPC_EXCLUSIVE_CONNECTION	3	/* Deprecated; use RXRPC_EXCLUSIVE_CALL instead */
-#define RXRPC_MIN_SECURITY_LEVEL	4	/* minimum security level */
-#define RXRPC_UPGRADEABLE_SERVICE	5	/* Upgrade service[0] -> service[1] */
-#define RXRPC_SUPPORTED_CMSG		6	/* Get highest supported control message type */
-
-/*
- * RxRPC control messages
- * - If neither abort or accept are specified, the message is a data message.
- * - terminal messages mean that a user call ID tag can be recycled
- * - s/r/- indicate whether these are applicable to sendmsg() and/or recvmsg()
- */
-enum rxrpc_cmsg_type {
-	RXRPC_USER_CALL_ID	= 1,	/* sr: user call ID specifier */
-	RXRPC_ABORT		= 2,	/* sr: abort request / notification [terminal] */
-	RXRPC_ACK		= 3,	/* -r: [Service] RPC op final ACK received [terminal] */
-	RXRPC_NET_ERROR		= 5,	/* -r: network error received [terminal] */
-	RXRPC_BUSY		= 6,	/* -r: server busy received [terminal] */
-	RXRPC_LOCAL_ERROR	= 7,	/* -r: local error generated [terminal] */
-	RXRPC_NEW_CALL		= 8,	/* -r: [Service] new incoming call notification */
-	RXRPC_ACCEPT		= 9,	/* s-: [Service] accept request */
-	RXRPC_EXCLUSIVE_CALL	= 10,	/* s-: Call should be on exclusive connection */
-	RXRPC_UPGRADE_SERVICE	= 11,	/* s-: Request service upgrade for client call */
-	RXRPC_TX_LENGTH		= 12,	/* s-: Total length of Tx data */
-	RXRPC__SUPPORTED
-};
-
-/*
- * RxRPC security levels
- */
-#define RXRPC_SECURITY_PLAIN	0	/* plain secure-checksummed packets only */
-#define RXRPC_SECURITY_AUTH	1	/* authenticated packets */
-#define RXRPC_SECURITY_ENCRYPT	2	/* encrypted packets */
-
-/*
- * RxRPC security indices
- */
-#define RXRPC_SECURITY_NONE	0	/* no security protocol */
-#define RXRPC_SECURITY_RXKAD	2	/* kaserver or kerberos 4 */
-#define RXRPC_SECURITY_RXGK	4	/* gssapi-based */
-#define RXRPC_SECURITY_RXK5	5	/* kerberos 5 */
-
-#endif /* _LINUX_RXRPC_H */
diff --git a/include/uapi/linux/rxrpc.h b/include/uapi/linux/rxrpc.h
new file mode 100644
index 000000000000..08e2fb9c70ae
--- /dev/null
+++ b/include/uapi/linux/rxrpc.h
@@ -0,0 +1,80 @@
+/* Types and definitions for AF_RXRPC.
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#ifndef _UAPI_LINUX_RXRPC_H
+#define _UAPI_LINUX_RXRPC_H
+
+#include <linux/types.h>
+#include <linux/in.h>
+#include <linux/in6.h>
+
+/*
+ * RxRPC socket address
+ */
+struct sockaddr_rxrpc {
+	sa_family_t	srx_family;	/* address family */
+	u16		srx_service;	/* service desired */
+	u16		transport_type;	/* type of transport socket (SOCK_DGRAM) */
+	u16		transport_len;	/* length of transport address */
+	union {
+		sa_family_t family;		/* transport address family */
+		struct sockaddr_in sin;		/* IPv4 transport address */
+		struct sockaddr_in6 sin6;	/* IPv6 transport address */
+	} transport;
+};
+
+/*
+ * RxRPC socket options
+ */
+#define RXRPC_SECURITY_KEY		1	/* [clnt] set client security key */
+#define RXRPC_SECURITY_KEYRING		2	/* [srvr] set ring of server security keys */
+#define RXRPC_EXCLUSIVE_CONNECTION	3	/* Deprecated; use RXRPC_EXCLUSIVE_CALL instead */
+#define RXRPC_MIN_SECURITY_LEVEL	4	/* minimum security level */
+#define RXRPC_UPGRADEABLE_SERVICE	5	/* Upgrade service[0] -> service[1] */
+#define RXRPC_SUPPORTED_CMSG		6	/* Get highest supported control message type */
+
+/*
+ * RxRPC control messages
+ * - If neither abort or accept are specified, the message is a data message.
+ * - terminal messages mean that a user call ID tag can be recycled
+ * - s/r/- indicate whether these are applicable to sendmsg() and/or recvmsg()
+ */
+enum rxrpc_cmsg_type {
+	RXRPC_USER_CALL_ID	= 1,	/* sr: user call ID specifier */
+	RXRPC_ABORT		= 2,	/* sr: abort request / notification [terminal] */
+	RXRPC_ACK		= 3,	/* -r: [Service] RPC op final ACK received [terminal] */
+	RXRPC_NET_ERROR		= 5,	/* -r: network error received [terminal] */
+	RXRPC_BUSY		= 6,	/* -r: server busy received [terminal] */
+	RXRPC_LOCAL_ERROR	= 7,	/* -r: local error generated [terminal] */
+	RXRPC_NEW_CALL		= 8,	/* -r: [Service] new incoming call notification */
+	RXRPC_ACCEPT		= 9,	/* s-: [Service] accept request */
+	RXRPC_EXCLUSIVE_CALL	= 10,	/* s-: Call should be on exclusive connection */
+	RXRPC_UPGRADE_SERVICE	= 11,	/* s-: Request service upgrade for client call */
+	RXRPC_TX_LENGTH		= 12,	/* s-: Total length of Tx data */
+	RXRPC__SUPPORTED
+};
+
+/*
+ * RxRPC security levels
+ */
+#define RXRPC_SECURITY_PLAIN	0	/* plain secure-checksummed packets only */
+#define RXRPC_SECURITY_AUTH	1	/* authenticated packets */
+#define RXRPC_SECURITY_ENCRYPT	2	/* encrypted packets */
+
+/*
+ * RxRPC security indices
+ */
+#define RXRPC_SECURITY_NONE	0	/* no security protocol */
+#define RXRPC_SECURITY_RXKAD	2	/* kaserver or kerberos 4 */
+#define RXRPC_SECURITY_RXGK	4	/* gssapi-based */
+#define RXRPC_SECURITY_RXK5	5	/* kerberos 5 */
+
+#endif /* _UAPI_LINUX_RXRPC_H */
-- 
cgit v1.2.3-71-gd317


From ddc6c70f07bb1f6dd39a2c6c430f7b4fa95199c8 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 21 Jul 2017 10:07:10 +0100
Subject: rxrpc: Move the packet.h include file into net/rxrpc/

Move the protocol description header file into net/rxrpc/ and rename it to
protocol.h.  It's no longer necessary to expose it as packets are no longer
exposed to kernel services (such as AFS) that use the facility.

The abort codes are transferred to the UAPI header instead as we pass these
back to userspace and also to kernel services.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 fs/afs/misc.c              |   1 -
 fs/afs/rxrpc.c             |   1 -
 include/rxrpc/packet.h     | 235 ---------------------------------------------
 include/uapi/linux/rxrpc.h |  44 +++++++++
 net/rxrpc/ar-internal.h    |   2 +-
 net/rxrpc/protocol.h       | 190 ++++++++++++++++++++++++++++++++++++
 6 files changed, 235 insertions(+), 238 deletions(-)
 delete mode 100644 include/rxrpc/packet.h
 create mode 100644 net/rxrpc/protocol.h

(limited to 'include/uapi/linux')

diff --git a/fs/afs/misc.c b/fs/afs/misc.c
index 100b207efc9e..c05f1f1c0d41 100644
--- a/fs/afs/misc.c
+++ b/fs/afs/misc.c
@@ -12,7 +12,6 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/errno.h>
-#include <rxrpc/packet.h>
 #include "internal.h"
 #include "afs_fs.h"
 
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index 02781e78ffb6..10743043d431 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -14,7 +14,6 @@
 
 #include <net/sock.h>
 #include <net/af_rxrpc.h>
-#include <rxrpc/packet.h>
 #include "internal.h"
 #include "afs_cm.h"
 
diff --git a/include/rxrpc/packet.h b/include/rxrpc/packet.h
deleted file mode 100644
index a2dcfb850b9f..000000000000
--- a/include/rxrpc/packet.h
+++ /dev/null
@@ -1,235 +0,0 @@
-/* packet.h: Rx packet layout and definitions
- *
- * Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#ifndef _LINUX_RXRPC_PACKET_H
-#define _LINUX_RXRPC_PACKET_H
-
-typedef u32	rxrpc_seq_t;	/* Rx message sequence number */
-typedef u32	rxrpc_serial_t;	/* Rx message serial number */
-typedef __be32	rxrpc_seq_net_t; /* on-the-wire Rx message sequence number */
-typedef __be32	rxrpc_serial_net_t; /* on-the-wire Rx message serial number */
-
-/*****************************************************************************/
-/*
- * on-the-wire Rx packet header
- * - all multibyte fields should be in network byte order
- */
-struct rxrpc_wire_header {
-	__be32		epoch;		/* client boot timestamp */
-#define RXRPC_RANDOM_EPOCH	0x80000000	/* Random if set, date-based if not */
-
-	__be32		cid;		/* connection and channel ID */
-#define RXRPC_MAXCALLS		4			/* max active calls per conn */
-#define RXRPC_CHANNELMASK	(RXRPC_MAXCALLS-1)	/* mask for channel ID */
-#define RXRPC_CIDMASK		(~RXRPC_CHANNELMASK)	/* mask for connection ID */
-#define RXRPC_CIDSHIFT		ilog2(RXRPC_MAXCALLS)	/* shift for connection ID */
-#define RXRPC_CID_INC		(1 << RXRPC_CIDSHIFT)	/* connection ID increment */
-
-	__be32		callNumber;	/* call ID (0 for connection-level packets) */
-	__be32		seq;		/* sequence number of pkt in call stream */
-	__be32		serial;		/* serial number of pkt sent to network */
-
-	uint8_t		type;		/* packet type */
-#define RXRPC_PACKET_TYPE_DATA		1	/* data */
-#define RXRPC_PACKET_TYPE_ACK		2	/* ACK */
-#define RXRPC_PACKET_TYPE_BUSY		3	/* call reject */
-#define RXRPC_PACKET_TYPE_ABORT		4	/* call/connection abort */
-#define RXRPC_PACKET_TYPE_ACKALL	5	/* ACK all outstanding packets on call */
-#define RXRPC_PACKET_TYPE_CHALLENGE	6	/* connection security challenge (SRVR->CLNT) */
-#define RXRPC_PACKET_TYPE_RESPONSE	7	/* connection secutity response (CLNT->SRVR) */
-#define RXRPC_PACKET_TYPE_DEBUG		8	/* debug info request */
-#define RXRPC_PACKET_TYPE_VERSION	13	/* version string request */
-#define RXRPC_N_PACKET_TYPES		14	/* number of packet types (incl type 0) */
-
-	uint8_t		flags;		/* packet flags */
-#define RXRPC_CLIENT_INITIATED	0x01		/* signifies a packet generated by a client */
-#define RXRPC_REQUEST_ACK	0x02		/* request an unconditional ACK of this packet */
-#define RXRPC_LAST_PACKET	0x04		/* the last packet from this side for this call */
-#define RXRPC_MORE_PACKETS	0x08		/* more packets to come */
-#define RXRPC_JUMBO_PACKET	0x20		/* [DATA] this is a jumbo packet */
-#define RXRPC_SLOW_START_OK	0x20		/* [ACK] slow start supported */
-
-	uint8_t		userStatus;	/* app-layer defined status */
-#define RXRPC_USERSTATUS_SERVICE_UPGRADE 0x01	/* AuriStor service upgrade request */
-	
-	uint8_t		securityIndex;	/* security protocol ID */
-	union {
-		__be16	_rsvd;		/* reserved */
-		__be16	cksum;		/* kerberos security checksum */
-	};
-	__be16		serviceId;	/* service ID */
-
-} __packed;
-
-#define RXRPC_SUPPORTED_PACKET_TYPES (			\
-		(1 << RXRPC_PACKET_TYPE_DATA) |		\
-		(1 << RXRPC_PACKET_TYPE_ACK) |		\
-		(1 << RXRPC_PACKET_TYPE_BUSY) |		\
-		(1 << RXRPC_PACKET_TYPE_ABORT) |	\
-		(1 << RXRPC_PACKET_TYPE_ACKALL) |	\
-		(1 << RXRPC_PACKET_TYPE_CHALLENGE) |	\
-		(1 << RXRPC_PACKET_TYPE_RESPONSE) |	\
-		/*(1 << RXRPC_PACKET_TYPE_DEBUG) | */	\
-		(1 << RXRPC_PACKET_TYPE_VERSION))
-
-/*****************************************************************************/
-/*
- * jumbo packet secondary header
- * - can be mapped to read header by:
- *   - new_serial = serial + 1
- *   - new_seq = seq + 1
- *   - new_flags = j_flags
- *   - new__rsvd = j__rsvd
- *   - duplicating all other fields
- */
-struct rxrpc_jumbo_header {
-	uint8_t		flags;		/* packet flags (as per rxrpc_header) */
-	uint8_t		pad;
-	union {
-		__be16	_rsvd;		/* reserved */
-		__be16	cksum;		/* kerberos security checksum */
-	};
-};
-
-#define RXRPC_JUMBO_DATALEN	1412	/* non-terminal jumbo packet data length */
-#define RXRPC_JUMBO_SUBPKTLEN	(RXRPC_JUMBO_DATALEN + sizeof(struct rxrpc_jumbo_header))
-
-/*****************************************************************************/
-/*
- * on-the-wire Rx ACK packet data payload
- * - all multibyte fields should be in network byte order
- */
-struct rxrpc_ackpacket {
-	__be16		bufferSpace;	/* number of packet buffers available */
-	__be16		maxSkew;	/* diff between serno being ACK'd and highest serial no
-					 * received */
-	__be32		firstPacket;	/* sequence no of first ACK'd packet in attached list */
-	__be32		previousPacket;	/* sequence no of previous packet received */
-	__be32		serial;		/* serial no of packet that prompted this ACK */
-
-	uint8_t		reason;		/* reason for ACK */
-#define RXRPC_ACK_REQUESTED		1	/* ACK was requested on packet */
-#define RXRPC_ACK_DUPLICATE		2	/* duplicate packet received */
-#define RXRPC_ACK_OUT_OF_SEQUENCE	3	/* out of sequence packet received */
-#define RXRPC_ACK_EXCEEDS_WINDOW	4	/* packet received beyond end of ACK window */
-#define RXRPC_ACK_NOSPACE		5	/* packet discarded due to lack of buffer space */
-#define RXRPC_ACK_PING			6	/* keep alive ACK */
-#define RXRPC_ACK_PING_RESPONSE		7	/* response to RXRPC_ACK_PING */
-#define RXRPC_ACK_DELAY			8	/* nothing happened since received packet */
-#define RXRPC_ACK_IDLE			9	/* ACK due to fully received ACK window */
-#define RXRPC_ACK__INVALID		10	/* Representation of invalid ACK reason */
-
-	uint8_t		nAcks;		/* number of ACKs */
-#define RXRPC_MAXACKS	255
-
-	uint8_t		acks[0];	/* list of ACK/NAKs */
-#define RXRPC_ACK_TYPE_NACK		0
-#define RXRPC_ACK_TYPE_ACK		1
-
-} __packed;
-
-/* Some ACKs refer to specific packets and some are general and can be updated. */
-#define RXRPC_ACK_UPDATEABLE ((1 << RXRPC_ACK_REQUESTED)	|	\
-			      (1 << RXRPC_ACK_PING_RESPONSE)	|	\
-			      (1 << RXRPC_ACK_DELAY)		|	\
-			      (1 << RXRPC_ACK_IDLE))
-
-
-/*
- * ACK packets can have a further piece of information tagged on the end
- */
-struct rxrpc_ackinfo {
-	__be32		rxMTU;		/* maximum Rx MTU size (bytes) [AFS 3.3] */
-	__be32		maxMTU;		/* maximum interface MTU size (bytes) [AFS 3.3] */
-	__be32		rwind;		/* Rx window size (packets) [AFS 3.4] */
-	__be32		jumbo_max;	/* max packets to stick into a jumbo packet [AFS 3.5] */
-};
-
-/*****************************************************************************/
-/*
- * Kerberos security type-2 challenge packet
- */
-struct rxkad_challenge {
-	__be32		version;	/* version of this challenge type */
-	__be32		nonce;		/* encrypted random number */
-	__be32		min_level;	/* minimum security level */
-	__be32		__padding;	/* padding to 8-byte boundary */
-} __packed;
-
-/*****************************************************************************/
-/*
- * Kerberos security type-2 response packet
- */
-struct rxkad_response {
-	__be32		version;	/* version of this response type */
-	__be32		__pad;
-
-	/* encrypted bit of the response */
-	struct {
-		__be32		epoch;		/* current epoch */
-		__be32		cid;		/* parent connection ID */
-		__be32		checksum;	/* checksum */
-		__be32		securityIndex;	/* security type */
-		__be32		call_id[4];	/* encrypted call IDs */
-		__be32		inc_nonce;	/* challenge nonce + 1 */
-		__be32		level;		/* desired level */
-	} encrypted;
-
-	__be32		kvno;		/* Kerberos key version number */
-	__be32		ticket_len;	/* Kerberos ticket length  */
-} __packed;
-
-/*****************************************************************************/
-/*
- * RxRPC-level abort codes
- */
-#define RX_CALL_DEAD		-1	/* call/conn has been inactive and is shut down */
-#define RX_INVALID_OPERATION	-2	/* invalid operation requested / attempted */
-#define RX_CALL_TIMEOUT		-3	/* call timeout exceeded */
-#define RX_EOF			-4	/* unexpected end of data on read op */
-#define RX_PROTOCOL_ERROR	-5	/* low-level protocol error */
-#define RX_USER_ABORT		-6	/* generic user abort */
-#define RX_ADDRINUSE		-7	/* UDP port in use */
-#define RX_DEBUGI_BADTYPE	-8	/* bad debugging packet type */
-
-/*
- * (un)marshalling abort codes (rxgen)
- */
-#define	RXGEN_CC_MARSHAL    -450
-#define	RXGEN_CC_UNMARSHAL  -451
-#define	RXGEN_SS_MARSHAL    -452
-#define	RXGEN_SS_UNMARSHAL  -453
-#define	RXGEN_DECODE	    -454
-#define	RXGEN_OPCODE	    -455
-#define	RXGEN_SS_XDRFREE    -456
-#define	RXGEN_CC_XDRFREE    -457
-
-/*
- * Rx kerberos security abort codes
- * - unfortunately we have no generalised security abort codes to say things
- *   like "unsupported security", so we have to use these instead and hope the
- *   other side understands
- */
-#define RXKADINCONSISTENCY	19270400	/* security module structure inconsistent */
-#define RXKADPACKETSHORT	19270401	/* packet too short for security challenge */
-#define RXKADLEVELFAIL		19270402	/* security level negotiation failed */
-#define RXKADTICKETLEN		19270403	/* ticket length too short or too long */
-#define RXKADOUTOFSEQUENCE	19270404	/* packet had bad sequence number */
-#define RXKADNOAUTH		19270405	/* caller not authorised */
-#define RXKADBADKEY		19270406	/* illegal key: bad parity or weak */
-#define RXKADBADTICKET		19270407	/* security object was passed a bad ticket */
-#define RXKADUNKNOWNKEY		19270408	/* ticket contained unknown key version number */
-#define RXKADEXPIRED		19270409	/* authentication expired */
-#define RXKADSEALEDINCON	19270410	/* sealed data inconsistent */
-#define RXKADDATALEN		19270411	/* user data too long */
-#define RXKADILLEGALLEVEL	19270412	/* caller not authorised to use encrypted conns */
-
-#endif /* _LINUX_RXRPC_PACKET_H */
diff --git a/include/uapi/linux/rxrpc.h b/include/uapi/linux/rxrpc.h
index 08e2fb9c70ae..9656aad8f8f7 100644
--- a/include/uapi/linux/rxrpc.h
+++ b/include/uapi/linux/rxrpc.h
@@ -77,4 +77,48 @@ enum rxrpc_cmsg_type {
 #define RXRPC_SECURITY_RXGK	4	/* gssapi-based */
 #define RXRPC_SECURITY_RXK5	5	/* kerberos 5 */
 
+/*
+ * RxRPC-level abort codes
+ */
+#define RX_CALL_DEAD		-1	/* call/conn has been inactive and is shut down */
+#define RX_INVALID_OPERATION	-2	/* invalid operation requested / attempted */
+#define RX_CALL_TIMEOUT		-3	/* call timeout exceeded */
+#define RX_EOF			-4	/* unexpected end of data on read op */
+#define RX_PROTOCOL_ERROR	-5	/* low-level protocol error */
+#define RX_USER_ABORT		-6	/* generic user abort */
+#define RX_ADDRINUSE		-7	/* UDP port in use */
+#define RX_DEBUGI_BADTYPE	-8	/* bad debugging packet type */
+
+/*
+ * (un)marshalling abort codes (rxgen)
+ */
+#define RXGEN_CC_MARSHAL	-450
+#define RXGEN_CC_UNMARSHAL	-451
+#define RXGEN_SS_MARSHAL	-452
+#define RXGEN_SS_UNMARSHAL	-453
+#define RXGEN_DECODE		-454
+#define RXGEN_OPCODE		-455
+#define RXGEN_SS_XDRFREE	-456
+#define RXGEN_CC_XDRFREE	-457
+
+/*
+ * Rx kerberos security abort codes
+ * - unfortunately we have no generalised security abort codes to say things
+ *   like "unsupported security", so we have to use these instead and hope the
+ *   other side understands
+ */
+#define RXKADINCONSISTENCY	19270400	/* security module structure inconsistent */
+#define RXKADPACKETSHORT	19270401	/* packet too short for security challenge */
+#define RXKADLEVELFAIL		19270402	/* security level negotiation failed */
+#define RXKADTICKETLEN		19270403	/* ticket length too short or too long */
+#define RXKADOUTOFSEQUENCE	19270404	/* packet had bad sequence number */
+#define RXKADNOAUTH		19270405	/* caller not authorised */
+#define RXKADBADKEY		19270406	/* illegal key: bad parity or weak */
+#define RXKADBADTICKET		19270407	/* security object was passed a bad ticket */
+#define RXKADUNKNOWNKEY		19270408	/* ticket contained unknown key version number */
+#define RXKADEXPIRED		19270409	/* authentication expired */
+#define RXKADSEALEDINCON	19270410	/* sealed data inconsistent */
+#define RXKADDATALEN		19270411	/* user data too long */
+#define RXKADILLEGALLEVEL	19270412	/* caller not authorised to use encrypted conns */
+
 #endif /* _UAPI_LINUX_RXRPC_H */
diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index 69b97339ff9d..8c0db9b3e4ab 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -15,7 +15,7 @@
 #include <net/netns/generic.h>
 #include <net/sock.h>
 #include <net/af_rxrpc.h>
-#include <rxrpc/packet.h>
+#include "protocol.h"
 
 #if 0
 #define CHECK_SLAB_OKAY(X)				     \
diff --git a/net/rxrpc/protocol.h b/net/rxrpc/protocol.h
new file mode 100644
index 000000000000..4bddcf3face3
--- /dev/null
+++ b/net/rxrpc/protocol.h
@@ -0,0 +1,190 @@
+/* packet.h: Rx packet layout and definitions
+ *
+ * Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#ifndef _LINUX_RXRPC_PACKET_H
+#define _LINUX_RXRPC_PACKET_H
+
+typedef u32	rxrpc_seq_t;	/* Rx message sequence number */
+typedef u32	rxrpc_serial_t;	/* Rx message serial number */
+typedef __be32	rxrpc_seq_net_t; /* on-the-wire Rx message sequence number */
+typedef __be32	rxrpc_serial_net_t; /* on-the-wire Rx message serial number */
+
+/*****************************************************************************/
+/*
+ * on-the-wire Rx packet header
+ * - all multibyte fields should be in network byte order
+ */
+struct rxrpc_wire_header {
+	__be32		epoch;		/* client boot timestamp */
+#define RXRPC_RANDOM_EPOCH	0x80000000	/* Random if set, date-based if not */
+
+	__be32		cid;		/* connection and channel ID */
+#define RXRPC_MAXCALLS		4			/* max active calls per conn */
+#define RXRPC_CHANNELMASK	(RXRPC_MAXCALLS-1)	/* mask for channel ID */
+#define RXRPC_CIDMASK		(~RXRPC_CHANNELMASK)	/* mask for connection ID */
+#define RXRPC_CIDSHIFT		ilog2(RXRPC_MAXCALLS)	/* shift for connection ID */
+#define RXRPC_CID_INC		(1 << RXRPC_CIDSHIFT)	/* connection ID increment */
+
+	__be32		callNumber;	/* call ID (0 for connection-level packets) */
+	__be32		seq;		/* sequence number of pkt in call stream */
+	__be32		serial;		/* serial number of pkt sent to network */
+
+	uint8_t		type;		/* packet type */
+#define RXRPC_PACKET_TYPE_DATA		1	/* data */
+#define RXRPC_PACKET_TYPE_ACK		2	/* ACK */
+#define RXRPC_PACKET_TYPE_BUSY		3	/* call reject */
+#define RXRPC_PACKET_TYPE_ABORT		4	/* call/connection abort */
+#define RXRPC_PACKET_TYPE_ACKALL	5	/* ACK all outstanding packets on call */
+#define RXRPC_PACKET_TYPE_CHALLENGE	6	/* connection security challenge (SRVR->CLNT) */
+#define RXRPC_PACKET_TYPE_RESPONSE	7	/* connection secutity response (CLNT->SRVR) */
+#define RXRPC_PACKET_TYPE_DEBUG		8	/* debug info request */
+#define RXRPC_PACKET_TYPE_VERSION	13	/* version string request */
+#define RXRPC_N_PACKET_TYPES		14	/* number of packet types (incl type 0) */
+
+	uint8_t		flags;		/* packet flags */
+#define RXRPC_CLIENT_INITIATED	0x01		/* signifies a packet generated by a client */
+#define RXRPC_REQUEST_ACK	0x02		/* request an unconditional ACK of this packet */
+#define RXRPC_LAST_PACKET	0x04		/* the last packet from this side for this call */
+#define RXRPC_MORE_PACKETS	0x08		/* more packets to come */
+#define RXRPC_JUMBO_PACKET	0x20		/* [DATA] this is a jumbo packet */
+#define RXRPC_SLOW_START_OK	0x20		/* [ACK] slow start supported */
+
+	uint8_t		userStatus;	/* app-layer defined status */
+#define RXRPC_USERSTATUS_SERVICE_UPGRADE 0x01	/* AuriStor service upgrade request */
+	
+	uint8_t		securityIndex;	/* security protocol ID */
+	union {
+		__be16	_rsvd;		/* reserved */
+		__be16	cksum;		/* kerberos security checksum */
+	};
+	__be16		serviceId;	/* service ID */
+
+} __packed;
+
+#define RXRPC_SUPPORTED_PACKET_TYPES (			\
+		(1 << RXRPC_PACKET_TYPE_DATA) |		\
+		(1 << RXRPC_PACKET_TYPE_ACK) |		\
+		(1 << RXRPC_PACKET_TYPE_BUSY) |		\
+		(1 << RXRPC_PACKET_TYPE_ABORT) |	\
+		(1 << RXRPC_PACKET_TYPE_ACKALL) |	\
+		(1 << RXRPC_PACKET_TYPE_CHALLENGE) |	\
+		(1 << RXRPC_PACKET_TYPE_RESPONSE) |	\
+		/*(1 << RXRPC_PACKET_TYPE_DEBUG) | */	\
+		(1 << RXRPC_PACKET_TYPE_VERSION))
+
+/*****************************************************************************/
+/*
+ * jumbo packet secondary header
+ * - can be mapped to read header by:
+ *   - new_serial = serial + 1
+ *   - new_seq = seq + 1
+ *   - new_flags = j_flags
+ *   - new__rsvd = j__rsvd
+ *   - duplicating all other fields
+ */
+struct rxrpc_jumbo_header {
+	uint8_t		flags;		/* packet flags (as per rxrpc_header) */
+	uint8_t		pad;
+	union {
+		__be16	_rsvd;		/* reserved */
+		__be16	cksum;		/* kerberos security checksum */
+	};
+};
+
+#define RXRPC_JUMBO_DATALEN	1412	/* non-terminal jumbo packet data length */
+#define RXRPC_JUMBO_SUBPKTLEN	(RXRPC_JUMBO_DATALEN + sizeof(struct rxrpc_jumbo_header))
+
+/*****************************************************************************/
+/*
+ * on-the-wire Rx ACK packet data payload
+ * - all multibyte fields should be in network byte order
+ */
+struct rxrpc_ackpacket {
+	__be16		bufferSpace;	/* number of packet buffers available */
+	__be16		maxSkew;	/* diff between serno being ACK'd and highest serial no
+					 * received */
+	__be32		firstPacket;	/* sequence no of first ACK'd packet in attached list */
+	__be32		previousPacket;	/* sequence no of previous packet received */
+	__be32		serial;		/* serial no of packet that prompted this ACK */
+
+	uint8_t		reason;		/* reason for ACK */
+#define RXRPC_ACK_REQUESTED		1	/* ACK was requested on packet */
+#define RXRPC_ACK_DUPLICATE		2	/* duplicate packet received */
+#define RXRPC_ACK_OUT_OF_SEQUENCE	3	/* out of sequence packet received */
+#define RXRPC_ACK_EXCEEDS_WINDOW	4	/* packet received beyond end of ACK window */
+#define RXRPC_ACK_NOSPACE		5	/* packet discarded due to lack of buffer space */
+#define RXRPC_ACK_PING			6	/* keep alive ACK */
+#define RXRPC_ACK_PING_RESPONSE		7	/* response to RXRPC_ACK_PING */
+#define RXRPC_ACK_DELAY			8	/* nothing happened since received packet */
+#define RXRPC_ACK_IDLE			9	/* ACK due to fully received ACK window */
+#define RXRPC_ACK__INVALID		10	/* Representation of invalid ACK reason */
+
+	uint8_t		nAcks;		/* number of ACKs */
+#define RXRPC_MAXACKS	255
+
+	uint8_t		acks[0];	/* list of ACK/NAKs */
+#define RXRPC_ACK_TYPE_NACK		0
+#define RXRPC_ACK_TYPE_ACK		1
+
+} __packed;
+
+/* Some ACKs refer to specific packets and some are general and can be updated. */
+#define RXRPC_ACK_UPDATEABLE ((1 << RXRPC_ACK_REQUESTED)	|	\
+			      (1 << RXRPC_ACK_PING_RESPONSE)	|	\
+			      (1 << RXRPC_ACK_DELAY)		|	\
+			      (1 << RXRPC_ACK_IDLE))
+
+
+/*
+ * ACK packets can have a further piece of information tagged on the end
+ */
+struct rxrpc_ackinfo {
+	__be32		rxMTU;		/* maximum Rx MTU size (bytes) [AFS 3.3] */
+	__be32		maxMTU;		/* maximum interface MTU size (bytes) [AFS 3.3] */
+	__be32		rwind;		/* Rx window size (packets) [AFS 3.4] */
+	__be32		jumbo_max;	/* max packets to stick into a jumbo packet [AFS 3.5] */
+};
+
+/*****************************************************************************/
+/*
+ * Kerberos security type-2 challenge packet
+ */
+struct rxkad_challenge {
+	__be32		version;	/* version of this challenge type */
+	__be32		nonce;		/* encrypted random number */
+	__be32		min_level;	/* minimum security level */
+	__be32		__padding;	/* padding to 8-byte boundary */
+} __packed;
+
+/*****************************************************************************/
+/*
+ * Kerberos security type-2 response packet
+ */
+struct rxkad_response {
+	__be32		version;	/* version of this response type */
+	__be32		__pad;
+
+	/* encrypted bit of the response */
+	struct {
+		__be32		epoch;		/* current epoch */
+		__be32		cid;		/* parent connection ID */
+		__be32		checksum;	/* checksum */
+		__be32		securityIndex;	/* security type */
+		__be32		call_id[4];	/* encrypted call IDs */
+		__be32		inc_nonce;	/* challenge nonce + 1 */
+		__be32		level;		/* desired level */
+	} encrypted;
+
+	__be32		kvno;		/* Kerberos key version number */
+	__be32		ticket_len;	/* Kerberos ticket length  */
+} __packed;
+
+#endif /* _LINUX_RXRPC_PACKET_H */
-- 
cgit v1.2.3-71-gd317


From 784b4e612d42a2b7578d7fab2ed78940e10536bc Mon Sep 17 00:00:00 2001
From: Phil Sutter <phil@nwl.cc>
Date: Wed, 19 Jul 2017 16:32:23 +0200
Subject: netfilter: nf_tables: Attach process info to NFT_MSG_NEWGEN
 notifications

This is helpful for 'nft monitor' to track which process caused a given
change to the ruleset.

Signed-off-by: Phil Sutter <phil@nwl.cc>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_tables.h | 2 ++
 net/netfilter/nf_tables_api.c            | 5 ++++-
 2 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index 683f6f88fcac..6f0a950e21c3 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -1221,6 +1221,8 @@ enum nft_objref_attributes {
 enum nft_gen_attributes {
 	NFTA_GEN_UNSPEC,
 	NFTA_GEN_ID,
+	NFTA_GEN_PROC_PID,
+	NFTA_GEN_PROC_NAME,
 	__NFTA_GEN_MAX
 };
 #define NFTA_GEN_MAX		(__NFTA_GEN_MAX - 1)
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 7fbf0070aba1..b77ad0813564 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -4657,6 +4657,7 @@ static int nf_tables_fill_gen_info(struct sk_buff *skb, struct net *net,
 {
 	struct nlmsghdr *nlh;
 	struct nfgenmsg *nfmsg;
+	char buf[TASK_COMM_LEN];
 	int event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, NFT_MSG_NEWGEN);
 
 	nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg), 0);
@@ -4668,7 +4669,9 @@ static int nf_tables_fill_gen_info(struct sk_buff *skb, struct net *net,
 	nfmsg->version		= NFNETLINK_V0;
 	nfmsg->res_id		= htons(net->nft.base_seq & 0xffff);
 
-	if (nla_put_be32(skb, NFTA_GEN_ID, htonl(net->nft.base_seq)))
+	if (nla_put_be32(skb, NFTA_GEN_ID, htonl(net->nft.base_seq)) ||
+	    nla_put_be32(skb, NFTA_GEN_PROC_PID, htonl(task_pid_nr(current))) ||
+	    nla_put_string(skb, NFTA_GEN_PROC_NAME, get_task_comm(buf, current)))
 		goto nla_put_failure;
 
 	nlmsg_end(skb, nlh);
-- 
cgit v1.2.3-71-gd317


From ca1136c99b66b1566781ff12ecddc635d570f932 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Wed, 12 Jul 2017 11:49:53 -0700
Subject: blktrace: export cgroup info in trace

Currently blktrace isn't cgroup aware. blktrace prints out task name of
current context, but the task of current context isn't always in the
cgroup where the BIO comes from. We can't use task name to find out IO
cgroup. For example, Writeback BIOs always comes from flusher thread but
the BIOs are for different blk cgroups. Request could be requeued and
dispatched from completely different tasks. MD/DM are another examples.

This patch tries to fix the gap. We print out cgroup fhandle info in
blktrace. Userspace can use open_by_handle_at() syscall to find the
cgroup by fhandle. Or userspace can use name_to_handle_at() syscall to
find fhandle for a cgroup and use a BPF program to filter out blktrace
for a specific cgroup.

We add a new 'blk_cgroup' trace option for blk tracer. It's default off.
Application which doesn't know the new option isn't affected.  When it's
on, we output fhandle info right after blk_io_trace with an extra bit
set in event action. So from application point of view, blktrace with
the option will output new actions.

I didn't change blk trace event yet, since I'm not sure if changing the
trace event output is an ABI issue. If not, I'll do it later.

Acked-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Signed-off-by: Shaohua Li <shli@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/blktrace_api.h |   3 +
 kernel/trace/blktrace.c           | 231 ++++++++++++++++++++++++++------------
 2 files changed, 161 insertions(+), 73 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/blktrace_api.h b/include/uapi/linux/blktrace_api.h
index c590ca6bfbd9..9cdaedeadb84 100644
--- a/include/uapi/linux/blktrace_api.h
+++ b/include/uapi/linux/blktrace_api.h
@@ -52,6 +52,7 @@ enum blktrace_act {
 	__BLK_TA_REMAP,			/* bio was remapped */
 	__BLK_TA_ABORT,			/* request aborted */
 	__BLK_TA_DRV_DATA,		/* driver-specific binary data */
+	__BLK_TA_CGROUP = 1 << 8,	/* from a cgroup*/
 };
 
 /*
@@ -61,6 +62,7 @@ enum blktrace_notify {
 	__BLK_TN_PROCESS = 0,		/* establish pid/name mapping */
 	__BLK_TN_TIMESTAMP,		/* include system clock */
 	__BLK_TN_MESSAGE,		/* Character string message */
+	__BLK_TN_CGROUP = __BLK_TA_CGROUP, /* from a cgroup */
 };
 
 
@@ -107,6 +109,7 @@ struct blk_io_trace {
 	__u32 cpu;		/* on what cpu did it happen */
 	__u16 error;		/* completion error */
 	__u16 pdu_len;		/* length of data after this trace */
+	/* cgroup id will be stored here if exists */
 };
 
 /*
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index bc364f86100a..f393d7a43695 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -27,6 +27,7 @@
 #include <linux/time.h>
 #include <linux/uaccess.h>
 #include <linux/list.h>
+#include <linux/blk-cgroup.h>
 
 #include "../../block/blk.h"
 
@@ -46,10 +47,14 @@ static __cacheline_aligned_in_smp DEFINE_SPINLOCK(running_trace_lock);
 
 /* Select an alternative, minimalistic output than the original one */
 #define TRACE_BLK_OPT_CLASSIC	0x1
+#define TRACE_BLK_OPT_CGROUP	0x2
 
 static struct tracer_opt blk_tracer_opts[] = {
 	/* Default disable the minimalistic output */
 	{ TRACER_OPT(blk_classic, TRACE_BLK_OPT_CLASSIC) },
+#ifdef CONFIG_BLK_CGROUP
+	{ TRACER_OPT(blk_cgroup, TRACE_BLK_OPT_CGROUP) },
+#endif
 	{ }
 };
 
@@ -68,7 +73,8 @@ static void blk_unregister_tracepoints(void);
  * Send out a notify message.
  */
 static void trace_note(struct blk_trace *bt, pid_t pid, int action,
-		       const void *data, size_t len)
+		       const void *data, size_t len,
+		       union kernfs_node_id *cgid)
 {
 	struct blk_io_trace *t;
 	struct ring_buffer_event *event = NULL;
@@ -76,12 +82,13 @@ static void trace_note(struct blk_trace *bt, pid_t pid, int action,
 	int pc = 0;
 	int cpu = smp_processor_id();
 	bool blk_tracer = blk_tracer_enabled;
+	ssize_t cgid_len = cgid ? sizeof(*cgid) : 0;
 
 	if (blk_tracer) {
 		buffer = blk_tr->trace_buffer.buffer;
 		pc = preempt_count();
 		event = trace_buffer_lock_reserve(buffer, TRACE_BLK,
-						  sizeof(*t) + len,
+						  sizeof(*t) + len + cgid_len,
 						  0, pc);
 		if (!event)
 			return;
@@ -92,17 +99,19 @@ static void trace_note(struct blk_trace *bt, pid_t pid, int action,
 	if (!bt->rchan)
 		return;
 
-	t = relay_reserve(bt->rchan, sizeof(*t) + len);
+	t = relay_reserve(bt->rchan, sizeof(*t) + len + cgid_len);
 	if (t) {
 		t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION;
 		t->time = ktime_to_ns(ktime_get());
 record_it:
 		t->device = bt->dev;
-		t->action = action;
+		t->action = action | (cgid ? __BLK_TN_CGROUP : 0);
 		t->pid = pid;
 		t->cpu = cpu;
-		t->pdu_len = len;
-		memcpy((void *) t + sizeof(*t), data, len);
+		t->pdu_len = len + cgid_len;
+		if (cgid)
+			memcpy((void *)t + sizeof(*t), cgid, cgid_len);
+		memcpy((void *) t + sizeof(*t) + cgid_len, data, len);
 
 		if (blk_tracer)
 			trace_buffer_unlock_commit(blk_tr, buffer, event, 0, pc);
@@ -122,7 +131,7 @@ static void trace_note_tsk(struct task_struct *tsk)
 	spin_lock_irqsave(&running_trace_lock, flags);
 	list_for_each_entry(bt, &running_trace_list, running_list) {
 		trace_note(bt, tsk->pid, BLK_TN_PROCESS, tsk->comm,
-			   sizeof(tsk->comm));
+			   sizeof(tsk->comm), NULL);
 	}
 	spin_unlock_irqrestore(&running_trace_lock, flags);
 }
@@ -139,7 +148,7 @@ static void trace_note_time(struct blk_trace *bt)
 	words[1] = now.tv_nsec;
 
 	local_irq_save(flags);
-	trace_note(bt, 0, BLK_TN_TIMESTAMP, words, sizeof(words));
+	trace_note(bt, 0, BLK_TN_TIMESTAMP, words, sizeof(words), NULL);
 	local_irq_restore(flags);
 }
 
@@ -167,7 +176,7 @@ void __trace_note_message(struct blk_trace *bt, const char *fmt, ...)
 	n = vscnprintf(buf, BLK_TN_MAX_MSG, fmt, args);
 	va_end(args);
 
-	trace_note(bt, 0, BLK_TN_MESSAGE, buf, n);
+	trace_note(bt, 0, BLK_TN_MESSAGE, buf, n, NULL);
 	local_irq_restore(flags);
 }
 EXPORT_SYMBOL_GPL(__trace_note_message);
@@ -204,7 +213,7 @@ static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ),
  */
 static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
 		     int op, int op_flags, u32 what, int error, int pdu_len,
-		     void *pdu_data)
+		     void *pdu_data, union kernfs_node_id *cgid)
 {
 	struct task_struct *tsk = current;
 	struct ring_buffer_event *event = NULL;
@@ -215,6 +224,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
 	pid_t pid;
 	int cpu, pc = 0;
 	bool blk_tracer = blk_tracer_enabled;
+	ssize_t cgid_len = cgid ? sizeof(*cgid) : 0;
 
 	if (unlikely(bt->trace_state != Blktrace_running && !blk_tracer))
 		return;
@@ -229,6 +239,8 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
 		what |= BLK_TC_ACT(BLK_TC_DISCARD);
 	if (op == REQ_OP_FLUSH)
 		what |= BLK_TC_ACT(BLK_TC_FLUSH);
+	if (cgid)
+		what |= __BLK_TA_CGROUP;
 
 	pid = tsk->pid;
 	if (act_log_check(bt, what, sector, pid))
@@ -241,7 +253,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
 		buffer = blk_tr->trace_buffer.buffer;
 		pc = preempt_count();
 		event = trace_buffer_lock_reserve(buffer, TRACE_BLK,
-						  sizeof(*t) + pdu_len,
+						  sizeof(*t) + pdu_len + cgid_len,
 						  0, pc);
 		if (!event)
 			return;
@@ -258,7 +270,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
 	 * from coming in and stepping on our toes.
 	 */
 	local_irq_save(flags);
-	t = relay_reserve(bt->rchan, sizeof(*t) + pdu_len);
+	t = relay_reserve(bt->rchan, sizeof(*t) + pdu_len + cgid_len);
 	if (t) {
 		sequence = per_cpu_ptr(bt->sequence, cpu);
 
@@ -280,10 +292,12 @@ record_it:
 		t->action = what;
 		t->device = bt->dev;
 		t->error = error;
-		t->pdu_len = pdu_len;
+		t->pdu_len = pdu_len + cgid_len;
 
+		if (cgid_len)
+			memcpy((void *)t + sizeof(*t), cgid, cgid_len);
 		if (pdu_len)
-			memcpy((void *) t + sizeof(*t), pdu_data, pdu_len);
+			memcpy((void *)t + sizeof(*t) + cgid_len, pdu_data, pdu_len);
 
 		if (blk_tracer) {
 			trace_buffer_unlock_commit(blk_tr, buffer, event, 0, pc);
@@ -684,6 +698,36 @@ void blk_trace_shutdown(struct request_queue *q)
 	}
 }
 
+#ifdef CONFIG_BLK_CGROUP
+static union kernfs_node_id *
+blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio)
+{
+	struct blk_trace *bt = q->blk_trace;
+
+	if (!bt || !(blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP))
+		return NULL;
+
+	if (!bio->bi_css)
+		return NULL;
+	return cgroup_get_kernfs_id(bio->bi_css->cgroup);
+}
+#else
+static union kernfs_node_id *
+blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio)
+{
+	return NULL;
+}
+#endif
+
+static union kernfs_node_id *
+blk_trace_request_get_cgid(struct request_queue *q, struct request *rq)
+{
+	if (!rq->bio)
+		return NULL;
+	/* Use the first bio */
+	return blk_trace_bio_get_cgid(q, rq->bio);
+}
+
 /*
  * blktrace probes
  */
@@ -694,13 +738,15 @@ void blk_trace_shutdown(struct request_queue *q)
  * @error:	return status to log
  * @nr_bytes:	number of completed bytes
  * @what:	the action
+ * @cgid:	the cgroup info
  *
  * Description:
  *     Records an action against a request. Will log the bio offset + size.
  *
  **/
 static void blk_add_trace_rq(struct request *rq, int error,
-			     unsigned int nr_bytes, u32 what)
+			     unsigned int nr_bytes, u32 what,
+			     union kernfs_node_id *cgid)
 {
 	struct blk_trace *bt = rq->q->blk_trace;
 
@@ -713,32 +759,36 @@ static void blk_add_trace_rq(struct request *rq, int error,
 		what |= BLK_TC_ACT(BLK_TC_FS);
 
 	__blk_add_trace(bt, blk_rq_trace_sector(rq), nr_bytes, req_op(rq),
-			rq->cmd_flags, what, error, 0, NULL);
+			rq->cmd_flags, what, error, 0, NULL, cgid);
 }
 
 static void blk_add_trace_rq_insert(void *ignore,
 				    struct request_queue *q, struct request *rq)
 {
-	blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_INSERT);
+	blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_INSERT,
+			 blk_trace_request_get_cgid(q, rq));
 }
 
 static void blk_add_trace_rq_issue(void *ignore,
 				   struct request_queue *q, struct request *rq)
 {
-	blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_ISSUE);
+	blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_ISSUE,
+			 blk_trace_request_get_cgid(q, rq));
 }
 
 static void blk_add_trace_rq_requeue(void *ignore,
 				     struct request_queue *q,
 				     struct request *rq)
 {
-	blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_REQUEUE);
+	blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_REQUEUE,
+			 blk_trace_request_get_cgid(q, rq));
 }
 
 static void blk_add_trace_rq_complete(void *ignore, struct request *rq,
 			int error, unsigned int nr_bytes)
 {
-	blk_add_trace_rq(rq, error, nr_bytes, BLK_TA_COMPLETE);
+	blk_add_trace_rq(rq, error, nr_bytes, BLK_TA_COMPLETE,
+			 blk_trace_request_get_cgid(rq->q, rq));
 }
 
 /**
@@ -753,7 +803,7 @@ static void blk_add_trace_rq_complete(void *ignore, struct request *rq,
  *
  **/
 static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
-			      u32 what, int error)
+			      u32 what, int error, union kernfs_node_id *cgid)
 {
 	struct blk_trace *bt = q->blk_trace;
 
@@ -761,20 +811,22 @@ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
 		return;
 
 	__blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size,
-			bio_op(bio), bio->bi_opf, what, error, 0, NULL);
+			bio_op(bio), bio->bi_opf, what, error, 0, NULL, cgid);
 }
 
 static void blk_add_trace_bio_bounce(void *ignore,
 				     struct request_queue *q, struct bio *bio)
 {
-	blk_add_trace_bio(q, bio, BLK_TA_BOUNCE, 0);
+	blk_add_trace_bio(q, bio, BLK_TA_BOUNCE, 0,
+			  blk_trace_bio_get_cgid(q, bio));
 }
 
 static void blk_add_trace_bio_complete(void *ignore,
 				       struct request_queue *q, struct bio *bio,
 				       int error)
 {
-	blk_add_trace_bio(q, bio, BLK_TA_COMPLETE, error);
+	blk_add_trace_bio(q, bio, BLK_TA_COMPLETE, error,
+			  blk_trace_bio_get_cgid(q, bio));
 }
 
 static void blk_add_trace_bio_backmerge(void *ignore,
@@ -782,7 +834,8 @@ static void blk_add_trace_bio_backmerge(void *ignore,
 					struct request *rq,
 					struct bio *bio)
 {
-	blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE, 0);
+	blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE, 0,
+			 blk_trace_bio_get_cgid(q, bio));
 }
 
 static void blk_add_trace_bio_frontmerge(void *ignore,
@@ -790,13 +843,15 @@ static void blk_add_trace_bio_frontmerge(void *ignore,
 					 struct request *rq,
 					 struct bio *bio)
 {
-	blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE, 0);
+	blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE, 0,
+			  blk_trace_bio_get_cgid(q, bio));
 }
 
 static void blk_add_trace_bio_queue(void *ignore,
 				    struct request_queue *q, struct bio *bio)
 {
-	blk_add_trace_bio(q, bio, BLK_TA_QUEUE, 0);
+	blk_add_trace_bio(q, bio, BLK_TA_QUEUE, 0,
+			  blk_trace_bio_get_cgid(q, bio));
 }
 
 static void blk_add_trace_getrq(void *ignore,
@@ -804,13 +859,14 @@ static void blk_add_trace_getrq(void *ignore,
 				struct bio *bio, int rw)
 {
 	if (bio)
-		blk_add_trace_bio(q, bio, BLK_TA_GETRQ, 0);
+		blk_add_trace_bio(q, bio, BLK_TA_GETRQ, 0,
+				  blk_trace_bio_get_cgid(q, bio));
 	else {
 		struct blk_trace *bt = q->blk_trace;
 
 		if (bt)
 			__blk_add_trace(bt, 0, 0, rw, 0, BLK_TA_GETRQ, 0, 0,
-					NULL);
+					NULL, NULL);
 	}
 }
 
@@ -820,13 +876,14 @@ static void blk_add_trace_sleeprq(void *ignore,
 				  struct bio *bio, int rw)
 {
 	if (bio)
-		blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ, 0);
+		blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ, 0,
+				  blk_trace_bio_get_cgid(q, bio));
 	else {
 		struct blk_trace *bt = q->blk_trace;
 
 		if (bt)
 			__blk_add_trace(bt, 0, 0, rw, 0, BLK_TA_SLEEPRQ,
-					0, 0, NULL);
+					0, 0, NULL, NULL);
 	}
 }
 
@@ -835,7 +892,7 @@ static void blk_add_trace_plug(void *ignore, struct request_queue *q)
 	struct blk_trace *bt = q->blk_trace;
 
 	if (bt)
-		__blk_add_trace(bt, 0, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL);
+		__blk_add_trace(bt, 0, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL, NULL);
 }
 
 static void blk_add_trace_unplug(void *ignore, struct request_queue *q,
@@ -852,7 +909,7 @@ static void blk_add_trace_unplug(void *ignore, struct request_queue *q,
 		else
 			what = BLK_TA_UNPLUG_TIMER;
 
-		__blk_add_trace(bt, 0, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu);
+		__blk_add_trace(bt, 0, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu, NULL);
 	}
 }
 
@@ -868,7 +925,7 @@ static void blk_add_trace_split(void *ignore,
 		__blk_add_trace(bt, bio->bi_iter.bi_sector,
 				bio->bi_iter.bi_size, bio_op(bio), bio->bi_opf,
 				BLK_TA_SPLIT, bio->bi_status, sizeof(rpdu),
-				&rpdu);
+				&rpdu, blk_trace_bio_get_cgid(q, bio));
 	}
 }
 
@@ -901,7 +958,7 @@ static void blk_add_trace_bio_remap(void *ignore,
 
 	__blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size,
 			bio_op(bio), bio->bi_opf, BLK_TA_REMAP, bio->bi_status,
-			sizeof(r), &r);
+			sizeof(r), &r, blk_trace_bio_get_cgid(q, bio));
 }
 
 /**
@@ -934,7 +991,7 @@ static void blk_add_trace_rq_remap(void *ignore,
 
 	__blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq),
 			rq_data_dir(rq), 0, BLK_TA_REMAP, 0,
-			sizeof(r), &r);
+			sizeof(r), &r, blk_trace_request_get_cgid(q, rq));
 }
 
 /**
@@ -958,7 +1015,8 @@ void blk_add_driver_data(struct request_queue *q,
 		return;
 
 	__blk_add_trace(bt, blk_rq_trace_sector(rq), blk_rq_bytes(rq), 0, 0,
-				BLK_TA_DRV_DATA, 0, len, data);
+				BLK_TA_DRV_DATA, 0, len, data,
+				blk_trace_request_get_cgid(q, rq));
 }
 EXPORT_SYMBOL_GPL(blk_add_driver_data);
 
@@ -1031,7 +1089,7 @@ static void fill_rwbs(char *rwbs, const struct blk_io_trace *t)
 	int i = 0;
 	int tc = t->action >> BLK_TC_SHIFT;
 
-	if (t->action == BLK_TN_MESSAGE) {
+	if ((t->action & ~__BLK_TN_CGROUP) == BLK_TN_MESSAGE) {
 		rwbs[i++] = 'N';
 		goto out;
 	}
@@ -1066,9 +1124,21 @@ const struct blk_io_trace *te_blk_io_trace(const struct trace_entry *ent)
 	return (const struct blk_io_trace *)ent;
 }
 
-static inline const void *pdu_start(const struct trace_entry *ent)
+static inline const void *pdu_start(const struct trace_entry *ent, bool has_cg)
 {
-	return te_blk_io_trace(ent) + 1;
+	return (void *)(te_blk_io_trace(ent) + 1) +
+		(has_cg ? sizeof(union kernfs_node_id) : 0);
+}
+
+static inline const void *cgid_start(const struct trace_entry *ent)
+{
+	return (void *)(te_blk_io_trace(ent) + 1);
+}
+
+static inline int pdu_real_len(const struct trace_entry *ent, bool has_cg)
+{
+	return te_blk_io_trace(ent)->pdu_len -
+			(has_cg ? sizeof(union kernfs_node_id) : 0);
 }
 
 static inline u32 t_action(const struct trace_entry *ent)
@@ -1096,16 +1166,16 @@ static inline __u16 t_error(const struct trace_entry *ent)
 	return te_blk_io_trace(ent)->error;
 }
 
-static __u64 get_pdu_int(const struct trace_entry *ent)
+static __u64 get_pdu_int(const struct trace_entry *ent, bool has_cg)
 {
-	const __u64 *val = pdu_start(ent);
+	const __u64 *val = pdu_start(ent, has_cg);
 	return be64_to_cpu(*val);
 }
 
 static void get_pdu_remap(const struct trace_entry *ent,
-			  struct blk_io_trace_remap *r)
+			  struct blk_io_trace_remap *r, bool has_cg)
 {
-	const struct blk_io_trace_remap *__r = pdu_start(ent);
+	const struct blk_io_trace_remap *__r = pdu_start(ent, has_cg);
 	__u64 sector_from = __r->sector_from;
 
 	r->device_from = be32_to_cpu(__r->device_from);
@@ -1113,9 +1183,11 @@ static void get_pdu_remap(const struct trace_entry *ent,
 	r->sector_from = be64_to_cpu(sector_from);
 }
 
-typedef void (blk_log_action_t) (struct trace_iterator *iter, const char *act);
+typedef void (blk_log_action_t) (struct trace_iterator *iter, const char *act,
+	bool has_cg);
 
-static void blk_log_action_classic(struct trace_iterator *iter, const char *act)
+static void blk_log_action_classic(struct trace_iterator *iter, const char *act,
+	bool has_cg)
 {
 	char rwbs[RWBS_LEN];
 	unsigned long long ts  = iter->ts;
@@ -1131,24 +1203,33 @@ static void blk_log_action_classic(struct trace_iterator *iter, const char *act)
 			 secs, nsec_rem, iter->ent->pid, act, rwbs);
 }
 
-static void blk_log_action(struct trace_iterator *iter, const char *act)
+static void blk_log_action(struct trace_iterator *iter, const char *act,
+	bool has_cg)
 {
 	char rwbs[RWBS_LEN];
 	const struct blk_io_trace *t = te_blk_io_trace(iter->ent);
 
 	fill_rwbs(rwbs, t);
-	trace_seq_printf(&iter->seq, "%3d,%-3d %2s %3s ",
-			 MAJOR(t->device), MINOR(t->device), act, rwbs);
+	if (has_cg) {
+		const union kernfs_node_id *id = cgid_start(iter->ent);
+
+		trace_seq_printf(&iter->seq, "%3d,%-3d %x,%-x %2s %3s ",
+				 MAJOR(t->device), MINOR(t->device),
+				 id->ino, id->generation, act, rwbs);
+	} else
+		trace_seq_printf(&iter->seq, "%3d,%-3d %2s %3s ",
+				 MAJOR(t->device), MINOR(t->device), act, rwbs);
 }
 
-static void blk_log_dump_pdu(struct trace_seq *s, const struct trace_entry *ent)
+static void blk_log_dump_pdu(struct trace_seq *s,
+	const struct trace_entry *ent, bool has_cg)
 {
 	const unsigned char *pdu_buf;
 	int pdu_len;
 	int i, end;
 
-	pdu_buf = pdu_start(ent);
-	pdu_len = te_blk_io_trace(ent)->pdu_len;
+	pdu_buf = pdu_start(ent, has_cg);
+	pdu_len = pdu_real_len(ent, has_cg);
 
 	if (!pdu_len)
 		return;
@@ -1179,7 +1260,7 @@ static void blk_log_dump_pdu(struct trace_seq *s, const struct trace_entry *ent)
 	trace_seq_puts(s, ") ");
 }
 
-static void blk_log_generic(struct trace_seq *s, const struct trace_entry *ent)
+static void blk_log_generic(struct trace_seq *s, const struct trace_entry *ent, bool has_cg)
 {
 	char cmd[TASK_COMM_LEN];
 
@@ -1187,7 +1268,7 @@ static void blk_log_generic(struct trace_seq *s, const struct trace_entry *ent)
 
 	if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) {
 		trace_seq_printf(s, "%u ", t_bytes(ent));
-		blk_log_dump_pdu(s, ent);
+		blk_log_dump_pdu(s, ent, has_cg);
 		trace_seq_printf(s, "[%s]\n", cmd);
 	} else {
 		if (t_sec(ent))
@@ -1199,10 +1280,10 @@ static void blk_log_generic(struct trace_seq *s, const struct trace_entry *ent)
 }
 
 static void blk_log_with_error(struct trace_seq *s,
-			      const struct trace_entry *ent)
+			      const struct trace_entry *ent, bool has_cg)
 {
 	if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) {
-		blk_log_dump_pdu(s, ent);
+		blk_log_dump_pdu(s, ent, has_cg);
 		trace_seq_printf(s, "[%d]\n", t_error(ent));
 	} else {
 		if (t_sec(ent))
@@ -1215,18 +1296,18 @@ static void blk_log_with_error(struct trace_seq *s,
 	}
 }
 
-static void blk_log_remap(struct trace_seq *s, const struct trace_entry *ent)
+static void blk_log_remap(struct trace_seq *s, const struct trace_entry *ent, bool has_cg)
 {
 	struct blk_io_trace_remap r = { .device_from = 0, };
 
-	get_pdu_remap(ent, &r);
+	get_pdu_remap(ent, &r, has_cg);
 	trace_seq_printf(s, "%llu + %u <- (%d,%d) %llu\n",
 			 t_sector(ent), t_sec(ent),
 			 MAJOR(r.device_from), MINOR(r.device_from),
 			 (unsigned long long)r.sector_from);
 }
 
-static void blk_log_plug(struct trace_seq *s, const struct trace_entry *ent)
+static void blk_log_plug(struct trace_seq *s, const struct trace_entry *ent, bool has_cg)
 {
 	char cmd[TASK_COMM_LEN];
 
@@ -1235,30 +1316,31 @@ static void blk_log_plug(struct trace_seq *s, const struct trace_entry *ent)
 	trace_seq_printf(s, "[%s]\n", cmd);
 }
 
-static void blk_log_unplug(struct trace_seq *s, const struct trace_entry *ent)
+static void blk_log_unplug(struct trace_seq *s, const struct trace_entry *ent, bool has_cg)
 {
 	char cmd[TASK_COMM_LEN];
 
 	trace_find_cmdline(ent->pid, cmd);
 
-	trace_seq_printf(s, "[%s] %llu\n", cmd, get_pdu_int(ent));
+	trace_seq_printf(s, "[%s] %llu\n", cmd, get_pdu_int(ent, has_cg));
 }
 
-static void blk_log_split(struct trace_seq *s, const struct trace_entry *ent)
+static void blk_log_split(struct trace_seq *s, const struct trace_entry *ent, bool has_cg)
 {
 	char cmd[TASK_COMM_LEN];
 
 	trace_find_cmdline(ent->pid, cmd);
 
 	trace_seq_printf(s, "%llu / %llu [%s]\n", t_sector(ent),
-			 get_pdu_int(ent), cmd);
+			 get_pdu_int(ent, has_cg), cmd);
 }
 
-static void blk_log_msg(struct trace_seq *s, const struct trace_entry *ent)
+static void blk_log_msg(struct trace_seq *s, const struct trace_entry *ent,
+			bool has_cg)
 {
-	const struct blk_io_trace *t = te_blk_io_trace(ent);
 
-	trace_seq_putmem(s, t + 1, t->pdu_len);
+	trace_seq_putmem(s, pdu_start(ent, has_cg),
+		pdu_real_len(ent, has_cg));
 	trace_seq_putc(s, '\n');
 }
 
@@ -1298,7 +1380,8 @@ static void blk_tracer_reset(struct trace_array *tr)
 
 static const struct {
 	const char *act[2];
-	void	   (*print)(struct trace_seq *s, const struct trace_entry *ent);
+	void	   (*print)(struct trace_seq *s, const struct trace_entry *ent,
+			    bool has_cg);
 } what2act[] = {
 	[__BLK_TA_QUEUE]	= {{  "Q", "queue" },	   blk_log_generic },
 	[__BLK_TA_BACKMERGE]	= {{  "M", "backmerge" },  blk_log_generic },
@@ -1326,23 +1409,25 @@ static enum print_line_t print_one_line(struct trace_iterator *iter,
 	u16 what;
 	bool long_act;
 	blk_log_action_t *log_action;
+	bool has_cg;
 
 	t	   = te_blk_io_trace(iter->ent);
-	what	   = t->action & ((1 << BLK_TC_SHIFT) - 1);
+	what	   = (t->action & ((1 << BLK_TC_SHIFT) - 1)) & ~__BLK_TA_CGROUP;
 	long_act   = !!(tr->trace_flags & TRACE_ITER_VERBOSE);
 	log_action = classic ? &blk_log_action_classic : &blk_log_action;
+	has_cg	   = t->action & __BLK_TA_CGROUP;
 
-	if (t->action == BLK_TN_MESSAGE) {
-		log_action(iter, long_act ? "message" : "m");
-		blk_log_msg(s, iter->ent);
+	if ((t->action & ~__BLK_TN_CGROUP) == BLK_TN_MESSAGE) {
+		log_action(iter, long_act ? "message" : "m", has_cg);
+		blk_log_msg(s, iter->ent, has_cg);
 		return trace_handle_return(s);
 	}
 
 	if (unlikely(what == 0 || what >= ARRAY_SIZE(what2act)))
 		trace_seq_printf(s, "Unknown action %x\n", what);
 	else {
-		log_action(iter, what2act[what].act[long_act]);
-		what2act[what].print(s, iter->ent);
+		log_action(iter, what2act[what].act[long_act], has_cg);
+		what2act[what].print(s, iter->ent, has_cg);
 	}
 
 	return trace_handle_return(s);
-- 
cgit v1.2.3-71-gd317


From 1a5f3da20bd966220931239fbd31e6ac6ff42251 Mon Sep 17 00:00:00 2001
From: Vidya Sagar Ravipati <vidya.chowdary@gmail.com>
Date: Thu, 27 Jul 2017 16:47:26 -0700
Subject: net: ethtool: add support for forward error correction modes

Forward Error Correction (FEC) modes i.e Base-R
and Reed-Solomon modes are introduced in 25G/40G/100G standards
for providing good BER at high speeds. Various networking devices
which support 25G/40G/100G provides ability to manage supported FEC
modes and the lack of FEC encoding control and reporting today is a
source for interoperability issues for many vendors.
FEC capability as well as specific FEC mode i.e. Base-R
or RS modes can be requested or advertised through bits D44:47 of
base link codeword.

This patch set intends to provide option under ethtool to manage
and report FEC encoding settings for networking devices as per
IEEE 802.3 bj, bm and by specs.

set-fec/show-fec option(s) are designed to provide control and
report the FEC encoding on the link.

SET FEC option:
root@tor: ethtool --set-fec  swp1 encoding [off | RS | BaseR | auto]

Encoding: Types of encoding
Off    :  Turning off any encoding
RS     :  enforcing RS-FEC encoding on supported speeds
BaseR  :  enforcing Base R encoding on supported speeds
Auto   :  IEEE defaults for the speed/medium combination

Here are a few examples of what we would expect if encoding=auto:
- if autoneg is on, we are  expecting FEC to be negotiated as on or off
  as long as protocol supports it
- if the hardware is capable of detecting the FEC encoding on it's
      receiver it will reconfigure its encoder to match
- in absence of the above, the configuration would be set to IEEE
  defaults.

>From our  understanding , this is essentially what most hardware/driver
combinations are doing today in the absence of a way for users to
control the behavior.

SHOW FEC option:
root@tor: ethtool --show-fec  swp1
FEC parameters for swp1:
Active FEC encodings: RS
Configured FEC encodings:  RS | BaseR

ETHTOOL DEVNAME output modification:

ethtool devname output:
root@tor:~# ethtool swp1
Settings for swp1:
root@hpe-7712-03:~# ethtool swp18
Settings for swp18:
    Supported ports: [ FIBRE ]
    Supported link modes:   40000baseCR4/Full
                            40000baseSR4/Full
                            40000baseLR4/Full
                            100000baseSR4/Full
                            100000baseCR4/Full
                            100000baseLR4_ER4/Full
    Supported pause frame use: No
    Supports auto-negotiation: Yes
    Supported FEC modes: [RS | BaseR | None | Not reported]
    Advertised link modes:  Not reported
    Advertised pause frame use: No
    Advertised auto-negotiation: No
    Advertised FEC modes: [RS | BaseR | None | Not reported]
<<<< One or more FEC modes
    Speed: 100000Mb/s
    Duplex: Full
    Port: FIBRE
    PHYAD: 106
    Transceiver: internal
    Auto-negotiation: off
    Link detected: yes

This patch includes following changes
a) New ETHTOOL_SFECPARAM/SFECPARAM API, handled by
  the new get_fecparam/set_fecparam callbacks, provides support
  for configuration of forward error correction modes.
b) Link mode bits for FEC modes i.e. None (No FEC mode), RS, BaseR/FC
  are defined so that users can configure these fec modes for supported
  and advertising fields as part of link autonegotiation.

Signed-off-by: Vidya Sagar Ravipati <vidya.chowdary@gmail.com>
Signed-off-by: Dustin Byford <dustin@cumulusnetworks.com>
Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ethtool.h      |  4 ++++
 include/uapi/linux/ethtool.h | 48 +++++++++++++++++++++++++++++++++++++++++++-
 net/core/ethtool.c           | 34 +++++++++++++++++++++++++++++++
 3 files changed, 85 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index 83cc9863444b..afdbb701fdb4 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -374,5 +374,9 @@ struct ethtool_ops {
 				      struct ethtool_link_ksettings *);
 	int	(*set_link_ksettings)(struct net_device *,
 				      const struct ethtool_link_ksettings *);
+	int	(*get_fecparam)(struct net_device *,
+				      struct ethtool_fecparam *);
+	int	(*set_fecparam)(struct net_device *,
+				      struct ethtool_fecparam *);
 };
 #endif /* _LINUX_ETHTOOL_H */
diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h
index 7d4a594d5d58..9c041dae8e2c 100644
--- a/include/uapi/linux/ethtool.h
+++ b/include/uapi/linux/ethtool.h
@@ -1238,6 +1238,47 @@ struct ethtool_per_queue_op {
 	char	data[];
 };
 
+/**
+ * struct ethtool_fecparam - Ethernet forward error correction(fec) parameters
+ * @cmd: Command number = %ETHTOOL_GFECPARAM or %ETHTOOL_SFECPARAM
+ * @active_fec: FEC mode which is active on porte
+ * @fec: Bitmask of supported/configured FEC modes
+ * @rsvd: Reserved for future extensions. i.e FEC bypass feature.
+ *
+ * Drivers should reject a non-zero setting of @autoneg when
+ * autoneogotiation is disabled (or not supported) for the link.
+ *
+ */
+struct ethtool_fecparam {
+	__u32   cmd;
+	/* bitmask of FEC modes */
+	__u32   active_fec;
+	__u32   fec;
+	__u32   reserved;
+};
+
+/**
+ * enum ethtool_fec_config_bits - flags definition of ethtool_fec_configuration
+ * @ETHTOOL_FEC_NONE: FEC mode configuration is not supported
+ * @ETHTOOL_FEC_AUTO: Default/Best FEC mode provided by driver
+ * @ETHTOOL_FEC_OFF: No FEC Mode
+ * @ETHTOOL_FEC_RS: Reed-Solomon Forward Error Detection mode
+ * @ETHTOOL_FEC_BASER: Base-R/Reed-Solomon Forward Error Detection mode
+ */
+enum ethtool_fec_config_bits {
+	ETHTOOL_FEC_NONE_BIT,
+	ETHTOOL_FEC_AUTO_BIT,
+	ETHTOOL_FEC_OFF_BIT,
+	ETHTOOL_FEC_RS_BIT,
+	ETHTOOL_FEC_BASER_BIT,
+};
+
+#define ETHTOOL_FEC_NONE		(1 << ETHTOOL_FEC_NONE_BIT)
+#define ETHTOOL_FEC_AUTO		(1 << ETHTOOL_FEC_AUTO_BIT)
+#define ETHTOOL_FEC_OFF			(1 << ETHTOOL_FEC_OFF_BIT)
+#define ETHTOOL_FEC_RS			(1 << ETHTOOL_FEC_RS_BIT)
+#define ETHTOOL_FEC_BASER		(1 << ETHTOOL_FEC_BASER_BIT)
+
 /* CMDs currently supported */
 #define ETHTOOL_GSET		0x00000001 /* DEPRECATED, Get settings.
 					    * Please use ETHTOOL_GLINKSETTINGS
@@ -1330,6 +1371,8 @@ struct ethtool_per_queue_op {
 #define ETHTOOL_SLINKSETTINGS	0x0000004d /* Set ethtool_link_settings */
 #define ETHTOOL_PHY_GTUNABLE	0x0000004e /* Get PHY tunable configuration */
 #define ETHTOOL_PHY_STUNABLE	0x0000004f /* Set PHY tunable configuration */
+#define ETHTOOL_GFECPARAM	0x00000050 /* Get FEC settings */
+#define ETHTOOL_SFECPARAM	0x00000051 /* Set FEC settings */
 
 /* compatibility with older code */
 #define SPARC_ETH_GSET		ETHTOOL_GSET
@@ -1387,6 +1430,9 @@ enum ethtool_link_mode_bit_indices {
 	ETHTOOL_LINK_MODE_2500baseT_Full_BIT	= 47,
 	ETHTOOL_LINK_MODE_5000baseT_Full_BIT	= 48,
 
+	ETHTOOL_LINK_MODE_FEC_NONE_BIT	= 49,
+	ETHTOOL_LINK_MODE_FEC_RS_BIT	= 50,
+	ETHTOOL_LINK_MODE_FEC_BASER_BIT	= 51,
 
 	/* Last allowed bit for __ETHTOOL_LINK_MODE_LEGACY_MASK is bit
 	 * 31. Please do NOT define any SUPPORTED_* or ADVERTISED_*
@@ -1395,7 +1441,7 @@ enum ethtool_link_mode_bit_indices {
 	 */
 
 	__ETHTOOL_LINK_MODE_LAST
-	  = ETHTOOL_LINK_MODE_5000baseT_Full_BIT,
+	  = ETHTOOL_LINK_MODE_FEC_BASER_BIT,
 };
 
 #define __ETHTOOL_LINK_MODE_LEGACY_MASK(base_name)	\
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index b987bc475fc8..6a582ae4c5d9 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -2512,6 +2512,33 @@ static int set_phy_tunable(struct net_device *dev, void __user *useraddr)
 	return ret;
 }
 
+static int ethtool_get_fecparam(struct net_device *dev, void __user *useraddr)
+{
+	struct ethtool_fecparam fecparam = { ETHTOOL_GFECPARAM };
+
+	if (!dev->ethtool_ops->get_fecparam)
+		return -EOPNOTSUPP;
+
+	dev->ethtool_ops->get_fecparam(dev, &fecparam);
+
+	if (copy_to_user(useraddr, &fecparam, sizeof(fecparam)))
+		return -EFAULT;
+	return 0;
+}
+
+static int ethtool_set_fecparam(struct net_device *dev, void __user *useraddr)
+{
+	struct ethtool_fecparam fecparam;
+
+	if (!dev->ethtool_ops->set_fecparam)
+		return -EOPNOTSUPP;
+
+	if (copy_from_user(&fecparam, useraddr, sizeof(fecparam)))
+		return -EFAULT;
+
+	return dev->ethtool_ops->set_fecparam(dev, &fecparam);
+}
+
 /* The main entry point in this file.  Called from net/core/dev_ioctl.c */
 
 int dev_ethtool(struct net *net, struct ifreq *ifr)
@@ -2570,6 +2597,7 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
 	case ETHTOOL_GTUNABLE:
 	case ETHTOOL_PHY_GTUNABLE:
 	case ETHTOOL_GLINKSETTINGS:
+	case ETHTOOL_GFECPARAM:
 		break;
 	default:
 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
@@ -2779,6 +2807,12 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
 	case ETHTOOL_PHY_STUNABLE:
 		rc = set_phy_tunable(dev, useraddr);
 		break;
+	case ETHTOOL_GFECPARAM:
+		rc = ethtool_get_fecparam(dev, useraddr);
+		break;
+	case ETHTOOL_SFECPARAM:
+		rc = ethtool_set_fecparam(dev, useraddr);
+		break;
 	default:
 		rc = -EOPNOTSUPP;
 	}
-- 
cgit v1.2.3-71-gd317


From 64c83d837329531252a1a0f0dfdd4fd607e1d8e9 Mon Sep 17 00:00:00 2001
From: Jamal Hadi Salim <jhs@mojatatu.com>
Date: Sun, 30 Jul 2017 13:24:49 -0400
Subject: net netlink: Add new type NLA_BITFIELD32

Generic bitflags attribute content sent to the kernel by user.
With this netlink attr type the user can either set or unset a
flag in the kernel.

The value is a bitmap that defines the bit values being set
The selector is a bitmask that defines which value bit is to be
considered.

A check is made to ensure the rules that a kernel subsystem always
conforms to bitflags the kernel already knows about. i.e
if the user tries to set a bit flag that is not understood then
the _it will be rejected_.

In the most basic form, the user specifies the attribute policy as:
[ATTR_GOO] = { .type = NLA_BITFIELD32, .validation_data = &myvalidflags },

where myvalidflags is the bit mask of the flags the kernel understands.

If the user _does not_ provide myvalidflags then the attribute will
also be rejected.

Examples:
value = 0x0, and selector = 0x1
implies we are selecting bit 1 and we want to set its value to 0.

value = 0x2, and selector = 0x2
implies we are selecting bit 2 and we want to set its value to 1.

Suggested-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netlink.h        | 16 ++++++++++++++++
 include/uapi/linux/netlink.h | 17 +++++++++++++++++
 lib/nlattr.c                 | 30 ++++++++++++++++++++++++++++++
 3 files changed, 63 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/net/netlink.h b/include/net/netlink.h
index ef8e6c3a80a6..82dd298b40c7 100644
--- a/include/net/netlink.h
+++ b/include/net/netlink.h
@@ -178,6 +178,7 @@ enum {
 	NLA_S16,
 	NLA_S32,
 	NLA_S64,
+	NLA_BITFIELD32,
 	__NLA_TYPE_MAX,
 };
 
@@ -206,6 +207,7 @@ enum {
  *    NLA_MSECS            Leaving the length field zero will verify the
  *                         given type fits, using it verifies minimum length
  *                         just like "All other"
+ *    NLA_BITFIELD32      A 32-bit bitmap/bitselector attribute
  *    All other            Minimum length of attribute payload
  *
  * Example:
@@ -213,11 +215,13 @@ enum {
  * 	[ATTR_FOO] = { .type = NLA_U16 },
  *	[ATTR_BAR] = { .type = NLA_STRING, .len = BARSIZ },
  *	[ATTR_BAZ] = { .len = sizeof(struct mystruct) },
+ *	[ATTR_GOO] = { .type = NLA_BITFIELD32, .validation_data = &myvalidflags },
  * };
  */
 struct nla_policy {
 	u16		type;
 	u16		len;
+	void            *validation_data;
 };
 
 /**
@@ -1202,6 +1206,18 @@ static inline struct in6_addr nla_get_in6_addr(const struct nlattr *nla)
 	return tmp;
 }
 
+/**
+ * nla_get_bitfield32 - return payload of 32 bitfield attribute
+ * @nla: nla_bitfield32 attribute
+ */
+static inline struct nla_bitfield32 nla_get_bitfield32(const struct nlattr *nla)
+{
+	struct nla_bitfield32 tmp;
+
+	nla_memcpy(&tmp, nla, sizeof(tmp));
+	return tmp;
+}
+
 /**
  * nla_memdup - duplicate attribute memory (kmemdup)
  * @src: netlink attribute to duplicate from
diff --git a/include/uapi/linux/netlink.h b/include/uapi/linux/netlink.h
index f86127a46cfc..f4fc9c9e123d 100644
--- a/include/uapi/linux/netlink.h
+++ b/include/uapi/linux/netlink.h
@@ -226,5 +226,22 @@ struct nlattr {
 #define NLA_ALIGN(len)		(((len) + NLA_ALIGNTO - 1) & ~(NLA_ALIGNTO - 1))
 #define NLA_HDRLEN		((int) NLA_ALIGN(sizeof(struct nlattr)))
 
+/* Generic 32 bitflags attribute content sent to the kernel.
+ *
+ * The value is a bitmap that defines the values being set
+ * The selector is a bitmask that defines which value is legit
+ *
+ * Examples:
+ *  value = 0x0, and selector = 0x1
+ *  implies we are selecting bit 1 and we want to set its value to 0.
+ *
+ *  value = 0x2, and selector = 0x2
+ *  implies we are selecting bit 2 and we want to set its value to 1.
+ *
+ */
+struct nla_bitfield32 {
+	__u32 value;
+	__u32 selector;
+};
 
 #endif /* _UAPI__LINUX_NETLINK_H */
diff --git a/lib/nlattr.c b/lib/nlattr.c
index fb52435be42d..ee79b7a3c6b0 100644
--- a/lib/nlattr.c
+++ b/lib/nlattr.c
@@ -27,6 +27,30 @@ static const u8 nla_attr_minlen[NLA_TYPE_MAX+1] = {
 	[NLA_S64]	= sizeof(s64),
 };
 
+static int validate_nla_bitfield32(const struct nlattr *nla,
+				   u32 *valid_flags_allowed)
+{
+	const struct nla_bitfield32 *bf = nla_data(nla);
+	u32 *valid_flags_mask = valid_flags_allowed;
+
+	if (!valid_flags_allowed)
+		return -EINVAL;
+
+	/*disallow invalid bit selector */
+	if (bf->selector & ~*valid_flags_mask)
+		return -EINVAL;
+
+	/*disallow invalid bit values */
+	if (bf->value & ~*valid_flags_mask)
+		return -EINVAL;
+
+	/*disallow valid bit values that are not selected*/
+	if (bf->value & ~bf->selector)
+		return -EINVAL;
+
+	return 0;
+}
+
 static int validate_nla(const struct nlattr *nla, int maxtype,
 			const struct nla_policy *policy)
 {
@@ -46,6 +70,12 @@ static int validate_nla(const struct nlattr *nla, int maxtype,
 			return -ERANGE;
 		break;
 
+	case NLA_BITFIELD32:
+		if (attrlen != sizeof(struct nla_bitfield32))
+			return -ERANGE;
+
+		return validate_nla_bitfield32(nla, pt->validation_data);
+
 	case NLA_NUL_STRING:
 		if (pt->len)
 			minlen = min_t(int, attrlen, pt->len + 1);
-- 
cgit v1.2.3-71-gd317


From 90825b23a887f06f6c05bdde77b200c5fe9b6217 Mon Sep 17 00:00:00 2001
From: Jamal Hadi Salim <jhs@mojatatu.com>
Date: Sun, 30 Jul 2017 13:24:51 -0400
Subject: net sched actions: dump more than TCA_ACT_MAX_PRIO actions per batch

When you dump hundreds of thousands of actions, getting only 32 per
dump batch even when the socket buffer and memory allocations allow
is inefficient.

With this change, the user will get as many as possibly fitting
within the given constraints available to the kernel.

The top level action TLV space is extended. An attribute
TCA_ROOT_FLAGS is used to carry flags; flag TCA_FLAG_LARGE_DUMP_ON
is set by the user indicating the user is capable of processing
these large dumps. Older user space which doesnt set this flag
doesnt get the large (than 32) batches.
The kernel uses the TCA_ROOT_COUNT attribute to tell the user how many
actions are put in a single batch. As such user space app knows how long
to iterate (independent of the type of action being dumped)
instead of hardcoded maximum of 32 thus maintaining backward compat.

Some results dumping 1.5M actions below:
first an unpatched tc which doesnt understand these features...

prompt$ time -p tc actions ls action gact | grep index | wc -l
1500000
real 1388.43
user 2.07
sys 1386.79

Now lets see a patched tc which sets the correct flags when requesting
a dump:

prompt$ time -p updatedtc actions ls action gact | grep index | wc -l
1500000
real 178.13
user 2.02
sys 176.96

That is about 8x performance improvement for tc app which sets its
receive buffer to about 32K.

Signed-off-by: Jamal Hadi Salim <jhs@mojatatu.com>
Reviewed-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/rtnetlink.h | 22 +++++++++++++++++--
 net/sched/act_api.c            | 50 +++++++++++++++++++++++++++++++++---------
 2 files changed, 60 insertions(+), 12 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
index d148505010a7..bfa80a6164d9 100644
--- a/include/uapi/linux/rtnetlink.h
+++ b/include/uapi/linux/rtnetlink.h
@@ -683,10 +683,28 @@ struct tcamsg {
 	unsigned char	tca__pad1;
 	unsigned short	tca__pad2;
 };
+
+enum {
+	TCA_ROOT_UNSPEC,
+	TCA_ROOT_TAB,
+#define TCA_ACT_TAB TCA_ROOT_TAB
+#define TCAA_MAX TCA_ROOT_TAB
+	TCA_ROOT_FLAGS,
+	TCA_ROOT_COUNT,
+	__TCA_ROOT_MAX,
+#define	TCA_ROOT_MAX (__TCA_ROOT_MAX - 1)
+};
+
 #define TA_RTA(r)  ((struct rtattr*)(((char*)(r)) + NLMSG_ALIGN(sizeof(struct tcamsg))))
 #define TA_PAYLOAD(n) NLMSG_PAYLOAD(n,sizeof(struct tcamsg))
-#define TCA_ACT_TAB 1 /* attr type must be >=1 */	
-#define TCAA_MAX 1
+/* tcamsg flags stored in attribute TCA_ROOT_FLAGS
+ *
+ * TCA_FLAG_LARGE_DUMP_ON user->kernel to request for larger than TCA_ACT_MAX_PRIO
+ * actions in a dump. All dump responses will contain the number of actions
+ * being dumped stored in for user app's consumption in TCA_ROOT_COUNT
+ *
+ */
+#define TCA_FLAG_LARGE_DUMP_ON		(1 << 0)
 
 /* New extended info filters for IFLA_EXT_MASK */
 #define RTEXT_FILTER_VF		(1 << 0)
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 848370e2fcca..d53653a73c4f 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -110,6 +110,7 @@ static int tcf_dump_walker(struct tcf_hashinfo *hinfo, struct sk_buff *skb,
 			   struct netlink_callback *cb)
 {
 	int err = 0, index = -1, i = 0, s_i = 0, n_i = 0;
+	u32 act_flags = cb->args[2];
 	struct nlattr *nest;
 
 	spin_lock_bh(&hinfo->lock);
@@ -138,14 +139,18 @@ static int tcf_dump_walker(struct tcf_hashinfo *hinfo, struct sk_buff *skb,
 			}
 			nla_nest_end(skb, nest);
 			n_i++;
-			if (n_i >= TCA_ACT_MAX_PRIO)
+			if (!(act_flags & TCA_FLAG_LARGE_DUMP_ON) &&
+			    n_i >= TCA_ACT_MAX_PRIO)
 				goto done;
 		}
 	}
 done:
 	spin_unlock_bh(&hinfo->lock);
-	if (n_i)
+	if (n_i) {
 		cb->args[0] += n_i;
+		if (act_flags & TCA_FLAG_LARGE_DUMP_ON)
+			cb->args[1] = n_i;
+	}
 	return n_i;
 
 nla_put_failure:
@@ -1068,11 +1073,17 @@ static int tcf_action_add(struct net *net, struct nlattr *nla,
 	return tcf_add_notify(net, n, &actions, portid);
 }
 
+static u32 tcaa_root_flags_allowed = TCA_FLAG_LARGE_DUMP_ON;
+static const struct nla_policy tcaa_policy[TCA_ROOT_MAX + 1] = {
+	[TCA_ROOT_FLAGS] = { .type = NLA_BITFIELD32,
+			     .validation_data = &tcaa_root_flags_allowed },
+};
+
 static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n,
 			 struct netlink_ext_ack *extack)
 {
 	struct net *net = sock_net(skb->sk);
-	struct nlattr *tca[TCAA_MAX + 1];
+	struct nlattr *tca[TCA_ROOT_MAX + 1];
 	u32 portid = skb ? NETLINK_CB(skb).portid : 0;
 	int ret = 0, ovr = 0;
 
@@ -1080,7 +1091,7 @@ static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n,
 	    !netlink_capable(skb, CAP_NET_ADMIN))
 		return -EPERM;
 
-	ret = nlmsg_parse(n, sizeof(struct tcamsg), tca, TCAA_MAX, NULL,
+	ret = nlmsg_parse(n, sizeof(struct tcamsg), tca, TCA_ROOT_MAX, NULL,
 			  extack);
 	if (ret < 0)
 		return ret;
@@ -1121,16 +1132,12 @@ replay:
 	return ret;
 }
 
-static struct nlattr *find_dump_kind(const struct nlmsghdr *n)
+static struct nlattr *find_dump_kind(struct nlattr **nla)
 {
 	struct nlattr *tb1, *tb2[TCA_ACT_MAX + 1];
 	struct nlattr *tb[TCA_ACT_MAX_PRIO + 1];
-	struct nlattr *nla[TCAA_MAX + 1];
 	struct nlattr *kind;
 
-	if (nlmsg_parse(n, sizeof(struct tcamsg), nla, TCAA_MAX,
-			NULL, NULL) < 0)
-		return NULL;
 	tb1 = nla[TCA_ACT_TAB];
 	if (tb1 == NULL)
 		return NULL;
@@ -1157,8 +1164,18 @@ static int tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb)
 	struct tc_action_ops *a_o;
 	int ret = 0;
 	struct tcamsg *t = (struct tcamsg *) nlmsg_data(cb->nlh);
-	struct nlattr *kind = find_dump_kind(cb->nlh);
+	struct nlattr *tb[TCA_ROOT_MAX + 1];
+	struct nlattr *count_attr = NULL;
+	struct nlattr *kind = NULL;
+	struct nla_bitfield32 bf;
+	u32 act_count = 0;
+
+	ret = nlmsg_parse(cb->nlh, sizeof(struct tcamsg), tb, TCA_ROOT_MAX,
+			  tcaa_policy, NULL);
+	if (ret < 0)
+		return ret;
 
+	kind = find_dump_kind(tb);
 	if (kind == NULL) {
 		pr_info("tc_dump_action: action bad kind\n");
 		return 0;
@@ -1168,14 +1185,24 @@ static int tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb)
 	if (a_o == NULL)
 		return 0;
 
+	cb->args[2] = 0;
+	if (tb[TCA_ROOT_FLAGS]) {
+		bf = nla_get_bitfield32(tb[TCA_ROOT_FLAGS]);
+		cb->args[2] = bf.value;
+	}
+
 	nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
 			cb->nlh->nlmsg_type, sizeof(*t), 0);
 	if (!nlh)
 		goto out_module_put;
+
 	t = nlmsg_data(nlh);
 	t->tca_family = AF_UNSPEC;
 	t->tca__pad1 = 0;
 	t->tca__pad2 = 0;
+	count_attr = nla_reserve(skb, TCA_ROOT_COUNT, sizeof(u32));
+	if (!count_attr)
+		goto out_module_put;
 
 	nest = nla_nest_start(skb, TCA_ACT_TAB);
 	if (nest == NULL)
@@ -1188,6 +1215,9 @@ static int tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb)
 	if (ret > 0) {
 		nla_nest_end(skb, nest);
 		ret = skb->len;
+		act_count = cb->args[1];
+		memcpy(nla_data(count_attr), &act_count, sizeof(u32));
+		cb->args[1] = 0;
 	} else
 		nlmsg_trim(skb, b);
 
-- 
cgit v1.2.3-71-gd317


From e62e484df04964ac947c679ef4f00c54ae5395aa Mon Sep 17 00:00:00 2001
From: Jamal Hadi Salim <jhs@mojatatu.com>
Date: Sun, 30 Jul 2017 13:24:52 -0400
Subject: net sched actions: add time filter for action dumping

This patch adds support for filtering based on time since last used.
When we are dumping a large number of actions it is useful to
have the option of filtering based on when the action was last
used to reduce the amount of data crossing to user space.

With this patch the user space app sets the TCA_ROOT_TIME_DELTA
attribute with the value in milliseconds with "time of interest
since now".  The kernel converts this to jiffies and does the
filtering comparison matching entries that have seen activity
since then and returns them to user space.
Old kernels and old tc continue to work in legacy mode since
they dont specify this attribute.

Some example (we have 400 actions bound to 400 filters); at
installation time. Using updated when tc setting the time of
interest to 120 seconds earlier (we see 400 actions):
prompt$ hackedtc actions ls action gact since 120000| grep index | wc -l
400

go get some coffee and wait for > 120 seconds and try again:

prompt$ hackedtc actions ls action gact since 120000 | grep index | wc -l
0

Lets see a filter bound to one of these actions:
....
filter pref 10 u32
filter pref 10 u32 fh 800: ht divisor 1
filter pref 10 u32 fh 800::800 order 2048 key ht 800 bkt 0 flowid 1:10  (rule hit 2 success 1)
  match 7f000002/ffffffff at 12 (success 1 )
    action order 1: gact action pass
     random type none pass val 0
     index 23 ref 2 bind 1 installed 1145 sec used 802 sec
    Action statistics:
    Sent 84 bytes 1 pkt (dropped 0, overlimits 0 requeues 0)
    backlog 0b 0p requeues 0
....

that coffee took long, no? It was good.

Now lets ping -c 1 127.0.0.2, then run the actions again:
prompt$ hackedtc actions ls action gact since 120 | grep index | wc -l
1

More details please:
prompt$ hackedtc -s actions ls action gact since 120000

    action order 0: gact action pass
     random type none pass val 0
     index 23 ref 2 bind 1 installed 1270 sec used 30 sec
    Action statistics:
    Sent 168 bytes 2 pkt (dropped 0, overlimits 0 requeues 0)
    backlog 0b 0p requeues 0

And the filter?

filter pref 10 u32
filter pref 10 u32 fh 800: ht divisor 1
filter pref 10 u32 fh 800::800 order 2048 key ht 800 bkt 0 flowid 1:10  (rule hit 4 success 2)
  match 7f000002/ffffffff at 12 (success 2 )
    action order 1: gact action pass
     random type none pass val 0
     index 23 ref 2 bind 1 installed 1324 sec used 84 sec
    Action statistics:
    Sent 168 bytes 2 pkt (dropped 0, overlimits 0 requeues 0)
    backlog 0b 0p requeues 0

Signed-off-by: Jamal Hadi Salim <jhs@mojatatu.com>
Reviewed-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/rtnetlink.h |  1 +
 net/sched/act_api.c            | 21 ++++++++++++++++++++-
 2 files changed, 21 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
index bfa80a6164d9..dab7dad9e01a 100644
--- a/include/uapi/linux/rtnetlink.h
+++ b/include/uapi/linux/rtnetlink.h
@@ -691,6 +691,7 @@ enum {
 #define TCAA_MAX TCA_ROOT_TAB
 	TCA_ROOT_FLAGS,
 	TCA_ROOT_COUNT,
+	TCA_ROOT_TIME_DELTA, /* in msecs */
 	__TCA_ROOT_MAX,
 #define	TCA_ROOT_MAX (__TCA_ROOT_MAX - 1)
 };
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index d53653a73c4f..f19b118df414 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -111,6 +111,7 @@ static int tcf_dump_walker(struct tcf_hashinfo *hinfo, struct sk_buff *skb,
 {
 	int err = 0, index = -1, i = 0, s_i = 0, n_i = 0;
 	u32 act_flags = cb->args[2];
+	unsigned long jiffy_since = cb->args[3];
 	struct nlattr *nest;
 
 	spin_lock_bh(&hinfo->lock);
@@ -128,6 +129,11 @@ static int tcf_dump_walker(struct tcf_hashinfo *hinfo, struct sk_buff *skb,
 			if (index < s_i)
 				continue;
 
+			if (jiffy_since &&
+			    time_after(jiffy_since,
+				       (unsigned long)p->tcfa_tm.lastuse))
+				continue;
+
 			nest = nla_nest_start(skb, n_i);
 			if (nest == NULL)
 				goto nla_put_failure;
@@ -145,9 +151,11 @@ static int tcf_dump_walker(struct tcf_hashinfo *hinfo, struct sk_buff *skb,
 		}
 	}
 done:
+	if (index >= 0)
+		cb->args[0] = index + 1;
+
 	spin_unlock_bh(&hinfo->lock);
 	if (n_i) {
-		cb->args[0] += n_i;
 		if (act_flags & TCA_FLAG_LARGE_DUMP_ON)
 			cb->args[1] = n_i;
 	}
@@ -1077,6 +1085,7 @@ static u32 tcaa_root_flags_allowed = TCA_FLAG_LARGE_DUMP_ON;
 static const struct nla_policy tcaa_policy[TCA_ROOT_MAX + 1] = {
 	[TCA_ROOT_FLAGS] = { .type = NLA_BITFIELD32,
 			     .validation_data = &tcaa_root_flags_allowed },
+	[TCA_ROOT_TIME_DELTA]      = { .type = NLA_U32 },
 };
 
 static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n,
@@ -1166,8 +1175,10 @@ static int tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb)
 	struct tcamsg *t = (struct tcamsg *) nlmsg_data(cb->nlh);
 	struct nlattr *tb[TCA_ROOT_MAX + 1];
 	struct nlattr *count_attr = NULL;
+	unsigned long jiffy_since = 0;
 	struct nlattr *kind = NULL;
 	struct nla_bitfield32 bf;
+	u32 msecs_since = 0;
 	u32 act_count = 0;
 
 	ret = nlmsg_parse(cb->nlh, sizeof(struct tcamsg), tb, TCA_ROOT_MAX,
@@ -1191,15 +1202,23 @@ static int tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb)
 		cb->args[2] = bf.value;
 	}
 
+	if (tb[TCA_ROOT_TIME_DELTA]) {
+		msecs_since = nla_get_u32(tb[TCA_ROOT_TIME_DELTA]);
+	}
+
 	nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
 			cb->nlh->nlmsg_type, sizeof(*t), 0);
 	if (!nlh)
 		goto out_module_put;
 
+	if (msecs_since)
+		jiffy_since = jiffies - msecs_to_jiffies(msecs_since);
+
 	t = nlmsg_data(nlh);
 	t->tca_family = AF_UNSPEC;
 	t->tca__pad1 = 0;
 	t->tca__pad2 = 0;
+	cb->args[3] = jiffy_since;
 	count_attr = nla_reserve(skb, TCA_ROOT_COUNT, sizeof(u32));
 	if (!count_attr)
 		goto out_module_put;
-- 
cgit v1.2.3-71-gd317


From e46abbcc05aa8a16b0e7f5c94e86d11af9aa2770 Mon Sep 17 00:00:00 2001
From: Phil Sutter <phil@nwl.cc>
Date: Thu, 27 Jul 2017 16:56:41 +0200
Subject: netfilter: nf_tables: Allow table names of up to 255 chars

Allocate all table names dynamically to allow for arbitrary lengths but
introduce NFT_NAME_MAXLEN as an upper sanity boundary. It's value was
chosen to allow using a domain name as per RFC 1035.

Signed-off-by: Phil Sutter <phil@nwl.cc>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h        |  2 +-
 include/uapi/linux/netfilter/nf_tables.h |  3 +-
 net/netfilter/nf_tables_api.c            | 49 +++++++++++++++++++++++---------
 net/netfilter/nf_tables_trace.c          |  2 +-
 4 files changed, 40 insertions(+), 16 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index bd5be0d691d5..05ecf78ec078 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -957,7 +957,7 @@ struct nft_table {
 	u32				use;
 	u16				flags:14,
 					genmask:2;
-	char				name[NFT_TABLE_MAXNAMELEN];
+	char				*name;
 };
 
 enum nft_af_flags {
diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index 6f0a950e21c3..0b94e572ef16 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -1,7 +1,8 @@
 #ifndef _LINUX_NF_TABLES_H
 #define _LINUX_NF_TABLES_H
 
-#define NFT_TABLE_MAXNAMELEN	32
+#define NFT_NAME_MAXLEN		256
+#define NFT_TABLE_MAXNAMELEN	NFT_NAME_MAXLEN
 #define NFT_CHAIN_MAXNAMELEN	32
 #define NFT_SET_MAXNAMELEN	32
 #define NFT_OBJ_MAXNAMELEN	32
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index b77ad0813564..c2e392d5e512 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -726,7 +726,10 @@ static int nf_tables_newtable(struct net *net, struct sock *nlsk,
 	if (table == NULL)
 		goto err2;
 
-	nla_strlcpy(table->name, name, NFT_TABLE_MAXNAMELEN);
+	table->name = nla_strdup(name, GFP_KERNEL);
+	if (table->name == NULL)
+		goto err3;
+
 	INIT_LIST_HEAD(&table->chains);
 	INIT_LIST_HEAD(&table->sets);
 	INIT_LIST_HEAD(&table->objects);
@@ -735,10 +738,12 @@ static int nf_tables_newtable(struct net *net, struct sock *nlsk,
 	nft_ctx_init(&ctx, net, skb, nlh, afi, table, NULL, nla);
 	err = nft_trans_table_add(&ctx, NFT_MSG_NEWTABLE);
 	if (err < 0)
-		goto err3;
+		goto err4;
 
 	list_add_tail_rcu(&table->list, &afi->tables);
 	return 0;
+err4:
+	kfree(table->name);
 err3:
 	kfree(table);
 err2:
@@ -865,6 +870,7 @@ static void nf_tables_table_destroy(struct nft_ctx *ctx)
 {
 	BUG_ON(ctx->table->use > 0);
 
+	kfree(ctx->table->name);
 	kfree(ctx->table);
 	module_put(ctx->afi->owner);
 }
@@ -1972,7 +1978,7 @@ err:
 }
 
 struct nft_rule_dump_ctx {
-	char table[NFT_TABLE_MAXNAMELEN];
+	char *table;
 	char chain[NFT_CHAIN_MAXNAMELEN];
 };
 
@@ -1997,7 +2003,7 @@ static int nf_tables_dump_rules(struct sk_buff *skb,
 			continue;
 
 		list_for_each_entry_rcu(table, &afi->tables, list) {
-			if (ctx && ctx->table[0] &&
+			if (ctx && ctx->table &&
 			    strcmp(ctx->table, table->name) != 0)
 				continue;
 
@@ -2037,7 +2043,12 @@ done:
 
 static int nf_tables_dump_rules_done(struct netlink_callback *cb)
 {
-	kfree(cb->data);
+	struct nft_rule_dump_ctx *ctx = cb->data;
+
+	if (ctx) {
+		kfree(ctx->table);
+		kfree(ctx);
+	}
 	return 0;
 }
 
@@ -2069,9 +2080,14 @@ static int nf_tables_getrule(struct net *net, struct sock *nlsk,
 			if (!ctx)
 				return -ENOMEM;
 
-			if (nla[NFTA_RULE_TABLE])
-				nla_strlcpy(ctx->table, nla[NFTA_RULE_TABLE],
-					    sizeof(ctx->table));
+			if (nla[NFTA_RULE_TABLE]) {
+				ctx->table = nla_strdup(nla[NFTA_RULE_TABLE],
+							GFP_KERNEL);
+				if (!ctx->table) {
+					kfree(ctx);
+					return -ENOMEM;
+				}
+			}
 			if (nla[NFTA_RULE_CHAIN])
 				nla_strlcpy(ctx->chain, nla[NFTA_RULE_CHAIN],
 					    sizeof(ctx->chain));
@@ -4410,7 +4426,7 @@ nla_put_failure:
 }
 
 struct nft_obj_filter {
-	char		table[NFT_OBJ_MAXNAMELEN];
+	char		*table;
 	u32		type;
 };
 
@@ -4475,7 +4491,10 @@ done:
 
 static int nf_tables_dump_obj_done(struct netlink_callback *cb)
 {
-	kfree(cb->data);
+	struct nft_obj_filter *filter = cb->data;
+
+	kfree(filter->table);
+	kfree(filter);
 
 	return 0;
 }
@@ -4489,9 +4508,13 @@ nft_obj_filter_alloc(const struct nlattr * const nla[])
 	if (!filter)
 		return ERR_PTR(-ENOMEM);
 
-	if (nla[NFTA_OBJ_TABLE])
-		nla_strlcpy(filter->table, nla[NFTA_OBJ_TABLE],
-			    NFT_TABLE_MAXNAMELEN);
+	if (nla[NFTA_OBJ_TABLE]) {
+		filter->table = nla_strdup(nla[NFTA_OBJ_TABLE], GFP_KERNEL);
+		if (!filter->table) {
+			kfree(filter);
+			return ERR_PTR(-ENOMEM);
+		}
+	}
 	if (nla[NFTA_OBJ_TYPE])
 		filter->type = ntohl(nla_get_be32(nla[NFTA_OBJ_TYPE]));
 
diff --git a/net/netfilter/nf_tables_trace.c b/net/netfilter/nf_tables_trace.c
index 0c3a0049e4aa..62787d985e9d 100644
--- a/net/netfilter/nf_tables_trace.c
+++ b/net/netfilter/nf_tables_trace.c
@@ -175,7 +175,7 @@ void nft_trace_notify(struct nft_traceinfo *info)
 		return;
 
 	size = nlmsg_total_size(sizeof(struct nfgenmsg)) +
-		nla_total_size(NFT_TABLE_MAXNAMELEN) +
+		nla_total_size(strlen(info->chain->table->name)) +
 		nla_total_size(NFT_CHAIN_MAXNAMELEN) +
 		nla_total_size_64bit(sizeof(__be64)) +	/* rule handle */
 		nla_total_size(sizeof(__be32)) +	/* trace type */
-- 
cgit v1.2.3-71-gd317


From b7263e071aba736cea9e71cdf2e76dfa7aebd039 Mon Sep 17 00:00:00 2001
From: Phil Sutter <phil@nwl.cc>
Date: Thu, 27 Jul 2017 16:56:42 +0200
Subject: netfilter: nf_tables: Allow chain name of up to 255 chars

Same conversion as for table names, use NFT_NAME_MAXLEN as upper
boundary as well.

Signed-off-by: Phil Sutter <phil@nwl.cc>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h        |  4 ++--
 include/uapi/linux/netfilter/nf_tables.h |  2 +-
 net/netfilter/nf_tables_api.c            | 34 ++++++++++++++++++++++++--------
 net/netfilter/nf_tables_trace.c          | 27 +++++++++++++++++++++++--
 4 files changed, 54 insertions(+), 13 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 05ecf78ec078..be1610162ee0 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -859,7 +859,7 @@ struct nft_chain {
 	u16				level;
 	u8				flags:6,
 					genmask:2;
-	char				name[NFT_CHAIN_MAXNAMELEN];
+	char				*name;
 };
 
 enum nft_chain_type {
@@ -1272,7 +1272,7 @@ struct nft_trans_set {
 
 struct nft_trans_chain {
 	bool				update;
-	char				name[NFT_CHAIN_MAXNAMELEN];
+	char				*name;
 	struct nft_stats __percpu	*stats;
 	u8				policy;
 };
diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index 0b94e572ef16..d9c03a8608ee 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -3,7 +3,7 @@
 
 #define NFT_NAME_MAXLEN		256
 #define NFT_TABLE_MAXNAMELEN	NFT_NAME_MAXLEN
-#define NFT_CHAIN_MAXNAMELEN	32
+#define NFT_CHAIN_MAXNAMELEN	NFT_NAME_MAXLEN
 #define NFT_SET_MAXNAMELEN	32
 #define NFT_OBJ_MAXNAMELEN	32
 #define NFT_USERDATA_MAXLEN	256
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index c2e392d5e512..747499039709 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -1250,8 +1250,10 @@ static void nf_tables_chain_destroy(struct nft_chain *chain)
 			static_branch_dec(&nft_counters_enabled);
 		if (basechain->ops[0].dev != NULL)
 			dev_put(basechain->ops[0].dev);
+		kfree(chain->name);
 		kfree(basechain);
 	} else {
+		kfree(chain->name);
 		kfree(chain);
 	}
 }
@@ -1476,8 +1478,13 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk,
 			nft_trans_chain_policy(trans) = -1;
 
 		if (nla[NFTA_CHAIN_HANDLE] && name) {
-			nla_strlcpy(nft_trans_chain_name(trans), name,
-				    NFT_CHAIN_MAXNAMELEN);
+			nft_trans_chain_name(trans) =
+				nla_strdup(name, GFP_KERNEL);
+			if (!nft_trans_chain_name(trans)) {
+				kfree(trans);
+				free_percpu(stats);
+				return -ENOMEM;
+			}
 		}
 		list_add_tail(&trans->list, &net->nft.commit_list);
 		return 0;
@@ -1544,7 +1551,11 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk,
 	INIT_LIST_HEAD(&chain->rules);
 	chain->handle = nf_tables_alloc_handle(table);
 	chain->table = table;
-	nla_strlcpy(chain->name, name, NFT_CHAIN_MAXNAMELEN);
+	chain->name = nla_strdup(name, GFP_KERNEL);
+	if (!chain->name) {
+		err = -ENOMEM;
+		goto err1;
+	}
 
 	err = nf_tables_register_hooks(net, table, chain, afi->nops);
 	if (err < 0)
@@ -1979,7 +1990,7 @@ err:
 
 struct nft_rule_dump_ctx {
 	char *table;
-	char chain[NFT_CHAIN_MAXNAMELEN];
+	char *chain;
 };
 
 static int nf_tables_dump_rules(struct sk_buff *skb,
@@ -2047,6 +2058,7 @@ static int nf_tables_dump_rules_done(struct netlink_callback *cb)
 
 	if (ctx) {
 		kfree(ctx->table);
+		kfree(ctx->chain);
 		kfree(ctx);
 	}
 	return 0;
@@ -2088,9 +2100,15 @@ static int nf_tables_getrule(struct net *net, struct sock *nlsk,
 					return -ENOMEM;
 				}
 			}
-			if (nla[NFTA_RULE_CHAIN])
-				nla_strlcpy(ctx->chain, nla[NFTA_RULE_CHAIN],
-					    sizeof(ctx->chain));
+			if (nla[NFTA_RULE_CHAIN]) {
+				ctx->chain = nla_strdup(nla[NFTA_RULE_CHAIN],
+							GFP_KERNEL);
+				if (!ctx->chain) {
+					kfree(ctx->table);
+					kfree(ctx);
+					return -ENOMEM;
+				}
+			}
 			c.data = ctx;
 		}
 
@@ -4863,7 +4881,7 @@ static void nft_chain_commit_update(struct nft_trans *trans)
 {
 	struct nft_base_chain *basechain;
 
-	if (nft_trans_chain_name(trans)[0])
+	if (nft_trans_chain_name(trans))
 		strcpy(trans->ctx.chain->name, nft_trans_chain_name(trans));
 
 	if (!nft_is_base_chain(trans->ctx.chain))
diff --git a/net/netfilter/nf_tables_trace.c b/net/netfilter/nf_tables_trace.c
index 62787d985e9d..e1dc527a493b 100644
--- a/net/netfilter/nf_tables_trace.c
+++ b/net/netfilter/nf_tables_trace.c
@@ -162,6 +162,27 @@ static int nf_trace_fill_rule_info(struct sk_buff *nlskb,
 			    NFTA_TRACE_PAD);
 }
 
+static bool nft_trace_have_verdict_chain(struct nft_traceinfo *info)
+{
+	switch (info->type) {
+	case NFT_TRACETYPE_RETURN:
+	case NFT_TRACETYPE_RULE:
+		break;
+	default:
+		return false;
+	}
+
+	switch (info->verdict->code) {
+	case NFT_JUMP:
+	case NFT_GOTO:
+		break;
+	default:
+		return false;
+	}
+
+	return true;
+}
+
 void nft_trace_notify(struct nft_traceinfo *info)
 {
 	const struct nft_pktinfo *pkt = info->pkt;
@@ -176,12 +197,11 @@ void nft_trace_notify(struct nft_traceinfo *info)
 
 	size = nlmsg_total_size(sizeof(struct nfgenmsg)) +
 		nla_total_size(strlen(info->chain->table->name)) +
-		nla_total_size(NFT_CHAIN_MAXNAMELEN) +
+		nla_total_size(strlen(info->chain->name)) +
 		nla_total_size_64bit(sizeof(__be64)) +	/* rule handle */
 		nla_total_size(sizeof(__be32)) +	/* trace type */
 		nla_total_size(0) +			/* VERDICT, nested */
 			nla_total_size(sizeof(u32)) +	/* verdict code */
-			nla_total_size(NFT_CHAIN_MAXNAMELEN) + /* jump target */
 		nla_total_size(sizeof(u32)) +		/* id */
 		nla_total_size(NFT_TRACETYPE_LL_HSIZE) +
 		nla_total_size(NFT_TRACETYPE_NETWORK_HSIZE) +
@@ -194,6 +214,9 @@ void nft_trace_notify(struct nft_traceinfo *info)
 		nla_total_size(sizeof(u32)) +		/* nfproto */
 		nla_total_size(sizeof(u32));		/* policy */
 
+	if (nft_trace_have_verdict_chain(info))
+		size += nla_total_size(strlen(info->verdict->chain->name)); /* jump target */
+
 	skb = nlmsg_new(size, GFP_ATOMIC);
 	if (!skb)
 		return;
-- 
cgit v1.2.3-71-gd317


From 387454901bd62022ac1b04e15bd8d4fcc60bbed4 Mon Sep 17 00:00:00 2001
From: Phil Sutter <phil@nwl.cc>
Date: Thu, 27 Jul 2017 16:56:43 +0200
Subject: netfilter: nf_tables: Allow set names of up to 255 chars

Same conversion as for table names, use NFT_NAME_MAXLEN as upper
boundary as well.

Signed-off-by: Phil Sutter <phil@nwl.cc>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h        |  2 +-
 include/uapi/linux/netfilter/nf_tables.h |  2 +-
 net/netfilter/nf_tables_api.c            | 18 ++++++++++++++----
 3 files changed, 16 insertions(+), 6 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index be1610162ee0..66ba62fa7d90 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -396,7 +396,7 @@ void nft_unregister_set(struct nft_set_type *type);
 struct nft_set {
 	struct list_head		list;
 	struct list_head		bindings;
-	char				name[NFT_SET_MAXNAMELEN];
+	char				*name;
 	u32				ktype;
 	u32				dtype;
 	u32				objtype;
diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index d9c03a8608ee..b5e73e80b7b6 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -4,7 +4,7 @@
 #define NFT_NAME_MAXLEN		256
 #define NFT_TABLE_MAXNAMELEN	NFT_NAME_MAXLEN
 #define NFT_CHAIN_MAXNAMELEN	NFT_NAME_MAXLEN
-#define NFT_SET_MAXNAMELEN	32
+#define NFT_SET_MAXNAMELEN	NFT_NAME_MAXLEN
 #define NFT_OBJ_MAXNAMELEN	32
 #define NFT_USERDATA_MAXLEN	256
 
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 747499039709..e6a07f27b1a3 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -2650,7 +2650,7 @@ static int nf_tables_set_alloc_name(struct nft_ctx *ctx, struct nft_set *set,
 	unsigned long *inuse;
 	unsigned int n = 0, min = 0;
 
-	p = strnchr(name, NFT_SET_MAXNAMELEN, '%');
+	p = strchr(name, '%');
 	if (p != NULL) {
 		if (p[1] != 'd' || strchr(p + 2, '%'))
 			return -EINVAL;
@@ -2681,7 +2681,10 @@ cont:
 		free_page((unsigned long)inuse);
 	}
 
-	snprintf(set->name, sizeof(set->name), name, min + n);
+	set->name = kasprintf(GFP_KERNEL, name, min + n);
+	if (!set->name)
+		return -ENOMEM;
+
 	list_for_each_entry(i, &ctx->table->sets, list) {
 		if (!nft_is_active_next(ctx->net, i))
 			continue;
@@ -2958,7 +2961,7 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk,
 	struct nft_table *table;
 	struct nft_set *set;
 	struct nft_ctx ctx;
-	char name[NFT_SET_MAXNAMELEN];
+	char *name;
 	unsigned int size;
 	bool create;
 	u64 timeout;
@@ -3104,8 +3107,14 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk,
 		goto err1;
 	}
 
-	nla_strlcpy(name, nla[NFTA_SET_NAME], sizeof(set->name));
+	name = nla_strdup(nla[NFTA_SET_NAME], GFP_KERNEL);
+	if (!name) {
+		err = -ENOMEM;
+		goto err2;
+	}
+
 	err = nf_tables_set_alloc_name(&ctx, set, name);
+	kfree(name);
 	if (err < 0)
 		goto err2;
 
@@ -3155,6 +3164,7 @@ static void nft_set_destroy(struct nft_set *set)
 {
 	set->ops->destroy(set);
 	module_put(set->ops->type->owner);
+	kfree(set->name);
 	kvfree(set);
 }
 
-- 
cgit v1.2.3-71-gd317


From 615095752100748e221028fc96163c2b78185ae4 Mon Sep 17 00:00:00 2001
From: Phil Sutter <phil@nwl.cc>
Date: Thu, 27 Jul 2017 16:56:44 +0200
Subject: netfilter: nf_tables: Allow object names of up to 255 chars

Same conversion as for table names, use NFT_NAME_MAXLEN as upper
boundary as well.

Signed-off-by: Phil Sutter <phil@nwl.cc>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h        |  2 +-
 include/uapi/linux/netfilter/nf_tables.h |  2 +-
 net/netfilter/nf_tables_api.c            | 11 +++++++++--
 3 files changed, 11 insertions(+), 4 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 66ba62fa7d90..f9795fe394f3 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -1016,7 +1016,7 @@ int nft_verdict_dump(struct sk_buff *skb, int type,
  */
 struct nft_object {
 	struct list_head		list;
-	char				name[NFT_OBJ_MAXNAMELEN];
+	char				*name;
 	struct nft_table		*table;
 	u32				genmask:2,
 					use:30;
diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index b5e73e80b7b6..be25cf69295b 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -5,7 +5,7 @@
 #define NFT_TABLE_MAXNAMELEN	NFT_NAME_MAXLEN
 #define NFT_CHAIN_MAXNAMELEN	NFT_NAME_MAXLEN
 #define NFT_SET_MAXNAMELEN	NFT_NAME_MAXLEN
-#define NFT_OBJ_MAXNAMELEN	32
+#define NFT_OBJ_MAXNAMELEN	NFT_NAME_MAXLEN
 #define NFT_USERDATA_MAXLEN	256
 
 /**
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index e6a07f27b1a3..149785ff1c7b 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -4402,15 +4402,21 @@ static int nf_tables_newobj(struct net *net, struct sock *nlsk,
 		goto err1;
 	}
 	obj->table = table;
-	nla_strlcpy(obj->name, nla[NFTA_OBJ_NAME], NFT_OBJ_MAXNAMELEN);
+	obj->name = nla_strdup(nla[NFTA_OBJ_NAME], GFP_KERNEL);
+	if (!obj->name) {
+		err = -ENOMEM;
+		goto err2;
+	}
 
 	err = nft_trans_obj_add(&ctx, NFT_MSG_NEWOBJ, obj);
 	if (err < 0)
-		goto err2;
+		goto err3;
 
 	list_add_tail_rcu(&obj->list, &table->objects);
 	table->use++;
 	return 0;
+err3:
+	kfree(obj->name);
 err2:
 	if (obj->type->destroy)
 		obj->type->destroy(obj);
@@ -4626,6 +4632,7 @@ static void nft_obj_destroy(struct nft_object *obj)
 		obj->type->destroy(obj);
 
 	module_put(obj->type->owner);
+	kfree(obj->name);
 	kfree(obj);
 }
 
-- 
cgit v1.2.3-71-gd317


From 3282e65558b3651e230ee985c174c35cb2fedaf1 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Sun, 30 Jul 2017 03:57:23 +0200
Subject: tcp: remove unused mib counters

was used by tcp prequeue and header prediction.
TCPFORWARDRETRANS use was removed in january.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/snmp.h | 9 ---------
 net/ipv4/proc.c           | 9 ---------
 2 files changed, 18 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h
index d85693295798..b3f346fb9fe3 100644
--- a/include/uapi/linux/snmp.h
+++ b/include/uapi/linux/snmp.h
@@ -184,14 +184,7 @@ enum
 	LINUX_MIB_DELAYEDACKLOST,		/* DelayedACKLost */
 	LINUX_MIB_LISTENOVERFLOWS,		/* ListenOverflows */
 	LINUX_MIB_LISTENDROPS,			/* ListenDrops */
-	LINUX_MIB_TCPPREQUEUED,			/* TCPPrequeued */
-	LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG,	/* TCPDirectCopyFromBacklog */
-	LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE,	/* TCPDirectCopyFromPrequeue */
-	LINUX_MIB_TCPPREQUEUEDROPPED,		/* TCPPrequeueDropped */
-	LINUX_MIB_TCPHPHITS,			/* TCPHPHits */
-	LINUX_MIB_TCPHPHITSTOUSER,		/* TCPHPHitsToUser */
 	LINUX_MIB_TCPPUREACKS,			/* TCPPureAcks */
-	LINUX_MIB_TCPHPACKS,			/* TCPHPAcks */
 	LINUX_MIB_TCPRENORECOVERY,		/* TCPRenoRecovery */
 	LINUX_MIB_TCPSACKRECOVERY,		/* TCPSackRecovery */
 	LINUX_MIB_TCPSACKRENEGING,		/* TCPSACKReneging */
@@ -208,14 +201,12 @@ enum
 	LINUX_MIB_TCPSACKFAILURES,		/* TCPSackFailures */
 	LINUX_MIB_TCPLOSSFAILURES,		/* TCPLossFailures */
 	LINUX_MIB_TCPFASTRETRANS,		/* TCPFastRetrans */
-	LINUX_MIB_TCPFORWARDRETRANS,		/* TCPForwardRetrans */
 	LINUX_MIB_TCPSLOWSTARTRETRANS,		/* TCPSlowStartRetrans */
 	LINUX_MIB_TCPTIMEOUTS,			/* TCPTimeouts */
 	LINUX_MIB_TCPLOSSPROBES,		/* TCPLossProbes */
 	LINUX_MIB_TCPLOSSPROBERECOVERY,		/* TCPLossProbeRecovery */
 	LINUX_MIB_TCPRENORECOVERYFAIL,		/* TCPRenoRecoveryFail */
 	LINUX_MIB_TCPSACKRECOVERYFAIL,		/* TCPSackRecoveryFail */
-	LINUX_MIB_TCPSCHEDULERFAILED,		/* TCPSchedulerFailed */
 	LINUX_MIB_TCPRCVCOLLAPSED,		/* TCPRcvCollapsed */
 	LINUX_MIB_TCPDSACKOLDSENT,		/* TCPDSACKOldSent */
 	LINUX_MIB_TCPDSACKOFOSENT,		/* TCPDSACKOfoSent */
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 43eb6567b3a0..b6d3fe03feb3 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -206,14 +206,7 @@ static const struct snmp_mib snmp4_net_list[] = {
 	SNMP_MIB_ITEM("DelayedACKLost", LINUX_MIB_DELAYEDACKLOST),
 	SNMP_MIB_ITEM("ListenOverflows", LINUX_MIB_LISTENOVERFLOWS),
 	SNMP_MIB_ITEM("ListenDrops", LINUX_MIB_LISTENDROPS),
-	SNMP_MIB_ITEM("TCPPrequeued", LINUX_MIB_TCPPREQUEUED),
-	SNMP_MIB_ITEM("TCPDirectCopyFromBacklog", LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG),
-	SNMP_MIB_ITEM("TCPDirectCopyFromPrequeue", LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE),
-	SNMP_MIB_ITEM("TCPPrequeueDropped", LINUX_MIB_TCPPREQUEUEDROPPED),
-	SNMP_MIB_ITEM("TCPHPHits", LINUX_MIB_TCPHPHITS),
-	SNMP_MIB_ITEM("TCPHPHitsToUser", LINUX_MIB_TCPHPHITSTOUSER),
 	SNMP_MIB_ITEM("TCPPureAcks", LINUX_MIB_TCPPUREACKS),
-	SNMP_MIB_ITEM("TCPHPAcks", LINUX_MIB_TCPHPACKS),
 	SNMP_MIB_ITEM("TCPRenoRecovery", LINUX_MIB_TCPRENORECOVERY),
 	SNMP_MIB_ITEM("TCPSackRecovery", LINUX_MIB_TCPSACKRECOVERY),
 	SNMP_MIB_ITEM("TCPSACKReneging", LINUX_MIB_TCPSACKRENEGING),
@@ -230,14 +223,12 @@ static const struct snmp_mib snmp4_net_list[] = {
 	SNMP_MIB_ITEM("TCPSackFailures", LINUX_MIB_TCPSACKFAILURES),
 	SNMP_MIB_ITEM("TCPLossFailures", LINUX_MIB_TCPLOSSFAILURES),
 	SNMP_MIB_ITEM("TCPFastRetrans", LINUX_MIB_TCPFASTRETRANS),
-	SNMP_MIB_ITEM("TCPForwardRetrans", LINUX_MIB_TCPFORWARDRETRANS),
 	SNMP_MIB_ITEM("TCPSlowStartRetrans", LINUX_MIB_TCPSLOWSTARTRETRANS),
 	SNMP_MIB_ITEM("TCPTimeouts", LINUX_MIB_TCPTIMEOUTS),
 	SNMP_MIB_ITEM("TCPLossProbes", LINUX_MIB_TCPLOSSPROBES),
 	SNMP_MIB_ITEM("TCPLossProbeRecovery", LINUX_MIB_TCPLOSSPROBERECOVERY),
 	SNMP_MIB_ITEM("TCPRenoRecoveryFail", LINUX_MIB_TCPRENORECOVERYFAIL),
 	SNMP_MIB_ITEM("TCPSackRecoveryFail", LINUX_MIB_TCPSACKRECOVERYFAIL),
-	SNMP_MIB_ITEM("TCPSchedulerFailed", LINUX_MIB_TCPSCHEDULERFAILED),
 	SNMP_MIB_ITEM("TCPRcvCollapsed", LINUX_MIB_TCPRCVCOLLAPSED),
 	SNMP_MIB_ITEM("TCPDSACKOldSent", LINUX_MIB_TCPDSACKOLDSENT),
 	SNMP_MIB_ITEM("TCPDSACKOfoSent", LINUX_MIB_TCPDSACKOFOSENT),
-- 
cgit v1.2.3-71-gd317


From bb7c19f96012720b895111300b9d9f3f858c3a69 Mon Sep 17 00:00:00 2001
From: Wei Wang <weiwan@google.com>
Date: Fri, 28 Jul 2017 10:28:21 -0700
Subject: tcp: add related fields into SCM_TIMESTAMPING_OPT_STATS

Add the following stats into SCM_TIMESTAMPING_OPT_STATS control msg:
    TCP_NLA_PACING_RATE
    TCP_NLA_DELIVERY_RATE
    TCP_NLA_SND_CWND
    TCP_NLA_REORDERING
    TCP_NLA_MIN_RTT
    TCP_NLA_RECUR_RETRANS
    TCP_NLA_DELIVERY_RATE_APP_LMT

Signed-off-by: Wei Wang <weiwan@google.com>
Acked-by: Yuchung Cheng <ycheng@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/tcp.h |  8 ++++++++
 net/ipv4/tcp.c           | 20 +++++++++++++++++++-
 2 files changed, 27 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index a5507c977497..030e594bab45 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -231,6 +231,14 @@ enum {
 	TCP_NLA_SNDBUF_LIMITED,	/* Time (usec) limited by send buffer */
 	TCP_NLA_DATA_SEGS_OUT,	/* Data pkts sent including retransmission */
 	TCP_NLA_TOTAL_RETRANS,	/* Data pkts retransmitted */
+	TCP_NLA_PACING_RATE,    /* Pacing rate in bytes per second */
+	TCP_NLA_DELIVERY_RATE,  /* Delivery rate in bytes per second */
+	TCP_NLA_SND_CWND,       /* Sending congestion window */
+	TCP_NLA_REORDERING,     /* Reordering metric */
+	TCP_NLA_MIN_RTT,        /* minimum RTT */
+	TCP_NLA_RECUR_RETRANS,  /* Recurring retransmits for the current pkt */
+	TCP_NLA_DELIVERY_RATE_APP_LMT, /* delivery rate application limited ? */
+
 };
 
 /* for TCP_MD5SIG socket option */
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index acee7acdcba6..5326b50a3450 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2840,8 +2840,12 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk)
 	const struct tcp_sock *tp = tcp_sk(sk);
 	struct sk_buff *stats;
 	struct tcp_info info;
+	u64 rate64;
+	u32 rate;
 
-	stats = alloc_skb(5 * nla_total_size_64bit(sizeof(u64)), GFP_ATOMIC);
+	stats = alloc_skb(7 * nla_total_size_64bit(sizeof(u64)) +
+			  3 * nla_total_size(sizeof(u32)) +
+			  2 * nla_total_size(sizeof(u8)), GFP_ATOMIC);
 	if (!stats)
 		return NULL;
 
@@ -2856,6 +2860,20 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk)
 			  tp->data_segs_out, TCP_NLA_PAD);
 	nla_put_u64_64bit(stats, TCP_NLA_TOTAL_RETRANS,
 			  tp->total_retrans, TCP_NLA_PAD);
+
+	rate = READ_ONCE(sk->sk_pacing_rate);
+	rate64 = rate != ~0U ? rate : ~0ULL;
+	nla_put_u64_64bit(stats, TCP_NLA_PACING_RATE, rate64, TCP_NLA_PAD);
+
+	rate64 = tcp_compute_delivery_rate(tp);
+	nla_put_u64_64bit(stats, TCP_NLA_DELIVERY_RATE, rate64, TCP_NLA_PAD);
+
+	nla_put_u32(stats, TCP_NLA_SND_CWND, tp->snd_cwnd);
+	nla_put_u32(stats, TCP_NLA_REORDERING, tp->reordering);
+	nla_put_u32(stats, TCP_NLA_MIN_RTT, tcp_min_rtt(tp));
+
+	nla_put_u8(stats, TCP_NLA_RECUR_RETRANS, inet_csk(sk)->icsk_retransmits);
+	nla_put_u8(stats, TCP_NLA_DELIVERY_RATE_APP_LMT, !!tp->rate_app_limited);
 	return stats;
 }
 
-- 
cgit v1.2.3-71-gd317


From 61e4d01e16acddadb9723143637a20417fa67ac9 Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@mellanox.com>
Date: Thu, 3 Aug 2017 13:28:20 +0200
Subject: ipv6: fib: Add offload indication to routes

Allow user space applications to see which routes are offloaded and
which aren't by setting the RTNH_F_OFFLOAD flag when dumping them.

To be consistent with IPv4, offload indication is provided on a
per-nexthop basis.

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/ipv6_route.h | 1 +
 net/ipv6/route.c                | 8 ++++++++
 2 files changed, 9 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/ipv6_route.h b/include/uapi/linux/ipv6_route.h
index d496c02e14bc..33e2a5732bd1 100644
--- a/include/uapi/linux/ipv6_route.h
+++ b/include/uapi/linux/ipv6_route.h
@@ -35,6 +35,7 @@
 #define RTF_PREF(pref)	((pref) << 27)
 #define RTF_PREF_MASK	0x18000000
 
+#define RTF_OFFLOAD	0x20000000	/* offloaded route		*/
 #define RTF_PCPU	0x40000000	/* read-only: can not be set by user */
 #define RTF_LOCAL	0x80000000
 
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 4d30c96a819d..aba07fce67fb 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1820,6 +1820,11 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
 		goto out;
 	}
 
+	if (cfg->fc_flags & RTF_OFFLOAD) {
+		NL_SET_ERR_MSG(extack, "Userspace can not set RTF_OFFLOAD");
+		goto out;
+	}
+
 	if (cfg->fc_dst_len > 128) {
 		NL_SET_ERR_MSG(extack, "Invalid prefix length");
 		goto out;
@@ -3327,6 +3332,9 @@ static int rt6_nexthop_info(struct sk_buff *skb, struct rt6_info *rt,
 			goto nla_put_failure;
 	}
 
+	if (rt->rt6i_flags & RTF_OFFLOAD)
+		*flags |= RTNH_F_OFFLOAD;
+
 	/* not needed for multipath encoding b/c it has a rtnexthop struct */
 	if (!skip_oif && rt->dst.dev &&
 	    nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
-- 
cgit v1.2.3-71-gd317


From 52267790ef52d7513879238ca9fac22c1733e0e3 Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Thu, 3 Aug 2017 16:29:39 -0400
Subject: sock: add MSG_ZEROCOPY

The kernel supports zerocopy sendmsg in virtio and tap. Expand the
infrastructure to support other socket types. Introduce a completion
notification channel over the socket error queue. Notifications are
returned with ee_origin SO_EE_ORIGIN_ZEROCOPY. ee_errno is 0 to avoid
blocking the send/recv path on receiving notifications.

Add reference counting, to support the skb split, merge, resize and
clone operations possible with SOCK_STREAM and other socket types.

The patch does not yet modify any datapaths.

Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h        |  60 +++++++++++++++++++
 include/linux/socket.h        |   1 +
 include/net/sock.h            |   2 +
 include/uapi/linux/errqueue.h |   3 +
 net/core/datagram.c           |  55 ++++++++++-------
 net/core/skbuff.c             | 133 ++++++++++++++++++++++++++++++++++++++++++
 net/core/sock.c               |   2 +
 7 files changed, 235 insertions(+), 21 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 2f64e2bbb592..59cff7aa494e 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -429,6 +429,7 @@ enum {
 	SKBTX_SCHED_TSTAMP = 1 << 6,
 };
 
+#define SKBTX_ZEROCOPY_FRAG	(SKBTX_DEV_ZEROCOPY | SKBTX_SHARED_FRAG)
 #define SKBTX_ANY_SW_TSTAMP	(SKBTX_SW_TSTAMP    | \
 				 SKBTX_SCHED_TSTAMP)
 #define SKBTX_ANY_TSTAMP	(SKBTX_HW_TSTAMP | SKBTX_ANY_SW_TSTAMP)
@@ -445,8 +446,28 @@ struct ubuf_info {
 	void (*callback)(struct ubuf_info *, bool zerocopy_success);
 	void *ctx;
 	unsigned long desc;
+	u16 zerocopy:1;
+	atomic_t refcnt;
 };
 
+#define skb_uarg(SKB)	((struct ubuf_info *)(skb_shinfo(SKB)->destructor_arg))
+
+struct ubuf_info *sock_zerocopy_alloc(struct sock *sk, size_t size);
+
+static inline void sock_zerocopy_get(struct ubuf_info *uarg)
+{
+	atomic_inc(&uarg->refcnt);
+}
+
+void sock_zerocopy_put(struct ubuf_info *uarg);
+void sock_zerocopy_put_abort(struct ubuf_info *uarg);
+
+void sock_zerocopy_callback(struct ubuf_info *uarg, bool success);
+
+int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
+			     struct msghdr *msg, int len,
+			     struct ubuf_info *uarg);
+
 /* This data is invariant across clones and lives at
  * the end of the header data, ie. at skb->end.
  */
@@ -1214,6 +1235,45 @@ static inline struct skb_shared_hwtstamps *skb_hwtstamps(struct sk_buff *skb)
 	return &skb_shinfo(skb)->hwtstamps;
 }
 
+static inline struct ubuf_info *skb_zcopy(struct sk_buff *skb)
+{
+	bool is_zcopy = skb && skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY;
+
+	return is_zcopy ? skb_uarg(skb) : NULL;
+}
+
+static inline void skb_zcopy_set(struct sk_buff *skb, struct ubuf_info *uarg)
+{
+	if (skb && uarg && !skb_zcopy(skb)) {
+		sock_zerocopy_get(uarg);
+		skb_shinfo(skb)->destructor_arg = uarg;
+		skb_shinfo(skb)->tx_flags |= SKBTX_ZEROCOPY_FRAG;
+	}
+}
+
+/* Release a reference on a zerocopy structure */
+static inline void skb_zcopy_clear(struct sk_buff *skb, bool zerocopy)
+{
+	struct ubuf_info *uarg = skb_zcopy(skb);
+
+	if (uarg) {
+		uarg->zerocopy = uarg->zerocopy && zerocopy;
+		sock_zerocopy_put(uarg);
+		skb_shinfo(skb)->tx_flags &= ~SKBTX_ZEROCOPY_FRAG;
+	}
+}
+
+/* Abort a zerocopy operation and revert zckey on error in send syscall */
+static inline void skb_zcopy_abort(struct sk_buff *skb)
+{
+	struct ubuf_info *uarg = skb_zcopy(skb);
+
+	if (uarg) {
+		sock_zerocopy_put_abort(uarg);
+		skb_shinfo(skb)->tx_flags &= ~SKBTX_ZEROCOPY_FRAG;
+	}
+}
+
 /**
  *	skb_queue_empty - check if a queue is empty
  *	@list: queue head
diff --git a/include/linux/socket.h b/include/linux/socket.h
index 8b13db5163cc..8ad963cdc88c 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -287,6 +287,7 @@ struct ucred {
 #define MSG_BATCH	0x40000 /* sendmmsg(): more messages coming */
 #define MSG_EOF         MSG_FIN
 
+#define MSG_ZEROCOPY	0x4000000	/* Use user data in kernel path */
 #define MSG_FASTOPEN	0x20000000	/* Send data in TCP SYN */
 #define MSG_CMSG_CLOEXEC 0x40000000	/* Set close_on_exec for file
 					   descriptor received through
diff --git a/include/net/sock.h b/include/net/sock.h
index 0f778d3c4300..fe1a0bc25cd3 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -294,6 +294,7 @@ struct sock_common {
   *	@sk_stamp: time stamp of last packet received
   *	@sk_tsflags: SO_TIMESTAMPING socket options
   *	@sk_tskey: counter to disambiguate concurrent tstamp requests
+  *	@sk_zckey: counter to order MSG_ZEROCOPY notifications
   *	@sk_socket: Identd and reporting IO signals
   *	@sk_user_data: RPC layer private data
   *	@sk_frag: cached page frag
@@ -462,6 +463,7 @@ struct sock {
 	u16			sk_tsflags;
 	u8			sk_shutdown;
 	u32			sk_tskey;
+	atomic_t		sk_zckey;
 	struct socket		*sk_socket;
 	void			*sk_user_data;
 #ifdef CONFIG_SECURITY
diff --git a/include/uapi/linux/errqueue.h b/include/uapi/linux/errqueue.h
index 07bdce1f444a..78fdf52d6b2f 100644
--- a/include/uapi/linux/errqueue.h
+++ b/include/uapi/linux/errqueue.h
@@ -18,10 +18,13 @@ struct sock_extended_err {
 #define SO_EE_ORIGIN_ICMP	2
 #define SO_EE_ORIGIN_ICMP6	3
 #define SO_EE_ORIGIN_TXSTATUS	4
+#define SO_EE_ORIGIN_ZEROCOPY	5
 #define SO_EE_ORIGIN_TIMESTAMPING SO_EE_ORIGIN_TXSTATUS
 
 #define SO_EE_OFFENDER(ee)	((struct sockaddr*)((ee)+1))
 
+#define SO_EE_CODE_ZEROCOPY_COPIED	1
+
 /**
  *	struct scm_timestamping - timestamps exposed through cmsg
  *
diff --git a/net/core/datagram.c b/net/core/datagram.c
index ee5647bd91b3..2f3277945d35 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -573,27 +573,12 @@ fault:
 }
 EXPORT_SYMBOL(skb_copy_datagram_from_iter);
 
-/**
- *	zerocopy_sg_from_iter - Build a zerocopy datagram from an iov_iter
- *	@skb: buffer to copy
- *	@from: the source to copy from
- *
- *	The function will first copy up to headlen, and then pin the userspace
- *	pages and build frags through them.
- *
- *	Returns 0, -EFAULT or -EMSGSIZE.
- */
-int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *from)
+int __zerocopy_sg_from_iter(struct sock *sk, struct sk_buff *skb,
+			    struct iov_iter *from, size_t length)
 {
-	int len = iov_iter_count(from);
-	int copy = min_t(int, skb_headlen(skb), len);
-	int frag = 0;
+	int frag = skb_shinfo(skb)->nr_frags;
 
-	/* copy up to skb headlen */
-	if (skb_copy_datagram_from_iter(skb, 0, from, copy))
-		return -EFAULT;
-
-	while (iov_iter_count(from)) {
+	while (length && iov_iter_count(from)) {
 		struct page *pages[MAX_SKB_FRAGS];
 		size_t start;
 		ssize_t copied;
@@ -603,18 +588,24 @@ int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *from)
 		if (frag == MAX_SKB_FRAGS)
 			return -EMSGSIZE;
 
-		copied = iov_iter_get_pages(from, pages, ~0U,
+		copied = iov_iter_get_pages(from, pages, length,
 					    MAX_SKB_FRAGS - frag, &start);
 		if (copied < 0)
 			return -EFAULT;
 
 		iov_iter_advance(from, copied);
+		length -= copied;
 
 		truesize = PAGE_ALIGN(copied + start);
 		skb->data_len += copied;
 		skb->len += copied;
 		skb->truesize += truesize;
-		refcount_add(truesize, &skb->sk->sk_wmem_alloc);
+		if (sk && sk->sk_type == SOCK_STREAM) {
+			sk->sk_wmem_queued += truesize;
+			sk_mem_charge(sk, truesize);
+		} else {
+			refcount_add(truesize, &skb->sk->sk_wmem_alloc);
+		}
 		while (copied) {
 			int size = min_t(int, copied, PAGE_SIZE - start);
 			skb_fill_page_desc(skb, frag++, pages[n], start, size);
@@ -625,6 +616,28 @@ int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *from)
 	}
 	return 0;
 }
+EXPORT_SYMBOL(__zerocopy_sg_from_iter);
+
+/**
+ *	zerocopy_sg_from_iter - Build a zerocopy datagram from an iov_iter
+ *	@skb: buffer to copy
+ *	@from: the source to copy from
+ *
+ *	The function will first copy up to headlen, and then pin the userspace
+ *	pages and build frags through them.
+ *
+ *	Returns 0, -EFAULT or -EMSGSIZE.
+ */
+int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *from)
+{
+	int copy = min_t(int, skb_headlen(skb), iov_iter_count(from));
+
+	/* copy up to skb headlen */
+	if (skb_copy_datagram_from_iter(skb, 0, from, copy))
+		return -EFAULT;
+
+	return __zerocopy_sg_from_iter(NULL, skb, from, ~0U);
+}
 EXPORT_SYMBOL(zerocopy_sg_from_iter);
 
 static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset,
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index a95877a8ac8b..0603e44950da 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -915,6 +915,139 @@ struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src)
 }
 EXPORT_SYMBOL_GPL(skb_morph);
 
+struct ubuf_info *sock_zerocopy_alloc(struct sock *sk, size_t size)
+{
+	struct ubuf_info *uarg;
+	struct sk_buff *skb;
+
+	WARN_ON_ONCE(!in_task());
+
+	skb = sock_omalloc(sk, 0, GFP_KERNEL);
+	if (!skb)
+		return NULL;
+
+	BUILD_BUG_ON(sizeof(*uarg) > sizeof(skb->cb));
+	uarg = (void *)skb->cb;
+
+	uarg->callback = sock_zerocopy_callback;
+	uarg->desc = atomic_inc_return(&sk->sk_zckey) - 1;
+	uarg->zerocopy = 1;
+	atomic_set(&uarg->refcnt, 0);
+	sock_hold(sk);
+
+	return uarg;
+}
+EXPORT_SYMBOL_GPL(sock_zerocopy_alloc);
+
+static inline struct sk_buff *skb_from_uarg(struct ubuf_info *uarg)
+{
+	return container_of((void *)uarg, struct sk_buff, cb);
+}
+
+void sock_zerocopy_callback(struct ubuf_info *uarg, bool success)
+{
+	struct sk_buff *skb = skb_from_uarg(uarg);
+	struct sock_exterr_skb *serr;
+	struct sock *sk = skb->sk;
+	u16 id = uarg->desc;
+
+	if (sock_flag(sk, SOCK_DEAD))
+		goto release;
+
+	serr = SKB_EXT_ERR(skb);
+	memset(serr, 0, sizeof(*serr));
+	serr->ee.ee_errno = 0;
+	serr->ee.ee_origin = SO_EE_ORIGIN_ZEROCOPY;
+	serr->ee.ee_data = id;
+	if (!success)
+		serr->ee.ee_code |= SO_EE_CODE_ZEROCOPY_COPIED;
+
+	skb_queue_tail(&sk->sk_error_queue, skb);
+	skb = NULL;
+
+	sk->sk_error_report(sk);
+
+release:
+	consume_skb(skb);
+	sock_put(sk);
+}
+EXPORT_SYMBOL_GPL(sock_zerocopy_callback);
+
+void sock_zerocopy_put(struct ubuf_info *uarg)
+{
+	if (uarg && atomic_dec_and_test(&uarg->refcnt)) {
+		if (uarg->callback)
+			uarg->callback(uarg, uarg->zerocopy);
+		else
+			consume_skb(skb_from_uarg(uarg));
+	}
+}
+EXPORT_SYMBOL_GPL(sock_zerocopy_put);
+
+void sock_zerocopy_put_abort(struct ubuf_info *uarg)
+{
+	if (uarg) {
+		struct sock *sk = skb_from_uarg(uarg)->sk;
+
+		atomic_dec(&sk->sk_zckey);
+
+		/* sock_zerocopy_put expects a ref. Most sockets take one per
+		 * skb, which is zero on abort. tcp_sendmsg holds one extra, to
+		 * avoid an skb send inside the main loop triggering uarg free.
+		 */
+		if (sk->sk_type != SOCK_STREAM)
+			atomic_inc(&uarg->refcnt);
+
+		sock_zerocopy_put(uarg);
+	}
+}
+EXPORT_SYMBOL_GPL(sock_zerocopy_put_abort);
+
+extern int __zerocopy_sg_from_iter(struct sock *sk, struct sk_buff *skb,
+				   struct iov_iter *from, size_t length);
+
+int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
+			     struct msghdr *msg, int len,
+			     struct ubuf_info *uarg)
+{
+	struct iov_iter orig_iter = msg->msg_iter;
+	int err, orig_len = skb->len;
+
+	err = __zerocopy_sg_from_iter(sk, skb, &msg->msg_iter, len);
+	if (err == -EFAULT || (err == -EMSGSIZE && skb->len == orig_len)) {
+		/* Streams do not free skb on error. Reset to prev state. */
+		msg->msg_iter = orig_iter;
+		___pskb_trim(skb, orig_len);
+		return err;
+	}
+
+	skb_zcopy_set(skb, uarg);
+	return skb->len - orig_len;
+}
+EXPORT_SYMBOL_GPL(skb_zerocopy_iter_stream);
+
+/* unused only until next patch in the series; will remove attribute */
+static int __attribute__((unused))
+	   skb_zerocopy_clone(struct sk_buff *nskb, struct sk_buff *orig,
+			      gfp_t gfp_mask)
+{
+	if (skb_zcopy(orig)) {
+		if (skb_zcopy(nskb)) {
+			/* !gfp_mask callers are verified to !skb_zcopy(nskb) */
+			if (!gfp_mask) {
+				WARN_ON_ONCE(1);
+				return -ENOMEM;
+			}
+			if (skb_uarg(nskb) == skb_uarg(orig))
+				return 0;
+			if (skb_copy_ubufs(nskb, GFP_ATOMIC))
+				return -EIO;
+		}
+		skb_zcopy_set(nskb, skb_uarg(orig));
+	}
+	return 0;
+}
+
 /**
  *	skb_copy_ubufs	-	copy userspace skb frags buffers to kernel
  *	@skb: the skb to modify
diff --git a/net/core/sock.c b/net/core/sock.c
index 1261880bdcc8..e8b696858cad 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1670,6 +1670,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
 		atomic_set(&newsk->sk_drops, 0);
 		newsk->sk_send_head	= NULL;
 		newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
+		atomic_set(&newsk->sk_zckey, 0);
 
 		sock_reset_flag(newsk, SOCK_DONE);
 
@@ -2722,6 +2723,7 @@ void sock_init_data(struct socket *sock, struct sock *sk)
 	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
 
 	sk->sk_stamp = SK_DEFAULT_STAMP;
+	atomic_set(&sk->sk_zckey, 0);
 
 #ifdef CONFIG_NET_RX_BUSY_POLL
 	sk->sk_napi_id		=	0;
-- 
cgit v1.2.3-71-gd317


From 059cf566e123ca7eb7434285c6455d7afafb4e02 Mon Sep 17 00:00:00 2001
From: Jens Wiklander <jens.wiklander@linaro.org>
Date: Thu, 16 Feb 2017 09:07:02 +0100
Subject: tee: indicate privileged dev in gen_caps

Mirrors the TEE_DESC_PRIVILEGED bit of struct tee_desc:flags into struct
tee_ioctl_version_data:gen_caps as TEE_GEN_CAP_PRIVILEGED in
tee_ioctl_version()

Reviewed-by: Jerome Forissier <jerome.forissier@linaro.org>
Signed-off-by: Jens Wiklander <jens.wiklander@linaro.org>
---
 drivers/tee/tee_core.c   | 5 +++++
 include/uapi/linux/tee.h | 1 +
 2 files changed, 6 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/drivers/tee/tee_core.c b/drivers/tee/tee_core.c
index 5c60bf4423e6..58a5009eacc3 100644
--- a/drivers/tee/tee_core.c
+++ b/drivers/tee/tee_core.c
@@ -90,8 +90,13 @@ static int tee_ioctl_version(struct tee_context *ctx,
 	struct tee_ioctl_version_data vers;
 
 	ctx->teedev->desc->ops->get_version(ctx->teedev, &vers);
+
+	if (ctx->teedev->desc->flags & TEE_DESC_PRIVILEGED)
+		vers.gen_caps |= TEE_GEN_CAP_PRIVILEGED;
+
 	if (copy_to_user(uvers, &vers, sizeof(vers)))
 		return -EFAULT;
+
 	return 0;
 }
 
diff --git a/include/uapi/linux/tee.h b/include/uapi/linux/tee.h
index 370d8845ab21..688782e90140 100644
--- a/include/uapi/linux/tee.h
+++ b/include/uapi/linux/tee.h
@@ -49,6 +49,7 @@
 #define TEE_MAX_ARG_SIZE	1024
 
 #define TEE_GEN_CAP_GP		(1 << 0)/* GlobalPlatform compliant TEE */
+#define TEE_GEN_CAP_PRIVILEGED	(1 << 1)/* Privileged device (for supplicant) */
 
 /*
  * TEE Implementation ID
-- 
cgit v1.2.3-71-gd317


From 56ce097c1caede1f9c191a7c9699b950e7c36ad9 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Fri, 4 Aug 2017 08:24:05 -0700
Subject: net: comment fixes against BPF devmap helper calls

Update BPF comments to accurately reflect XDP usage.

Fixes: 97f91a7cf04ff ("bpf: add bpf_redirect_map helper routine")
Reported-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/bpf.h | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 1106a8c4cd36..1d06be1569b1 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -345,14 +345,20 @@ union bpf_attr {
  * int bpf_redirect(ifindex, flags)
  *     redirect to another netdev
  *     @ifindex: ifindex of the net device
- *     @flags: bit 0 - if set, redirect to ingress instead of egress
- *             other bits - reserved
- *     Return: TC_ACT_REDIRECT
- * int bpf_redirect_map(key, map, flags)
+ *     @flags:
+ *	  cls_bpf:
+ *          bit 0 - if set, redirect to ingress instead of egress
+ *          other bits - reserved
+ *	  xdp_bpf:
+ *	    all bits - reserved
+ *     Return: cls_bpf: TC_ACT_REDIRECT on success or TC_ACT_SHOT on error
+ *	       xdp_bfp: XDP_REDIRECT on success or XDP_ABORT on error
+ * int bpf_redirect_map(map, key, flags)
  *     redirect to endpoint in map
+ *     @map: pointer to dev map
  *     @key: index in map to lookup
- *     @map: fd of map to do lookup in
  *     @flags: --
+ *     Return: XDP_REDIRECT on success or XDP_ABORT on error
  *
  * u32 bpf_get_route_realm(skb)
  *     retrieve a dst's tclassid
-- 
cgit v1.2.3-71-gd317


From 472b46c352c9ff0b6fa57dbf85d77c51901a3368 Mon Sep 17 00:00:00 2001
From: Mikko Rapeli <mikko.rapeli@iki.fi>
Date: Sun, 6 Aug 2017 18:44:27 +0200
Subject: uapi linux/kfd_ioctl.h: only use __u32 and __u64
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Include <drm/drm.h> instead of <linux/types.h> which on Linux includes
<linux/types.h> and on non-Linux platforms defines __u32 etc types.

Fixes user space compilation errors like:

linux/kfd_ioctl.h:33:2: error: unknown type name ‘uint32_t’
  uint32_t major_version; /* from KFD */
  ^~~~~~~~

Signed-off-by: Mikko Rapeli <mikko.rapeli@iki.fi>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
---
 include/uapi/linux/kfd_ioctl.h | 172 ++++++++++++++++++++---------------------
 1 file changed, 86 insertions(+), 86 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
index 7b4567bacfc2..26283fefdf5f 100644
--- a/include/uapi/linux/kfd_ioctl.h
+++ b/include/uapi/linux/kfd_ioctl.h
@@ -23,15 +23,15 @@
 #ifndef KFD_IOCTL_H_INCLUDED
 #define KFD_IOCTL_H_INCLUDED
 
-#include <linux/types.h>
+#include <drm/drm.h>
 #include <linux/ioctl.h>
 
 #define KFD_IOCTL_MAJOR_VERSION 1
 #define KFD_IOCTL_MINOR_VERSION 1
 
 struct kfd_ioctl_get_version_args {
-	uint32_t major_version;	/* from KFD */
-	uint32_t minor_version;	/* from KFD */
+	__u32 major_version;	/* from KFD */
+	__u32 minor_version;	/* from KFD */
 };
 
 /* For kfd_ioctl_create_queue_args.queue_type. */
@@ -43,36 +43,36 @@ struct kfd_ioctl_get_version_args {
 #define KFD_MAX_QUEUE_PRIORITY		15
 
 struct kfd_ioctl_create_queue_args {
-	uint64_t ring_base_address;	/* to KFD */
-	uint64_t write_pointer_address;	/* from KFD */
-	uint64_t read_pointer_address;	/* from KFD */
-	uint64_t doorbell_offset;	/* from KFD */
-
-	uint32_t ring_size;		/* to KFD */
-	uint32_t gpu_id;		/* to KFD */
-	uint32_t queue_type;		/* to KFD */
-	uint32_t queue_percentage;	/* to KFD */
-	uint32_t queue_priority;	/* to KFD */
-	uint32_t queue_id;		/* from KFD */
-
-	uint64_t eop_buffer_address;	/* to KFD */
-	uint64_t eop_buffer_size;	/* to KFD */
-	uint64_t ctx_save_restore_address; /* to KFD */
-	uint64_t ctx_save_restore_size;	/* to KFD */
+	__u64 ring_base_address;	/* to KFD */
+	__u64 write_pointer_address;	/* from KFD */
+	__u64 read_pointer_address;	/* from KFD */
+	__u64 doorbell_offset;	/* from KFD */
+
+	__u32 ring_size;		/* to KFD */
+	__u32 gpu_id;		/* to KFD */
+	__u32 queue_type;		/* to KFD */
+	__u32 queue_percentage;	/* to KFD */
+	__u32 queue_priority;	/* to KFD */
+	__u32 queue_id;		/* from KFD */
+
+	__u64 eop_buffer_address;	/* to KFD */
+	__u64 eop_buffer_size;	/* to KFD */
+	__u64 ctx_save_restore_address; /* to KFD */
+	__u64 ctx_save_restore_size;	/* to KFD */
 };
 
 struct kfd_ioctl_destroy_queue_args {
-	uint32_t queue_id;		/* to KFD */
-	uint32_t pad;
+	__u32 queue_id;		/* to KFD */
+	__u32 pad;
 };
 
 struct kfd_ioctl_update_queue_args {
-	uint64_t ring_base_address;	/* to KFD */
+	__u64 ring_base_address;	/* to KFD */
 
-	uint32_t queue_id;		/* to KFD */
-	uint32_t ring_size;		/* to KFD */
-	uint32_t queue_percentage;	/* to KFD */
-	uint32_t queue_priority;	/* to KFD */
+	__u32 queue_id;		/* to KFD */
+	__u32 ring_size;		/* to KFD */
+	__u32 queue_percentage;	/* to KFD */
+	__u32 queue_priority;	/* to KFD */
 };
 
 /* For kfd_ioctl_set_memory_policy_args.default_policy and alternate_policy */
@@ -80,13 +80,13 @@ struct kfd_ioctl_update_queue_args {
 #define KFD_IOC_CACHE_POLICY_NONCOHERENT 1
 
 struct kfd_ioctl_set_memory_policy_args {
-	uint64_t alternate_aperture_base;	/* to KFD */
-	uint64_t alternate_aperture_size;	/* to KFD */
+	__u64 alternate_aperture_base;	/* to KFD */
+	__u64 alternate_aperture_size;	/* to KFD */
 
-	uint32_t gpu_id;			/* to KFD */
-	uint32_t default_policy;		/* to KFD */
-	uint32_t alternate_policy;		/* to KFD */
-	uint32_t pad;
+	__u32 gpu_id;			/* to KFD */
+	__u32 default_policy;		/* to KFD */
+	__u32 alternate_policy;		/* to KFD */
+	__u32 pad;
 };
 
 /*
@@ -97,26 +97,26 @@ struct kfd_ioctl_set_memory_policy_args {
  */
 
 struct kfd_ioctl_get_clock_counters_args {
-	uint64_t gpu_clock_counter;	/* from KFD */
-	uint64_t cpu_clock_counter;	/* from KFD */
-	uint64_t system_clock_counter;	/* from KFD */
-	uint64_t system_clock_freq;	/* from KFD */
+	__u64 gpu_clock_counter;	/* from KFD */
+	__u64 cpu_clock_counter;	/* from KFD */
+	__u64 system_clock_counter;	/* from KFD */
+	__u64 system_clock_freq;	/* from KFD */
 
-	uint32_t gpu_id;		/* to KFD */
-	uint32_t pad;
+	__u32 gpu_id;		/* to KFD */
+	__u32 pad;
 };
 
 #define NUM_OF_SUPPORTED_GPUS 7
 
 struct kfd_process_device_apertures {
-	uint64_t lds_base;		/* from KFD */
-	uint64_t lds_limit;		/* from KFD */
-	uint64_t scratch_base;		/* from KFD */
-	uint64_t scratch_limit;		/* from KFD */
-	uint64_t gpuvm_base;		/* from KFD */
-	uint64_t gpuvm_limit;		/* from KFD */
-	uint32_t gpu_id;		/* from KFD */
-	uint32_t pad;
+	__u64 lds_base;		/* from KFD */
+	__u64 lds_limit;		/* from KFD */
+	__u64 scratch_base;		/* from KFD */
+	__u64 scratch_limit;		/* from KFD */
+	__u64 gpuvm_base;		/* from KFD */
+	__u64 gpuvm_limit;		/* from KFD */
+	__u32 gpu_id;		/* from KFD */
+	__u32 pad;
 };
 
 struct kfd_ioctl_get_process_apertures_args {
@@ -124,8 +124,8 @@ struct kfd_ioctl_get_process_apertures_args {
 			process_apertures[NUM_OF_SUPPORTED_GPUS];/* from KFD */
 
 	/* from KFD, should be in the range [1 - NUM_OF_SUPPORTED_GPUS] */
-	uint32_t num_of_nodes;
-	uint32_t pad;
+	__u32 num_of_nodes;
+	__u32 pad;
 };
 
 #define MAX_ALLOWED_NUM_POINTS    100
@@ -133,25 +133,25 @@ struct kfd_ioctl_get_process_apertures_args {
 #define MAX_ALLOWED_WAC_BUFF_SIZE  128
 
 struct kfd_ioctl_dbg_register_args {
-	uint32_t gpu_id;		/* to KFD */
-	uint32_t pad;
+	__u32 gpu_id;		/* to KFD */
+	__u32 pad;
 };
 
 struct kfd_ioctl_dbg_unregister_args {
-	uint32_t gpu_id;		/* to KFD */
-	uint32_t pad;
+	__u32 gpu_id;		/* to KFD */
+	__u32 pad;
 };
 
 struct kfd_ioctl_dbg_address_watch_args {
-	uint64_t content_ptr;		/* a pointer to the actual content */
-	uint32_t gpu_id;		/* to KFD */
-	uint32_t buf_size_in_bytes;	/*including gpu_id and buf_size */
+	__u64 content_ptr;		/* a pointer to the actual content */
+	__u32 gpu_id;		/* to KFD */
+	__u32 buf_size_in_bytes;	/*including gpu_id and buf_size */
 };
 
 struct kfd_ioctl_dbg_wave_control_args {
-	uint64_t content_ptr;		/* a pointer to the actual content */
-	uint32_t gpu_id;		/* to KFD */
-	uint32_t buf_size_in_bytes;	/*including gpu_id and buf_size */
+	__u64 content_ptr;		/* a pointer to the actual content */
+	__u32 gpu_id;		/* to KFD */
+	__u32 buf_size_in_bytes;	/*including gpu_id and buf_size */
 };
 
 /* Matching HSA_EVENTTYPE */
@@ -172,44 +172,44 @@ struct kfd_ioctl_dbg_wave_control_args {
 #define KFD_SIGNAL_EVENT_LIMIT			256
 
 struct kfd_ioctl_create_event_args {
-	uint64_t event_page_offset;	/* from KFD */
-	uint32_t event_trigger_data;	/* from KFD - signal events only */
-	uint32_t event_type;		/* to KFD */
-	uint32_t auto_reset;		/* to KFD */
-	uint32_t node_id;		/* to KFD - only valid for certain
+	__u64 event_page_offset;	/* from KFD */
+	__u32 event_trigger_data;	/* from KFD - signal events only */
+	__u32 event_type;		/* to KFD */
+	__u32 auto_reset;		/* to KFD */
+	__u32 node_id;		/* to KFD - only valid for certain
 							event types */
-	uint32_t event_id;		/* from KFD */
-	uint32_t event_slot_index;	/* from KFD */
+	__u32 event_id;		/* from KFD */
+	__u32 event_slot_index;	/* from KFD */
 };
 
 struct kfd_ioctl_destroy_event_args {
-	uint32_t event_id;		/* to KFD */
-	uint32_t pad;
+	__u32 event_id;		/* to KFD */
+	__u32 pad;
 };
 
 struct kfd_ioctl_set_event_args {
-	uint32_t event_id;		/* to KFD */
-	uint32_t pad;
+	__u32 event_id;		/* to KFD */
+	__u32 pad;
 };
 
 struct kfd_ioctl_reset_event_args {
-	uint32_t event_id;		/* to KFD */
-	uint32_t pad;
+	__u32 event_id;		/* to KFD */
+	__u32 pad;
 };
 
 struct kfd_memory_exception_failure {
-	uint32_t NotPresent;	/* Page not present or supervisor privilege */
-	uint32_t ReadOnly;	/* Write access to a read-only page */
-	uint32_t NoExecute;	/* Execute access to a page marked NX */
-	uint32_t pad;
+	__u32 NotPresent;	/* Page not present or supervisor privilege */
+	__u32 ReadOnly;	/* Write access to a read-only page */
+	__u32 NoExecute;	/* Execute access to a page marked NX */
+	__u32 pad;
 };
 
 /* memory exception data*/
 struct kfd_hsa_memory_exception_data {
 	struct kfd_memory_exception_failure failure;
-	uint64_t va;
-	uint32_t gpu_id;
-	uint32_t pad;
+	__u64 va;
+	__u32 gpu_id;
+	__u32 pad;
 };
 
 /* Event data*/
@@ -217,19 +217,19 @@ struct kfd_event_data {
 	union {
 		struct kfd_hsa_memory_exception_data memory_exception_data;
 	};				/* From KFD */
-	uint64_t kfd_event_data_ext;	/* pointer to an extension structure
+	__u64 kfd_event_data_ext;	/* pointer to an extension structure
 					   for future exception types */
-	uint32_t event_id;		/* to KFD */
-	uint32_t pad;
+	__u32 event_id;		/* to KFD */
+	__u32 pad;
 };
 
 struct kfd_ioctl_wait_events_args {
-	uint64_t events_ptr;		/* pointed to struct
+	__u64 events_ptr;		/* pointed to struct
 					   kfd_event_data array, to KFD */
-	uint32_t num_events;		/* to KFD */
-	uint32_t wait_for_all;		/* to KFD */
-	uint32_t timeout;		/* to KFD */
-	uint32_t wait_result;		/* from KFD */
+	__u32 num_events;		/* to KFD */
+	__u32 wait_for_all;		/* to KFD */
+	__u32 timeout;		/* to KFD */
+	__u32 wait_result;		/* from KFD */
 };
 
 struct kfd_ioctl_set_scratch_backing_va_args {
-- 
cgit v1.2.3-71-gd317


From f02a60924c221985fb8b634734d6610706fa779a Mon Sep 17 00:00:00 2001
From: Mikko Rapeli <mikko.rapeli@iki.fi>
Date: Sun, 6 Aug 2017 18:44:07 +0200
Subject: uapi linux/dlm_netlink.h: include linux/dlmconstants.h
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fixes userspace compilation error:

error: ‘DLM_RESNAME_MAXLEN’ undeclared here (not in a function)
  char resource_name[DLM_RESNAME_MAXLEN];

Signed-off-by: Mikko Rapeli <mikko.rapeli@iki.fi>
Signed-off-by: David Teigland <teigland@redhat.com>
---
 include/uapi/linux/dlm_netlink.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/dlm_netlink.h b/include/uapi/linux/dlm_netlink.h
index 647c8ef27227..ef1e2e08769a 100644
--- a/include/uapi/linux/dlm_netlink.h
+++ b/include/uapi/linux/dlm_netlink.h
@@ -10,6 +10,7 @@
 #define _DLM_NETLINK_H
 
 #include <linux/types.h>
+#include <linux/dlmconstants.h>
 
 enum {
 	DLM_STATUS_WAITING = 1,
-- 
cgit v1.2.3-71-gd317


From d1df6fd8a1d22d37cffa0075ab8ad423ce656777 Mon Sep 17 00:00:00 2001
From: David Lebrun <david.lebrun@uclouvain.be>
Date: Sat, 5 Aug 2017 12:38:26 +0200
Subject: ipv6: sr: define core operations for seg6local lightweight tunnel

This patch implements a new type of lightweight tunnel named seg6local.
A seg6local lwt is defined by a type of action and a set of parameters.
The action represents the operation to perform on the packets matching the
lwt's route, and is not necessarily an encapsulation. The set of parameters
are arguments for the processing function.

Each action is defined in a struct seg6_action_desc within
seg6_action_table[]. This structure contains the action, mandatory
attributes, the processing function, and a static headroom size required by
the action. The mandatory attributes are encoded as a bitmask field. The
static headroom is set to a non-zero value when the processing function
always add a constant number of bytes to the skb (e.g. the header size for
encapsulations).

To facilitate rtnetlink-related operations such as parsing, fill_encap,
and cmp_encap, each type of action parameter is associated to three
function pointers, in seg6_action_params[].

All actions defined in seg6_local.h are detailed in [1].

[1] https://tools.ietf.org/html/draft-filsfils-spring-srv6-network-programming-01

Signed-off-by: David Lebrun <david.lebrun@uclouvain.be>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/seg6_local.h      |   6 +
 include/net/seg6.h              |   2 +
 include/uapi/linux/lwtunnel.h   |   1 +
 include/uapi/linux/seg6_local.h |  68 +++++++++
 net/core/lwtunnel.c             |   2 +
 net/ipv6/Kconfig                |   3 +-
 net/ipv6/Makefile               |   2 +-
 net/ipv6/seg6.c                 |   5 +
 net/ipv6/seg6_local.c           | 320 ++++++++++++++++++++++++++++++++++++++++
 9 files changed, 407 insertions(+), 2 deletions(-)
 create mode 100644 include/linux/seg6_local.h
 create mode 100644 include/uapi/linux/seg6_local.h
 create mode 100644 net/ipv6/seg6_local.c

(limited to 'include/uapi/linux')

diff --git a/include/linux/seg6_local.h b/include/linux/seg6_local.h
new file mode 100644
index 000000000000..ee63e76fe0c7
--- /dev/null
+++ b/include/linux/seg6_local.h
@@ -0,0 +1,6 @@
+#ifndef _LINUX_SEG6_LOCAL_H
+#define _LINUX_SEG6_LOCAL_H
+
+#include <uapi/linux/seg6_local.h>
+
+#endif
diff --git a/include/net/seg6.h b/include/net/seg6.h
index a32abb040e1d..5379f550f521 100644
--- a/include/net/seg6.h
+++ b/include/net/seg6.h
@@ -56,6 +56,8 @@ extern int seg6_init(void);
 extern void seg6_exit(void);
 extern int seg6_iptunnel_init(void);
 extern void seg6_iptunnel_exit(void);
+extern int seg6_local_init(void);
+extern void seg6_local_exit(void);
 
 extern bool seg6_validate_srh(struct ipv6_sr_hdr *srh, int len);
 extern int seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh);
diff --git a/include/uapi/linux/lwtunnel.h b/include/uapi/linux/lwtunnel.h
index 92724cba1eba..7fdd19ca7511 100644
--- a/include/uapi/linux/lwtunnel.h
+++ b/include/uapi/linux/lwtunnel.h
@@ -11,6 +11,7 @@ enum lwtunnel_encap_types {
 	LWTUNNEL_ENCAP_IP6,
 	LWTUNNEL_ENCAP_SEG6,
 	LWTUNNEL_ENCAP_BPF,
+	LWTUNNEL_ENCAP_SEG6_LOCAL,
 	__LWTUNNEL_ENCAP_MAX,
 };
 
diff --git a/include/uapi/linux/seg6_local.h b/include/uapi/linux/seg6_local.h
new file mode 100644
index 000000000000..ef2d8c3e76c1
--- /dev/null
+++ b/include/uapi/linux/seg6_local.h
@@ -0,0 +1,68 @@
+/*
+ *  SR-IPv6 implementation
+ *
+ *  Author:
+ *  David Lebrun <david.lebrun@uclouvain.be>
+ *
+ *
+ *  This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+
+#ifndef _UAPI_LINUX_SEG6_LOCAL_H
+#define _UAPI_LINUX_SEG6_LOCAL_H
+
+#include <linux/seg6.h>
+
+enum {
+	SEG6_LOCAL_UNSPEC,
+	SEG6_LOCAL_ACTION,
+	SEG6_LOCAL_SRH,
+	SEG6_LOCAL_TABLE,
+	SEG6_LOCAL_NH4,
+	SEG6_LOCAL_NH6,
+	SEG6_LOCAL_IIF,
+	SEG6_LOCAL_OIF,
+	__SEG6_LOCAL_MAX,
+};
+#define SEG6_LOCAL_MAX (__SEG6_LOCAL_MAX - 1)
+
+enum {
+	SEG6_LOCAL_ACTION_UNSPEC	= 0,
+	/* node segment */
+	SEG6_LOCAL_ACTION_END		= 1,
+	/* adjacency segment (IPv6 cross-connect) */
+	SEG6_LOCAL_ACTION_END_X		= 2,
+	/* lookup of next seg NH in table */
+	SEG6_LOCAL_ACTION_END_T		= 3,
+	/* decap and L2 cross-connect */
+	SEG6_LOCAL_ACTION_END_DX2	= 4,
+	/* decap and IPv6 cross-connect */
+	SEG6_LOCAL_ACTION_END_DX6	= 5,
+	/* decap and IPv4 cross-connect */
+	SEG6_LOCAL_ACTION_END_DX4	= 6,
+	/* decap and lookup of DA in v6 table */
+	SEG6_LOCAL_ACTION_END_DT6	= 7,
+	/* decap and lookup of DA in v4 table */
+	SEG6_LOCAL_ACTION_END_DT4	= 8,
+	/* binding segment with insertion */
+	SEG6_LOCAL_ACTION_END_B6	= 9,
+	/* binding segment with encapsulation */
+	SEG6_LOCAL_ACTION_END_B6_ENCAP	= 10,
+	/* binding segment with MPLS encap */
+	SEG6_LOCAL_ACTION_END_BM	= 11,
+	/* lookup last seg in table */
+	SEG6_LOCAL_ACTION_END_S		= 12,
+	/* forward to SR-unaware VNF with static proxy */
+	SEG6_LOCAL_ACTION_END_AS	= 13,
+	/* forward to SR-unaware VNF with masquerading */
+	SEG6_LOCAL_ACTION_END_AM	= 14,
+
+	__SEG6_LOCAL_ACTION_MAX,
+};
+
+#define SEG6_LOCAL_ACTION_MAX (__SEG6_LOCAL_ACTION_MAX - 1)
+
+#endif
diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c
index 435f35f9a61c..0b171756453c 100644
--- a/net/core/lwtunnel.c
+++ b/net/core/lwtunnel.c
@@ -44,6 +44,8 @@ static const char *lwtunnel_encap_str(enum lwtunnel_encap_types encap_type)
 		return "SEG6";
 	case LWTUNNEL_ENCAP_BPF:
 		return "BPF";
+	case LWTUNNEL_ENCAP_SEG6_LOCAL:
+		return "SEG6LOCAL";
 	case LWTUNNEL_ENCAP_IP6:
 	case LWTUNNEL_ENCAP_IP:
 	case LWTUNNEL_ENCAP_NONE:
diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig
index 50181a96923e..0d722396dce6 100644
--- a/net/ipv6/Kconfig
+++ b/net/ipv6/Kconfig
@@ -311,7 +311,8 @@ config IPV6_SEG6_LWTUNNEL
 	---help---
 	  Support for encapsulation of packets within an outer IPv6
 	  header and a Segment Routing Header using the lightweight
-	  tunnels mechanism.
+	  tunnels mechanism. Also enable support for advanced local
+	  processing of SRv6 packets based on their active segment.
 
 	  If unsure, say N.
 
diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile
index f8b24c2e0d77..10e342363793 100644
--- a/net/ipv6/Makefile
+++ b/net/ipv6/Makefile
@@ -23,7 +23,7 @@ ipv6-$(CONFIG_IPV6_MULTIPLE_TABLES) += fib6_rules.o
 ipv6-$(CONFIG_PROC_FS) += proc.o
 ipv6-$(CONFIG_SYN_COOKIES) += syncookies.o
 ipv6-$(CONFIG_NETLABEL) += calipso.o
-ipv6-$(CONFIG_IPV6_SEG6_LWTUNNEL) += seg6_iptunnel.o
+ipv6-$(CONFIG_IPV6_SEG6_LWTUNNEL) += seg6_iptunnel.o seg6_local.o
 ipv6-$(CONFIG_IPV6_SEG6_HMAC) += seg6_hmac.o
 
 ipv6-objs += $(ipv6-y)
diff --git a/net/ipv6/seg6.c b/net/ipv6/seg6.c
index 81c2339b3285..c81407770956 100644
--- a/net/ipv6/seg6.c
+++ b/net/ipv6/seg6.c
@@ -456,6 +456,10 @@ int __init seg6_init(void)
 	err = seg6_iptunnel_init();
 	if (err)
 		goto out_unregister_pernet;
+
+	err = seg6_local_init();
+	if (err)
+		goto out_unregister_pernet;
 #endif
 
 #ifdef CONFIG_IPV6_SEG6_HMAC
@@ -471,6 +475,7 @@ out:
 #ifdef CONFIG_IPV6_SEG6_HMAC
 out_unregister_iptun:
 #ifdef CONFIG_IPV6_SEG6_LWTUNNEL
+	seg6_local_exit();
 	seg6_iptunnel_exit();
 #endif
 #endif
diff --git a/net/ipv6/seg6_local.c b/net/ipv6/seg6_local.c
new file mode 100644
index 000000000000..53615d7e0723
--- /dev/null
+++ b/net/ipv6/seg6_local.c
@@ -0,0 +1,320 @@
+/*
+ *  SR-IPv6 implementation
+ *
+ *  Author:
+ *  David Lebrun <david.lebrun@uclouvain.be>
+ *
+ *
+ *  This program is free software; you can redistribute it and/or
+ *        modify it under the terms of the GNU General Public License
+ *        as published by the Free Software Foundation; either version
+ *        2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/types.h>
+#include <linux/skbuff.h>
+#include <linux/net.h>
+#include <linux/module.h>
+#include <net/ip.h>
+#include <net/lwtunnel.h>
+#include <net/netevent.h>
+#include <net/netns/generic.h>
+#include <net/ip6_fib.h>
+#include <net/route.h>
+#include <net/seg6.h>
+#include <linux/seg6.h>
+#include <linux/seg6_local.h>
+#include <net/addrconf.h>
+#include <net/ip6_route.h>
+#include <net/dst_cache.h>
+#ifdef CONFIG_IPV6_SEG6_HMAC
+#include <net/seg6_hmac.h>
+#endif
+
+struct seg6_local_lwt;
+
+struct seg6_action_desc {
+	int action;
+	unsigned long attrs;
+	int (*input)(struct sk_buff *skb, struct seg6_local_lwt *slwt);
+	int static_headroom;
+};
+
+struct seg6_local_lwt {
+	int action;
+	struct ipv6_sr_hdr *srh;
+	int table;
+	struct in_addr nh4;
+	struct in6_addr nh6;
+	int iif;
+	int oif;
+
+	int headroom;
+	struct seg6_action_desc *desc;
+};
+
+static struct seg6_local_lwt *seg6_local_lwtunnel(struct lwtunnel_state *lwt)
+{
+	return (struct seg6_local_lwt *)lwt->data;
+}
+
+static struct seg6_action_desc seg6_action_table[] = {
+	{
+		.action		= SEG6_LOCAL_ACTION_END,
+		.attrs		= 0,
+	},
+};
+
+static struct seg6_action_desc *__get_action_desc(int action)
+{
+	struct seg6_action_desc *desc;
+	int i, count;
+
+	count = sizeof(seg6_action_table) / sizeof(struct seg6_action_desc);
+	for (i = 0; i < count; i++) {
+		desc = &seg6_action_table[i];
+		if (desc->action == action)
+			return desc;
+	}
+
+	return NULL;
+}
+
+static int seg6_local_input(struct sk_buff *skb)
+{
+	struct dst_entry *orig_dst = skb_dst(skb);
+	struct seg6_action_desc *desc;
+	struct seg6_local_lwt *slwt;
+
+	slwt = seg6_local_lwtunnel(orig_dst->lwtstate);
+	desc = slwt->desc;
+
+	return desc->input(skb, slwt);
+}
+
+static const struct nla_policy seg6_local_policy[SEG6_LOCAL_MAX + 1] = {
+	[SEG6_LOCAL_ACTION]	= { .type = NLA_U32 },
+	[SEG6_LOCAL_SRH]	= { .type = NLA_BINARY },
+	[SEG6_LOCAL_TABLE]	= { .type = NLA_U32 },
+	[SEG6_LOCAL_NH4]	= { .type = NLA_BINARY,
+				    .len = sizeof(struct in_addr) },
+	[SEG6_LOCAL_NH6]	= { .type = NLA_BINARY,
+				    .len = sizeof(struct in6_addr) },
+	[SEG6_LOCAL_IIF]	= { .type = NLA_U32 },
+	[SEG6_LOCAL_OIF]	= { .type = NLA_U32 },
+};
+
+struct seg6_action_param {
+	int (*parse)(struct nlattr **attrs, struct seg6_local_lwt *slwt);
+	int (*put)(struct sk_buff *skb, struct seg6_local_lwt *slwt);
+	int (*cmp)(struct seg6_local_lwt *a, struct seg6_local_lwt *b);
+};
+
+static struct seg6_action_param seg6_action_params[SEG6_LOCAL_MAX + 1] = {
+	[SEG6_LOCAL_SRH]	= { .parse = NULL,
+				    .put = NULL,
+				    .cmp = NULL },
+
+	[SEG6_LOCAL_TABLE]	= { .parse = NULL,
+				    .put = NULL,
+				    .cmp = NULL },
+
+	[SEG6_LOCAL_NH4]	= { .parse = NULL,
+				    .put = NULL,
+				    .cmp = NULL },
+
+	[SEG6_LOCAL_NH6]	= { .parse = NULL,
+				    .put = NULL,
+				    .cmp = NULL },
+
+	[SEG6_LOCAL_IIF]	= { .parse = NULL,
+				    .put = NULL,
+				    .cmp = NULL },
+
+	[SEG6_LOCAL_OIF]	= { .parse = NULL,
+				    .put = NULL,
+				    .cmp = NULL },
+};
+
+static int parse_nla_action(struct nlattr **attrs, struct seg6_local_lwt *slwt)
+{
+	struct seg6_action_param *param;
+	struct seg6_action_desc *desc;
+	int i, err;
+
+	desc = __get_action_desc(slwt->action);
+	if (!desc)
+		return -EINVAL;
+
+	if (!desc->input)
+		return -EOPNOTSUPP;
+
+	slwt->desc = desc;
+	slwt->headroom += desc->static_headroom;
+
+	for (i = 0; i < SEG6_LOCAL_MAX + 1; i++) {
+		if (desc->attrs & (1 << i)) {
+			if (!attrs[i])
+				return -EINVAL;
+
+			param = &seg6_action_params[i];
+
+			err = param->parse(attrs, slwt);
+			if (err < 0)
+				return err;
+		}
+	}
+
+	return 0;
+}
+
+static int seg6_local_build_state(struct nlattr *nla, unsigned int family,
+				  const void *cfg, struct lwtunnel_state **ts,
+				  struct netlink_ext_ack *extack)
+{
+	struct nlattr *tb[SEG6_LOCAL_MAX + 1];
+	struct lwtunnel_state *newts;
+	struct seg6_local_lwt *slwt;
+	int err;
+
+	err = nla_parse_nested(tb, SEG6_LOCAL_MAX, nla, seg6_local_policy,
+			       extack);
+
+	if (err < 0)
+		return err;
+
+	if (!tb[SEG6_LOCAL_ACTION])
+		return -EINVAL;
+
+	newts = lwtunnel_state_alloc(sizeof(*slwt));
+	if (!newts)
+		return -ENOMEM;
+
+	slwt = seg6_local_lwtunnel(newts);
+	slwt->action = nla_get_u32(tb[SEG6_LOCAL_ACTION]);
+
+	err = parse_nla_action(tb, slwt);
+	if (err < 0)
+		goto out_free;
+
+	newts->type = LWTUNNEL_ENCAP_SEG6_LOCAL;
+	newts->flags = LWTUNNEL_STATE_INPUT_REDIRECT;
+	newts->headroom = slwt->headroom;
+
+	*ts = newts;
+
+	return 0;
+
+out_free:
+	kfree(slwt->srh);
+	kfree(newts);
+	return err;
+}
+
+static void seg6_local_destroy_state(struct lwtunnel_state *lwt)
+{
+	struct seg6_local_lwt *slwt = seg6_local_lwtunnel(lwt);
+
+	kfree(slwt->srh);
+}
+
+static int seg6_local_fill_encap(struct sk_buff *skb,
+				 struct lwtunnel_state *lwt)
+{
+	struct seg6_local_lwt *slwt = seg6_local_lwtunnel(lwt);
+	struct seg6_action_param *param;
+	int i, err;
+
+	if (nla_put_u32(skb, SEG6_LOCAL_ACTION, slwt->action))
+		return -EMSGSIZE;
+
+	for (i = 0; i < SEG6_LOCAL_MAX + 1; i++) {
+		if (slwt->desc->attrs & (1 << i)) {
+			param = &seg6_action_params[i];
+			err = param->put(skb, slwt);
+			if (err < 0)
+				return err;
+		}
+	}
+
+	return 0;
+}
+
+static int seg6_local_get_encap_size(struct lwtunnel_state *lwt)
+{
+	struct seg6_local_lwt *slwt = seg6_local_lwtunnel(lwt);
+	unsigned long attrs;
+	int nlsize;
+
+	nlsize = nla_total_size(4); /* action */
+
+	attrs = slwt->desc->attrs;
+
+	if (attrs & (1 << SEG6_LOCAL_SRH))
+		nlsize += nla_total_size((slwt->srh->hdrlen + 1) << 3);
+
+	if (attrs & (1 << SEG6_LOCAL_TABLE))
+		nlsize += nla_total_size(4);
+
+	if (attrs & (1 << SEG6_LOCAL_NH4))
+		nlsize += nla_total_size(4);
+
+	if (attrs & (1 << SEG6_LOCAL_NH6))
+		nlsize += nla_total_size(16);
+
+	if (attrs & (1 << SEG6_LOCAL_IIF))
+		nlsize += nla_total_size(4);
+
+	if (attrs & (1 << SEG6_LOCAL_OIF))
+		nlsize += nla_total_size(4);
+
+	return nlsize;
+}
+
+static int seg6_local_cmp_encap(struct lwtunnel_state *a,
+				struct lwtunnel_state *b)
+{
+	struct seg6_local_lwt *slwt_a, *slwt_b;
+	struct seg6_action_param *param;
+	int i;
+
+	slwt_a = seg6_local_lwtunnel(a);
+	slwt_b = seg6_local_lwtunnel(b);
+
+	if (slwt_a->action != slwt_b->action)
+		return 1;
+
+	if (slwt_a->desc->attrs != slwt_b->desc->attrs)
+		return 1;
+
+	for (i = 0; i < SEG6_LOCAL_MAX + 1; i++) {
+		if (slwt_a->desc->attrs & (1 << i)) {
+			param = &seg6_action_params[i];
+			if (param->cmp(slwt_a, slwt_b))
+				return 1;
+		}
+	}
+
+	return 0;
+}
+
+static const struct lwtunnel_encap_ops seg6_local_ops = {
+	.build_state	= seg6_local_build_state,
+	.destroy_state	= seg6_local_destroy_state,
+	.input		= seg6_local_input,
+	.fill_encap	= seg6_local_fill_encap,
+	.get_encap_size	= seg6_local_get_encap_size,
+	.cmp_encap	= seg6_local_cmp_encap,
+	.owner		= THIS_MODULE,
+};
+
+int __init seg6_local_init(void)
+{
+	return lwtunnel_encap_add_ops(&seg6_local_ops,
+				      LWTUNNEL_ENCAP_SEG6_LOCAL);
+}
+
+void seg6_local_exit(void)
+{
+	lwtunnel_encap_del_ops(&seg6_local_ops, LWTUNNEL_ENCAP_SEG6_LOCAL);
+}
-- 
cgit v1.2.3-71-gd317


From 6c2c188f35c61c8eee71ec6d07524ce122c06539 Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hans.verkuil@cisco.com>
Date: Fri, 28 Jul 2017 03:25:06 -0400
Subject: media: drop use of MEDIA_API_VERSION

Set media_version to LINUX_VERSION_CODE, just as we did for
driver_version.

Nobody ever rememebers to update the version number, but
LINUX_VERSION_CODE will always be updated.

Move the MEDIA_API_VERSION define to the ifndef __KERNEL__ section of the
media.h header. That way kernelspace can't accidentally start to use
it again.

Signed-off-by: Hans Verkuil <hans.verkuil@cisco.com>
Reviewed-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
---
 drivers/media/media-device.c | 4 ++--
 include/uapi/linux/media.h   | 5 +++--
 2 files changed, 5 insertions(+), 4 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/media/media-device.c b/drivers/media/media-device.c
index 979e4307d248..e79f72b8b858 100644
--- a/drivers/media/media-device.c
+++ b/drivers/media/media-device.c
@@ -69,9 +69,9 @@ static int media_device_get_info(struct media_device *dev,
 	strlcpy(info->serial, dev->serial, sizeof(info->serial));
 	strlcpy(info->bus_info, dev->bus_info, sizeof(info->bus_info));
 
-	info->media_version = MEDIA_API_VERSION;
+	info->media_version = LINUX_VERSION_CODE;
+	info->driver_version = info->media_version;
 	info->hw_revision = dev->hw_revision;
-	info->driver_version = LINUX_VERSION_CODE;
 
 	return 0;
 }
diff --git a/include/uapi/linux/media.h b/include/uapi/linux/media.h
index fac96c64fe51..4865f1e71339 100644
--- a/include/uapi/linux/media.h
+++ b/include/uapi/linux/media.h
@@ -30,8 +30,6 @@
 #include <linux/types.h>
 #include <linux/version.h>
 
-#define MEDIA_API_VERSION	KERNEL_VERSION(0, 1, 0)
-
 struct media_device_info {
 	char driver[16];
 	char model[32];
@@ -187,6 +185,9 @@ struct media_device_info {
 #define MEDIA_ENT_T_V4L2_SUBDEV_LENS	MEDIA_ENT_F_LENS
 #define MEDIA_ENT_T_V4L2_SUBDEV_DECODER	MEDIA_ENT_F_ATV_DECODER
 #define MEDIA_ENT_T_V4L2_SUBDEV_TUNER	MEDIA_ENT_F_TUNER
+
+/* Obsolete symbol for media_version, no longer used in the kernel */
+#define MEDIA_API_VERSION		KERNEL_VERSION(0, 1, 0)
 #endif
 
 /* Entity flags */
-- 
cgit v1.2.3-71-gd317


From 79bcd34ccfe009ad21d16100ae2aef9b378a512d Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hverkuil@xs4all.nl>
Date: Tue, 1 Aug 2017 07:53:30 -0400
Subject: media: cec-funcs.h: cec_ops_report_features: set *dev_features to
 NULL

gcc can get confused by this code and it thinks dev_features can be
returned uninitialized. So initialize to NULL at the beginning to shut up
the warning.

Signed-off-by: Hans Verkuil <hans.verkuil@cisco.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
---
 include/uapi/linux/cec-funcs.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/cec-funcs.h b/include/uapi/linux/cec-funcs.h
index c451eec42a83..270b251a3d9b 100644
--- a/include/uapi/linux/cec-funcs.h
+++ b/include/uapi/linux/cec-funcs.h
@@ -895,6 +895,7 @@ static inline void cec_ops_report_features(const struct cec_msg *msg,
 	*cec_version = msg->msg[2];
 	*all_device_types = msg->msg[3];
 	*rc_profile = p;
+	*dev_features = NULL;
 	while (p < &msg->msg[14] && (*p & CEC_OP_FEAT_EXT))
 		p++;
 	if (!(*p & CEC_OP_FEAT_EXT)) {
-- 
cgit v1.2.3-71-gd317


From 92b31a9af73b3a3fc801899335d6c47966351830 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Thu, 10 Aug 2017 01:39:55 +0200
Subject: bpf: add BPF_J{LT,LE,SLT,SLE} instructions

Currently, eBPF only understands BPF_JGT (>), BPF_JGE (>=),
BPF_JSGT (s>), BPF_JSGE (s>=) instructions, this means that
particularly *JLT/*JLE counterparts involving immediates need
to be rewritten from e.g. X < [IMM] by swapping arguments into
[IMM] > X, meaning the immediate first is required to be loaded
into a register Y := [IMM], such that then we can compare with
Y > X. Note that the destination operand is always required to
be a register.

This has the downside of having unnecessarily increased register
pressure, meaning complex program would need to spill other
registers temporarily to stack in order to obtain an unused
register for the [IMM]. Loading to registers will thus also
affect state pruning since we need to account for that register
use and potentially those registers that had to be spilled/filled
again. As a consequence slightly more stack space might have
been used due to spilling, and BPF programs are a bit longer
due to extra code involving the register load and potentially
required spill/fills.

Thus, add BPF_JLT (<), BPF_JLE (<=), BPF_JSLT (s<), BPF_JSLE (s<=)
counterparts to the eBPF instruction set. Modifying LLVM to
remove the NegateCC() workaround in a PoC patch at [1] and
allowing it to also emit the new instructions resulted in
cilium's BPF programs that are injected into the fast-path to
have a reduced program length in the range of 2-3% (e.g.
accumulated main and tail call sections from one of the object
file reduced from 4864 to 4729 insns), reduced complexity in
the range of 10-30% (e.g. accumulated sections reduced in one
of the cases from 116432 to 88428 insns), and reduced stack
usage in the range of 1-5% (e.g. accumulated sections from one
of the object files reduced from 824 to 784b).

The modification for LLVM will be incorporated in a backwards
compatible way. Plan is for LLVM to have i) a target specific
option to offer a possibility to explicitly enable the extension
by the user (as we have with -m target specific extensions today
for various CPU insns), and ii) have the kernel checked for
presence of the extensions and enable them transparently when
the user is selecting more aggressive options such as -march=native
in a bpf target context. (Other frontends generating BPF byte
code, e.g. ply can probe the kernel directly for its code
generation.)

  [1] https://github.com/borkmann/llvm/tree/bpf-insns

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/filter.txt |   4 +
 include/uapi/linux/bpf.h            |   5 +
 kernel/bpf/core.c                   |  60 ++++++
 lib/test_bpf.c                      | 364 ++++++++++++++++++++++++++++++++++++
 net/core/filter.c                   |  21 ++-
 tools/include/uapi/linux/bpf.h      |   5 +
 6 files changed, 455 insertions(+), 4 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/networking/filter.txt b/Documentation/networking/filter.txt
index d0fdba7d66e2..6a0df8df6c43 100644
--- a/Documentation/networking/filter.txt
+++ b/Documentation/networking/filter.txt
@@ -906,6 +906,10 @@ If BPF_CLASS(code) == BPF_JMP, BPF_OP(code) is one of:
   BPF_JSGE  0x70  /* eBPF only: signed '>=' */
   BPF_CALL  0x80  /* eBPF only: function call */
   BPF_EXIT  0x90  /* eBPF only: function return */
+  BPF_JLT   0xa0  /* eBPF only: unsigned '<' */
+  BPF_JLE   0xb0  /* eBPF only: unsigned '<=' */
+  BPF_JSLT  0xc0  /* eBPF only: signed '<' */
+  BPF_JSLE  0xd0  /* eBPF only: signed '<=' */
 
 So BPF_ADD | BPF_X | BPF_ALU means 32-bit addition in both classic BPF
 and eBPF. There are only two registers in classic BPF, so it means A += X.
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 1d06be1569b1..91da8371a2d0 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -30,9 +30,14 @@
 #define BPF_FROM_LE	BPF_TO_LE
 #define BPF_FROM_BE	BPF_TO_BE
 
+/* jmp encodings */
 #define BPF_JNE		0x50	/* jump != */
+#define BPF_JLT		0xa0	/* LT is unsigned, '<' */
+#define BPF_JLE		0xb0	/* LE is unsigned, '<=' */
 #define BPF_JSGT	0x60	/* SGT is signed '>', GT in x86 */
 #define BPF_JSGE	0x70	/* SGE is signed '>=', GE in x86 */
+#define BPF_JSLT	0xc0	/* SLT is signed, '<' */
+#define BPF_JSLE	0xd0	/* SLE is signed, '<=' */
 #define BPF_CALL	0x80	/* function call */
 #define BPF_EXIT	0x90	/* function return */
 
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index ad5f55922a13..c69e7f5bfde7 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -595,9 +595,13 @@ static int bpf_jit_blind_insn(const struct bpf_insn *from,
 	case BPF_JMP | BPF_JEQ  | BPF_K:
 	case BPF_JMP | BPF_JNE  | BPF_K:
 	case BPF_JMP | BPF_JGT  | BPF_K:
+	case BPF_JMP | BPF_JLT  | BPF_K:
 	case BPF_JMP | BPF_JGE  | BPF_K:
+	case BPF_JMP | BPF_JLE  | BPF_K:
 	case BPF_JMP | BPF_JSGT | BPF_K:
+	case BPF_JMP | BPF_JSLT | BPF_K:
 	case BPF_JMP | BPF_JSGE | BPF_K:
+	case BPF_JMP | BPF_JSLE | BPF_K:
 	case BPF_JMP | BPF_JSET | BPF_K:
 		/* Accommodate for extra offset in case of a backjump. */
 		off = from->off;
@@ -833,12 +837,20 @@ static unsigned int ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn,
 		[BPF_JMP | BPF_JNE | BPF_K] = &&JMP_JNE_K,
 		[BPF_JMP | BPF_JGT | BPF_X] = &&JMP_JGT_X,
 		[BPF_JMP | BPF_JGT | BPF_K] = &&JMP_JGT_K,
+		[BPF_JMP | BPF_JLT | BPF_X] = &&JMP_JLT_X,
+		[BPF_JMP | BPF_JLT | BPF_K] = &&JMP_JLT_K,
 		[BPF_JMP | BPF_JGE | BPF_X] = &&JMP_JGE_X,
 		[BPF_JMP | BPF_JGE | BPF_K] = &&JMP_JGE_K,
+		[BPF_JMP | BPF_JLE | BPF_X] = &&JMP_JLE_X,
+		[BPF_JMP | BPF_JLE | BPF_K] = &&JMP_JLE_K,
 		[BPF_JMP | BPF_JSGT | BPF_X] = &&JMP_JSGT_X,
 		[BPF_JMP | BPF_JSGT | BPF_K] = &&JMP_JSGT_K,
+		[BPF_JMP | BPF_JSLT | BPF_X] = &&JMP_JSLT_X,
+		[BPF_JMP | BPF_JSLT | BPF_K] = &&JMP_JSLT_K,
 		[BPF_JMP | BPF_JSGE | BPF_X] = &&JMP_JSGE_X,
 		[BPF_JMP | BPF_JSGE | BPF_K] = &&JMP_JSGE_K,
+		[BPF_JMP | BPF_JSLE | BPF_X] = &&JMP_JSLE_X,
+		[BPF_JMP | BPF_JSLE | BPF_K] = &&JMP_JSLE_K,
 		[BPF_JMP | BPF_JSET | BPF_X] = &&JMP_JSET_X,
 		[BPF_JMP | BPF_JSET | BPF_K] = &&JMP_JSET_K,
 		/* Program return */
@@ -1073,6 +1085,18 @@ out:
 			CONT_JMP;
 		}
 		CONT;
+	JMP_JLT_X:
+		if (DST < SRC) {
+			insn += insn->off;
+			CONT_JMP;
+		}
+		CONT;
+	JMP_JLT_K:
+		if (DST < IMM) {
+			insn += insn->off;
+			CONT_JMP;
+		}
+		CONT;
 	JMP_JGE_X:
 		if (DST >= SRC) {
 			insn += insn->off;
@@ -1085,6 +1109,18 @@ out:
 			CONT_JMP;
 		}
 		CONT;
+	JMP_JLE_X:
+		if (DST <= SRC) {
+			insn += insn->off;
+			CONT_JMP;
+		}
+		CONT;
+	JMP_JLE_K:
+		if (DST <= IMM) {
+			insn += insn->off;
+			CONT_JMP;
+		}
+		CONT;
 	JMP_JSGT_X:
 		if (((s64) DST) > ((s64) SRC)) {
 			insn += insn->off;
@@ -1097,6 +1133,18 @@ out:
 			CONT_JMP;
 		}
 		CONT;
+	JMP_JSLT_X:
+		if (((s64) DST) < ((s64) SRC)) {
+			insn += insn->off;
+			CONT_JMP;
+		}
+		CONT;
+	JMP_JSLT_K:
+		if (((s64) DST) < ((s64) IMM)) {
+			insn += insn->off;
+			CONT_JMP;
+		}
+		CONT;
 	JMP_JSGE_X:
 		if (((s64) DST) >= ((s64) SRC)) {
 			insn += insn->off;
@@ -1109,6 +1157,18 @@ out:
 			CONT_JMP;
 		}
 		CONT;
+	JMP_JSLE_X:
+		if (((s64) DST) <= ((s64) SRC)) {
+			insn += insn->off;
+			CONT_JMP;
+		}
+		CONT;
+	JMP_JSLE_K:
+		if (((s64) DST) <= ((s64) IMM)) {
+			insn += insn->off;
+			CONT_JMP;
+		}
+		CONT;
 	JMP_JSET_X:
 		if (DST & SRC) {
 			insn += insn->off;
diff --git a/lib/test_bpf.c b/lib/test_bpf.c
index d9d5a410955c..aa8812ae6776 100644
--- a/lib/test_bpf.c
+++ b/lib/test_bpf.c
@@ -951,6 +951,32 @@ static struct bpf_test tests[] = {
 		{ 4, 4, 4, 3, 3 },
 		{ { 2, 0 }, { 3, 1 }, { 4, MAX_K } },
 	},
+	{
+		"JGE (jt 0), test 1",
+		.u.insns = {
+			BPF_STMT(BPF_LDX | BPF_LEN, 0),
+			BPF_STMT(BPF_LD | BPF_B | BPF_ABS, 2),
+			BPF_JUMP(BPF_JMP | BPF_JGE | BPF_X, 0, 0, 1),
+			BPF_STMT(BPF_RET | BPF_K, 1),
+			BPF_STMT(BPF_RET | BPF_K, MAX_K)
+		},
+		CLASSIC,
+		{ 4, 4, 4, 3, 3 },
+		{ { 2, 0 }, { 3, 1 }, { 4, 1 } },
+	},
+	{
+		"JGE (jt 0), test 2",
+		.u.insns = {
+			BPF_STMT(BPF_LDX | BPF_LEN, 0),
+			BPF_STMT(BPF_LD | BPF_B | BPF_ABS, 2),
+			BPF_JUMP(BPF_JMP | BPF_JGE | BPF_X, 0, 0, 1),
+			BPF_STMT(BPF_RET | BPF_K, 1),
+			BPF_STMT(BPF_RET | BPF_K, MAX_K)
+		},
+		CLASSIC,
+		{ 4, 4, 5, 3, 3 },
+		{ { 4, 1 }, { 5, 1 }, { 6, MAX_K } },
+	},
 	{
 		"JGE",
 		.u.insns = {
@@ -4492,6 +4518,35 @@ static struct bpf_test tests[] = {
 		{ },
 		{ { 0, 1 } },
 	},
+	/* BPF_JMP | BPF_JSLT | BPF_K */
+	{
+		"JMP_JSLT_K: Signed jump: if (-2 < -1) return 1",
+		.u.insns_int = {
+			BPF_ALU32_IMM(BPF_MOV, R0, 0),
+			BPF_LD_IMM64(R1, 0xfffffffffffffffeLL),
+			BPF_JMP_IMM(BPF_JSLT, R1, -1, 1),
+			BPF_EXIT_INSN(),
+			BPF_ALU32_IMM(BPF_MOV, R0, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 1 } },
+	},
+	{
+		"JMP_JSLT_K: Signed jump: if (-1 < -1) return 0",
+		.u.insns_int = {
+			BPF_ALU32_IMM(BPF_MOV, R0, 1),
+			BPF_LD_IMM64(R1, 0xffffffffffffffffLL),
+			BPF_JMP_IMM(BPF_JSLT, R1, -1, 1),
+			BPF_EXIT_INSN(),
+			BPF_ALU32_IMM(BPF_MOV, R0, 0),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 1 } },
+	},
 	/* BPF_JMP | BPF_JSGT | BPF_K */
 	{
 		"JMP_JSGT_K: Signed jump: if (-1 > -2) return 1",
@@ -4521,6 +4576,73 @@ static struct bpf_test tests[] = {
 		{ },
 		{ { 0, 1 } },
 	},
+	/* BPF_JMP | BPF_JSLE | BPF_K */
+	{
+		"JMP_JSLE_K: Signed jump: if (-2 <= -1) return 1",
+		.u.insns_int = {
+			BPF_ALU32_IMM(BPF_MOV, R0, 0),
+			BPF_LD_IMM64(R1, 0xfffffffffffffffeLL),
+			BPF_JMP_IMM(BPF_JSLE, R1, -1, 1),
+			BPF_EXIT_INSN(),
+			BPF_ALU32_IMM(BPF_MOV, R0, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 1 } },
+	},
+	{
+		"JMP_JSLE_K: Signed jump: if (-1 <= -1) return 1",
+		.u.insns_int = {
+			BPF_ALU32_IMM(BPF_MOV, R0, 0),
+			BPF_LD_IMM64(R1, 0xffffffffffffffffLL),
+			BPF_JMP_IMM(BPF_JSLE, R1, -1, 1),
+			BPF_EXIT_INSN(),
+			BPF_ALU32_IMM(BPF_MOV, R0, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 1 } },
+	},
+	{
+		"JMP_JSLE_K: Signed jump: value walk 1",
+		.u.insns_int = {
+			BPF_ALU32_IMM(BPF_MOV, R0, 0),
+			BPF_LD_IMM64(R1, 3),
+			BPF_JMP_IMM(BPF_JSLE, R1, 0, 6),
+			BPF_ALU64_IMM(BPF_SUB, R1, 1),
+			BPF_JMP_IMM(BPF_JSLE, R1, 0, 4),
+			BPF_ALU64_IMM(BPF_SUB, R1, 1),
+			BPF_JMP_IMM(BPF_JSLE, R1, 0, 2),
+			BPF_ALU64_IMM(BPF_SUB, R1, 1),
+			BPF_JMP_IMM(BPF_JSLE, R1, 0, 1),
+			BPF_EXIT_INSN(),		/* bad exit */
+			BPF_ALU32_IMM(BPF_MOV, R0, 1),	/* good exit */
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 1 } },
+	},
+	{
+		"JMP_JSLE_K: Signed jump: value walk 2",
+		.u.insns_int = {
+			BPF_ALU32_IMM(BPF_MOV, R0, 0),
+			BPF_LD_IMM64(R1, 3),
+			BPF_JMP_IMM(BPF_JSLE, R1, 0, 4),
+			BPF_ALU64_IMM(BPF_SUB, R1, 2),
+			BPF_JMP_IMM(BPF_JSLE, R1, 0, 2),
+			BPF_ALU64_IMM(BPF_SUB, R1, 2),
+			BPF_JMP_IMM(BPF_JSLE, R1, 0, 1),
+			BPF_EXIT_INSN(),		/* bad exit */
+			BPF_ALU32_IMM(BPF_MOV, R0, 1),	/* good exit */
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 1 } },
+	},
 	/* BPF_JMP | BPF_JSGE | BPF_K */
 	{
 		"JMP_JSGE_K: Signed jump: if (-1 >= -2) return 1",
@@ -4617,6 +4739,35 @@ static struct bpf_test tests[] = {
 		{ },
 		{ { 0, 1 } },
 	},
+	/* BPF_JMP | BPF_JLT | BPF_K */
+	{
+		"JMP_JLT_K: if (2 < 3) return 1",
+		.u.insns_int = {
+			BPF_ALU32_IMM(BPF_MOV, R0, 0),
+			BPF_LD_IMM64(R1, 2),
+			BPF_JMP_IMM(BPF_JLT, R1, 3, 1),
+			BPF_EXIT_INSN(),
+			BPF_ALU32_IMM(BPF_MOV, R0, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 1 } },
+	},
+	{
+		"JMP_JGT_K: Unsigned jump: if (1 < -1) return 1",
+		.u.insns_int = {
+			BPF_ALU32_IMM(BPF_MOV, R0, 0),
+			BPF_LD_IMM64(R1, 1),
+			BPF_JMP_IMM(BPF_JLT, R1, -1, 1),
+			BPF_EXIT_INSN(),
+			BPF_ALU32_IMM(BPF_MOV, R0, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 1 } },
+	},
 	/* BPF_JMP | BPF_JGE | BPF_K */
 	{
 		"JMP_JGE_K: if (3 >= 2) return 1",
@@ -4632,6 +4783,21 @@ static struct bpf_test tests[] = {
 		{ },
 		{ { 0, 1 } },
 	},
+	/* BPF_JMP | BPF_JLE | BPF_K */
+	{
+		"JMP_JLE_K: if (2 <= 3) return 1",
+		.u.insns_int = {
+			BPF_ALU32_IMM(BPF_MOV, R0, 0),
+			BPF_LD_IMM64(R1, 2),
+			BPF_JMP_IMM(BPF_JLE, R1, 3, 1),
+			BPF_EXIT_INSN(),
+			BPF_ALU32_IMM(BPF_MOV, R0, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 1 } },
+	},
 	/* BPF_JMP | BPF_JGT | BPF_K jump backwards */
 	{
 		"JMP_JGT_K: if (3 > 2) return 1 (jump backwards)",
@@ -4662,6 +4828,36 @@ static struct bpf_test tests[] = {
 		{ },
 		{ { 0, 1 } },
 	},
+	/* BPF_JMP | BPF_JLT | BPF_K jump backwards */
+	{
+		"JMP_JGT_K: if (2 < 3) return 1 (jump backwards)",
+		.u.insns_int = {
+			BPF_JMP_IMM(BPF_JA, 0, 0, 2), /* goto start */
+			BPF_ALU32_IMM(BPF_MOV, R0, 1), /* out: */
+			BPF_EXIT_INSN(),
+			BPF_ALU32_IMM(BPF_MOV, R0, 0), /* start: */
+			BPF_LD_IMM64(R1, 2), /* note: this takes 2 insns */
+			BPF_JMP_IMM(BPF_JLT, R1, 3, -6), /* goto out */
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 1 } },
+	},
+	{
+		"JMP_JLE_K: if (3 <= 3) return 1",
+		.u.insns_int = {
+			BPF_ALU32_IMM(BPF_MOV, R0, 0),
+			BPF_LD_IMM64(R1, 3),
+			BPF_JMP_IMM(BPF_JLE, R1, 3, 1),
+			BPF_EXIT_INSN(),
+			BPF_ALU32_IMM(BPF_MOV, R0, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 1 } },
+	},
 	/* BPF_JMP | BPF_JNE | BPF_K */
 	{
 		"JMP_JNE_K: if (3 != 2) return 1",
@@ -4752,6 +4948,37 @@ static struct bpf_test tests[] = {
 		{ },
 		{ { 0, 1 } },
 	},
+	/* BPF_JMP | BPF_JSLT | BPF_X */
+	{
+		"JMP_JSLT_X: Signed jump: if (-2 < -1) return 1",
+		.u.insns_int = {
+			BPF_ALU32_IMM(BPF_MOV, R0, 0),
+			BPF_LD_IMM64(R1, -1),
+			BPF_LD_IMM64(R2, -2),
+			BPF_JMP_REG(BPF_JSLT, R2, R1, 1),
+			BPF_EXIT_INSN(),
+			BPF_ALU32_IMM(BPF_MOV, R0, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 1 } },
+	},
+	{
+		"JMP_JSLT_X: Signed jump: if (-1 < -1) return 0",
+		.u.insns_int = {
+			BPF_ALU32_IMM(BPF_MOV, R0, 1),
+			BPF_LD_IMM64(R1, -1),
+			BPF_LD_IMM64(R2, -1),
+			BPF_JMP_REG(BPF_JSLT, R1, R2, 1),
+			BPF_EXIT_INSN(),
+			BPF_ALU32_IMM(BPF_MOV, R0, 0),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 1 } },
+	},
 	/* BPF_JMP | BPF_JSGE | BPF_X */
 	{
 		"JMP_JSGE_X: Signed jump: if (-1 >= -2) return 1",
@@ -4783,6 +5010,37 @@ static struct bpf_test tests[] = {
 		{ },
 		{ { 0, 1 } },
 	},
+	/* BPF_JMP | BPF_JSLE | BPF_X */
+	{
+		"JMP_JSLE_X: Signed jump: if (-2 <= -1) return 1",
+		.u.insns_int = {
+			BPF_ALU32_IMM(BPF_MOV, R0, 0),
+			BPF_LD_IMM64(R1, -1),
+			BPF_LD_IMM64(R2, -2),
+			BPF_JMP_REG(BPF_JSLE, R2, R1, 1),
+			BPF_EXIT_INSN(),
+			BPF_ALU32_IMM(BPF_MOV, R0, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 1 } },
+	},
+	{
+		"JMP_JSLE_X: Signed jump: if (-1 <= -1) return 1",
+		.u.insns_int = {
+			BPF_ALU32_IMM(BPF_MOV, R0, 0),
+			BPF_LD_IMM64(R1, -1),
+			BPF_LD_IMM64(R2, -1),
+			BPF_JMP_REG(BPF_JSLE, R1, R2, 1),
+			BPF_EXIT_INSN(),
+			BPF_ALU32_IMM(BPF_MOV, R0, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 1 } },
+	},
 	/* BPF_JMP | BPF_JGT | BPF_X */
 	{
 		"JMP_JGT_X: if (3 > 2) return 1",
@@ -4814,6 +5072,37 @@ static struct bpf_test tests[] = {
 		{ },
 		{ { 0, 1 } },
 	},
+	/* BPF_JMP | BPF_JLT | BPF_X */
+	{
+		"JMP_JLT_X: if (2 < 3) return 1",
+		.u.insns_int = {
+			BPF_ALU32_IMM(BPF_MOV, R0, 0),
+			BPF_LD_IMM64(R1, 3),
+			BPF_LD_IMM64(R2, 2),
+			BPF_JMP_REG(BPF_JLT, R2, R1, 1),
+			BPF_EXIT_INSN(),
+			BPF_ALU32_IMM(BPF_MOV, R0, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 1 } },
+	},
+	{
+		"JMP_JLT_X: Unsigned jump: if (1 < -1) return 1",
+		.u.insns_int = {
+			BPF_ALU32_IMM(BPF_MOV, R0, 0),
+			BPF_LD_IMM64(R1, -1),
+			BPF_LD_IMM64(R2, 1),
+			BPF_JMP_REG(BPF_JLT, R2, R1, 1),
+			BPF_EXIT_INSN(),
+			BPF_ALU32_IMM(BPF_MOV, R0, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 1 } },
+	},
 	/* BPF_JMP | BPF_JGE | BPF_X */
 	{
 		"JMP_JGE_X: if (3 >= 2) return 1",
@@ -4845,6 +5134,37 @@ static struct bpf_test tests[] = {
 		{ },
 		{ { 0, 1 } },
 	},
+	/* BPF_JMP | BPF_JLE | BPF_X */
+	{
+		"JMP_JLE_X: if (2 <= 3) return 1",
+		.u.insns_int = {
+			BPF_ALU32_IMM(BPF_MOV, R0, 0),
+			BPF_LD_IMM64(R1, 3),
+			BPF_LD_IMM64(R2, 2),
+			BPF_JMP_REG(BPF_JLE, R2, R1, 1),
+			BPF_EXIT_INSN(),
+			BPF_ALU32_IMM(BPF_MOV, R0, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 1 } },
+	},
+	{
+		"JMP_JLE_X: if (3 <= 3) return 1",
+		.u.insns_int = {
+			BPF_ALU32_IMM(BPF_MOV, R0, 0),
+			BPF_LD_IMM64(R1, 3),
+			BPF_LD_IMM64(R2, 3),
+			BPF_JMP_REG(BPF_JLE, R1, R2, 1),
+			BPF_EXIT_INSN(),
+			BPF_ALU32_IMM(BPF_MOV, R0, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 1 } },
+	},
 	{
 		/* Mainly testing JIT + imm64 here. */
 		"JMP_JGE_X: ldimm64 test 1",
@@ -4890,6 +5210,50 @@ static struct bpf_test tests[] = {
 		{ },
 		{ { 0, 1 } },
 	},
+	{
+		"JMP_JLE_X: ldimm64 test 1",
+		.u.insns_int = {
+			BPF_ALU32_IMM(BPF_MOV, R0, 0),
+			BPF_LD_IMM64(R1, 3),
+			BPF_LD_IMM64(R2, 2),
+			BPF_JMP_REG(BPF_JLE, R2, R1, 2),
+			BPF_LD_IMM64(R0, 0xffffffffffffffffULL),
+			BPF_LD_IMM64(R0, 0xeeeeeeeeeeeeeeeeULL),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 0xeeeeeeeeU } },
+	},
+	{
+		"JMP_JLE_X: ldimm64 test 2",
+		.u.insns_int = {
+			BPF_ALU32_IMM(BPF_MOV, R0, 0),
+			BPF_LD_IMM64(R1, 3),
+			BPF_LD_IMM64(R2, 2),
+			BPF_JMP_REG(BPF_JLE, R2, R1, 0),
+			BPF_LD_IMM64(R0, 0xffffffffffffffffULL),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 0xffffffffU } },
+	},
+	{
+		"JMP_JLE_X: ldimm64 test 3",
+		.u.insns_int = {
+			BPF_ALU32_IMM(BPF_MOV, R0, 1),
+			BPF_LD_IMM64(R1, 3),
+			BPF_LD_IMM64(R2, 2),
+			BPF_JMP_REG(BPF_JLE, R2, R1, 4),
+			BPF_LD_IMM64(R0, 0xffffffffffffffffULL),
+			BPF_LD_IMM64(R0, 0xeeeeeeeeeeeeeeeeULL),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 1 } },
+	},
 	/* BPF_JMP | BPF_JNE | BPF_X */
 	{
 		"JMP_JNE_X: if (3 != 2) return 1",
diff --git a/net/core/filter.c b/net/core/filter.c
index 78d00933dbe7..5afe3ac191ec 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -514,14 +514,27 @@ do_pass:
 				break;
 			}
 
-			/* Convert JEQ into JNE when 'jump_true' is next insn. */
-			if (fp->jt == 0 && BPF_OP(fp->code) == BPF_JEQ) {
-				insn->code = BPF_JMP | BPF_JNE | bpf_src;
+			/* Convert some jumps when 'jump_true' is next insn. */
+			if (fp->jt == 0) {
+				switch (BPF_OP(fp->code)) {
+				case BPF_JEQ:
+					insn->code = BPF_JMP | BPF_JNE | bpf_src;
+					break;
+				case BPF_JGT:
+					insn->code = BPF_JMP | BPF_JLE | bpf_src;
+					break;
+				case BPF_JGE:
+					insn->code = BPF_JMP | BPF_JLT | bpf_src;
+					break;
+				default:
+					goto jmp_rest;
+				}
+
 				target = i + fp->jf + 1;
 				BPF_EMIT_JMP;
 				break;
 			}
-
+jmp_rest:
 			/* Other jumps are mapped into two insns: Jxx and JA. */
 			target = i + fp->jt + 1;
 			insn->code = BPF_JMP | BPF_OP(fp->code) | bpf_src;
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 8d9bfcca3fe4..bf3b2e230455 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -30,9 +30,14 @@
 #define BPF_FROM_LE	BPF_TO_LE
 #define BPF_FROM_BE	BPF_TO_BE
 
+/* jmp encodings */
 #define BPF_JNE		0x50	/* jump != */
+#define BPF_JLT		0xa0	/* LT is unsigned, '<' */
+#define BPF_JLE		0xb0	/* LE is unsigned, '<=' */
 #define BPF_JSGT	0x60	/* SGT is signed '>', GT in x86 */
 #define BPF_JSGE	0x70	/* SGE is signed '>=', GE in x86 */
+#define BPF_JSLT	0xc0	/* SLT is signed, '<' */
+#define BPF_JSLE	0xd0	/* SLE is signed, '<=' */
 #define BPF_CALL	0x80	/* function call */
 #define BPF_EXIT	0x90	/* function return */
 
-- 
cgit v1.2.3-71-gd317


From 077fbac405bfc6d41419ad6c1725804ad4e9887c Mon Sep 17 00:00:00 2001
From: Lorenzo Colitti <lorenzo@google.com>
Date: Fri, 11 Aug 2017 02:11:33 +0900
Subject: net: xfrm: support setting an output mark.

On systems that use mark-based routing it may be necessary for
routing lookups to use marks in order for packets to be routed
correctly. An example of such a system is Android, which uses
socket marks to route packets via different networks.

Currently, routing lookups in tunnel mode always use a mark of
zero, making routing incorrect on such systems.

This patch adds a new output_mark element to the xfrm state and
a corresponding XFRMA_OUTPUT_MARK netlink attribute. The output
mark differs from the existing xfrm mark in two ways:

1. The xfrm mark is used to match xfrm policies and states, while
   the xfrm output mark is used to set the mark (and influence
   the routing) of the packets emitted by those states.
2. The existing mark is constrained to be a subset of the bits of
   the originating socket or transformed packet, but the output
   mark is arbitrary and depends only on the state.

The use of a separate mark provides additional flexibility. For
example:

- A packet subject to two transforms (e.g., transport mode inside
  tunnel mode) can have two different output marks applied to it,
  one for the transport mode SA and one for the tunnel mode SA.
- On a system where socket marks determine routing, the packets
  emitted by an IPsec tunnel can be routed based on a mark that
  is determined by the tunnel, not by the marks of the
  unencrypted packets.
- Support for setting the output marks can be introduced without
  breaking any existing setups that employ both mark-based
  routing and xfrm tunnel mode. Simply changing the code to use
  the xfrm mark for routing output packets could xfrm mark could
  change behaviour in a way that breaks these setups.

If the output mark is unspecified or set to zero, the mark is not
set or changed.

Tested: make allyesconfig; make -j64
Tested: https://android-review.googlesource.com/452776
Signed-off-by: Lorenzo Colitti <lorenzo@google.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/xfrm.h        |  9 ++++++---
 include/uapi/linux/xfrm.h |  1 +
 net/ipv4/xfrm4_policy.c   | 14 +++++++++-----
 net/ipv6/xfrm6_policy.c   |  9 ++++++---
 net/xfrm/xfrm_device.c    |  3 ++-
 net/xfrm/xfrm_output.c    |  3 +++
 net/xfrm/xfrm_policy.c    | 17 +++++++++--------
 net/xfrm/xfrm_user.c      | 11 +++++++++++
 8 files changed, 47 insertions(+), 20 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 18d7de34a5c3..9c7b70cce6d6 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -165,6 +165,7 @@ struct xfrm_state {
 		int		header_len;
 		int		trailer_len;
 		u32		extra_flags;
+		u32		output_mark;
 	} props;
 
 	struct xfrm_lifetime_cfg lft;
@@ -298,10 +299,12 @@ struct xfrm_policy_afinfo {
 	struct dst_entry	*(*dst_lookup)(struct net *net,
 					       int tos, int oif,
 					       const xfrm_address_t *saddr,
-					       const xfrm_address_t *daddr);
+					       const xfrm_address_t *daddr,
+					       u32 mark);
 	int			(*get_saddr)(struct net *net, int oif,
 					     xfrm_address_t *saddr,
-					     xfrm_address_t *daddr);
+					     xfrm_address_t *daddr,
+					     u32 mark);
 	void			(*decode_session)(struct sk_buff *skb,
 						  struct flowi *fl,
 						  int reverse);
@@ -1640,7 +1643,7 @@ static inline int xfrm4_udp_encap_rcv(struct sock *sk, struct sk_buff *skb)
 struct dst_entry *__xfrm_dst_lookup(struct net *net, int tos, int oif,
 				    const xfrm_address_t *saddr,
 				    const xfrm_address_t *daddr,
-				    int family);
+				    int family, u32 mark);
 
 struct xfrm_policy *xfrm_policy_alloc(struct net *net, gfp_t gfp);
 
diff --git a/include/uapi/linux/xfrm.h b/include/uapi/linux/xfrm.h
index 2b384ff09fa0..5fe7370a2bef 100644
--- a/include/uapi/linux/xfrm.h
+++ b/include/uapi/linux/xfrm.h
@@ -304,6 +304,7 @@ enum xfrm_attr_type_t {
 	XFRMA_ADDRESS_FILTER,	/* struct xfrm_address_filter */
 	XFRMA_PAD,
 	XFRMA_OFFLOAD_DEV,	/* struct xfrm_state_offload */
+	XFRMA_OUTPUT_MARK,	/* __u32 */
 	__XFRMA_MAX
 
 #define XFRMA_MAX (__XFRMA_MAX - 1)
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index 4aefb149fe0a..d7bf0b041885 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -20,7 +20,8 @@
 static struct dst_entry *__xfrm4_dst_lookup(struct net *net, struct flowi4 *fl4,
 					    int tos, int oif,
 					    const xfrm_address_t *saddr,
-					    const xfrm_address_t *daddr)
+					    const xfrm_address_t *daddr,
+					    u32 mark)
 {
 	struct rtable *rt;
 
@@ -28,6 +29,7 @@ static struct dst_entry *__xfrm4_dst_lookup(struct net *net, struct flowi4 *fl4,
 	fl4->daddr = daddr->a4;
 	fl4->flowi4_tos = tos;
 	fl4->flowi4_oif = l3mdev_master_ifindex_by_index(net, oif);
+	fl4->flowi4_mark = mark;
 	if (saddr)
 		fl4->saddr = saddr->a4;
 
@@ -42,20 +44,22 @@ static struct dst_entry *__xfrm4_dst_lookup(struct net *net, struct flowi4 *fl4,
 
 static struct dst_entry *xfrm4_dst_lookup(struct net *net, int tos, int oif,
 					  const xfrm_address_t *saddr,
-					  const xfrm_address_t *daddr)
+					  const xfrm_address_t *daddr,
+					  u32 mark)
 {
 	struct flowi4 fl4;
 
-	return __xfrm4_dst_lookup(net, &fl4, tos, oif, saddr, daddr);
+	return __xfrm4_dst_lookup(net, &fl4, tos, oif, saddr, daddr, mark);
 }
 
 static int xfrm4_get_saddr(struct net *net, int oif,
-			   xfrm_address_t *saddr, xfrm_address_t *daddr)
+			   xfrm_address_t *saddr, xfrm_address_t *daddr,
+			   u32 mark)
 {
 	struct dst_entry *dst;
 	struct flowi4 fl4;
 
-	dst = __xfrm4_dst_lookup(net, &fl4, 0, oif, NULL, daddr);
+	dst = __xfrm4_dst_lookup(net, &fl4, 0, oif, NULL, daddr, mark);
 	if (IS_ERR(dst))
 		return -EHOSTUNREACH;
 
diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c
index f44b25a48478..11d1314ab6c5 100644
--- a/net/ipv6/xfrm6_policy.c
+++ b/net/ipv6/xfrm6_policy.c
@@ -27,7 +27,8 @@
 
 static struct dst_entry *xfrm6_dst_lookup(struct net *net, int tos, int oif,
 					  const xfrm_address_t *saddr,
-					  const xfrm_address_t *daddr)
+					  const xfrm_address_t *daddr,
+					  u32 mark)
 {
 	struct flowi6 fl6;
 	struct dst_entry *dst;
@@ -36,6 +37,7 @@ static struct dst_entry *xfrm6_dst_lookup(struct net *net, int tos, int oif,
 	memset(&fl6, 0, sizeof(fl6));
 	fl6.flowi6_oif = l3mdev_master_ifindex_by_index(net, oif);
 	fl6.flowi6_flags = FLOWI_FLAG_SKIP_NH_OIF;
+	fl6.flowi6_mark = mark;
 	memcpy(&fl6.daddr, daddr, sizeof(fl6.daddr));
 	if (saddr)
 		memcpy(&fl6.saddr, saddr, sizeof(fl6.saddr));
@@ -52,12 +54,13 @@ static struct dst_entry *xfrm6_dst_lookup(struct net *net, int tos, int oif,
 }
 
 static int xfrm6_get_saddr(struct net *net, int oif,
-			   xfrm_address_t *saddr, xfrm_address_t *daddr)
+			   xfrm_address_t *saddr, xfrm_address_t *daddr,
+			   u32 mark)
 {
 	struct dst_entry *dst;
 	struct net_device *dev;
 
-	dst = xfrm6_dst_lookup(net, 0, oif, NULL, daddr);
+	dst = xfrm6_dst_lookup(net, 0, oif, NULL, daddr, mark);
 	if (IS_ERR(dst))
 		return -EHOSTUNREACH;
 
diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c
index 1904127f5fb8..acf00104ef31 100644
--- a/net/xfrm/xfrm_device.c
+++ b/net/xfrm/xfrm_device.c
@@ -79,7 +79,8 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x,
 			daddr = &x->props.saddr;
 		}
 
-		dst = __xfrm_dst_lookup(net, 0, 0, saddr, daddr, x->props.family);
+		dst = __xfrm_dst_lookup(net, 0, 0, saddr, daddr,
+					x->props.family, x->props.output_mark);
 		if (IS_ERR(dst))
 			return 0;
 
diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c
index 8c0b6722aaa8..31a2e6d34dba 100644
--- a/net/xfrm/xfrm_output.c
+++ b/net/xfrm/xfrm_output.c
@@ -66,6 +66,9 @@ static int xfrm_output_one(struct sk_buff *skb, int err)
 			goto error_nolock;
 		}
 
+		if (x->props.output_mark)
+			skb->mark = x->props.output_mark;
+
 		err = x->outer_mode->output(x, skb);
 		if (err) {
 			XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTSTATEMODEERROR);
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 06c3bf7ab86b..1de52f36caf5 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -122,7 +122,7 @@ static const struct xfrm_policy_afinfo *xfrm_policy_get_afinfo(unsigned short fa
 struct dst_entry *__xfrm_dst_lookup(struct net *net, int tos, int oif,
 				    const xfrm_address_t *saddr,
 				    const xfrm_address_t *daddr,
-				    int family)
+				    int family, u32 mark)
 {
 	const struct xfrm_policy_afinfo *afinfo;
 	struct dst_entry *dst;
@@ -131,7 +131,7 @@ struct dst_entry *__xfrm_dst_lookup(struct net *net, int tos, int oif,
 	if (unlikely(afinfo == NULL))
 		return ERR_PTR(-EAFNOSUPPORT);
 
-	dst = afinfo->dst_lookup(net, tos, oif, saddr, daddr);
+	dst = afinfo->dst_lookup(net, tos, oif, saddr, daddr, mark);
 
 	rcu_read_unlock();
 
@@ -143,7 +143,7 @@ static inline struct dst_entry *xfrm_dst_lookup(struct xfrm_state *x,
 						int tos, int oif,
 						xfrm_address_t *prev_saddr,
 						xfrm_address_t *prev_daddr,
-						int family)
+						int family, u32 mark)
 {
 	struct net *net = xs_net(x);
 	xfrm_address_t *saddr = &x->props.saddr;
@@ -159,7 +159,7 @@ static inline struct dst_entry *xfrm_dst_lookup(struct xfrm_state *x,
 		daddr = x->coaddr;
 	}
 
-	dst = __xfrm_dst_lookup(net, tos, oif, saddr, daddr, family);
+	dst = __xfrm_dst_lookup(net, tos, oif, saddr, daddr, family, mark);
 
 	if (!IS_ERR(dst)) {
 		if (prev_saddr != saddr)
@@ -1340,14 +1340,14 @@ int __xfrm_sk_clone_policy(struct sock *sk, const struct sock *osk)
 
 static int
 xfrm_get_saddr(struct net *net, int oif, xfrm_address_t *local,
-	       xfrm_address_t *remote, unsigned short family)
+	       xfrm_address_t *remote, unsigned short family, u32 mark)
 {
 	int err;
 	const struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
 
 	if (unlikely(afinfo == NULL))
 		return -EINVAL;
-	err = afinfo->get_saddr(net, oif, local, remote);
+	err = afinfo->get_saddr(net, oif, local, remote, mark);
 	rcu_read_unlock();
 	return err;
 }
@@ -1378,7 +1378,7 @@ xfrm_tmpl_resolve_one(struct xfrm_policy *policy, const struct flowi *fl,
 			if (xfrm_addr_any(local, tmpl->encap_family)) {
 				error = xfrm_get_saddr(net, fl->flowi_oif,
 						       &tmp, remote,
-						       tmpl->encap_family);
+						       tmpl->encap_family, 0);
 				if (error)
 					goto fail;
 				local = &tmp;
@@ -1598,7 +1598,8 @@ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy,
 		if (xfrm[i]->props.mode != XFRM_MODE_TRANSPORT) {
 			family = xfrm[i]->props.family;
 			dst = xfrm_dst_lookup(xfrm[i], tos, fl->flowi_oif,
-					      &saddr, &daddr, family);
+					      &saddr, &daddr, family,
+					      xfrm[i]->props.output_mark);
 			err = PTR_ERR(dst);
 			if (IS_ERR(dst))
 				goto put_states;
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index ffe8d5ef09eb..cc3268d814b4 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -584,6 +584,9 @@ static struct xfrm_state *xfrm_state_construct(struct net *net,
 
 	xfrm_mark_get(attrs, &x->mark);
 
+	if (attrs[XFRMA_OUTPUT_MARK])
+		x->props.output_mark = nla_get_u32(attrs[XFRMA_OUTPUT_MARK]);
+
 	err = __xfrm_init_state(x, false, attrs[XFRMA_OFFLOAD_DEV]);
 	if (err)
 		goto error;
@@ -899,6 +902,11 @@ static int copy_to_user_state_extra(struct xfrm_state *x,
 		goto out;
 	if (x->security)
 		ret = copy_sec_ctx(x->security, skb);
+	if (x->props.output_mark) {
+		ret = nla_put_u32(skb, XFRMA_OUTPUT_MARK, x->props.output_mark);
+		if (ret)
+			goto out;
+	}
 out:
 	return ret;
 }
@@ -2454,6 +2462,7 @@ static const struct nla_policy xfrma_policy[XFRMA_MAX+1] = {
 	[XFRMA_PROTO]		= { .type = NLA_U8 },
 	[XFRMA_ADDRESS_FILTER]	= { .len = sizeof(struct xfrm_address_filter) },
 	[XFRMA_OFFLOAD_DEV]	= { .len = sizeof(struct xfrm_user_offload) },
+	[XFRMA_OUTPUT_MARK]	= { .len = NLA_U32 },
 };
 
 static const struct nla_policy xfrma_spd_policy[XFRMA_SPD_MAX+1] = {
@@ -2673,6 +2682,8 @@ static inline size_t xfrm_sa_len(struct xfrm_state *x)
 		l += nla_total_size(sizeof(x->props.extra_flags));
 	if (x->xso.dev)
 		 l += nla_total_size(sizeof(x->xso));
+	if (x->props.output_mark)
+		l += nla_total_size(sizeof(x->props.output_mark));
 
 	/* Must count x->lastused as it may become non-zero behind our back. */
 	l += nla_total_size_64bit(sizeof(u64));
-- 
cgit v1.2.3-71-gd317


From 34fc75bfc616f1c1fbab56508c3f48f4b97c97ea Mon Sep 17 00:00:00 2001
From: Florian Weimer <fweimer@redhat.com>
Date: Fri, 11 Aug 2017 16:24:15 +0200
Subject: uapi/linux/quota.h: Do not include linux/errno.h

linux/errno.h is very sensitive to coordination with libc headers.
Nothing in linux/quota.h needs it, so this change allows using
this header in more contexts.

Signed-off-by: Florian Weimer <fweimer@redhat.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 include/uapi/linux/quota.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/quota.h b/include/uapi/linux/quota.h
index 4d2489ef6f10..f17c9636a859 100644
--- a/include/uapi/linux/quota.h
+++ b/include/uapi/linux/quota.h
@@ -33,7 +33,6 @@
 #ifndef _UAPI_LINUX_QUOTA_
 #define _UAPI_LINUX_QUOTA_
 
-#include <linux/errno.h>
 #include <linux/types.h>
 
 #define __DQUOT_VERSION__	"dquot_6.6.0"
-- 
cgit v1.2.3-71-gd317


From d612b1fd8010d0d67b5287fe146b8b55bcbb8655 Mon Sep 17 00:00:00 2001
From: Tyler Hicks <tyhicks@canonical.com>
Date: Fri, 11 Aug 2017 04:33:53 +0000
Subject: seccomp: Operation for checking if an action is available

Userspace code that needs to check if the kernel supports a given action
may not be able to use the /proc/sys/kernel/seccomp/actions_avail
sysctl. The process may be running in a sandbox and, therefore,
sufficient filesystem access may not be available. This patch adds an
operation to the seccomp(2) syscall that allows userspace code to ask
the kernel if a given action is available.

If the action is supported by the kernel, 0 is returned. If the action
is not supported by the kernel, -1 is returned with errno set to
-EOPNOTSUPP. If this check is attempted on a kernel that doesn't support
this new operation, -1 is returned with errno set to -EINVAL meaning
that userspace code will have the ability to differentiate between the
two error cases.

Signed-off-by: Tyler Hicks <tyhicks@canonical.com>
Suggested-by: Andy Lutomirski <luto@amacapital.net>
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 include/uapi/linux/seccomp.h                  |  5 ++--
 kernel/seccomp.c                              | 26 +++++++++++++++++++
 tools/testing/selftests/seccomp/seccomp_bpf.c | 36 +++++++++++++++++++++++++++
 3 files changed, 65 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h
index 0f238a43ff1e..aaad61cc46bc 100644
--- a/include/uapi/linux/seccomp.h
+++ b/include/uapi/linux/seccomp.h
@@ -11,8 +11,9 @@
 #define SECCOMP_MODE_FILTER	2 /* uses user-supplied filter. */
 
 /* Valid operations for seccomp syscall. */
-#define SECCOMP_SET_MODE_STRICT	0
-#define SECCOMP_SET_MODE_FILTER	1
+#define SECCOMP_SET_MODE_STRICT		0
+#define SECCOMP_SET_MODE_FILTER		1
+#define SECCOMP_GET_ACTION_AVAIL	2
 
 /* Valid flags for SECCOMP_SET_MODE_FILTER */
 #define SECCOMP_FILTER_FLAG_TSYNC	1
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 5f19f41e4e50..7a6089f66fed 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -808,6 +808,27 @@ static inline long seccomp_set_mode_filter(unsigned int flags,
 }
 #endif
 
+static long seccomp_get_action_avail(const char __user *uaction)
+{
+	u32 action;
+
+	if (copy_from_user(&action, uaction, sizeof(action)))
+		return -EFAULT;
+
+	switch (action) {
+	case SECCOMP_RET_KILL:
+	case SECCOMP_RET_TRAP:
+	case SECCOMP_RET_ERRNO:
+	case SECCOMP_RET_TRACE:
+	case SECCOMP_RET_ALLOW:
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	return 0;
+}
+
 /* Common entry point for both prctl and syscall. */
 static long do_seccomp(unsigned int op, unsigned int flags,
 		       const char __user *uargs)
@@ -819,6 +840,11 @@ static long do_seccomp(unsigned int op, unsigned int flags,
 		return seccomp_set_mode_strict();
 	case SECCOMP_SET_MODE_FILTER:
 		return seccomp_set_mode_filter(flags, uargs);
+	case SECCOMP_GET_ACTION_AVAIL:
+		if (flags != 0)
+			return -EINVAL;
+
+		return seccomp_get_action_avail(uargs);
 	default:
 		return -EINVAL;
 	}
diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c
index 2fb49d99588d..1f2888f6678b 100644
--- a/tools/testing/selftests/seccomp/seccomp_bpf.c
+++ b/tools/testing/selftests/seccomp/seccomp_bpf.c
@@ -1731,6 +1731,10 @@ TEST_F_SIGNAL(TRACE_syscall, kill_after_ptrace, SIGSYS)
 #define SECCOMP_SET_MODE_FILTER 1
 #endif
 
+#ifndef SECCOMP_GET_ACTION_AVAIL
+#define SECCOMP_GET_ACTION_AVAIL 2
+#endif
+
 #ifndef SECCOMP_FILTER_FLAG_TSYNC
 #define SECCOMP_FILTER_FLAG_TSYNC 1
 #endif
@@ -2469,6 +2473,38 @@ TEST(syscall_restart)
 		_metadata->passed = 0;
 }
 
+TEST(get_action_avail)
+{
+	__u32 actions[] = { SECCOMP_RET_KILL,  SECCOMP_RET_TRAP,
+			    SECCOMP_RET_ERRNO, SECCOMP_RET_TRACE,
+			    SECCOMP_RET_ALLOW };
+	__u32 unknown_action = 0x10000000U;
+	int i;
+	long ret;
+
+	ret = seccomp(SECCOMP_GET_ACTION_AVAIL, 0, &actions[0]);
+	ASSERT_NE(ENOSYS, errno) {
+		TH_LOG("Kernel does not support seccomp syscall!");
+	}
+	ASSERT_NE(EINVAL, errno) {
+		TH_LOG("Kernel does not support SECCOMP_GET_ACTION_AVAIL operation!");
+	}
+	EXPECT_EQ(ret, 0);
+
+	for (i = 0; i < ARRAY_SIZE(actions); i++) {
+		ret = seccomp(SECCOMP_GET_ACTION_AVAIL, 0, &actions[i]);
+		EXPECT_EQ(ret, 0) {
+			TH_LOG("Expected action (0x%X) not available!",
+			       actions[i]);
+		}
+	}
+
+	/* Check that an unknown action is handled properly (EOPNOTSUPP) */
+	ret = seccomp(SECCOMP_GET_ACTION_AVAIL, 0, &unknown_action);
+	EXPECT_EQ(ret, -1);
+	EXPECT_EQ(errno, EOPNOTSUPP);
+}
+
 /*
  * TODO:
  * - add microbenchmarks
-- 
cgit v1.2.3-71-gd317


From e66a39977985b1e69e17c4042cb290768eca9b02 Mon Sep 17 00:00:00 2001
From: Tyler Hicks <tyhicks@canonical.com>
Date: Fri, 11 Aug 2017 04:33:56 +0000
Subject: seccomp: Filter flag to log all actions except SECCOMP_RET_ALLOW

Add a new filter flag, SECCOMP_FILTER_FLAG_LOG, that enables logging for
all actions except for SECCOMP_RET_ALLOW for the given filter.

SECCOMP_RET_KILL actions are always logged, when "kill" is in the
actions_logged sysctl, and SECCOMP_RET_ALLOW actions are never logged,
regardless of this flag.

This flag can be used to create noisy filters that result in all
non-allowed actions to be logged. A process may have one noisy filter,
which is loaded with this flag, as well as a quiet filter that's not
loaded with this flag. This allows for the actions in a set of filters
to be selectively conveyed to the admin.

Since a system could have a large number of allocated seccomp_filter
structs, struct packing was taken in consideration. On 64 bit x86, the
new log member takes up one byte of an existing four byte hole in the
struct. On 32 bit x86, the new log member creates a new four byte hole
(unavoidable) and consumes one of those bytes.

Unfortunately, the tests added for SECCOMP_FILTER_FLAG_LOG are not
capable of inspecting the audit log to verify that the actions taken in
the filter were logged.

With this patch, the logic for deciding if an action will be logged is:

if action == RET_ALLOW:
  do not log
else if action == RET_KILL && RET_KILL in actions_logged:
  log
else if filter-requests-logging && action in actions_logged:
  log
else if audit_enabled && process-is-being-audited:
  log
else:
  do not log

Signed-off-by: Tyler Hicks <tyhicks@canonical.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 include/linux/seccomp.h                       |  3 +-
 include/uapi/linux/seccomp.h                  |  1 +
 kernel/seccomp.c                              | 26 +++++++---
 tools/testing/selftests/seccomp/seccomp_bpf.c | 69 ++++++++++++++++++++++++++-
 4 files changed, 91 insertions(+), 8 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
index ecc296c137cd..c8bef436b61d 100644
--- a/include/linux/seccomp.h
+++ b/include/linux/seccomp.h
@@ -3,7 +3,8 @@
 
 #include <uapi/linux/seccomp.h>
 
-#define SECCOMP_FILTER_FLAG_MASK	(SECCOMP_FILTER_FLAG_TSYNC)
+#define SECCOMP_FILTER_FLAG_MASK	(SECCOMP_FILTER_FLAG_TSYNC | \
+					 SECCOMP_FILTER_FLAG_LOG)
 
 #ifdef CONFIG_SECCOMP
 
diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h
index aaad61cc46bc..19a611d0712e 100644
--- a/include/uapi/linux/seccomp.h
+++ b/include/uapi/linux/seccomp.h
@@ -17,6 +17,7 @@
 
 /* Valid flags for SECCOMP_SET_MODE_FILTER */
 #define SECCOMP_FILTER_FLAG_TSYNC	1
+#define SECCOMP_FILTER_FLAG_LOG		2
 
 /*
  * All BPF programs must return a 32-bit value.
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 54357e361aea..ed9fde418fc4 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -44,6 +44,7 @@
  *         get/put helpers should be used when accessing an instance
  *         outside of a lifetime-guarded section.  In general, this
  *         is only needed for handling filters shared across tasks.
+ * @log: true if all actions except for SECCOMP_RET_ALLOW should be logged
  * @prev: points to a previously installed, or inherited, filter
  * @prog: the BPF program to evaluate
  *
@@ -59,6 +60,7 @@
  */
 struct seccomp_filter {
 	refcount_t usage;
+	bool log;
 	struct seccomp_filter *prev;
 	struct bpf_prog *prog;
 };
@@ -452,6 +454,10 @@ static long seccomp_attach_filter(unsigned int flags,
 			return ret;
 	}
 
+	/* Set log flag, if present. */
+	if (flags & SECCOMP_FILTER_FLAG_LOG)
+		filter->log = true;
+
 	/*
 	 * If there is an existing filter, make it the prev and don't drop its
 	 * task reference.
@@ -532,15 +538,22 @@ static void seccomp_send_sigsys(int syscall, int reason)
 static u32 seccomp_actions_logged = SECCOMP_LOG_KILL  | SECCOMP_LOG_TRAP  |
 				    SECCOMP_LOG_ERRNO | SECCOMP_LOG_TRACE;
 
-static inline void seccomp_log(unsigned long syscall, long signr, u32 action)
+static inline void seccomp_log(unsigned long syscall, long signr, u32 action,
+			       bool requested)
 {
 	bool log = false;
 
 	switch (action) {
 	case SECCOMP_RET_ALLOW:
+		break;
 	case SECCOMP_RET_TRAP:
+		log = requested && seccomp_actions_logged & SECCOMP_LOG_TRAP;
+		break;
 	case SECCOMP_RET_ERRNO:
+		log = requested && seccomp_actions_logged & SECCOMP_LOG_ERRNO;
+		break;
 	case SECCOMP_RET_TRACE:
+		log = requested && seccomp_actions_logged & SECCOMP_LOG_TRACE;
 		break;
 	case SECCOMP_RET_KILL:
 	default:
@@ -548,8 +561,9 @@ static inline void seccomp_log(unsigned long syscall, long signr, u32 action)
 	}
 
 	/*
-	 * Force an audit message to be emitted when the action is RET_KILL and
-	 * the action is allowed to be logged by the admin.
+	 * Force an audit message to be emitted when the action is RET_KILL or
+	 * the FILTER_FLAG_LOG bit was set and the action is allowed to be
+	 * logged by the admin.
 	 */
 	if (log)
 		return __audit_seccomp(syscall, signr, action);
@@ -586,7 +600,7 @@ static void __secure_computing_strict(int this_syscall)
 #ifdef SECCOMP_DEBUG
 	dump_stack();
 #endif
-	seccomp_log(this_syscall, SIGKILL, SECCOMP_RET_KILL);
+	seccomp_log(this_syscall, SIGKILL, SECCOMP_RET_KILL, true);
 	do_exit(SIGKILL);
 }
 
@@ -695,7 +709,7 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
 
 	case SECCOMP_RET_KILL:
 	default:
-		seccomp_log(this_syscall, SIGSYS, action);
+		seccomp_log(this_syscall, SIGSYS, action, true);
 		/* Dump core only if this is the last remaining thread. */
 		if (get_nr_threads(current) == 1) {
 			siginfo_t info;
@@ -712,7 +726,7 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
 	unreachable();
 
 skip:
-	seccomp_log(this_syscall, 0, action);
+	seccomp_log(this_syscall, 0, action, match ? match->log : false);
 	return -1;
 }
 #else
diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c
index abf708e09892..1c8c22ce7740 100644
--- a/tools/testing/selftests/seccomp/seccomp_bpf.c
+++ b/tools/testing/selftests/seccomp/seccomp_bpf.c
@@ -1739,6 +1739,10 @@ TEST_F_SIGNAL(TRACE_syscall, kill_after_ptrace, SIGSYS)
 #define SECCOMP_FILTER_FLAG_TSYNC 1
 #endif
 
+#ifndef SECCOMP_FILTER_FLAG_LOG
+#define SECCOMP_FILTER_FLAG_LOG 2
+#endif
+
 #ifndef seccomp
 int seccomp(unsigned int op, unsigned int flags, void *args)
 {
@@ -1844,7 +1848,8 @@ TEST(seccomp_syscall_mode_lock)
  */
 TEST(detect_seccomp_filter_flags)
 {
-	unsigned int flags[] = { SECCOMP_FILTER_FLAG_TSYNC };
+	unsigned int flags[] = { SECCOMP_FILTER_FLAG_TSYNC,
+				 SECCOMP_FILTER_FLAG_LOG };
 	unsigned int flag, all_flags;
 	int i;
 	long ret;
@@ -2533,6 +2538,67 @@ TEST(syscall_restart)
 		_metadata->passed = 0;
 }
 
+TEST_SIGNAL(filter_flag_log, SIGSYS)
+{
+	struct sock_filter allow_filter[] = {
+		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
+	};
+	struct sock_filter kill_filter[] = {
+		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
+			offsetof(struct seccomp_data, nr)),
+		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 0, 1),
+		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
+		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
+	};
+	struct sock_fprog allow_prog = {
+		.len = (unsigned short)ARRAY_SIZE(allow_filter),
+		.filter = allow_filter,
+	};
+	struct sock_fprog kill_prog = {
+		.len = (unsigned short)ARRAY_SIZE(kill_filter),
+		.filter = kill_filter,
+	};
+	long ret;
+	pid_t parent = getppid();
+
+	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+	ASSERT_EQ(0, ret);
+
+	/* Verify that the FILTER_FLAG_LOG flag isn't accepted in strict mode */
+	ret = seccomp(SECCOMP_SET_MODE_STRICT, SECCOMP_FILTER_FLAG_LOG,
+		      &allow_prog);
+	ASSERT_NE(ENOSYS, errno) {
+		TH_LOG("Kernel does not support seccomp syscall!");
+	}
+	EXPECT_NE(0, ret) {
+		TH_LOG("Kernel accepted FILTER_FLAG_LOG flag in strict mode!");
+	}
+	EXPECT_EQ(EINVAL, errno) {
+		TH_LOG("Kernel returned unexpected errno for FILTER_FLAG_LOG flag in strict mode!");
+	}
+
+	/* Verify that a simple, permissive filter can be added with no flags */
+	ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, &allow_prog);
+	EXPECT_EQ(0, ret);
+
+	/* See if the same filter can be added with the FILTER_FLAG_LOG flag */
+	ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_LOG,
+		      &allow_prog);
+	ASSERT_NE(EINVAL, errno) {
+		TH_LOG("Kernel does not support the FILTER_FLAG_LOG flag!");
+	}
+	EXPECT_EQ(0, ret);
+
+	/* Ensure that the kill filter works with the FILTER_FLAG_LOG flag */
+	ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_LOG,
+		      &kill_prog);
+	EXPECT_EQ(0, ret);
+
+	EXPECT_EQ(parent, syscall(__NR_getppid));
+	/* getpid() should never return. */
+	EXPECT_EQ(0, syscall(__NR_getpid));
+}
+
 TEST(get_action_avail)
 {
 	__u32 actions[] = { SECCOMP_RET_KILL,  SECCOMP_RET_TRAP,
@@ -2573,6 +2639,7 @@ TEST(get_action_avail)
  * - endianness checking when appropriate
  * - 64-bit arg prodding
  * - arch value testing (x86 modes especially)
+ * - verify that FILTER_FLAG_LOG filters generate log messages
  * - ...
  */
 
-- 
cgit v1.2.3-71-gd317


From 59f5cf44a38284eb9e76270c786fb6cc62ef8ac4 Mon Sep 17 00:00:00 2001
From: Tyler Hicks <tyhicks@canonical.com>
Date: Fri, 11 Aug 2017 04:33:57 +0000
Subject: seccomp: Action to log before allowing

Add a new action, SECCOMP_RET_LOG, that logs a syscall before allowing
the syscall. At the implementation level, this action is identical to
the existing SECCOMP_RET_ALLOW action. However, it can be very useful when
initially developing a seccomp filter for an application. The developer
can set the default action to be SECCOMP_RET_LOG, maybe mark any
obviously needed syscalls with SECCOMP_RET_ALLOW, and then put the
application through its paces. A list of syscalls that triggered the
default action (SECCOMP_RET_LOG) can be easily gleaned from the logs and
that list can be used to build the syscall whitelist. Finally, the
developer can change the default action to the desired value.

This provides a more friendly experience than seeing the application get
killed, then updating the filter and rebuilding the app, seeing the
application get killed due to a different syscall, then updating the
filter and rebuilding the app, etc.

The functionality is similar to what's supported by the various LSMs.
SELinux has permissive mode, AppArmor has complain mode, SMACK has
bring-up mode, etc.

SECCOMP_RET_LOG is given a lower value than SECCOMP_RET_ALLOW as allow
while logging is slightly more restrictive than quietly allowing.

Unfortunately, the tests added for SECCOMP_RET_LOG are not capable of
inspecting the audit log to verify that the syscall was logged.

With this patch, the logic for deciding if an action will be logged is:

if action == RET_ALLOW:
  do not log
else if action == RET_KILL && RET_KILL in actions_logged:
  log
else if action == RET_LOG && RET_LOG in actions_logged:
  log
else if filter-requests-logging && action in actions_logged:
  log
else if audit_enabled && process-is-being-audited:
  log
else:
  do not log

Signed-off-by: Tyler Hicks <tyhicks@canonical.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 Documentation/userspace-api/seccomp_filter.rst |  9 +++
 include/uapi/linux/seccomp.h                   |  1 +
 kernel/seccomp.c                               | 23 ++++--
 tools/testing/selftests/seccomp/seccomp_bpf.c  | 98 +++++++++++++++++++++++++-
 4 files changed, 125 insertions(+), 6 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/userspace-api/seccomp_filter.rst b/Documentation/userspace-api/seccomp_filter.rst
index 2d1d8ab04ac5..f4977357daf2 100644
--- a/Documentation/userspace-api/seccomp_filter.rst
+++ b/Documentation/userspace-api/seccomp_filter.rst
@@ -141,6 +141,15 @@ In precedence order, they are:
 	allow use of ptrace, even of other sandboxed processes, without
 	extreme care; ptracers can use this mechanism to escape.)
 
+``SECCOMP_RET_LOG``:
+	Results in the system call being executed after it is logged. This
+	should be used by application developers to learn which syscalls their
+	application needs without having to iterate through multiple test and
+	development cycles to build the list.
+
+	This action will only be logged if "log" is present in the
+	actions_logged sysctl string.
+
 ``SECCOMP_RET_ALLOW``:
 	Results in the system call being executed.
 
diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h
index 19a611d0712e..f94433263e4b 100644
--- a/include/uapi/linux/seccomp.h
+++ b/include/uapi/linux/seccomp.h
@@ -31,6 +31,7 @@
 #define SECCOMP_RET_TRAP	0x00030000U /* disallow and force a SIGSYS */
 #define SECCOMP_RET_ERRNO	0x00050000U /* returns an errno */
 #define SECCOMP_RET_TRACE	0x7ff00000U /* pass to a tracer or disallow */
+#define SECCOMP_RET_LOG		0x7ffc0000U /* allow after logging */
 #define SECCOMP_RET_ALLOW	0x7fff0000U /* allow */
 
 /* Masks for the return value sections. */
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index ed9fde418fc4..59cde2ed3b92 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -533,10 +533,12 @@ static void seccomp_send_sigsys(int syscall, int reason)
 #define SECCOMP_LOG_TRAP		(1 << 2)
 #define SECCOMP_LOG_ERRNO		(1 << 3)
 #define SECCOMP_LOG_TRACE		(1 << 4)
-#define SECCOMP_LOG_ALLOW		(1 << 5)
+#define SECCOMP_LOG_LOG			(1 << 5)
+#define SECCOMP_LOG_ALLOW		(1 << 6)
 
 static u32 seccomp_actions_logged = SECCOMP_LOG_KILL  | SECCOMP_LOG_TRAP  |
-				    SECCOMP_LOG_ERRNO | SECCOMP_LOG_TRACE;
+				    SECCOMP_LOG_ERRNO | SECCOMP_LOG_TRACE |
+				    SECCOMP_LOG_LOG;
 
 static inline void seccomp_log(unsigned long syscall, long signr, u32 action,
 			       bool requested)
@@ -555,15 +557,18 @@ static inline void seccomp_log(unsigned long syscall, long signr, u32 action,
 	case SECCOMP_RET_TRACE:
 		log = requested && seccomp_actions_logged & SECCOMP_LOG_TRACE;
 		break;
+	case SECCOMP_RET_LOG:
+		log = seccomp_actions_logged & SECCOMP_LOG_LOG;
+		break;
 	case SECCOMP_RET_KILL:
 	default:
 		log = seccomp_actions_logged & SECCOMP_LOG_KILL;
 	}
 
 	/*
-	 * Force an audit message to be emitted when the action is RET_KILL or
-	 * the FILTER_FLAG_LOG bit was set and the action is allowed to be
-	 * logged by the admin.
+	 * Force an audit message to be emitted when the action is RET_KILL,
+	 * RET_LOG, or the FILTER_FLAG_LOG bit was set and the action is
+	 * allowed to be logged by the admin.
 	 */
 	if (log)
 		return __audit_seccomp(syscall, signr, action);
@@ -699,6 +704,10 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
 
 		return 0;
 
+	case SECCOMP_RET_LOG:
+		seccomp_log(this_syscall, 0, action, true);
+		return 0;
+
 	case SECCOMP_RET_ALLOW:
 		/*
 		 * Note that the "match" filter will always be NULL for
@@ -873,6 +882,7 @@ static long seccomp_get_action_avail(const char __user *uaction)
 	case SECCOMP_RET_TRAP:
 	case SECCOMP_RET_ERRNO:
 	case SECCOMP_RET_TRACE:
+	case SECCOMP_RET_LOG:
 	case SECCOMP_RET_ALLOW:
 		break;
 	default:
@@ -1023,12 +1033,14 @@ out:
 #define SECCOMP_RET_TRAP_NAME		"trap"
 #define SECCOMP_RET_ERRNO_NAME		"errno"
 #define SECCOMP_RET_TRACE_NAME		"trace"
+#define SECCOMP_RET_LOG_NAME		"log"
 #define SECCOMP_RET_ALLOW_NAME		"allow"
 
 static const char seccomp_actions_avail[] = SECCOMP_RET_KILL_NAME	" "
 					    SECCOMP_RET_TRAP_NAME	" "
 					    SECCOMP_RET_ERRNO_NAME	" "
 					    SECCOMP_RET_TRACE_NAME	" "
+					    SECCOMP_RET_LOG_NAME	" "
 					    SECCOMP_RET_ALLOW_NAME;
 
 struct seccomp_log_name {
@@ -1041,6 +1053,7 @@ static const struct seccomp_log_name seccomp_log_names[] = {
 	{ SECCOMP_LOG_TRAP, SECCOMP_RET_TRAP_NAME },
 	{ SECCOMP_LOG_ERRNO, SECCOMP_RET_ERRNO_NAME },
 	{ SECCOMP_LOG_TRACE, SECCOMP_RET_TRACE_NAME },
+	{ SECCOMP_LOG_LOG, SECCOMP_RET_LOG_NAME },
 	{ SECCOMP_LOG_ALLOW, SECCOMP_RET_ALLOW_NAME },
 	{ }
 };
diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c
index 1c8c22ce7740..7372958eccb5 100644
--- a/tools/testing/selftests/seccomp/seccomp_bpf.c
+++ b/tools/testing/selftests/seccomp/seccomp_bpf.c
@@ -74,7 +74,12 @@
 #define SECCOMP_RET_ERRNO       0x00050000U /* returns an errno */
 #define SECCOMP_RET_TRACE       0x7ff00000U /* pass to a tracer or disallow */
 #define SECCOMP_RET_ALLOW       0x7fff0000U /* allow */
+#endif
+#ifndef SECCOMP_RET_LOG
+#define SECCOMP_RET_LOG       0x7ffc0000U /* allow after logging */
+#endif
 
+#ifndef SECCOMP_RET_ACTION
 /* Masks for the return value sections. */
 #define SECCOMP_RET_ACTION      0x7fff0000U
 #define SECCOMP_RET_DATA        0x0000ffffU
@@ -342,6 +347,28 @@ TEST(empty_prog)
 	EXPECT_EQ(EINVAL, errno);
 }
 
+TEST(log_all)
+{
+	struct sock_filter filter[] = {
+		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_LOG),
+	};
+	struct sock_fprog prog = {
+		.len = (unsigned short)ARRAY_SIZE(filter),
+		.filter = filter,
+	};
+	long ret;
+	pid_t parent = getppid();
+
+	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+	ASSERT_EQ(0, ret);
+
+	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
+	ASSERT_EQ(0, ret);
+
+	/* getppid() should succeed and be logged (no check for logging) */
+	EXPECT_EQ(parent, syscall(__NR_getppid));
+}
+
 TEST_SIGNAL(unknown_ret_is_kill_inside, SIGSYS)
 {
 	struct sock_filter filter[] = {
@@ -756,6 +783,7 @@ TEST_F(TRAP, handler)
 
 FIXTURE_DATA(precedence) {
 	struct sock_fprog allow;
+	struct sock_fprog log;
 	struct sock_fprog trace;
 	struct sock_fprog error;
 	struct sock_fprog trap;
@@ -767,6 +795,13 @@ FIXTURE_SETUP(precedence)
 	struct sock_filter allow_insns[] = {
 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
 	};
+	struct sock_filter log_insns[] = {
+		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
+			offsetof(struct seccomp_data, nr)),
+		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 1, 0),
+		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
+		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_LOG),
+	};
 	struct sock_filter trace_insns[] = {
 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
 			offsetof(struct seccomp_data, nr)),
@@ -803,6 +838,7 @@ FIXTURE_SETUP(precedence)
 	memcpy(self->_x.filter, &_x##_insns, sizeof(_x##_insns)); \
 	self->_x.len = (unsigned short)ARRAY_SIZE(_x##_insns)
 	FILTER_ALLOC(allow);
+	FILTER_ALLOC(log);
 	FILTER_ALLOC(trace);
 	FILTER_ALLOC(error);
 	FILTER_ALLOC(trap);
@@ -813,6 +849,7 @@ FIXTURE_TEARDOWN(precedence)
 {
 #define FILTER_FREE(_x) if (self->_x.filter) free(self->_x.filter)
 	FILTER_FREE(allow);
+	FILTER_FREE(log);
 	FILTER_FREE(trace);
 	FILTER_FREE(error);
 	FILTER_FREE(trap);
@@ -830,6 +867,8 @@ TEST_F(precedence, allow_ok)
 
 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
 	ASSERT_EQ(0, ret);
+	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
+	ASSERT_EQ(0, ret);
 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
 	ASSERT_EQ(0, ret);
 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error);
@@ -854,6 +893,8 @@ TEST_F_SIGNAL(precedence, kill_is_highest, SIGSYS)
 
 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
 	ASSERT_EQ(0, ret);
+	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
+	ASSERT_EQ(0, ret);
 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
 	ASSERT_EQ(0, ret);
 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error);
@@ -885,6 +926,8 @@ TEST_F_SIGNAL(precedence, kill_is_highest_in_any_order, SIGSYS)
 	ASSERT_EQ(0, ret);
 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error);
 	ASSERT_EQ(0, ret);
+	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
+	ASSERT_EQ(0, ret);
 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
 	ASSERT_EQ(0, ret);
 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trap);
@@ -906,6 +949,8 @@ TEST_F_SIGNAL(precedence, trap_is_second, SIGSYS)
 
 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
 	ASSERT_EQ(0, ret);
+	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
+	ASSERT_EQ(0, ret);
 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
 	ASSERT_EQ(0, ret);
 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error);
@@ -931,6 +976,8 @@ TEST_F_SIGNAL(precedence, trap_is_second_in_any_order, SIGSYS)
 	ASSERT_EQ(0, ret);
 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trap);
 	ASSERT_EQ(0, ret);
+	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
+	ASSERT_EQ(0, ret);
 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
 	ASSERT_EQ(0, ret);
 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error);
@@ -952,6 +999,8 @@ TEST_F(precedence, errno_is_third)
 
 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
 	ASSERT_EQ(0, ret);
+	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
+	ASSERT_EQ(0, ret);
 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
 	ASSERT_EQ(0, ret);
 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error);
@@ -970,6 +1019,8 @@ TEST_F(precedence, errno_is_third_in_any_order)
 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
 	ASSERT_EQ(0, ret);
 
+	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
+	ASSERT_EQ(0, ret);
 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error);
 	ASSERT_EQ(0, ret);
 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
@@ -992,6 +1043,8 @@ TEST_F(precedence, trace_is_fourth)
 
 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
 	ASSERT_EQ(0, ret);
+	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
+	ASSERT_EQ(0, ret);
 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
 	ASSERT_EQ(0, ret);
 	/* Should work just fine. */
@@ -1013,12 +1066,54 @@ TEST_F(precedence, trace_is_fourth_in_any_order)
 	ASSERT_EQ(0, ret);
 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
 	ASSERT_EQ(0, ret);
+	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
+	ASSERT_EQ(0, ret);
 	/* Should work just fine. */
 	EXPECT_EQ(parent, syscall(__NR_getppid));
 	/* No ptracer */
 	EXPECT_EQ(-1, syscall(__NR_getpid));
 }
 
+TEST_F(precedence, log_is_fifth)
+{
+	pid_t mypid, parent;
+	long ret;
+
+	mypid = getpid();
+	parent = getppid();
+	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+	ASSERT_EQ(0, ret);
+
+	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
+	ASSERT_EQ(0, ret);
+	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
+	ASSERT_EQ(0, ret);
+	/* Should work just fine. */
+	EXPECT_EQ(parent, syscall(__NR_getppid));
+	/* Should also work just fine */
+	EXPECT_EQ(mypid, syscall(__NR_getpid));
+}
+
+TEST_F(precedence, log_is_fifth_in_any_order)
+{
+	pid_t mypid, parent;
+	long ret;
+
+	mypid = getpid();
+	parent = getppid();
+	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+	ASSERT_EQ(0, ret);
+
+	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
+	ASSERT_EQ(0, ret);
+	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
+	ASSERT_EQ(0, ret);
+	/* Should work just fine. */
+	EXPECT_EQ(parent, syscall(__NR_getppid));
+	/* Should also work just fine */
+	EXPECT_EQ(mypid, syscall(__NR_getpid));
+}
+
 #ifndef PTRACE_O_TRACESECCOMP
 #define PTRACE_O_TRACESECCOMP	0x00000080
 #endif
@@ -2603,7 +2698,7 @@ TEST(get_action_avail)
 {
 	__u32 actions[] = { SECCOMP_RET_KILL,  SECCOMP_RET_TRAP,
 			    SECCOMP_RET_ERRNO, SECCOMP_RET_TRACE,
-			    SECCOMP_RET_ALLOW };
+			    SECCOMP_RET_LOG,   SECCOMP_RET_ALLOW };
 	__u32 unknown_action = 0x10000000U;
 	int i;
 	long ret;
@@ -2640,6 +2735,7 @@ TEST(get_action_avail)
  * - 64-bit arg prodding
  * - arch value testing (x86 modes especially)
  * - verify that FILTER_FLAG_LOG filters generate log messages
+ * - verify that RET_LOG generates log messages
  * - ...
  */
 
-- 
cgit v1.2.3-71-gd317


From fd76875ca289a3d4722f266fd2d5532a27083903 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Fri, 11 Aug 2017 12:53:18 -0700
Subject: seccomp: Rename SECCOMP_RET_KILL to SECCOMP_RET_KILL_THREAD

In preparation for adding SECCOMP_RET_KILL_PROCESS, rename SECCOMP_RET_KILL
to the more accurate SECCOMP_RET_KILL_THREAD.

The existing selftest values are intentionally left as SECCOMP_RET_KILL
just to be sure we're exercising the alias.

Signed-off-by: Kees Cook <keescook@chromium.org>
---
 Documentation/networking/filter.txt            |  2 +-
 Documentation/userspace-api/seccomp_filter.rst |  4 +--
 include/uapi/linux/seccomp.h                   |  3 +-
 kernel/seccomp.c                               | 39 ++++++++++++++------------
 samples/seccomp/bpf-direct.c                   |  4 +--
 samples/seccomp/bpf-helper.h                   |  2 +-
 tools/testing/selftests/seccomp/seccomp_bpf.c  | 17 ++++++-----
 7 files changed, 39 insertions(+), 32 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/networking/filter.txt b/Documentation/networking/filter.txt
index b69b205501de..73aa0f12156d 100644
--- a/Documentation/networking/filter.txt
+++ b/Documentation/networking/filter.txt
@@ -337,7 +337,7 @@ Examples for low-level BPF:
   jeq #14, good           /* __NR_rt_sigprocmask */
   jeq #13, good           /* __NR_rt_sigaction */
   jeq #35, good           /* __NR_nanosleep */
-  bad: ret #0             /* SECCOMP_RET_KILL */
+  bad: ret #0             /* SECCOMP_RET_KILL_THREAD */
   good: ret #0x7fff0000   /* SECCOMP_RET_ALLOW */
 
 The above example code can be placed into a file (here called "foo"), and
diff --git a/Documentation/userspace-api/seccomp_filter.rst b/Documentation/userspace-api/seccomp_filter.rst
index f4977357daf2..d76396f2d8ed 100644
--- a/Documentation/userspace-api/seccomp_filter.rst
+++ b/Documentation/userspace-api/seccomp_filter.rst
@@ -87,11 +87,11 @@ Return values
 A seccomp filter may return any of the following values. If multiple
 filters exist, the return value for the evaluation of a given system
 call will always use the highest precedent value. (For example,
-``SECCOMP_RET_KILL`` will always take precedence.)
+``SECCOMP_RET_KILL_THREAD`` will always take precedence.)
 
 In precedence order, they are:
 
-``SECCOMP_RET_KILL``:
+``SECCOMP_RET_KILL_THREAD``:
 	Results in the task exiting immediately without executing the
 	system call.  The exit status of the task (``status & 0x7f``) will
 	be ``SIGSYS``, not ``SIGKILL``.
diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h
index f94433263e4b..5a03f699eb17 100644
--- a/include/uapi/linux/seccomp.h
+++ b/include/uapi/linux/seccomp.h
@@ -27,7 +27,8 @@
  * The ordering ensures that a min_t() over composed return values always
  * selects the least permissive choice.
  */
-#define SECCOMP_RET_KILL	0x00000000U /* kill the task immediately */
+#define SECCOMP_RET_KILL_THREAD	0x00000000U /* kill the thread */
+#define SECCOMP_RET_KILL	SECCOMP_RET_KILL_THREAD
 #define SECCOMP_RET_TRAP	0x00030000U /* disallow and force a SIGSYS */
 #define SECCOMP_RET_ERRNO	0x00050000U /* returns an errno */
 #define SECCOMP_RET_TRACE	0x7ff00000U /* pass to a tracer or disallow */
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 59cde2ed3b92..95ac54cff00f 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -192,7 +192,7 @@ static u32 seccomp_run_filters(const struct seccomp_data *sd,
 
 	/* Ensure unexpected behavior doesn't result in failing open. */
 	if (unlikely(WARN_ON(f == NULL)))
-		return SECCOMP_RET_KILL;
+		return SECCOMP_RET_KILL_THREAD;
 
 	if (!sd) {
 		populate_seccomp_data(&sd_local);
@@ -529,15 +529,17 @@ static void seccomp_send_sigsys(int syscall, int reason)
 #endif	/* CONFIG_SECCOMP_FILTER */
 
 /* For use with seccomp_actions_logged */
-#define SECCOMP_LOG_KILL		(1 << 0)
+#define SECCOMP_LOG_KILL_THREAD		(1 << 0)
 #define SECCOMP_LOG_TRAP		(1 << 2)
 #define SECCOMP_LOG_ERRNO		(1 << 3)
 #define SECCOMP_LOG_TRACE		(1 << 4)
 #define SECCOMP_LOG_LOG			(1 << 5)
 #define SECCOMP_LOG_ALLOW		(1 << 6)
 
-static u32 seccomp_actions_logged = SECCOMP_LOG_KILL  | SECCOMP_LOG_TRAP  |
-				    SECCOMP_LOG_ERRNO | SECCOMP_LOG_TRACE |
+static u32 seccomp_actions_logged = SECCOMP_LOG_KILL_THREAD |
+				    SECCOMP_LOG_TRAP  |
+				    SECCOMP_LOG_ERRNO |
+				    SECCOMP_LOG_TRACE |
 				    SECCOMP_LOG_LOG;
 
 static inline void seccomp_log(unsigned long syscall, long signr, u32 action,
@@ -560,13 +562,13 @@ static inline void seccomp_log(unsigned long syscall, long signr, u32 action,
 	case SECCOMP_RET_LOG:
 		log = seccomp_actions_logged & SECCOMP_LOG_LOG;
 		break;
-	case SECCOMP_RET_KILL:
+	case SECCOMP_RET_KILL_THREAD:
 	default:
-		log = seccomp_actions_logged & SECCOMP_LOG_KILL;
+		log = seccomp_actions_logged & SECCOMP_LOG_KILL_THREAD;
 	}
 
 	/*
-	 * Force an audit message to be emitted when the action is RET_KILL,
+	 * Force an audit message to be emitted when the action is RET_KILL_*,
 	 * RET_LOG, or the FILTER_FLAG_LOG bit was set and the action is
 	 * allowed to be logged by the admin.
 	 */
@@ -605,7 +607,7 @@ static void __secure_computing_strict(int this_syscall)
 #ifdef SECCOMP_DEBUG
 	dump_stack();
 #endif
-	seccomp_log(this_syscall, SIGKILL, SECCOMP_RET_KILL, true);
+	seccomp_log(this_syscall, SIGKILL, SECCOMP_RET_KILL_THREAD, true);
 	do_exit(SIGKILL);
 }
 
@@ -716,7 +718,7 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
 		 */
 		return 0;
 
-	case SECCOMP_RET_KILL:
+	case SECCOMP_RET_KILL_THREAD:
 	default:
 		seccomp_log(this_syscall, SIGSYS, action, true);
 		/* Dump core only if this is the last remaining thread. */
@@ -878,7 +880,7 @@ static long seccomp_get_action_avail(const char __user *uaction)
 		return -EFAULT;
 
 	switch (action) {
-	case SECCOMP_RET_KILL:
+	case SECCOMP_RET_KILL_THREAD:
 	case SECCOMP_RET_TRAP:
 	case SECCOMP_RET_ERRNO:
 	case SECCOMP_RET_TRACE:
@@ -1029,19 +1031,20 @@ out:
 #ifdef CONFIG_SYSCTL
 
 /* Human readable action names for friendly sysctl interaction */
-#define SECCOMP_RET_KILL_NAME		"kill"
+#define SECCOMP_RET_KILL_THREAD_NAME	"kill_thread"
 #define SECCOMP_RET_TRAP_NAME		"trap"
 #define SECCOMP_RET_ERRNO_NAME		"errno"
 #define SECCOMP_RET_TRACE_NAME		"trace"
 #define SECCOMP_RET_LOG_NAME		"log"
 #define SECCOMP_RET_ALLOW_NAME		"allow"
 
-static const char seccomp_actions_avail[] = SECCOMP_RET_KILL_NAME	" "
-					    SECCOMP_RET_TRAP_NAME	" "
-					    SECCOMP_RET_ERRNO_NAME	" "
-					    SECCOMP_RET_TRACE_NAME	" "
-					    SECCOMP_RET_LOG_NAME	" "
-					    SECCOMP_RET_ALLOW_NAME;
+static const char seccomp_actions_avail[] =
+				SECCOMP_RET_KILL_THREAD_NAME	" "
+				SECCOMP_RET_TRAP_NAME		" "
+				SECCOMP_RET_ERRNO_NAME		" "
+				SECCOMP_RET_TRACE_NAME		" "
+				SECCOMP_RET_LOG_NAME		" "
+				SECCOMP_RET_ALLOW_NAME;
 
 struct seccomp_log_name {
 	u32		log;
@@ -1049,7 +1052,7 @@ struct seccomp_log_name {
 };
 
 static const struct seccomp_log_name seccomp_log_names[] = {
-	{ SECCOMP_LOG_KILL, SECCOMP_RET_KILL_NAME },
+	{ SECCOMP_LOG_KILL_THREAD, SECCOMP_RET_KILL_THREAD_NAME },
 	{ SECCOMP_LOG_TRAP, SECCOMP_RET_TRAP_NAME },
 	{ SECCOMP_LOG_ERRNO, SECCOMP_RET_ERRNO_NAME },
 	{ SECCOMP_LOG_TRACE, SECCOMP_RET_TRACE_NAME },
diff --git a/samples/seccomp/bpf-direct.c b/samples/seccomp/bpf-direct.c
index 151ec3f52189..235ce3c49ee9 100644
--- a/samples/seccomp/bpf-direct.c
+++ b/samples/seccomp/bpf-direct.c
@@ -129,7 +129,7 @@ static int install_filter(void)
 		/* Check that read is only using stdin. */
 		BPF_STMT(BPF_LD+BPF_W+BPF_ABS, syscall_arg(0)),
 		BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, STDIN_FILENO, 4, 0),
-		BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_KILL),
+		BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_KILL_THREAD),
 
 		/* Check that write is only using stdout */
 		BPF_STMT(BPF_LD+BPF_W+BPF_ABS, syscall_arg(0)),
@@ -139,7 +139,7 @@ static int install_filter(void)
 
 		BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW),
 		BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_TRAP),
-		BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_KILL),
+		BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_KILL_THREAD),
 	};
 	struct sock_fprog prog = {
 		.len = (unsigned short)(sizeof(filter)/sizeof(filter[0])),
diff --git a/samples/seccomp/bpf-helper.h b/samples/seccomp/bpf-helper.h
index 1d8de9edd858..83dbe79cbe2c 100644
--- a/samples/seccomp/bpf-helper.h
+++ b/samples/seccomp/bpf-helper.h
@@ -44,7 +44,7 @@ void seccomp_bpf_print(struct sock_filter *filter, size_t count);
 #define ALLOW \
 	BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW)
 #define DENY \
-	BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_KILL)
+	BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_KILL_THREAD)
 #define JUMP(labels, label) \
 	BPF_JUMP(BPF_JMP+BPF_JA, FIND_LABEL((labels), (label)), \
 		 JUMP_JT, JUMP_JF)
diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c
index 7372958eccb5..a3ba39a32449 100644
--- a/tools/testing/selftests/seccomp/seccomp_bpf.c
+++ b/tools/testing/selftests/seccomp/seccomp_bpf.c
@@ -68,15 +68,18 @@
 #define SECCOMP_MODE_FILTER 2
 #endif
 
+#ifndef SECCOMP_RET_KILL_THREAD
+#define SECCOMP_RET_KILL_THREAD	 0x00000000U /* kill the thread */
+#endif
 #ifndef SECCOMP_RET_KILL
-#define SECCOMP_RET_KILL        0x00000000U /* kill the task immediately */
-#define SECCOMP_RET_TRAP        0x00030000U /* disallow and force a SIGSYS */
-#define SECCOMP_RET_ERRNO       0x00050000U /* returns an errno */
-#define SECCOMP_RET_TRACE       0x7ff00000U /* pass to a tracer or disallow */
-#define SECCOMP_RET_ALLOW       0x7fff0000U /* allow */
+#define SECCOMP_RET_KILL	 SECCOMP_RET_KILL_THREAD
+#define SECCOMP_RET_TRAP	 0x00030000U /* disallow and force a SIGSYS */
+#define SECCOMP_RET_ERRNO	 0x00050000U /* returns an errno */
+#define SECCOMP_RET_TRACE	 0x7ff00000U /* pass to a tracer or disallow */
+#define SECCOMP_RET_ALLOW	 0x7fff0000U /* allow */
 #endif
 #ifndef SECCOMP_RET_LOG
-#define SECCOMP_RET_LOG       0x7ffc0000U /* allow after logging */
+#define SECCOMP_RET_LOG		 0x7ffc0000U /* allow after logging */
 #endif
 
 #ifndef SECCOMP_RET_ACTION
@@ -2696,7 +2699,7 @@ TEST_SIGNAL(filter_flag_log, SIGSYS)
 
 TEST(get_action_avail)
 {
-	__u32 actions[] = { SECCOMP_RET_KILL,  SECCOMP_RET_TRAP,
+	__u32 actions[] = { SECCOMP_RET_KILL_THREAD, SECCOMP_RET_TRAP,
 			    SECCOMP_RET_ERRNO, SECCOMP_RET_TRACE,
 			    SECCOMP_RET_LOG,   SECCOMP_RET_ALLOW };
 	__u32 unknown_action = 0x10000000U;
-- 
cgit v1.2.3-71-gd317


From 4d3b0b05aae9ee9ce0970dc4cc0fb3fad5e85945 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Fri, 11 Aug 2017 13:01:39 -0700
Subject: seccomp: Introduce SECCOMP_RET_KILL_PROCESS

This introduces the BPF return value for SECCOMP_RET_KILL_PROCESS to kill
an entire process. This cannot yet be reached by seccomp, but it changes
the default-kill behavior (for unknown return values) from kill-thread to
kill-process.

Signed-off-by: Kees Cook <keescook@chromium.org>
---
 include/uapi/linux/seccomp.h | 18 ++++++++++--------
 kernel/seccomp.c             | 22 ++++++++++++++++------
 2 files changed, 26 insertions(+), 14 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h
index 5a03f699eb17..7e77c92df78a 100644
--- a/include/uapi/linux/seccomp.h
+++ b/include/uapi/linux/seccomp.h
@@ -22,18 +22,20 @@
 /*
  * All BPF programs must return a 32-bit value.
  * The bottom 16-bits are for optional return data.
- * The upper 16-bits are ordered from least permissive values to most.
+ * The upper 16-bits are ordered from least permissive values to most,
+ * as a signed value (so 0x8000000 is negative).
  *
  * The ordering ensures that a min_t() over composed return values always
  * selects the least permissive choice.
  */
-#define SECCOMP_RET_KILL_THREAD	0x00000000U /* kill the thread */
-#define SECCOMP_RET_KILL	SECCOMP_RET_KILL_THREAD
-#define SECCOMP_RET_TRAP	0x00030000U /* disallow and force a SIGSYS */
-#define SECCOMP_RET_ERRNO	0x00050000U /* returns an errno */
-#define SECCOMP_RET_TRACE	0x7ff00000U /* pass to a tracer or disallow */
-#define SECCOMP_RET_LOG		0x7ffc0000U /* allow after logging */
-#define SECCOMP_RET_ALLOW	0x7fff0000U /* allow */
+#define SECCOMP_RET_KILL_PROCESS 0x80000000U /* kill the process */
+#define SECCOMP_RET_KILL_THREAD	 0x00000000U /* kill the thread */
+#define SECCOMP_RET_KILL	 SECCOMP_RET_KILL_THREAD
+#define SECCOMP_RET_TRAP	 0x00030000U /* disallow and force a SIGSYS */
+#define SECCOMP_RET_ERRNO	 0x00050000U /* returns an errno */
+#define SECCOMP_RET_TRACE	 0x7ff00000U /* pass to a tracer or disallow */
+#define SECCOMP_RET_LOG		 0x7ffc0000U /* allow after logging */
+#define SECCOMP_RET_ALLOW	 0x7fff0000U /* allow */
 
 /* Masks for the return value sections. */
 #define SECCOMP_RET_ACTION	0x7fff0000U
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 95ac54cff00f..5c7299b9d953 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -192,7 +192,7 @@ static u32 seccomp_run_filters(const struct seccomp_data *sd,
 
 	/* Ensure unexpected behavior doesn't result in failing open. */
 	if (unlikely(WARN_ON(f == NULL)))
-		return SECCOMP_RET_KILL_THREAD;
+		return SECCOMP_RET_KILL_PROCESS;
 
 	if (!sd) {
 		populate_seccomp_data(&sd_local);
@@ -529,14 +529,16 @@ static void seccomp_send_sigsys(int syscall, int reason)
 #endif	/* CONFIG_SECCOMP_FILTER */
 
 /* For use with seccomp_actions_logged */
-#define SECCOMP_LOG_KILL_THREAD		(1 << 0)
+#define SECCOMP_LOG_KILL_PROCESS	(1 << 0)
+#define SECCOMP_LOG_KILL_THREAD		(1 << 1)
 #define SECCOMP_LOG_TRAP		(1 << 2)
 #define SECCOMP_LOG_ERRNO		(1 << 3)
 #define SECCOMP_LOG_TRACE		(1 << 4)
 #define SECCOMP_LOG_LOG			(1 << 5)
 #define SECCOMP_LOG_ALLOW		(1 << 6)
 
-static u32 seccomp_actions_logged = SECCOMP_LOG_KILL_THREAD |
+static u32 seccomp_actions_logged = SECCOMP_LOG_KILL_PROCESS |
+				    SECCOMP_LOG_KILL_THREAD  |
 				    SECCOMP_LOG_TRAP  |
 				    SECCOMP_LOG_ERRNO |
 				    SECCOMP_LOG_TRACE |
@@ -563,8 +565,11 @@ static inline void seccomp_log(unsigned long syscall, long signr, u32 action,
 		log = seccomp_actions_logged & SECCOMP_LOG_LOG;
 		break;
 	case SECCOMP_RET_KILL_THREAD:
-	default:
 		log = seccomp_actions_logged & SECCOMP_LOG_KILL_THREAD;
+		break;
+	case SECCOMP_RET_KILL_PROCESS:
+	default:
+		log = seccomp_actions_logged & SECCOMP_LOG_KILL_PROCESS;
 	}
 
 	/*
@@ -719,10 +724,12 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
 		return 0;
 
 	case SECCOMP_RET_KILL_THREAD:
+	case SECCOMP_RET_KILL_PROCESS:
 	default:
 		seccomp_log(this_syscall, SIGSYS, action, true);
 		/* Dump core only if this is the last remaining thread. */
-		if (get_nr_threads(current) == 1) {
+		if (action == SECCOMP_RET_KILL_PROCESS ||
+		    get_nr_threads(current) == 1) {
 			siginfo_t info;
 
 			/* Show the original registers in the dump. */
@@ -731,7 +738,10 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
 			seccomp_init_siginfo(&info, this_syscall, data);
 			do_coredump(&info);
 		}
-		do_exit(SIGSYS);
+		if (action == SECCOMP_RET_KILL_PROCESS)
+			do_group_exit(SIGSYS);
+		else
+			do_exit(SIGSYS);
 	}
 
 	unreachable();
-- 
cgit v1.2.3-71-gd317


From 0466bdb99e8744bc9befa8d62a317f0fd7fd7421 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Fri, 11 Aug 2017 13:12:11 -0700
Subject: seccomp: Implement SECCOMP_RET_KILL_PROCESS action
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Right now, SECCOMP_RET_KILL_THREAD (neé SECCOMP_RET_KILL) kills the
current thread. There have been a few requests for this to kill the entire
process (the thread group). This cannot be just changed (discovered when
adding coredump support since coredumping kills the entire process)
because there are userspace programs depending on the thread-kill
behavior.

Instead, implement SECCOMP_RET_KILL_PROCESS, which is 0x80000000, and can
be processed as "-1" by the kernel, below the existing RET_KILL that is
ABI-set to "0". For userspace, SECCOMP_RET_ACTION_FULL is added to expand
the mask to the signed bit. Old userspace using the SECCOMP_RET_ACTION
mask will see SECCOMP_RET_KILL_PROCESS as 0 still, but this would only
be visible when examining the siginfo in a core dump from a RET_KILL_*,
where it will think it was thread-killed instead of process-killed.

Attempts to introduce this behavior via other ways (filter flags,
seccomp struct flags, masked RET_DATA bits) all come with weird
side-effects and baggage. This change preserves the central behavioral
expectations of the seccomp filter engine without putting too great
a burden on changes needed in userspace to use the new action.

The new action is discoverable by userspace through either the new
actions_avail sysctl or through the SECCOMP_GET_ACTION_AVAIL seccomp
operation. If used without checking for availability, old kernels
will treat RET_KILL_PROCESS as RET_KILL_THREAD (since the old mask
will produce RET_KILL_THREAD).

Cc: Paul Moore <paul@paul-moore.com>
Cc: Fabricio Voznika <fvoznika@google.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 Documentation/userspace-api/seccomp_filter.rst | 7 ++++++-
 include/uapi/linux/seccomp.h                   | 1 +
 kernel/seccomp.c                               | 9 +++++++--
 3 files changed, 14 insertions(+), 3 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/userspace-api/seccomp_filter.rst b/Documentation/userspace-api/seccomp_filter.rst
index d76396f2d8ed..099c412951d6 100644
--- a/Documentation/userspace-api/seccomp_filter.rst
+++ b/Documentation/userspace-api/seccomp_filter.rst
@@ -87,10 +87,15 @@ Return values
 A seccomp filter may return any of the following values. If multiple
 filters exist, the return value for the evaluation of a given system
 call will always use the highest precedent value. (For example,
-``SECCOMP_RET_KILL_THREAD`` will always take precedence.)
+``SECCOMP_RET_KILL_PROCESS`` will always take precedence.)
 
 In precedence order, they are:
 
+``SECCOMP_RET_KILL_PROCESS``:
+	Results in the entire process exiting immediately without executing
+	the system call.  The exit status of the task (``status & 0x7f``)
+	will be ``SIGSYS``, not ``SIGKILL``.
+
 ``SECCOMP_RET_KILL_THREAD``:
 	Results in the task exiting immediately without executing the
 	system call.  The exit status of the task (``status & 0x7f``) will
diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h
index 7e77c92df78a..f6bc1dea3247 100644
--- a/include/uapi/linux/seccomp.h
+++ b/include/uapi/linux/seccomp.h
@@ -38,6 +38,7 @@
 #define SECCOMP_RET_ALLOW	 0x7fff0000U /* allow */
 
 /* Masks for the return value sections. */
+#define SECCOMP_RET_ACTION_FULL	0xffff0000U
 #define SECCOMP_RET_ACTION	0x7fff0000U
 #define SECCOMP_RET_DATA	0x0000ffffU
 
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 5c7299b9d953..c24579dfa7a1 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -181,6 +181,7 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
  *
  * Returns valid seccomp BPF response codes.
  */
+#define ACTION_ONLY(ret) ((s32)((ret) & (SECCOMP_RET_ACTION_FULL)))
 static u32 seccomp_run_filters(const struct seccomp_data *sd,
 			       struct seccomp_filter **match)
 {
@@ -206,7 +207,7 @@ static u32 seccomp_run_filters(const struct seccomp_data *sd,
 	for (; f; f = f->prev) {
 		u32 cur_ret = BPF_PROG_RUN(f->prog, sd);
 
-		if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION)) {
+		if (ACTION_ONLY(cur_ret) < ACTION_ONLY(ret)) {
 			ret = cur_ret;
 			*match = f;
 		}
@@ -650,7 +651,7 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
 
 	filter_ret = seccomp_run_filters(sd, &match);
 	data = filter_ret & SECCOMP_RET_DATA;
-	action = filter_ret & SECCOMP_RET_ACTION;
+	action = filter_ret & SECCOMP_RET_ACTION_FULL;
 
 	switch (action) {
 	case SECCOMP_RET_ERRNO:
@@ -890,6 +891,7 @@ static long seccomp_get_action_avail(const char __user *uaction)
 		return -EFAULT;
 
 	switch (action) {
+	case SECCOMP_RET_KILL_PROCESS:
 	case SECCOMP_RET_KILL_THREAD:
 	case SECCOMP_RET_TRAP:
 	case SECCOMP_RET_ERRNO:
@@ -1041,6 +1043,7 @@ out:
 #ifdef CONFIG_SYSCTL
 
 /* Human readable action names for friendly sysctl interaction */
+#define SECCOMP_RET_KILL_PROCESS_NAME	"kill_process"
 #define SECCOMP_RET_KILL_THREAD_NAME	"kill_thread"
 #define SECCOMP_RET_TRAP_NAME		"trap"
 #define SECCOMP_RET_ERRNO_NAME		"errno"
@@ -1049,6 +1052,7 @@ out:
 #define SECCOMP_RET_ALLOW_NAME		"allow"
 
 static const char seccomp_actions_avail[] =
+				SECCOMP_RET_KILL_PROCESS_NAME	" "
 				SECCOMP_RET_KILL_THREAD_NAME	" "
 				SECCOMP_RET_TRAP_NAME		" "
 				SECCOMP_RET_ERRNO_NAME		" "
@@ -1062,6 +1066,7 @@ struct seccomp_log_name {
 };
 
 static const struct seccomp_log_name seccomp_log_names[] = {
+	{ SECCOMP_LOG_KILL_PROCESS, SECCOMP_RET_KILL_PROCESS_NAME },
 	{ SECCOMP_LOG_KILL_THREAD, SECCOMP_RET_KILL_THREAD_NAME },
 	{ SECCOMP_LOG_TRAP, SECCOMP_RET_TRAP_NAME },
 	{ SECCOMP_LOG_ERRNO, SECCOMP_RET_ERRNO_NAME },
-- 
cgit v1.2.3-71-gd317


From 44dd8a989c787e9077745417140aa132bfe45bf5 Mon Sep 17 00:00:00 2001
From: Baolin Wang <baolin.wang@linaro.org>
Date: Tue, 15 Aug 2017 19:07:53 +0800
Subject: include: uapi: usb: Introduce USB charger type and state definition

Introducing USB charger type and state definition can help
to support USB charging which will be added in USB phy core.

Signed-off-by: Baolin Wang <baolin.wang@linaro.org>
Signed-off-by: Felipe Balbi <felipe.balbi@linux.intel.com>
---
 include/uapi/linux/usb/charger.h | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)
 create mode 100644 include/uapi/linux/usb/charger.h

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/usb/charger.h b/include/uapi/linux/usb/charger.h
new file mode 100644
index 000000000000..5f72af35b3ed
--- /dev/null
+++ b/include/uapi/linux/usb/charger.h
@@ -0,0 +1,31 @@
+/*
+ * This file defines the USB charger type and state that are needed for
+ * USB device APIs.
+ */
+
+#ifndef _UAPI__LINUX_USB_CHARGER_H
+#define _UAPI__LINUX_USB_CHARGER_H
+
+/*
+ * USB charger type:
+ * SDP (Standard Downstream Port)
+ * DCP (Dedicated Charging Port)
+ * CDP (Charging Downstream Port)
+ * ACA (Accessory Charger Adapters)
+ */
+enum usb_charger_type {
+	UNKNOWN_TYPE,
+	SDP_TYPE,
+	DCP_TYPE,
+	CDP_TYPE,
+	ACA_TYPE,
+};
+
+/* USB charger state */
+enum usb_charger_state {
+	USB_CHARGER_DEFAULT,
+	USB_CHARGER_PRESENT,
+	USB_CHARGER_ABSENT,
+};
+
+#endif /* _UAPI__LINUX_USB_CHARGER_H */
-- 
cgit v1.2.3-71-gd317


From 5c1aab1dd5445ed8bdcdbb575abc1b0d7ee5b2e7 Mon Sep 17 00:00:00 2001
From: Nick Terrell <terrelln@fb.com>
Date: Wed, 9 Aug 2017 19:39:02 -0700
Subject: btrfs: Add zstd support

Add zstd compression and decompression support to BtrFS. zstd at its
fastest level compresses almost as well as zlib, while offering much
faster compression and decompression, approaching lzo speeds.

I benchmarked btrfs with zstd compression against no compression, lzo
compression, and zlib compression. I benchmarked two scenarios. Copying
a set of files to btrfs, and then reading the files. Copying a tarball
to btrfs, extracting it to btrfs, and then reading the extracted files.
After every operation, I call `sync` and include the sync time.
Between every pair of operations I unmount and remount the filesystem
to avoid caching. The benchmark files can be found in the upstream
zstd source repository under
`contrib/linux-kernel/{btrfs-benchmark.sh,btrfs-extract-benchmark.sh}`
[1] [2].

I ran the benchmarks on a Ubuntu 14.04 VM with 2 cores and 4 GiB of RAM.
The VM is running on a MacBook Pro with a 3.1 GHz Intel Core i7 processor,
16 GB of RAM, and a SSD.

The first compression benchmark is copying 10 copies of the unzipped
Silesia corpus [3] into a BtrFS filesystem mounted with
`-o compress-force=Method`. The decompression benchmark times how long
it takes to `tar` all 10 copies into `/dev/null`. The compression ratio is
measured by comparing the output of `df` and `du`. See the benchmark file
[1] for details. I benchmarked multiple zstd compression levels, although
the patch uses zstd level 1.

| Method  | Ratio | Compression MB/s | Decompression speed |
|---------|-------|------------------|---------------------|
| None    |  0.99 |              504 |                 686 |
| lzo     |  1.66 |              398 |                 442 |
| zlib    |  2.58 |               65 |                 241 |
| zstd 1  |  2.57 |              260 |                 383 |
| zstd 3  |  2.71 |              174 |                 408 |
| zstd 6  |  2.87 |               70 |                 398 |
| zstd 9  |  2.92 |               43 |                 406 |
| zstd 12 |  2.93 |               21 |                 408 |
| zstd 15 |  3.01 |               11 |                 354 |

The next benchmark first copies `linux-4.11.6.tar` [4] to btrfs. Then it
measures the compression ratio, extracts the tar, and deletes the tar.
Then it measures the compression ratio again, and `tar`s the extracted
files into `/dev/null`. See the benchmark file [2] for details.

| Method | Tar Ratio | Extract Ratio | Copy (s) | Extract (s)| Read (s) |
|--------|-----------|---------------|----------|------------|----------|
| None   |      0.97 |          0.78 |    0.981 |      5.501 |    8.807 |
| lzo    |      2.06 |          1.38 |    1.631 |      8.458 |    8.585 |
| zlib   |      3.40 |          1.86 |    7.750 |     21.544 |   11.744 |
| zstd 1 |      3.57 |          1.85 |    2.579 |     11.479 |    9.389 |

[1] https://github.com/facebook/zstd/blob/dev/contrib/linux-kernel/btrfs-benchmark.sh
[2] https://github.com/facebook/zstd/blob/dev/contrib/linux-kernel/btrfs-extract-benchmark.sh
[3] http://sun.aei.polsl.pl/~sdeor/index.php?page=silesia
[4] https://cdn.kernel.org/pub/linux/kernel/v4.x/linux-4.11.6.tar.xz

zstd source repository: https://github.com/facebook/zstd

Signed-off-by: Nick Terrell <terrelln@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/Kconfig           |   2 +
 fs/btrfs/Makefile          |   2 +-
 fs/btrfs/compression.c     |   1 +
 fs/btrfs/compression.h     |   6 +-
 fs/btrfs/ctree.h           |   1 +
 fs/btrfs/disk-io.c         |   2 +
 fs/btrfs/ioctl.c           |   6 +-
 fs/btrfs/props.c           |   6 +
 fs/btrfs/super.c           |  12 +-
 fs/btrfs/sysfs.c           |   2 +
 fs/btrfs/zstd.c            | 432 +++++++++++++++++++++++++++++++++++++++++++++
 include/uapi/linux/btrfs.h |   8 +-
 12 files changed, 468 insertions(+), 12 deletions(-)
 create mode 100644 fs/btrfs/zstd.c

(limited to 'include/uapi/linux')

diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
index 80e9c18ea64f..a26c63b4ad68 100644
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@@ -6,6 +6,8 @@ config BTRFS_FS
 	select ZLIB_DEFLATE
 	select LZO_COMPRESS
 	select LZO_DECOMPRESS
+	select ZSTD_COMPRESS
+	select ZSTD_DECOMPRESS
 	select RAID6_PQ
 	select XOR_BLOCKS
 	select SRCU
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 128ce17a80b0..962a95aefb81 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -6,7 +6,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
 	   transaction.o inode.o file.o tree-defrag.o \
 	   extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
 	   extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
-	   export.o tree-log.o free-space-cache.o zlib.o lzo.o \
+	   export.o tree-log.o free-space-cache.o zlib.o lzo.o zstd.o \
 	   compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
 	   reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \
 	   uuid-tree.o props.o hash.o free-space-tree.o
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index d2ef9ac2a630..4ff42d18a64d 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -704,6 +704,7 @@ static struct {
 static const struct btrfs_compress_op * const btrfs_compress_op[] = {
 	&btrfs_zlib_compress,
 	&btrfs_lzo_compress,
+	&btrfs_zstd_compress,
 };
 
 void __init btrfs_init_compress(void)
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index 87f6d3332163..2269e00854d8 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -99,8 +99,9 @@ enum btrfs_compression_type {
 	BTRFS_COMPRESS_NONE  = 0,
 	BTRFS_COMPRESS_ZLIB  = 1,
 	BTRFS_COMPRESS_LZO   = 2,
-	BTRFS_COMPRESS_TYPES = 2,
-	BTRFS_COMPRESS_LAST  = 3,
+	BTRFS_COMPRESS_ZSTD  = 3,
+	BTRFS_COMPRESS_TYPES = 3,
+	BTRFS_COMPRESS_LAST  = 4,
 };
 
 struct btrfs_compress_op {
@@ -128,5 +129,6 @@ struct btrfs_compress_op {
 
 extern const struct btrfs_compress_op btrfs_zlib_compress;
 extern const struct btrfs_compress_op btrfs_lzo_compress;
+extern const struct btrfs_compress_op btrfs_zstd_compress;
 
 #endif
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 3f3eb7b17cac..845d77c097d6 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -270,6 +270,7 @@ struct btrfs_super_block {
 	 BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS |		\
 	 BTRFS_FEATURE_INCOMPAT_BIG_METADATA |		\
 	 BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO |		\
+	 BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD |		\
 	 BTRFS_FEATURE_INCOMPAT_RAID56 |		\
 	 BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF |		\
 	 BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA |	\
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 080e2ebb8aa0..04632f4de933 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2828,6 +2828,8 @@ int open_ctree(struct super_block *sb,
 	features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF;
 	if (fs_info->compress_type == BTRFS_COMPRESS_LZO)
 		features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
+	else if (fs_info->compress_type == BTRFS_COMPRESS_ZSTD)
+		features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD;
 
 	if (features & BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA)
 		btrfs_info(fs_info, "has skinny extents");
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index fa1b78cf25f6..b9963d94d727 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -327,8 +327,10 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
 
 		if (fs_info->compress_type == BTRFS_COMPRESS_LZO)
 			comp = "lzo";
-		else
+		else if (fs_info->compress_type == BTRFS_COMPRESS_ZLIB)
 			comp = "zlib";
+		else
+			comp = "zstd";
 		ret = btrfs_set_prop(inode, "btrfs.compression",
 				     comp, strlen(comp), 0);
 		if (ret)
@@ -1466,6 +1468,8 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
 
 	if (range->compress_type == BTRFS_COMPRESS_LZO) {
 		btrfs_set_fs_incompat(fs_info, COMPRESS_LZO);
+	} else if (range->compress_type == BTRFS_COMPRESS_ZSTD) {
+		btrfs_set_fs_incompat(fs_info, COMPRESS_ZSTD);
 	}
 
 	ret = defrag_count;
diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c
index 4b23ae5d0e5c..20631e9273a0 100644
--- a/fs/btrfs/props.c
+++ b/fs/btrfs/props.c
@@ -390,6 +390,8 @@ static int prop_compression_validate(const char *value, size_t len)
 		return 0;
 	else if (!strncmp("zlib", value, len))
 		return 0;
+	else if (!strncmp("zstd", value, len))
+		return 0;
 
 	return -EINVAL;
 }
@@ -412,6 +414,8 @@ static int prop_compression_apply(struct inode *inode,
 		type = BTRFS_COMPRESS_LZO;
 	else if (!strncmp("zlib", value, len))
 		type = BTRFS_COMPRESS_ZLIB;
+	else if (!strncmp("zstd", value, len))
+		type = BTRFS_COMPRESS_ZSTD;
 	else
 		return -EINVAL;
 
@@ -429,6 +433,8 @@ static const char *prop_compression_extract(struct inode *inode)
 		return "zlib";
 	case BTRFS_COMPRESS_LZO:
 		return "lzo";
+	case BTRFS_COMPRESS_ZSTD:
+		return "zstd";
 	}
 
 	return NULL;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 12540b6104b5..c370deadb790 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -513,6 +513,14 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
 				btrfs_clear_opt(info->mount_opt, NODATASUM);
 				btrfs_set_fs_incompat(info, COMPRESS_LZO);
 				no_compress = 0;
+			} else if (strcmp(args[0].from, "zstd") == 0) {
+				compress_type = "zstd";
+				info->compress_type = BTRFS_COMPRESS_ZSTD;
+				btrfs_set_opt(info->mount_opt, COMPRESS);
+				btrfs_clear_opt(info->mount_opt, NODATACOW);
+				btrfs_clear_opt(info->mount_opt, NODATASUM);
+				btrfs_set_fs_incompat(info, COMPRESS_ZSTD);
+				no_compress = 0;
 			} else if (strncmp(args[0].from, "no", 2) == 0) {
 				compress_type = "no";
 				btrfs_clear_opt(info->mount_opt, COMPRESS);
@@ -1227,8 +1235,10 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
 	if (btrfs_test_opt(info, COMPRESS)) {
 		if (info->compress_type == BTRFS_COMPRESS_ZLIB)
 			compress_type = "zlib";
-		else
+		else if (info->compress_type == BTRFS_COMPRESS_LZO)
 			compress_type = "lzo";
+		else
+			compress_type = "zstd";
 		if (btrfs_test_opt(info, FORCE_COMPRESS))
 			seq_printf(seq, ",compress-force=%s", compress_type);
 		else
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index c2d5f3580b4c..2b6d37c09a81 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -200,6 +200,7 @@ BTRFS_FEAT_ATTR_INCOMPAT(mixed_backref, MIXED_BACKREF);
 BTRFS_FEAT_ATTR_INCOMPAT(default_subvol, DEFAULT_SUBVOL);
 BTRFS_FEAT_ATTR_INCOMPAT(mixed_groups, MIXED_GROUPS);
 BTRFS_FEAT_ATTR_INCOMPAT(compress_lzo, COMPRESS_LZO);
+BTRFS_FEAT_ATTR_INCOMPAT(compress_zstd, COMPRESS_ZSTD);
 BTRFS_FEAT_ATTR_INCOMPAT(big_metadata, BIG_METADATA);
 BTRFS_FEAT_ATTR_INCOMPAT(extended_iref, EXTENDED_IREF);
 BTRFS_FEAT_ATTR_INCOMPAT(raid56, RAID56);
@@ -212,6 +213,7 @@ static struct attribute *btrfs_supported_feature_attrs[] = {
 	BTRFS_FEAT_ATTR_PTR(default_subvol),
 	BTRFS_FEAT_ATTR_PTR(mixed_groups),
 	BTRFS_FEAT_ATTR_PTR(compress_lzo),
+	BTRFS_FEAT_ATTR_PTR(compress_zstd),
 	BTRFS_FEAT_ATTR_PTR(big_metadata),
 	BTRFS_FEAT_ATTR_PTR(extended_iref),
 	BTRFS_FEAT_ATTR_PTR(raid56),
diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c
new file mode 100644
index 000000000000..607ce47b483a
--- /dev/null
+++ b/fs/btrfs/zstd.c
@@ -0,0 +1,432 @@
+/*
+ * Copyright (c) 2016-present, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#include <linux/bio.h>
+#include <linux/err.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/refcount.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/zstd.h>
+#include "compression.h"
+
+#define ZSTD_BTRFS_MAX_WINDOWLOG 17
+#define ZSTD_BTRFS_MAX_INPUT (1 << ZSTD_BTRFS_MAX_WINDOWLOG)
+#define ZSTD_BTRFS_DEFAULT_LEVEL 3
+
+static ZSTD_parameters zstd_get_btrfs_parameters(size_t src_len)
+{
+	ZSTD_parameters params = ZSTD_getParams(ZSTD_BTRFS_DEFAULT_LEVEL,
+						src_len, 0);
+
+	if (params.cParams.windowLog > ZSTD_BTRFS_MAX_WINDOWLOG)
+		params.cParams.windowLog = ZSTD_BTRFS_MAX_WINDOWLOG;
+	WARN_ON(src_len > ZSTD_BTRFS_MAX_INPUT);
+	return params;
+}
+
+struct workspace {
+	void *mem;
+	size_t size;
+	char *buf;
+	struct list_head list;
+};
+
+static void zstd_free_workspace(struct list_head *ws)
+{
+	struct workspace *workspace = list_entry(ws, struct workspace, list);
+
+	kvfree(workspace->mem);
+	kfree(workspace->buf);
+	kfree(workspace);
+}
+
+static struct list_head *zstd_alloc_workspace(void)
+{
+	ZSTD_parameters params =
+			zstd_get_btrfs_parameters(ZSTD_BTRFS_MAX_INPUT);
+	struct workspace *workspace;
+
+	workspace = kzalloc(sizeof(*workspace), GFP_KERNEL);
+	if (!workspace)
+		return ERR_PTR(-ENOMEM);
+
+	workspace->size = max_t(size_t,
+			ZSTD_CStreamWorkspaceBound(params.cParams),
+			ZSTD_DStreamWorkspaceBound(ZSTD_BTRFS_MAX_INPUT));
+	workspace->mem = kvmalloc(workspace->size, GFP_KERNEL);
+	workspace->buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
+	if (!workspace->mem || !workspace->buf)
+		goto fail;
+
+	INIT_LIST_HEAD(&workspace->list);
+
+	return &workspace->list;
+fail:
+	zstd_free_workspace(&workspace->list);
+	return ERR_PTR(-ENOMEM);
+}
+
+static int zstd_compress_pages(struct list_head *ws,
+		struct address_space *mapping,
+		u64 start,
+		struct page **pages,
+		unsigned long *out_pages,
+		unsigned long *total_in,
+		unsigned long *total_out)
+{
+	struct workspace *workspace = list_entry(ws, struct workspace, list);
+	ZSTD_CStream *stream;
+	int ret = 0;
+	int nr_pages = 0;
+	struct page *in_page = NULL;  /* The current page to read */
+	struct page *out_page = NULL; /* The current page to write to */
+	ZSTD_inBuffer in_buf = { NULL, 0, 0 };
+	ZSTD_outBuffer out_buf = { NULL, 0, 0 };
+	unsigned long tot_in = 0;
+	unsigned long tot_out = 0;
+	unsigned long len = *total_out;
+	const unsigned long nr_dest_pages = *out_pages;
+	unsigned long max_out = nr_dest_pages * PAGE_SIZE;
+	ZSTD_parameters params = zstd_get_btrfs_parameters(len);
+
+	*out_pages = 0;
+	*total_out = 0;
+	*total_in = 0;
+
+	/* Initialize the stream */
+	stream = ZSTD_initCStream(params, len, workspace->mem,
+			workspace->size);
+	if (!stream) {
+		pr_warn("BTRFS: ZSTD_initCStream failed\n");
+		ret = -EIO;
+		goto out;
+	}
+
+	/* map in the first page of input data */
+	in_page = find_get_page(mapping, start >> PAGE_SHIFT);
+	in_buf.src = kmap(in_page);
+	in_buf.pos = 0;
+	in_buf.size = min_t(size_t, len, PAGE_SIZE);
+
+
+	/* Allocate and map in the output buffer */
+	out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+	if (out_page == NULL) {
+		ret = -ENOMEM;
+		goto out;
+	}
+	pages[nr_pages++] = out_page;
+	out_buf.dst = kmap(out_page);
+	out_buf.pos = 0;
+	out_buf.size = min_t(size_t, max_out, PAGE_SIZE);
+
+	while (1) {
+		size_t ret2;
+
+		ret2 = ZSTD_compressStream(stream, &out_buf, &in_buf);
+		if (ZSTD_isError(ret2)) {
+			pr_debug("BTRFS: ZSTD_compressStream returned %d\n",
+					ZSTD_getErrorCode(ret2));
+			ret = -EIO;
+			goto out;
+		}
+
+		/* Check to see if we are making it bigger */
+		if (tot_in + in_buf.pos > 8192 &&
+				tot_in + in_buf.pos <
+				tot_out + out_buf.pos) {
+			ret = -E2BIG;
+			goto out;
+		}
+
+		/* We've reached the end of our output range */
+		if (out_buf.pos >= max_out) {
+			tot_out += out_buf.pos;
+			ret = -E2BIG;
+			goto out;
+		}
+
+		/* Check if we need more output space */
+		if (out_buf.pos == out_buf.size) {
+			tot_out += PAGE_SIZE;
+			max_out -= PAGE_SIZE;
+			kunmap(out_page);
+			if (nr_pages == nr_dest_pages) {
+				out_page = NULL;
+				ret = -E2BIG;
+				goto out;
+			}
+			out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+			if (out_page == NULL) {
+				ret = -ENOMEM;
+				goto out;
+			}
+			pages[nr_pages++] = out_page;
+			out_buf.dst = kmap(out_page);
+			out_buf.pos = 0;
+			out_buf.size = min_t(size_t, max_out, PAGE_SIZE);
+		}
+
+		/* We've reached the end of the input */
+		if (in_buf.pos >= len) {
+			tot_in += in_buf.pos;
+			break;
+		}
+
+		/* Check if we need more input */
+		if (in_buf.pos == in_buf.size) {
+			tot_in += PAGE_SIZE;
+			kunmap(in_page);
+			put_page(in_page);
+
+			start += PAGE_SIZE;
+			len -= PAGE_SIZE;
+			in_page = find_get_page(mapping, start >> PAGE_SHIFT);
+			in_buf.src = kmap(in_page);
+			in_buf.pos = 0;
+			in_buf.size = min_t(size_t, len, PAGE_SIZE);
+		}
+	}
+	while (1) {
+		size_t ret2;
+
+		ret2 = ZSTD_endStream(stream, &out_buf);
+		if (ZSTD_isError(ret2)) {
+			pr_debug("BTRFS: ZSTD_endStream returned %d\n",
+					ZSTD_getErrorCode(ret2));
+			ret = -EIO;
+			goto out;
+		}
+		if (ret2 == 0) {
+			tot_out += out_buf.pos;
+			break;
+		}
+		if (out_buf.pos >= max_out) {
+			tot_out += out_buf.pos;
+			ret = -E2BIG;
+			goto out;
+		}
+
+		tot_out += PAGE_SIZE;
+		max_out -= PAGE_SIZE;
+		kunmap(out_page);
+		if (nr_pages == nr_dest_pages) {
+			out_page = NULL;
+			ret = -E2BIG;
+			goto out;
+		}
+		out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+		if (out_page == NULL) {
+			ret = -ENOMEM;
+			goto out;
+		}
+		pages[nr_pages++] = out_page;
+		out_buf.dst = kmap(out_page);
+		out_buf.pos = 0;
+		out_buf.size = min_t(size_t, max_out, PAGE_SIZE);
+	}
+
+	if (tot_out >= tot_in) {
+		ret = -E2BIG;
+		goto out;
+	}
+
+	ret = 0;
+	*total_in = tot_in;
+	*total_out = tot_out;
+out:
+	*out_pages = nr_pages;
+	/* Cleanup */
+	if (in_page) {
+		kunmap(in_page);
+		put_page(in_page);
+	}
+	if (out_page)
+		kunmap(out_page);
+	return ret;
+}
+
+static int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
+{
+	struct workspace *workspace = list_entry(ws, struct workspace, list);
+	struct page **pages_in = cb->compressed_pages;
+	u64 disk_start = cb->start;
+	struct bio *orig_bio = cb->orig_bio;
+	size_t srclen = cb->compressed_len;
+	ZSTD_DStream *stream;
+	int ret = 0;
+	unsigned long page_in_index = 0;
+	unsigned long total_pages_in = DIV_ROUND_UP(srclen, PAGE_SIZE);
+	unsigned long buf_start;
+	unsigned long total_out = 0;
+	ZSTD_inBuffer in_buf = { NULL, 0, 0 };
+	ZSTD_outBuffer out_buf = { NULL, 0, 0 };
+
+	stream = ZSTD_initDStream(
+			ZSTD_BTRFS_MAX_INPUT, workspace->mem, workspace->size);
+	if (!stream) {
+		pr_debug("BTRFS: ZSTD_initDStream failed\n");
+		ret = -EIO;
+		goto done;
+	}
+
+	in_buf.src = kmap(pages_in[page_in_index]);
+	in_buf.pos = 0;
+	in_buf.size = min_t(size_t, srclen, PAGE_SIZE);
+
+	out_buf.dst = workspace->buf;
+	out_buf.pos = 0;
+	out_buf.size = PAGE_SIZE;
+
+	while (1) {
+		size_t ret2;
+
+		ret2 = ZSTD_decompressStream(stream, &out_buf, &in_buf);
+		if (ZSTD_isError(ret2)) {
+			pr_debug("BTRFS: ZSTD_decompressStream returned %d\n",
+					ZSTD_getErrorCode(ret2));
+			ret = -EIO;
+			goto done;
+		}
+		buf_start = total_out;
+		total_out += out_buf.pos;
+		out_buf.pos = 0;
+
+		ret = btrfs_decompress_buf2page(out_buf.dst, buf_start,
+				total_out, disk_start, orig_bio);
+		if (ret == 0)
+			break;
+
+		if (in_buf.pos >= srclen)
+			break;
+
+		/* Check if we've hit the end of a frame */
+		if (ret2 == 0)
+			break;
+
+		if (in_buf.pos == in_buf.size) {
+			kunmap(pages_in[page_in_index++]);
+			if (page_in_index >= total_pages_in) {
+				in_buf.src = NULL;
+				ret = -EIO;
+				goto done;
+			}
+			srclen -= PAGE_SIZE;
+			in_buf.src = kmap(pages_in[page_in_index]);
+			in_buf.pos = 0;
+			in_buf.size = min_t(size_t, srclen, PAGE_SIZE);
+		}
+	}
+	ret = 0;
+	zero_fill_bio(orig_bio);
+done:
+	if (in_buf.src)
+		kunmap(pages_in[page_in_index]);
+	return ret;
+}
+
+static int zstd_decompress(struct list_head *ws, unsigned char *data_in,
+		struct page *dest_page,
+		unsigned long start_byte,
+		size_t srclen, size_t destlen)
+{
+	struct workspace *workspace = list_entry(ws, struct workspace, list);
+	ZSTD_DStream *stream;
+	int ret = 0;
+	size_t ret2;
+	ZSTD_inBuffer in_buf = { NULL, 0, 0 };
+	ZSTD_outBuffer out_buf = { NULL, 0, 0 };
+	unsigned long total_out = 0;
+	unsigned long pg_offset = 0;
+	char *kaddr;
+
+	stream = ZSTD_initDStream(
+			ZSTD_BTRFS_MAX_INPUT, workspace->mem, workspace->size);
+	if (!stream) {
+		pr_warn("BTRFS: ZSTD_initDStream failed\n");
+		ret = -EIO;
+		goto finish;
+	}
+
+	destlen = min_t(size_t, destlen, PAGE_SIZE);
+
+	in_buf.src = data_in;
+	in_buf.pos = 0;
+	in_buf.size = srclen;
+
+	out_buf.dst = workspace->buf;
+	out_buf.pos = 0;
+	out_buf.size = PAGE_SIZE;
+
+	ret2 = 1;
+	while (pg_offset < destlen && in_buf.pos < in_buf.size) {
+		unsigned long buf_start;
+		unsigned long buf_offset;
+		unsigned long bytes;
+
+		/* Check if the frame is over and we still need more input */
+		if (ret2 == 0) {
+			pr_debug("BTRFS: ZSTD_decompressStream ended early\n");
+			ret = -EIO;
+			goto finish;
+		}
+		ret2 = ZSTD_decompressStream(stream, &out_buf, &in_buf);
+		if (ZSTD_isError(ret2)) {
+			pr_debug("BTRFS: ZSTD_decompressStream returned %d\n",
+					ZSTD_getErrorCode(ret2));
+			ret = -EIO;
+			goto finish;
+		}
+
+		buf_start = total_out;
+		total_out += out_buf.pos;
+		out_buf.pos = 0;
+
+		if (total_out <= start_byte)
+			continue;
+
+		if (total_out > start_byte && buf_start < start_byte)
+			buf_offset = start_byte - buf_start;
+		else
+			buf_offset = 0;
+
+		bytes = min_t(unsigned long, destlen - pg_offset,
+				out_buf.size - buf_offset);
+
+		kaddr = kmap_atomic(dest_page);
+		memcpy(kaddr + pg_offset, out_buf.dst + buf_offset, bytes);
+		kunmap_atomic(kaddr);
+
+		pg_offset += bytes;
+	}
+	ret = 0;
+finish:
+	if (pg_offset < destlen) {
+		kaddr = kmap_atomic(dest_page);
+		memset(kaddr + pg_offset, 0, destlen - pg_offset);
+		kunmap_atomic(kaddr);
+	}
+	return ret;
+}
+
+const struct btrfs_compress_op btrfs_zstd_compress = {
+	.alloc_workspace = zstd_alloc_workspace,
+	.free_workspace = zstd_free_workspace,
+	.compress_pages = zstd_compress_pages,
+	.decompress_bio = zstd_decompress_bio,
+	.decompress = zstd_decompress,
+};
diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h
index 9aa74f317747..378230c163d5 100644
--- a/include/uapi/linux/btrfs.h
+++ b/include/uapi/linux/btrfs.h
@@ -255,13 +255,7 @@ struct btrfs_ioctl_fs_info_args {
 #define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL	(1ULL << 1)
 #define BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS	(1ULL << 2)
 #define BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO	(1ULL << 3)
-/*
- * some patches floated around with a second compression method
- * lets save that incompat here for when they do get in
- * Note we don't actually support it, we're just reserving the
- * number
- */
-#define BTRFS_FEATURE_INCOMPAT_COMPRESS_LZOv2	(1ULL << 4)
+#define BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD	(1ULL << 4)
 
 /*
  * older kernels tried to do bigger metadata blocks, but the
-- 
cgit v1.2.3-71-gd317


From fe4007999599c02598c17b643e8de43e487d48e8 Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@mellanox.com>
Date: Tue, 15 Aug 2017 09:09:49 +0200
Subject: ipv6: fib: Provide offload indication using nexthop flags

IPv6 routes currently lack nexthop flags as in IPv4. This has several
implications.

In the forwarding path, it requires us to check the carrier state of the
nexthop device and potentially ignore a linkdown route, instead of
checking for RTNH_F_LINKDOWN.

It also requires capable drivers to use the user facing IPv6-specific
route flags to provide offload indication, instead of using the nexthop
flags as in IPv4.

Add nexthop flags to IPv6 routes in the 40 bytes hole and use it to
provide offload indication instead of the RTF_OFFLOAD flag, which is
removed while it's still not part of any official kernel release.

In the near future we would like to use the field for the
RTNH_F_{LINKDOWN,DEAD} flags, but this change is more involved and might
not be ready in time for the current cycle.

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Acked-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c | 8 ++++----
 include/net/ip6_fib.h                                 | 2 ++
 include/uapi/linux/ipv6_route.h                       | 1 -
 net/ipv6/route.c                                      | 7 +------
 4 files changed, 7 insertions(+), 11 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
index 16676fffbf70..4895d5b8942b 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
@@ -2397,7 +2397,7 @@ mlxsw_sp_fib6_entry_offload_set(struct mlxsw_sp_fib_entry *fib_entry)
 
 	if (fib_entry->type == MLXSW_SP_FIB_ENTRY_TYPE_LOCAL) {
 		list_first_entry(&fib6_entry->rt6_list, struct mlxsw_sp_rt6,
-				 list)->rt->rt6i_flags |= RTF_OFFLOAD;
+				 list)->rt->rt6i_nh_flags |= RTNH_F_OFFLOAD;
 		return;
 	}
 
@@ -2407,9 +2407,9 @@ mlxsw_sp_fib6_entry_offload_set(struct mlxsw_sp_fib_entry *fib_entry)
 
 		nh = mlxsw_sp_rt6_nexthop(nh_grp, mlxsw_sp_rt6);
 		if (nh && nh->offloaded)
-			mlxsw_sp_rt6->rt->rt6i_flags |= RTF_OFFLOAD;
+			mlxsw_sp_rt6->rt->rt6i_nh_flags |= RTNH_F_OFFLOAD;
 		else
-			mlxsw_sp_rt6->rt->rt6i_flags &= ~RTF_OFFLOAD;
+			mlxsw_sp_rt6->rt->rt6i_nh_flags &= ~RTNH_F_OFFLOAD;
 	}
 }
 
@@ -2424,7 +2424,7 @@ mlxsw_sp_fib6_entry_offload_unset(struct mlxsw_sp_fib_entry *fib_entry)
 	list_for_each_entry(mlxsw_sp_rt6, &fib6_entry->rt6_list, list) {
 		struct rt6_info *rt = mlxsw_sp_rt6->rt;
 
-		rt->rt6i_flags &= ~RTF_OFFLOAD;
+		rt->rt6i_nh_flags &= ~RTNH_F_OFFLOAD;
 	}
 }
 
diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 1d790ea40ea7..71c1646298ae 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -120,6 +120,8 @@ struct rt6_info {
 
 	atomic_t			rt6i_ref;
 
+	unsigned int			rt6i_nh_flags;
+
 	/* These are in a separate cache line. */
 	struct rt6key			rt6i_dst ____cacheline_aligned_in_smp;
 	u32				rt6i_flags;
diff --git a/include/uapi/linux/ipv6_route.h b/include/uapi/linux/ipv6_route.h
index 33e2a5732bd1..d496c02e14bc 100644
--- a/include/uapi/linux/ipv6_route.h
+++ b/include/uapi/linux/ipv6_route.h
@@ -35,7 +35,6 @@
 #define RTF_PREF(pref)	((pref) << 27)
 #define RTF_PREF_MASK	0x18000000
 
-#define RTF_OFFLOAD	0x20000000	/* offloaded route		*/
 #define RTF_PCPU	0x40000000	/* read-only: can not be set by user */
 #define RTF_LOCAL	0x80000000
 
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 035762fed07d..6793135d49db 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1820,11 +1820,6 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
 		goto out;
 	}
 
-	if (cfg->fc_flags & RTF_OFFLOAD) {
-		NL_SET_ERR_MSG(extack, "Userspace can not set RTF_OFFLOAD");
-		goto out;
-	}
-
 	if (cfg->fc_dst_len > 128) {
 		NL_SET_ERR_MSG(extack, "Invalid prefix length");
 		goto out;
@@ -3335,7 +3330,7 @@ static int rt6_nexthop_info(struct sk_buff *skb, struct rt6_info *rt,
 			goto nla_put_failure;
 	}
 
-	if (rt->rt6i_flags & RTF_OFFLOAD)
+	if (rt->rt6i_nh_flags & RTNH_F_OFFLOAD)
 		*flags |= RTNH_F_OFFLOAD;
 
 	/* not needed for multipath encoding b/c it has a rtnexthop struct */
-- 
cgit v1.2.3-71-gd317


From b005fd189cec9407b700599e1e80e0552446ee79 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Tue, 15 Aug 2017 22:31:58 -0700
Subject: bpf: introduce new program type for skbs on sockets

A class of programs, run from strparser and soon from a new map type
called sock map, are used with skb as the context but on established
sockets. By creating a specific program type for these we can use
bpf helpers that expect full sockets and get the verifier to ensure
these helpers are not used out of context.

The new type is BPF_PROG_TYPE_SK_SKB. This patch introduces the
infrastructure and type.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf_types.h |  1 +
 include/uapi/linux/bpf.h  |  1 +
 net/core/filter.c         | 36 ++++++++++++++++++++++++++++++++++++
 3 files changed, 38 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index b1e1035ca24b..4b72db30dacf 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -11,6 +11,7 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_IN, lwt_inout_prog_ops)
 BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_OUT, lwt_inout_prog_ops)
 BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_XMIT, lwt_xmit_prog_ops)
 BPF_PROG_TYPE(BPF_PROG_TYPE_SOCK_OPS, sock_ops_prog_ops)
+BPF_PROG_TYPE(BPF_PROG_TYPE_SK_SKB, sk_skb_prog_ops)
 #endif
 #ifdef CONFIG_BPF_EVENTS
 BPF_PROG_TYPE(BPF_PROG_TYPE_KPROBE, kprobe_prog_ops)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 91da8371a2d0..2e796e384aeb 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -127,6 +127,7 @@ enum bpf_prog_type {
 	BPF_PROG_TYPE_LWT_OUT,
 	BPF_PROG_TYPE_LWT_XMIT,
 	BPF_PROG_TYPE_SOCK_OPS,
+	BPF_PROG_TYPE_SK_SKB,
 };
 
 enum bpf_attach_type {
diff --git a/net/core/filter.c b/net/core/filter.c
index e0688a855c47..46321033ae0e 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3234,6 +3234,20 @@ static const struct bpf_func_proto *
 	}
 }
 
+static const struct bpf_func_proto *sk_skb_func_proto(enum bpf_func_id func_id)
+{
+	switch (func_id) {
+	case BPF_FUNC_skb_load_bytes:
+		return &bpf_skb_load_bytes_proto;
+	case BPF_FUNC_get_socket_cookie:
+		return &bpf_get_socket_cookie_proto;
+	case BPF_FUNC_get_socket_uid:
+		return &bpf_get_socket_uid_proto;
+	default:
+		return bpf_base_func_proto(func_id);
+	}
+}
+
 static const struct bpf_func_proto *
 lwt_xmit_func_proto(enum bpf_func_id func_id)
 {
@@ -3525,6 +3539,22 @@ static bool sock_ops_is_valid_access(int off, int size,
 	return __is_valid_sock_ops_access(off, size);
 }
 
+static bool sk_skb_is_valid_access(int off, int size,
+				   enum bpf_access_type type,
+				   struct bpf_insn_access_aux *info)
+{
+	switch (off) {
+	case bpf_ctx_range(struct __sk_buff, data):
+		info->reg_type = PTR_TO_PACKET;
+		break;
+	case bpf_ctx_range(struct __sk_buff, data_end):
+		info->reg_type = PTR_TO_PACKET_END;
+		break;
+	}
+
+	return bpf_skb_is_valid_access(off, size, type, info);
+}
+
 static u32 bpf_convert_ctx_access(enum bpf_access_type type,
 				  const struct bpf_insn *si,
 				  struct bpf_insn *insn_buf,
@@ -3994,6 +4024,12 @@ const struct bpf_verifier_ops sock_ops_prog_ops = {
 	.convert_ctx_access	= sock_ops_convert_ctx_access,
 };
 
+const struct bpf_verifier_ops sk_skb_prog_ops = {
+	.get_func_proto		= sk_skb_func_proto,
+	.is_valid_access	= sk_skb_is_valid_access,
+	.convert_ctx_access	= bpf_convert_ctx_access,
+};
+
 int sk_detach_filter(struct sock *sk)
 {
 	int ret = -ENOENT;
-- 
cgit v1.2.3-71-gd317


From 174a79ff9515f400b9a6115643dafd62a635b7e6 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Tue, 15 Aug 2017 22:32:47 -0700
Subject: bpf: sockmap with sk redirect support

Recently we added a new map type called dev map used to forward XDP
packets between ports (6093ec2dc313). This patches introduces a
similar notion for sockets.

A sockmap allows users to add participating sockets to a map. When
sockets are added to the map enough context is stored with the
map entry to use the entry with a new helper

  bpf_sk_redirect_map(map, key, flags)

This helper (analogous to bpf_redirect_map in XDP) is given the map
and an entry in the map. When called from a sockmap program, discussed
below, the skb will be sent on the socket using skb_send_sock().

With the above we need a bpf program to call the helper from that will
then implement the send logic. The initial site implemented in this
series is the recv_sock hook. For this to work we implemented a map
attach command to add attributes to a map. In sockmap we add two
programs a parse program and a verdict program. The parse program
uses strparser to build messages and pass them to the verdict program.
The parse programs use the normal strparser semantics. The verdict
program is of type SK_SKB.

The verdict program returns a verdict SK_DROP, or  SK_REDIRECT for
now. Additional actions may be added later. When SK_REDIRECT is
returned, expected when bpf program uses bpf_sk_redirect_map(), the
sockmap logic will consult per cpu variables set by the helper routine
and pull the sock entry out of the sock map. This pattern follows the
existing redirect logic in cls and xdp programs.

This gives the flow,

 recv_sock -> str_parser (parse_prog) -> verdict_prog -> skb_send_sock
                                                     \
                                                      -> kfree_skb

As an example use case a message based load balancer may use specific
logic in the verdict program to select the sock to send on.

Sample programs are provided in future patches that hopefully illustrate
the user interfaces. Also selftests are in follow-on patches.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h       |   7 +-
 include/linux/bpf_types.h |   1 +
 include/linux/filter.h    |   2 +
 include/uapi/linux/bpf.h  |  33 +-
 kernel/bpf/Makefile       |   2 +-
 kernel/bpf/sockmap.c      | 792 ++++++++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/syscall.c      |  51 ++-
 kernel/bpf/verifier.c     |  14 +
 net/core/filter.c         |  43 +++
 9 files changed, 940 insertions(+), 5 deletions(-)
 create mode 100644 kernel/bpf/sockmap.c

(limited to 'include/uapi/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index d6e1de8ce0fc..a4145e9c74b5 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -16,6 +16,7 @@
 #include <linux/rbtree_latch.h>
 
 struct perf_event;
+struct bpf_prog;
 struct bpf_map;
 
 /* map is generic key/value storage optionally accesible by eBPF programs */
@@ -37,6 +38,8 @@ struct bpf_map_ops {
 	void (*map_fd_put_ptr)(void *ptr);
 	u32 (*map_gen_lookup)(struct bpf_map *map, struct bpf_insn *insn_buf);
 	u32 (*map_fd_sys_lookup_elem)(void *ptr);
+	int (*map_attach)(struct bpf_map *map,
+			  struct bpf_prog *p1, struct bpf_prog *p2);
 };
 
 struct bpf_map {
@@ -138,8 +141,6 @@ enum bpf_reg_type {
 	PTR_TO_PACKET_END,	 /* skb->data + headlen */
 };
 
-struct bpf_prog;
-
 /* The information passed from prog-specific *_is_valid_access
  * back to the verifier.
  */
@@ -312,6 +313,7 @@ int bpf_check(struct bpf_prog **fp, union bpf_attr *attr);
 
 /* Map specifics */
 struct net_device  *__dev_map_lookup_elem(struct bpf_map *map, u32 key);
+struct sock  *__sock_map_lookup_elem(struct bpf_map *map, u32 key);
 void __dev_map_insert_ctx(struct bpf_map *map, u32 index);
 void __dev_map_flush(struct bpf_map *map);
 
@@ -391,6 +393,7 @@ extern const struct bpf_func_proto bpf_get_current_comm_proto;
 extern const struct bpf_func_proto bpf_skb_vlan_push_proto;
 extern const struct bpf_func_proto bpf_skb_vlan_pop_proto;
 extern const struct bpf_func_proto bpf_get_stackid_proto;
+extern const struct bpf_func_proto bpf_sock_map_update_proto;
 
 /* Shared helpers among cBPF and eBPF. */
 void bpf_user_rnd_init_once(void);
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index 4b72db30dacf..fa805074d168 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -38,4 +38,5 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY_OF_MAPS, array_of_maps_map_ops)
 BPF_MAP_TYPE(BPF_MAP_TYPE_HASH_OF_MAPS, htab_of_maps_map_ops)
 #ifdef CONFIG_NET
 BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP, dev_map_ops)
+BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKMAP, sock_map_ops)
 #endif
diff --git a/include/linux/filter.h b/include/linux/filter.h
index d19ed3c15e1e..7015116331af 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -727,6 +727,8 @@ void xdp_do_flush_map(void);
 void bpf_warn_invalid_xdp_action(u32 act);
 void bpf_warn_invalid_xdp_redirect(u32 ifindex);
 
+struct sock *do_sk_redirect_map(void);
+
 #ifdef CONFIG_BPF_JIT
 extern int bpf_jit_enable;
 extern int bpf_jit_harden;
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 2e796e384aeb..7f774769e3f5 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -110,6 +110,7 @@ enum bpf_map_type {
 	BPF_MAP_TYPE_ARRAY_OF_MAPS,
 	BPF_MAP_TYPE_HASH_OF_MAPS,
 	BPF_MAP_TYPE_DEVMAP,
+	BPF_MAP_TYPE_SOCKMAP,
 };
 
 enum bpf_prog_type {
@@ -135,11 +136,15 @@ enum bpf_attach_type {
 	BPF_CGROUP_INET_EGRESS,
 	BPF_CGROUP_INET_SOCK_CREATE,
 	BPF_CGROUP_SOCK_OPS,
+	BPF_CGROUP_SMAP_INGRESS,
 	__MAX_BPF_ATTACH_TYPE
 };
 
 #define MAX_BPF_ATTACH_TYPE __MAX_BPF_ATTACH_TYPE
 
+/* If BPF_SOCKMAP_STRPARSER is used sockmap will use strparser on receive */
+#define BPF_SOCKMAP_STRPARSER	(1U << 0)
+
 /* If BPF_F_ALLOW_OVERRIDE flag is used in BPF_PROG_ATTACH command
  * to the given target_fd cgroup the descendent cgroup will be able to
  * override effective bpf program that was inherited from this cgroup
@@ -211,6 +216,7 @@ union bpf_attr {
 		__u32		attach_bpf_fd;	/* eBPF program to attach */
 		__u32		attach_type;
 		__u32		attach_flags;
+		__u32		attach_bpf_fd2;
 	};
 
 	struct { /* anonymous struct used by BPF_PROG_TEST_RUN command */
@@ -557,6 +563,23 @@ union bpf_attr {
  *     @mode: operation mode (enum bpf_adj_room_mode)
  *     @flags: reserved for future use
  *     Return: 0 on success or negative error code
+ *
+ * int bpf_sk_redirect_map(map, key, flags)
+ *     Redirect skb to a sock in map using key as a lookup key for the
+ *     sock in map.
+ *     @map: pointer to sockmap
+ *     @key: key to lookup sock in map
+ *     @flags: reserved for future use
+ *     Return: SK_REDIRECT
+ *
+ * int bpf_sock_map_update(skops, map, key, flags, map_flags)
+ *	@skops: pointer to bpf_sock_ops
+ *	@map: pointer to sockmap to update
+ *	@key: key to insert/update sock in map
+ *	@flags: same flags as map update elem
+ *	@map_flags: sock map specific flags
+ *	   bit 1: Enable strparser
+ *	   other bits: reserved
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -610,7 +633,9 @@ union bpf_attr {
 	FN(set_hash),			\
 	FN(setsockopt),			\
 	FN(skb_adjust_room),		\
-	FN(redirect_map),
+	FN(redirect_map),		\
+	FN(sk_redirect_map),		\
+	FN(sock_map_update),		\
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
@@ -747,6 +772,12 @@ struct xdp_md {
 	__u32 data_end;
 };
 
+enum sk_action {
+	SK_ABORTED = 0,
+	SK_DROP,
+	SK_REDIRECT,
+};
+
 #define BPF_TAG_SIZE	8
 
 struct bpf_prog_info {
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 2f0bcda40e90..aa24287db888 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -3,7 +3,7 @@ obj-y := core.o
 obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o
 obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o
 ifeq ($(CONFIG_NET),y)
-obj-$(CONFIG_BPF_SYSCALL) += devmap.o
+obj-$(CONFIG_BPF_SYSCALL) += devmap.o sockmap.o
 endif
 ifeq ($(CONFIG_PERF_EVENTS),y)
 obj-$(CONFIG_BPF_SYSCALL) += stackmap.o
diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
new file mode 100644
index 000000000000..792f0addfafa
--- /dev/null
+++ b/kernel/bpf/sockmap.c
@@ -0,0 +1,792 @@
+/* Copyright (c) 2017 Covalent IO, Inc. http://covalent.io
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+/* A BPF sock_map is used to store sock objects. This is primarly used
+ * for doing socket redirect with BPF helper routines.
+ *
+ * A sock map may have two BPF programs attached to it, a program used
+ * to parse packets and a program to provide a verdict and redirect
+ * decision on the packet. If no BPF parse program is provided it is
+ * assumed that every skb is a "message" (skb->len). Otherwise the
+ * parse program is attached to strparser and used to build messages
+ * that may span multiple skbs. The verdict program will either select
+ * a socket to send/receive the skb on or provide the drop code indicating
+ * the skb should be dropped. More actions may be added later as needed.
+ * The default program will drop packets.
+ *
+ * For reference this program is similar to devmap used in XDP context
+ * reviewing these together may be useful. For an example please review
+ * ./samples/bpf/sockmap/.
+ */
+#include <linux/bpf.h>
+#include <net/sock.h>
+#include <linux/filter.h>
+#include <linux/errno.h>
+#include <linux/file.h>
+#include <linux/kernel.h>
+#include <linux/net.h>
+#include <linux/skbuff.h>
+#include <linux/workqueue.h>
+#include <linux/list.h>
+#include <net/strparser.h>
+
+struct bpf_stab {
+	struct bpf_map map;
+	struct sock **sock_map;
+	struct bpf_prog *bpf_parse;
+	struct bpf_prog *bpf_verdict;
+	refcount_t refcnt;
+};
+
+enum smap_psock_state {
+	SMAP_TX_RUNNING,
+};
+
+struct smap_psock {
+	struct rcu_head	rcu;
+
+	/* datapath variables */
+	struct sk_buff_head rxqueue;
+	bool strp_enabled;
+
+	/* datapath error path cache across tx work invocations */
+	int save_rem;
+	int save_off;
+	struct sk_buff *save_skb;
+
+	struct strparser strp;
+	struct bpf_prog *bpf_parse;
+	struct bpf_prog *bpf_verdict;
+	struct bpf_stab *stab;
+
+	/* Back reference used when sock callback trigger sockmap operations */
+	int key;
+	struct sock *sock;
+	unsigned long state;
+
+	struct work_struct tx_work;
+	struct work_struct gc_work;
+
+	void (*save_data_ready)(struct sock *sk);
+	void (*save_write_space)(struct sock *sk);
+	void (*save_state_change)(struct sock *sk);
+};
+
+static inline struct smap_psock *smap_psock_sk(const struct sock *sk)
+{
+	return (struct smap_psock *)rcu_dereference_sk_user_data(sk);
+}
+
+static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb)
+{
+	struct bpf_prog *prog = READ_ONCE(psock->bpf_verdict);
+	int rc;
+
+	if (unlikely(!prog))
+		return SK_DROP;
+
+	skb_orphan(skb);
+	skb->sk = psock->sock;
+	bpf_compute_data_end(skb);
+	rc = (*prog->bpf_func)(skb, prog->insnsi);
+	skb->sk = NULL;
+
+	return rc;
+}
+
+static void smap_do_verdict(struct smap_psock *psock, struct sk_buff *skb)
+{
+	struct sock *sock;
+	int rc;
+
+	/* Because we use per cpu values to feed input from sock redirect
+	 * in BPF program to do_sk_redirect_map() call we need to ensure we
+	 * are not preempted. RCU read lock is not sufficient in this case
+	 * with CONFIG_PREEMPT_RCU enabled so we must be explicit here.
+	 */
+	preempt_disable();
+	rc = smap_verdict_func(psock, skb);
+	switch (rc) {
+	case SK_REDIRECT:
+		sock = do_sk_redirect_map();
+		preempt_enable();
+		if (likely(sock)) {
+			struct smap_psock *peer = smap_psock_sk(sock);
+
+			if (likely(peer &&
+				   test_bit(SMAP_TX_RUNNING, &peer->state) &&
+				   sk_stream_memory_free(peer->sock))) {
+				peer->sock->sk_wmem_queued += skb->truesize;
+				sk_mem_charge(peer->sock, skb->truesize);
+				skb_queue_tail(&peer->rxqueue, skb);
+				schedule_work(&peer->tx_work);
+				break;
+			}
+		}
+	/* Fall through and free skb otherwise */
+	case SK_DROP:
+	default:
+		preempt_enable();
+		kfree_skb(skb);
+	}
+}
+
+static void smap_report_sk_error(struct smap_psock *psock, int err)
+{
+	struct sock *sk = psock->sock;
+
+	sk->sk_err = err;
+	sk->sk_error_report(sk);
+}
+
+static void smap_release_sock(struct sock *sock);
+
+/* Called with lock_sock(sk) held */
+static void smap_state_change(struct sock *sk)
+{
+	struct smap_psock *psock;
+	struct sock *osk;
+
+	rcu_read_lock();
+
+	/* Allowing transitions into an established syn_recv states allows
+	 * for early binding sockets to a smap object before the connection
+	 * is established.
+	 */
+	switch (sk->sk_state) {
+	case TCP_SYN_RECV:
+	case TCP_ESTABLISHED:
+		break;
+	case TCP_CLOSE_WAIT:
+	case TCP_CLOSING:
+	case TCP_LAST_ACK:
+	case TCP_FIN_WAIT1:
+	case TCP_FIN_WAIT2:
+	case TCP_LISTEN:
+		break;
+	case TCP_CLOSE:
+		/* Only release if the map entry is in fact the sock in
+		 * question. There is a case where the operator deletes
+		 * the sock from the map, but the TCP sock is closed before
+		 * the psock is detached. Use cmpxchg to verify correct
+		 * sock is removed.
+		 */
+		psock = smap_psock_sk(sk);
+		if (unlikely(!psock))
+			break;
+		osk = cmpxchg(&psock->stab->sock_map[psock->key], sk, NULL);
+		if (osk == sk)
+			smap_release_sock(sk);
+		break;
+	default:
+		smap_report_sk_error(psock, EPIPE);
+		break;
+	}
+	rcu_read_unlock();
+}
+
+static void smap_read_sock_strparser(struct strparser *strp,
+				     struct sk_buff *skb)
+{
+	struct smap_psock *psock;
+
+	rcu_read_lock();
+	psock = container_of(strp, struct smap_psock, strp);
+	smap_do_verdict(psock, skb);
+	rcu_read_unlock();
+}
+
+/* Called with lock held on socket */
+static void smap_data_ready(struct sock *sk)
+{
+	struct smap_psock *psock;
+
+	write_lock_bh(&sk->sk_callback_lock);
+	psock = smap_psock_sk(sk);
+	if (likely(psock))
+		strp_data_ready(&psock->strp);
+	write_unlock_bh(&sk->sk_callback_lock);
+}
+
+static void smap_tx_work(struct work_struct *w)
+{
+	struct smap_psock *psock;
+	struct sk_buff *skb;
+	int rem, off, n;
+
+	psock = container_of(w, struct smap_psock, tx_work);
+
+	/* lock sock to avoid losing sk_socket at some point during loop */
+	lock_sock(psock->sock);
+	if (psock->save_skb) {
+		skb = psock->save_skb;
+		rem = psock->save_rem;
+		off = psock->save_off;
+		psock->save_skb = NULL;
+		goto start;
+	}
+
+	while ((skb = skb_dequeue(&psock->rxqueue))) {
+		rem = skb->len;
+		off = 0;
+start:
+		do {
+			if (likely(psock->sock->sk_socket))
+				n = skb_send_sock_locked(psock->sock,
+							 skb, off, rem);
+			else
+				n = -EINVAL;
+			if (n <= 0) {
+				if (n == -EAGAIN) {
+					/* Retry when space is available */
+					psock->save_skb = skb;
+					psock->save_rem = rem;
+					psock->save_off = off;
+					goto out;
+				}
+				/* Hard errors break pipe and stop xmit */
+				smap_report_sk_error(psock, n ? -n : EPIPE);
+				clear_bit(SMAP_TX_RUNNING, &psock->state);
+				sk_mem_uncharge(psock->sock, skb->truesize);
+				psock->sock->sk_wmem_queued -= skb->truesize;
+				kfree_skb(skb);
+				goto out;
+			}
+			rem -= n;
+			off += n;
+		} while (rem);
+		sk_mem_uncharge(psock->sock, skb->truesize);
+		psock->sock->sk_wmem_queued -= skb->truesize;
+		kfree_skb(skb);
+	}
+out:
+	release_sock(psock->sock);
+}
+
+static void smap_write_space(struct sock *sk)
+{
+	struct smap_psock *psock;
+
+	rcu_read_lock();
+	psock = smap_psock_sk(sk);
+	if (likely(psock && test_bit(SMAP_TX_RUNNING, &psock->state)))
+		schedule_work(&psock->tx_work);
+	rcu_read_unlock();
+}
+
+static void smap_stop_sock(struct smap_psock *psock, struct sock *sk)
+{
+	write_lock_bh(&sk->sk_callback_lock);
+	if (!psock->strp_enabled)
+		goto out;
+	sk->sk_data_ready = psock->save_data_ready;
+	sk->sk_write_space = psock->save_write_space;
+	sk->sk_state_change = psock->save_state_change;
+	psock->save_data_ready = NULL;
+	psock->save_write_space = NULL;
+	psock->save_state_change = NULL;
+	strp_stop(&psock->strp);
+	psock->strp_enabled = false;
+out:
+	write_unlock_bh(&sk->sk_callback_lock);
+}
+
+static void smap_destroy_psock(struct rcu_head *rcu)
+{
+	struct smap_psock *psock = container_of(rcu,
+						  struct smap_psock, rcu);
+
+	/* Now that a grace period has passed there is no longer
+	 * any reference to this sock in the sockmap so we can
+	 * destroy the psock, strparser, and bpf programs. But,
+	 * because we use workqueue sync operations we can not
+	 * do it in rcu context
+	 */
+	schedule_work(&psock->gc_work);
+}
+
+static void smap_release_sock(struct sock *sock)
+{
+	struct smap_psock *psock = smap_psock_sk(sock);
+
+	smap_stop_sock(psock, sock);
+	clear_bit(SMAP_TX_RUNNING, &psock->state);
+	rcu_assign_sk_user_data(sock, NULL);
+	call_rcu_sched(&psock->rcu, smap_destroy_psock);
+}
+
+static int smap_parse_func_strparser(struct strparser *strp,
+				       struct sk_buff *skb)
+{
+	struct smap_psock *psock;
+	struct bpf_prog *prog;
+	int rc;
+
+	rcu_read_lock();
+	psock = container_of(strp, struct smap_psock, strp);
+	prog = READ_ONCE(psock->bpf_parse);
+
+	if (unlikely(!prog)) {
+		rcu_read_unlock();
+		return skb->len;
+	}
+
+	/* Attach socket for bpf program to use if needed we can do this
+	 * because strparser clones the skb before handing it to a upper
+	 * layer, meaning skb_orphan has been called. We NULL sk on the
+	 * way out to ensure we don't trigger a BUG_ON in skb/sk operations
+	 * later and because we are not charging the memory of this skb to
+	 * any socket yet.
+	 */
+	skb->sk = psock->sock;
+	bpf_compute_data_end(skb);
+	rc = (*prog->bpf_func)(skb, prog->insnsi);
+	skb->sk = NULL;
+	rcu_read_unlock();
+	return rc;
+}
+
+
+static int smap_read_sock_done(struct strparser *strp, int err)
+{
+	return err;
+}
+
+static int smap_init_sock(struct smap_psock *psock,
+			  struct sock *sk)
+{
+	struct strp_callbacks cb;
+
+	memset(&cb, 0, sizeof(cb));
+	cb.rcv_msg = smap_read_sock_strparser;
+	cb.parse_msg = smap_parse_func_strparser;
+	cb.read_sock_done = smap_read_sock_done;
+	return strp_init(&psock->strp, sk, &cb);
+}
+
+static void smap_init_progs(struct smap_psock *psock,
+			    struct bpf_stab *stab,
+			    struct bpf_prog *verdict,
+			    struct bpf_prog *parse)
+{
+	struct bpf_prog *orig_parse, *orig_verdict;
+
+	orig_parse = xchg(&psock->bpf_parse, parse);
+	orig_verdict = xchg(&psock->bpf_verdict, verdict);
+
+	if (orig_verdict)
+		bpf_prog_put(orig_verdict);
+	if (orig_parse)
+		bpf_prog_put(orig_parse);
+}
+
+static void smap_start_sock(struct smap_psock *psock, struct sock *sk)
+{
+	if (sk->sk_data_ready == smap_data_ready)
+		return;
+	psock->save_data_ready = sk->sk_data_ready;
+	psock->save_write_space = sk->sk_write_space;
+	psock->save_state_change = sk->sk_state_change;
+	sk->sk_data_ready = smap_data_ready;
+	sk->sk_write_space = smap_write_space;
+	sk->sk_state_change = smap_state_change;
+	psock->strp_enabled = true;
+}
+
+static void sock_map_remove_complete(struct bpf_stab *stab)
+{
+	bpf_map_area_free(stab->sock_map);
+	kfree(stab);
+}
+
+static void smap_gc_work(struct work_struct *w)
+{
+	struct smap_psock *psock;
+
+	psock = container_of(w, struct smap_psock, gc_work);
+
+	/* no callback lock needed because we already detached sockmap ops */
+	if (psock->strp_enabled)
+		strp_done(&psock->strp);
+
+	cancel_work_sync(&psock->tx_work);
+	__skb_queue_purge(&psock->rxqueue);
+
+	/* At this point all strparser and xmit work must be complete */
+	if (psock->bpf_parse)
+		bpf_prog_put(psock->bpf_parse);
+	if (psock->bpf_verdict)
+		bpf_prog_put(psock->bpf_verdict);
+
+	if (refcount_dec_and_test(&psock->stab->refcnt))
+		sock_map_remove_complete(psock->stab);
+
+	sock_put(psock->sock);
+	kfree(psock);
+}
+
+static struct smap_psock *smap_init_psock(struct sock *sock,
+					  struct bpf_stab *stab)
+{
+	struct smap_psock *psock;
+
+	psock = kzalloc(sizeof(struct smap_psock), GFP_ATOMIC | __GFP_NOWARN);
+	if (!psock)
+		return ERR_PTR(-ENOMEM);
+
+	psock->sock = sock;
+	skb_queue_head_init(&psock->rxqueue);
+	INIT_WORK(&psock->tx_work, smap_tx_work);
+	INIT_WORK(&psock->gc_work, smap_gc_work);
+
+	rcu_assign_sk_user_data(sock, psock);
+	sock_hold(sock);
+	return psock;
+}
+
+static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
+{
+	struct bpf_stab *stab;
+	int err = -EINVAL;
+	u64 cost;
+
+	/* check sanity of attributes */
+	if (attr->max_entries == 0 || attr->key_size != 4 ||
+	    attr->value_size != 4 || attr->map_flags)
+		return ERR_PTR(-EINVAL);
+
+	if (attr->value_size > KMALLOC_MAX_SIZE)
+		return ERR_PTR(-E2BIG);
+
+	stab = kzalloc(sizeof(*stab), GFP_USER);
+	if (!stab)
+		return ERR_PTR(-ENOMEM);
+
+	/* mandatory map attributes */
+	stab->map.map_type = attr->map_type;
+	stab->map.key_size = attr->key_size;
+	stab->map.value_size = attr->value_size;
+	stab->map.max_entries = attr->max_entries;
+	stab->map.map_flags = attr->map_flags;
+
+	/* make sure page count doesn't overflow */
+	cost = (u64) stab->map.max_entries * sizeof(struct sock *);
+	if (cost >= U32_MAX - PAGE_SIZE)
+		goto free_stab;
+
+	stab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
+
+	/* if map size is larger than memlock limit, reject it early */
+	err = bpf_map_precharge_memlock(stab->map.pages);
+	if (err)
+		goto free_stab;
+
+	stab->sock_map = bpf_map_area_alloc(stab->map.max_entries *
+					    sizeof(struct sock *));
+	if (!stab->sock_map)
+		goto free_stab;
+
+	refcount_set(&stab->refcnt, 1);
+	return &stab->map;
+free_stab:
+	kfree(stab);
+	return ERR_PTR(err);
+}
+
+static void sock_map_free(struct bpf_map *map)
+{
+	struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
+	int i;
+
+	synchronize_rcu();
+
+	/* At this point no update, lookup or delete operations can happen.
+	 * However, be aware we can still get a socket state event updates,
+	 * and data ready callabacks that reference the psock from sk_user_data
+	 * Also psock worker threads are still in-flight. So smap_release_sock
+	 * will only free the psock after cancel_sync on the worker threads
+	 * and a grace period expire to ensure psock is really safe to remove.
+	 */
+	rcu_read_lock();
+	for (i = 0; i < stab->map.max_entries; i++) {
+		struct sock *sock;
+
+		sock = xchg(&stab->sock_map[i], NULL);
+		if (!sock)
+			continue;
+
+		smap_release_sock(sock);
+	}
+	rcu_read_unlock();
+
+	if (stab->bpf_verdict)
+		bpf_prog_put(stab->bpf_verdict);
+	if (stab->bpf_parse)
+		bpf_prog_put(stab->bpf_parse);
+
+	if (refcount_dec_and_test(&stab->refcnt))
+		sock_map_remove_complete(stab);
+}
+
+static int sock_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
+{
+	struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
+	u32 i = key ? *(u32 *)key : U32_MAX;
+	u32 *next = (u32 *)next_key;
+
+	if (i >= stab->map.max_entries) {
+		*next = 0;
+		return 0;
+	}
+
+	if (i == stab->map.max_entries - 1)
+		return -ENOENT;
+
+	*next = i + 1;
+	return 0;
+}
+
+struct sock  *__sock_map_lookup_elem(struct bpf_map *map, u32 key)
+{
+	struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
+
+	if (key >= map->max_entries)
+		return NULL;
+
+	return READ_ONCE(stab->sock_map[key]);
+}
+
+static int sock_map_delete_elem(struct bpf_map *map, void *key)
+{
+	struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
+	int k = *(u32 *)key;
+	struct sock *sock;
+
+	if (k >= map->max_entries)
+		return -EINVAL;
+
+	sock = xchg(&stab->sock_map[k], NULL);
+	if (!sock)
+		return -EINVAL;
+
+	smap_release_sock(sock);
+	return 0;
+}
+
+/* Locking notes: Concurrent updates, deletes, and lookups are allowed and are
+ * done inside rcu critical sections. This ensures on updates that the psock
+ * will not be released via smap_release_sock() until concurrent updates/deletes
+ * complete. All operations operate on sock_map using cmpxchg and xchg
+ * operations to ensure we do not get stale references. Any reads into the
+ * map must be done with READ_ONCE() because of this.
+ *
+ * A psock is destroyed via call_rcu and after any worker threads are cancelled
+ * and syncd so we are certain all references from the update/lookup/delete
+ * operations as well as references in the data path are no longer in use.
+ *
+ * A psock object holds a refcnt on the sockmap it is attached to and this is
+ * not decremented until after a RCU grace period and garbage collection occurs.
+ * This ensures the map is not free'd until psocks linked to it are removed. The
+ * map link is used when the independent sock events trigger map deletion.
+ *
+ * Psocks may only participate in one sockmap at a time. Users that try to
+ * join a single sock to multiple maps will get an error.
+ *
+ * Last, but not least, it is possible the socket is closed while running
+ * an update on an existing psock. This will release the psock, but again
+ * not until the update has completed due to rcu grace period rules.
+ */
+static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
+				    struct bpf_map *map,
+				    void *key, u64 flags, u64 map_flags)
+{
+	struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
+	struct bpf_prog *verdict, *parse;
+	struct smap_psock *psock = NULL;
+	struct sock *old_sock, *sock;
+	u32 i = *(u32 *)key;
+	bool update = false;
+	int err = 0;
+
+	if (unlikely(flags > BPF_EXIST))
+		return -EINVAL;
+
+	if (unlikely(i >= stab->map.max_entries))
+		return -E2BIG;
+
+	if (unlikely(map_flags > BPF_SOCKMAP_STRPARSER))
+		return -EINVAL;
+
+	verdict = parse = NULL;
+	sock = READ_ONCE(stab->sock_map[i]);
+
+	if (flags == BPF_EXIST || flags == BPF_ANY) {
+		if (!sock && flags == BPF_EXIST) {
+			return -ENOENT;
+		} else if (sock && sock != skops->sk) {
+			return -EINVAL;
+		} else if (sock) {
+			psock = smap_psock_sk(sock);
+			if (unlikely(!psock))
+				return -EBUSY;
+			update = true;
+		}
+	} else if (sock && BPF_NOEXIST) {
+		return -EEXIST;
+	}
+
+	/* reserve BPF programs early so can abort easily on failures */
+	if (map_flags & BPF_SOCKMAP_STRPARSER) {
+		verdict = READ_ONCE(stab->bpf_verdict);
+		parse = READ_ONCE(stab->bpf_parse);
+
+		if (!verdict || !parse)
+			return -ENOENT;
+
+		/* bpf prog refcnt may be zero if a concurrent attach operation
+		 * removes the program after the above READ_ONCE() but before
+		 * we increment the refcnt. If this is the case abort with an
+		 * error.
+		 */
+		verdict = bpf_prog_inc_not_zero(stab->bpf_verdict);
+		if (IS_ERR(verdict))
+			return PTR_ERR(verdict);
+
+		parse = bpf_prog_inc_not_zero(stab->bpf_parse);
+		if (IS_ERR(parse)) {
+			bpf_prog_put(verdict);
+			return PTR_ERR(parse);
+		}
+	}
+
+	if (!psock) {
+		sock = skops->sk;
+		if (rcu_dereference_sk_user_data(sock))
+			return -EEXIST;
+		psock = smap_init_psock(sock, stab);
+		if (IS_ERR(psock)) {
+			if (verdict)
+				bpf_prog_put(verdict);
+			if (parse)
+				bpf_prog_put(parse);
+			return PTR_ERR(psock);
+		}
+		psock->key = i;
+		psock->stab = stab;
+		refcount_inc(&stab->refcnt);
+		set_bit(SMAP_TX_RUNNING, &psock->state);
+	}
+
+	if (map_flags & BPF_SOCKMAP_STRPARSER) {
+		write_lock_bh(&sock->sk_callback_lock);
+		if (psock->strp_enabled)
+			goto start_done;
+		err = smap_init_sock(psock, sock);
+		if (err)
+			goto out;
+		smap_init_progs(psock, stab, verdict, parse);
+		smap_start_sock(psock, sock);
+start_done:
+		write_unlock_bh(&sock->sk_callback_lock);
+	} else if (update) {
+		smap_stop_sock(psock, sock);
+	}
+
+	if (!update) {
+		old_sock = xchg(&stab->sock_map[i], skops->sk);
+		if (old_sock)
+			smap_release_sock(old_sock);
+	}
+
+	return 0;
+out:
+	write_unlock_bh(&sock->sk_callback_lock);
+	if (!update)
+		smap_release_sock(sock);
+	return err;
+}
+
+static int sock_map_attach_prog(struct bpf_map *map,
+				struct bpf_prog *parse,
+				struct bpf_prog *verdict)
+{
+	struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
+	struct bpf_prog *_parse, *_verdict;
+
+	_parse = xchg(&stab->bpf_parse, parse);
+	_verdict = xchg(&stab->bpf_verdict, verdict);
+
+	if (_parse)
+		bpf_prog_put(_parse);
+	if (_verdict)
+		bpf_prog_put(_verdict);
+
+	return 0;
+}
+
+static void *sock_map_lookup(struct bpf_map *map, void *key)
+{
+	return NULL;
+}
+
+static int sock_map_update_elem(struct bpf_map *map,
+				void *key, void *value, u64 flags)
+{
+	struct bpf_sock_ops_kern skops;
+	u32 fd = *(u32 *)value;
+	struct socket *socket;
+	int err;
+
+	socket = sockfd_lookup(fd, &err);
+	if (!socket)
+		return err;
+
+	skops.sk = socket->sk;
+	if (!skops.sk) {
+		fput(socket->file);
+		return -EINVAL;
+	}
+
+	err = sock_map_ctx_update_elem(&skops, map, key,
+				       flags, BPF_SOCKMAP_STRPARSER);
+	fput(socket->file);
+	return err;
+}
+
+const struct bpf_map_ops sock_map_ops = {
+	.map_alloc = sock_map_alloc,
+	.map_free = sock_map_free,
+	.map_lookup_elem = sock_map_lookup,
+	.map_get_next_key = sock_map_get_next_key,
+	.map_update_elem = sock_map_update_elem,
+	.map_delete_elem = sock_map_delete_elem,
+	.map_attach = sock_map_attach_prog,
+};
+
+BPF_CALL_5(bpf_sock_map_update, struct bpf_sock_ops_kern *, bpf_sock,
+	   struct bpf_map *, map, void *, key, u64, flags, u64, map_flags)
+{
+	WARN_ON_ONCE(!rcu_read_lock_held());
+	return sock_map_ctx_update_elem(bpf_sock, map, key, flags, map_flags);
+}
+
+const struct bpf_func_proto bpf_sock_map_update_proto = {
+	.func		= bpf_sock_map_update,
+	.gpl_only	= false,
+	.pkt_access	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_CONST_MAP_PTR,
+	.arg3_type	= ARG_PTR_TO_MAP_KEY,
+	.arg4_type	= ARG_ANYTHING,
+	.arg5_type	= ARG_ANYTHING,
+};
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 17e29f596de1..d2f2bdf71ffa 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1087,7 +1087,50 @@ static int bpf_obj_get(const union bpf_attr *attr)
 
 #ifdef CONFIG_CGROUP_BPF
 
-#define BPF_PROG_ATTACH_LAST_FIELD attach_flags
+#define BPF_PROG_ATTACH_LAST_FIELD attach_bpf_fd2
+
+static int sockmap_get_from_fd(const union bpf_attr *attr, int ptype)
+{
+	struct bpf_prog *prog1, *prog2;
+	int ufd = attr->target_fd;
+	struct bpf_map *map;
+	struct fd f;
+	int err;
+
+	f = fdget(ufd);
+	map = __bpf_map_get(f);
+	if (IS_ERR(map))
+		return PTR_ERR(map);
+
+	if (!map->ops->map_attach) {
+		fdput(f);
+		return -EOPNOTSUPP;
+	}
+
+	prog1 = bpf_prog_get_type(attr->attach_bpf_fd, ptype);
+	if (IS_ERR(prog1)) {
+		fdput(f);
+		return PTR_ERR(prog1);
+	}
+
+	prog2 = bpf_prog_get_type(attr->attach_bpf_fd2, ptype);
+	if (IS_ERR(prog2)) {
+		fdput(f);
+		bpf_prog_put(prog1);
+		return PTR_ERR(prog2);
+	}
+
+	err = map->ops->map_attach(map, prog1, prog2);
+	if (err) {
+		fdput(f);
+		bpf_prog_put(prog1);
+		bpf_prog_put(prog2);
+		return PTR_ERR(map);
+	}
+
+	fdput(f);
+	return err;
+}
 
 static int bpf_prog_attach(const union bpf_attr *attr)
 {
@@ -1116,10 +1159,16 @@ static int bpf_prog_attach(const union bpf_attr *attr)
 	case BPF_CGROUP_SOCK_OPS:
 		ptype = BPF_PROG_TYPE_SOCK_OPS;
 		break;
+	case BPF_CGROUP_SMAP_INGRESS:
+		ptype = BPF_PROG_TYPE_SK_SKB;
+		break;
 	default:
 		return -EINVAL;
 	}
 
+	if (attr->attach_type == BPF_CGROUP_SMAP_INGRESS)
+		return sockmap_get_from_fd(attr, ptype);
+
 	prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype);
 	if (IS_ERR(prog))
 		return PTR_ERR(prog);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 7dd96d064be1..a71bc0996572 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1522,6 +1522,12 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
 	case BPF_MAP_TYPE_HASH_OF_MAPS:
 		if (func_id != BPF_FUNC_map_lookup_elem)
 			goto error;
+	case BPF_MAP_TYPE_SOCKMAP:
+		if (func_id != BPF_FUNC_sk_redirect_map &&
+		    func_id != BPF_FUNC_sock_map_update &&
+		    func_id != BPF_FUNC_map_delete_elem)
+			goto error;
+		break;
 	default:
 		break;
 	}
@@ -1550,6 +1556,14 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
 		if (map->map_type != BPF_MAP_TYPE_DEVMAP)
 			goto error;
 		break;
+	case BPF_FUNC_sk_redirect_map:
+		if (map->map_type != BPF_MAP_TYPE_SOCKMAP)
+			goto error;
+		break;
+	case BPF_FUNC_sock_map_update:
+		if (map->map_type != BPF_MAP_TYPE_SOCKMAP)
+			goto error;
+		break;
 	default:
 		break;
 	}
diff --git a/net/core/filter.c b/net/core/filter.c
index 46321033ae0e..8e136578488c 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1858,6 +1858,45 @@ static const struct bpf_func_proto bpf_redirect_map_proto = {
 	.arg3_type      = ARG_ANYTHING,
 };
 
+BPF_CALL_3(bpf_sk_redirect_map, struct bpf_map *, map, u32, key, u64, flags)
+{
+	struct redirect_info *ri = this_cpu_ptr(&redirect_info);
+
+	if (unlikely(flags))
+		return SK_ABORTED;
+
+	ri->ifindex = key;
+	ri->flags = flags;
+	ri->map = map;
+
+	return SK_REDIRECT;
+}
+
+struct sock *do_sk_redirect_map(void)
+{
+	struct redirect_info *ri = this_cpu_ptr(&redirect_info);
+	struct sock *sk = NULL;
+
+	if (ri->map) {
+		sk = __sock_map_lookup_elem(ri->map, ri->ifindex);
+
+		ri->ifindex = 0;
+		ri->map = NULL;
+		/* we do not clear flags for future lookup */
+	}
+
+	return sk;
+}
+
+static const struct bpf_func_proto bpf_sk_redirect_map_proto = {
+	.func           = bpf_sk_redirect_map,
+	.gpl_only       = false,
+	.ret_type       = RET_INTEGER,
+	.arg1_type      = ARG_CONST_MAP_PTR,
+	.arg2_type      = ARG_ANYTHING,
+	.arg3_type      = ARG_ANYTHING,
+};
+
 BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb)
 {
 	return task_get_classid(skb);
@@ -3229,6 +3268,8 @@ static const struct bpf_func_proto *
 	switch (func_id) {
 	case BPF_FUNC_setsockopt:
 		return &bpf_setsockopt_proto;
+	case BPF_FUNC_sock_map_update:
+		return &bpf_sock_map_update_proto;
 	default:
 		return bpf_base_func_proto(func_id);
 	}
@@ -3243,6 +3284,8 @@ static const struct bpf_func_proto *sk_skb_func_proto(enum bpf_func_id func_id)
 		return &bpf_get_socket_cookie_proto;
 	case BPF_FUNC_get_socket_uid:
 		return &bpf_get_socket_uid_proto;
+	case BPF_FUNC_sk_redirect_map:
+		return &bpf_sk_redirect_map_proto;
 	default:
 		return bpf_base_func_proto(func_id);
 	}
-- 
cgit v1.2.3-71-gd317


From 8a31db5615667956c513d205cfb06885c3ec6d0b Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Tue, 15 Aug 2017 22:33:09 -0700
Subject: bpf: add access to sock fields and pkt data from sk_skb programs

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/bpf.h |   9 +++
 kernel/bpf/verifier.c    |   1 +
 net/core/filter.c        | 169 +++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 179 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 7f774769e3f5..5ecbe812a2cc 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -712,6 +712,15 @@ struct __sk_buff {
 	__u32 data;
 	__u32 data_end;
 	__u32 napi_id;
+
+	/* accessed by BPF_PROG_TYPE_sk_skb types */
+	__u32 family;
+	__u32 remote_ip4;	/* Stored in network byte order */
+	__u32 local_ip4;	/* Stored in network byte order */
+	__u32 remote_ip6[4];	/* Stored in network byte order */
+	__u32 local_ip6[4];	/* Stored in network byte order */
+	__u32 remote_port;	/* Stored in network byte order */
+	__u32 local_port;	/* stored in host byte order */
 };
 
 struct bpf_tunnel_key {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index a71bc0996572..958ba84a9995 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -886,6 +886,7 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env,
 	case BPF_PROG_TYPE_SCHED_ACT:
 	case BPF_PROG_TYPE_XDP:
 	case BPF_PROG_TYPE_LWT_XMIT:
+	case BPF_PROG_TYPE_SK_SKB:
 		if (meta)
 			return meta->pkt_access;
 
diff --git a/net/core/filter.c b/net/core/filter.c
index 8e136578488c..e9f8dcef6c57 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3278,8 +3278,16 @@ static const struct bpf_func_proto *
 static const struct bpf_func_proto *sk_skb_func_proto(enum bpf_func_id func_id)
 {
 	switch (func_id) {
+	case BPF_FUNC_skb_store_bytes:
+		return &bpf_skb_store_bytes_proto;
 	case BPF_FUNC_skb_load_bytes:
 		return &bpf_skb_load_bytes_proto;
+	case BPF_FUNC_skb_pull_data:
+		return &bpf_skb_pull_data_proto;
+	case BPF_FUNC_skb_change_tail:
+		return &bpf_skb_change_tail_proto;
+	case BPF_FUNC_skb_change_head:
+		return &bpf_skb_change_head_proto;
 	case BPF_FUNC_get_socket_cookie:
 		return &bpf_get_socket_cookie_proto;
 	case BPF_FUNC_get_socket_uid:
@@ -3343,6 +3351,10 @@ static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type
 		if (off + size > offsetofend(struct __sk_buff, cb[4]))
 			return false;
 		break;
+	case bpf_ctx_range_till(struct __sk_buff, remote_ip6[0], remote_ip6[3]):
+	case bpf_ctx_range_till(struct __sk_buff, local_ip6[0], local_ip6[3]):
+	case bpf_ctx_range_till(struct __sk_buff, remote_ip4, remote_ip4):
+	case bpf_ctx_range_till(struct __sk_buff, local_ip4, local_ip4):
 	case bpf_ctx_range(struct __sk_buff, data):
 	case bpf_ctx_range(struct __sk_buff, data_end):
 		if (size != size_default)
@@ -3371,6 +3383,7 @@ static bool sk_filter_is_valid_access(int off, int size,
 	case bpf_ctx_range(struct __sk_buff, tc_classid):
 	case bpf_ctx_range(struct __sk_buff, data):
 	case bpf_ctx_range(struct __sk_buff, data_end):
+	case bpf_ctx_range_till(struct __sk_buff, family, local_port):
 		return false;
 	}
 
@@ -3392,6 +3405,7 @@ static bool lwt_is_valid_access(int off, int size,
 {
 	switch (off) {
 	case bpf_ctx_range(struct __sk_buff, tc_classid):
+	case bpf_ctx_range_till(struct __sk_buff, family, local_port):
 		return false;
 	}
 
@@ -3505,6 +3519,8 @@ static bool tc_cls_act_is_valid_access(int off, int size,
 	case bpf_ctx_range(struct __sk_buff, data_end):
 		info->reg_type = PTR_TO_PACKET_END;
 		break;
+	case bpf_ctx_range_till(struct __sk_buff, family, local_port):
+		return false;
 	}
 
 	return bpf_skb_is_valid_access(off, size, type, info);
@@ -3582,11 +3598,63 @@ static bool sock_ops_is_valid_access(int off, int size,
 	return __is_valid_sock_ops_access(off, size);
 }
 
+static int sk_skb_prologue(struct bpf_insn *insn_buf, bool direct_write,
+			   const struct bpf_prog *prog)
+{
+	struct bpf_insn *insn = insn_buf;
+
+	if (!direct_write)
+		return 0;
+
+	/* if (!skb->cloned)
+	 *       goto start;
+	 *
+	 * (Fast-path, otherwise approximation that we might be
+	 *  a clone, do the rest in helper.)
+	 */
+	*insn++ = BPF_LDX_MEM(BPF_B, BPF_REG_6, BPF_REG_1, CLONED_OFFSET());
+	*insn++ = BPF_ALU32_IMM(BPF_AND, BPF_REG_6, CLONED_MASK);
+	*insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_6, 0, 7);
+
+	/* ret = bpf_skb_pull_data(skb, 0); */
+	*insn++ = BPF_MOV64_REG(BPF_REG_6, BPF_REG_1);
+	*insn++ = BPF_ALU64_REG(BPF_XOR, BPF_REG_2, BPF_REG_2);
+	*insn++ = BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+			       BPF_FUNC_skb_pull_data);
+	/* if (!ret)
+	 *      goto restore;
+	 * return SK_DROP;
+	 */
+	*insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2);
+	*insn++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_0, SK_DROP);
+	*insn++ = BPF_EXIT_INSN();
+
+	/* restore: */
+	*insn++ = BPF_MOV64_REG(BPF_REG_1, BPF_REG_6);
+	/* start: */
+	*insn++ = prog->insnsi[0];
+
+	return insn - insn_buf;
+}
+
 static bool sk_skb_is_valid_access(int off, int size,
 				   enum bpf_access_type type,
 				   struct bpf_insn_access_aux *info)
 {
+	if (type == BPF_WRITE) {
+		switch (off) {
+		case bpf_ctx_range(struct __sk_buff, mark):
+		case bpf_ctx_range(struct __sk_buff, tc_index):
+		case bpf_ctx_range(struct __sk_buff, priority):
+			break;
+		default:
+			return false;
+		}
+	}
+
 	switch (off) {
+	case bpf_ctx_range(struct __sk_buff, tc_classid):
+		return false;
 	case bpf_ctx_range(struct __sk_buff, data):
 		info->reg_type = PTR_TO_PACKET;
 		break;
@@ -3783,6 +3851,106 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type,
 		*insn++ = BPF_MOV64_IMM(si->dst_reg, 0);
 #endif
 		break;
+	case offsetof(struct __sk_buff, family):
+		BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_family) != 2);
+
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
+				      si->dst_reg, si->src_reg,
+				      offsetof(struct sk_buff, sk));
+		*insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
+				      bpf_target_off(struct sock_common,
+						     skc_family,
+						     2, target_size));
+		break;
+	case offsetof(struct __sk_buff, remote_ip4):
+		BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_daddr) != 4);
+
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
+				      si->dst_reg, si->src_reg,
+				      offsetof(struct sk_buff, sk));
+		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
+				      bpf_target_off(struct sock_common,
+						     skc_daddr,
+						     4, target_size));
+		break;
+	case offsetof(struct __sk_buff, local_ip4):
+		BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common,
+					  skc_rcv_saddr) != 4);
+
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
+				      si->dst_reg, si->src_reg,
+				      offsetof(struct sk_buff, sk));
+		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
+				      bpf_target_off(struct sock_common,
+						     skc_rcv_saddr,
+						     4, target_size));
+		break;
+	case offsetof(struct __sk_buff, remote_ip6[0]) ...
+	     offsetof(struct __sk_buff, remote_ip6[3]):
+#if IS_ENABLED(CONFIG_IPV6)
+		BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common,
+					  skc_v6_daddr.s6_addr32[0]) != 4);
+
+		off = si->off;
+		off -= offsetof(struct __sk_buff, remote_ip6[0]);
+
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
+				      si->dst_reg, si->src_reg,
+				      offsetof(struct sk_buff, sk));
+		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
+				      offsetof(struct sock_common,
+					       skc_v6_daddr.s6_addr32[0]) +
+				      off);
+#else
+		*insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
+#endif
+		break;
+	case offsetof(struct __sk_buff, local_ip6[0]) ...
+	     offsetof(struct __sk_buff, local_ip6[3]):
+#if IS_ENABLED(CONFIG_IPV6)
+		BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common,
+					  skc_v6_rcv_saddr.s6_addr32[0]) != 4);
+
+		off = si->off;
+		off -= offsetof(struct __sk_buff, local_ip6[0]);
+
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
+				      si->dst_reg, si->src_reg,
+				      offsetof(struct sk_buff, sk));
+		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
+				      offsetof(struct sock_common,
+					       skc_v6_rcv_saddr.s6_addr32[0]) +
+				      off);
+#else
+		*insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
+#endif
+		break;
+
+	case offsetof(struct __sk_buff, remote_port):
+		BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_dport) != 2);
+
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
+				      si->dst_reg, si->src_reg,
+				      offsetof(struct sk_buff, sk));
+		*insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
+				      bpf_target_off(struct sock_common,
+						     skc_dport,
+						     2, target_size));
+#ifndef __BIG_ENDIAN_BITFIELD
+		*insn++ = BPF_ALU32_IMM(BPF_LSH, si->dst_reg, 16);
+#endif
+		break;
+
+	case offsetof(struct __sk_buff, local_port):
+		BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_num) != 2);
+
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
+				      si->dst_reg, si->src_reg,
+				      offsetof(struct sk_buff, sk));
+		*insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
+				      bpf_target_off(struct sock_common,
+						     skc_num, 2, target_size));
+		break;
 	}
 
 	return insn - insn_buf;
@@ -4071,6 +4239,7 @@ const struct bpf_verifier_ops sk_skb_prog_ops = {
 	.get_func_proto		= sk_skb_func_proto,
 	.is_valid_access	= sk_skb_is_valid_access,
 	.convert_ctx_access	= bpf_convert_ctx_access,
+	.gen_prologue		= sk_skb_prologue,
 };
 
 int sk_detach_filter(struct sock *sk)
-- 
cgit v1.2.3-71-gd317


From 22e4ebb975822833b083533035233d128b30e98f Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Fri, 28 Jul 2017 16:40:40 -0400
Subject: membarrier: Provide expedited private command

Implement MEMBARRIER_CMD_PRIVATE_EXPEDITED with IPIs using cpumask built
from all runqueues for which current thread's mm is the same as the
thread calling sys_membarrier. It executes faster than the non-expedited
variant (no blocking). It also works on NOHZ_FULL configurations.

Scheduler-wise, it requires a memory barrier before and after context
switching between processes (which have different mm). The memory
barrier before context switch is already present. For the barrier after
context switch:

* Our TSO archs can do RELEASE without being a full barrier. Look at
  x86 spin_unlock() being a regular STORE for example.  But for those
  archs, all atomics imply smp_mb and all of them have atomic ops in
  switch_mm() for mm_cpumask(), and on x86 the CR3 load acts as a full
  barrier.

* From all weakly ordered machines, only ARM64 and PPC can do RELEASE,
  the rest does indeed do smp_mb(), so there the spin_unlock() is a full
  barrier and we're good.

* ARM64 has a very heavy barrier in switch_to(), which suffices.

* PPC just removed its barrier from switch_to(), but appears to be
  talking about adding something to switch_mm(). So add a
  smp_mb__after_unlock_lock() for now, until this is settled on the PPC
  side.

Changes since v3:
- Properly document the memory barriers provided by each architecture.

Changes since v2:
- Address comments from Peter Zijlstra,
- Add smp_mb__after_unlock_lock() after finish_lock_switch() in
  finish_task_switch() to add the memory barrier we need after storing
  to rq->curr. This is much simpler than the previous approach relying
  on atomic_dec_and_test() in mmdrop(), which actually added a memory
  barrier in the common case of switching between userspace processes.
- Return -EINVAL when MEMBARRIER_CMD_SHARED is used on a nohz_full
  kernel, rather than having the whole membarrier system call returning
  -ENOSYS. Indeed, CMD_PRIVATE_EXPEDITED is compatible with nohz_full.
  Adapt the CMD_QUERY mask accordingly.

Changes since v1:
- move membarrier code under kernel/sched/ because it uses the
  scheduler runqueue,
- only add the barrier when we switch from a kernel thread. The case
  where we switch from a user-space thread is already handled by
  the atomic_dec_and_test() in mmdrop().
- add a comment to mmdrop() documenting the requirement on the implicit
  memory barrier.

CC: Peter Zijlstra <peterz@infradead.org>
CC: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
CC: Boqun Feng <boqun.feng@gmail.com>
CC: Andrew Hunter <ahh@google.com>
CC: Maged Michael <maged.michael@gmail.com>
CC: gromer@google.com
CC: Avi Kivity <avi@scylladb.com>
CC: Benjamin Herrenschmidt <benh@kernel.crashing.org>
CC: Paul Mackerras <paulus@samba.org>
CC: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Tested-by: Dave Watson <davejwatson@fb.com>
---
 MAINTAINERS                     |   2 +-
 arch/arm64/kernel/process.c     |   2 +
 include/uapi/linux/membarrier.h |  23 +++++-
 kernel/Makefile                 |   1 -
 kernel/membarrier.c             |  70 ------------------
 kernel/sched/Makefile           |   1 +
 kernel/sched/core.c             |  25 +++++++
 kernel/sched/membarrier.c       | 152 ++++++++++++++++++++++++++++++++++++++++
 8 files changed, 202 insertions(+), 74 deletions(-)
 delete mode 100644 kernel/membarrier.c
 create mode 100644 kernel/sched/membarrier.c

(limited to 'include/uapi/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index f66488dfdbc9..3b035584272f 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -8621,7 +8621,7 @@ M:	Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
 M:	"Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
 L:	linux-kernel@vger.kernel.org
 S:	Supported
-F:	kernel/membarrier.c
+F:	kernel/sched/membarrier.c
 F:	include/uapi/linux/membarrier.h
 
 MEMORY MANAGEMENT
diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index 659ae8094ed5..c8f7d98d8cb9 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -360,6 +360,8 @@ __notrace_funcgraph struct task_struct *__switch_to(struct task_struct *prev,
 	/*
 	 * Complete any pending TLB or cache maintenance on this CPU in case
 	 * the thread migrates to a different CPU.
+	 * This full barrier is also required by the membarrier system
+	 * call.
 	 */
 	dsb(ish);
 
diff --git a/include/uapi/linux/membarrier.h b/include/uapi/linux/membarrier.h
index e0b108bd2624..6d47b3249d8a 100644
--- a/include/uapi/linux/membarrier.h
+++ b/include/uapi/linux/membarrier.h
@@ -40,14 +40,33 @@
  *                          (non-running threads are de facto in such a
  *                          state). This covers threads from all processes
  *                          running on the system. This command returns 0.
+ * @MEMBARRIER_CMD_PRIVATE_EXPEDITED:
+ *                          Execute a memory barrier on each running
+ *                          thread belonging to the same process as the current
+ *                          thread. Upon return from system call, the
+ *                          caller thread is ensured that all its running
+ *                          threads siblings have passed through a state
+ *                          where all memory accesses to user-space
+ *                          addresses match program order between entry
+ *                          to and return from the system call
+ *                          (non-running threads are de facto in such a
+ *                          state). This only covers threads from the
+ *                          same processes as the caller thread. This
+ *                          command returns 0. The "expedited" commands
+ *                          complete faster than the non-expedited ones,
+ *                          they never block, but have the downside of
+ *                          causing extra overhead.
  *
  * Command to be passed to the membarrier system call. The commands need to
  * be a single bit each, except for MEMBARRIER_CMD_QUERY which is assigned to
  * the value 0.
  */
 enum membarrier_cmd {
-	MEMBARRIER_CMD_QUERY = 0,
-	MEMBARRIER_CMD_SHARED = (1 << 0),
+	MEMBARRIER_CMD_QUERY			= 0,
+	MEMBARRIER_CMD_SHARED			= (1 << 0),
+	/* reserved for MEMBARRIER_CMD_SHARED_EXPEDITED (1 << 1) */
+	/* reserved for MEMBARRIER_CMD_PRIVATE (1 << 2) */
+	MEMBARRIER_CMD_PRIVATE_EXPEDITED	= (1 << 3),
 };
 
 #endif /* _UAPI_LINUX_MEMBARRIER_H */
diff --git a/kernel/Makefile b/kernel/Makefile
index 4cb8e8b23c6e..9c323a6daa46 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -108,7 +108,6 @@ obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
 obj-$(CONFIG_JUMP_LABEL) += jump_label.o
 obj-$(CONFIG_CONTEXT_TRACKING) += context_tracking.o
 obj-$(CONFIG_TORTURE_TEST) += torture.o
-obj-$(CONFIG_MEMBARRIER) += membarrier.o
 
 obj-$(CONFIG_HAS_IOMEM) += memremap.o
 
diff --git a/kernel/membarrier.c b/kernel/membarrier.c
deleted file mode 100644
index 9f9284f37f8d..000000000000
--- a/kernel/membarrier.c
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * Copyright (C) 2010, 2015 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
- *
- * membarrier system call
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- */
-
-#include <linux/syscalls.h>
-#include <linux/membarrier.h>
-#include <linux/tick.h>
-
-/*
- * Bitmask made from a "or" of all commands within enum membarrier_cmd,
- * except MEMBARRIER_CMD_QUERY.
- */
-#define MEMBARRIER_CMD_BITMASK	(MEMBARRIER_CMD_SHARED)
-
-/**
- * sys_membarrier - issue memory barriers on a set of threads
- * @cmd:   Takes command values defined in enum membarrier_cmd.
- * @flags: Currently needs to be 0. For future extensions.
- *
- * If this system call is not implemented, -ENOSYS is returned. If the
- * command specified does not exist, or if the command argument is invalid,
- * this system call returns -EINVAL. For a given command, with flags argument
- * set to 0, this system call is guaranteed to always return the same value
- * until reboot.
- *
- * All memory accesses performed in program order from each targeted thread
- * is guaranteed to be ordered with respect to sys_membarrier(). If we use
- * the semantic "barrier()" to represent a compiler barrier forcing memory
- * accesses to be performed in program order across the barrier, and
- * smp_mb() to represent explicit memory barriers forcing full memory
- * ordering across the barrier, we have the following ordering table for
- * each pair of barrier(), sys_membarrier() and smp_mb():
- *
- * The pair ordering is detailed as (O: ordered, X: not ordered):
- *
- *                        barrier()   smp_mb() sys_membarrier()
- *        barrier()          X           X            O
- *        smp_mb()           X           O            O
- *        sys_membarrier()   O           O            O
- */
-SYSCALL_DEFINE2(membarrier, int, cmd, int, flags)
-{
-	/* MEMBARRIER_CMD_SHARED is not compatible with nohz_full. */
-	if (tick_nohz_full_enabled())
-		return -ENOSYS;
-	if (unlikely(flags))
-		return -EINVAL;
-	switch (cmd) {
-	case MEMBARRIER_CMD_QUERY:
-		return MEMBARRIER_CMD_BITMASK;
-	case MEMBARRIER_CMD_SHARED:
-		if (num_online_cpus() > 1)
-			synchronize_sched();
-		return 0;
-	default:
-		return -EINVAL;
-	}
-}
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 53f0164ed362..78f54932ea1d 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -25,3 +25,4 @@ obj-$(CONFIG_SCHED_DEBUG) += debug.o
 obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o
 obj-$(CONFIG_CPU_FREQ) += cpufreq.o
 obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o
+obj-$(CONFIG_MEMBARRIER) += membarrier.o
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index bfee6ea7db49..f77269c6c2f8 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2640,6 +2640,16 @@ static struct rq *finish_task_switch(struct task_struct *prev)
 	prev_state = prev->state;
 	vtime_task_switch(prev);
 	perf_event_task_sched_in(prev, current);
+	/*
+	 * The membarrier system call requires a full memory barrier
+	 * after storing to rq->curr, before going back to user-space.
+	 *
+	 * TODO: This smp_mb__after_unlock_lock can go away if PPC end
+	 * up adding a full barrier to switch_mm(), or we should figure
+	 * out if a smp_mb__after_unlock_lock is really the proper API
+	 * to use.
+	 */
+	smp_mb__after_unlock_lock();
 	finish_lock_switch(rq, prev);
 	finish_arch_post_lock_switch();
 
@@ -3329,6 +3339,21 @@ static void __sched notrace __schedule(bool preempt)
 	if (likely(prev != next)) {
 		rq->nr_switches++;
 		rq->curr = next;
+		/*
+		 * The membarrier system call requires each architecture
+		 * to have a full memory barrier after updating
+		 * rq->curr, before returning to user-space. For TSO
+		 * (e.g. x86), the architecture must provide its own
+		 * barrier in switch_mm(). For weakly ordered machines
+		 * for which spin_unlock() acts as a full memory
+		 * barrier, finish_lock_switch() in common code takes
+		 * care of this barrier. For weakly ordered machines for
+		 * which spin_unlock() acts as a RELEASE barrier (only
+		 * arm64 and PowerPC), arm64 has a full barrier in
+		 * switch_to(), and PowerPC has
+		 * smp_mb__after_unlock_lock() before
+		 * finish_lock_switch().
+		 */
 		++*switch_count;
 
 		trace_sched_switch(preempt, prev, next);
diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c
new file mode 100644
index 000000000000..a92fddc22747
--- /dev/null
+++ b/kernel/sched/membarrier.c
@@ -0,0 +1,152 @@
+/*
+ * Copyright (C) 2010-2017 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+ *
+ * membarrier system call
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/syscalls.h>
+#include <linux/membarrier.h>
+#include <linux/tick.h>
+#include <linux/cpumask.h>
+
+#include "sched.h"	/* for cpu_rq(). */
+
+/*
+ * Bitmask made from a "or" of all commands within enum membarrier_cmd,
+ * except MEMBARRIER_CMD_QUERY.
+ */
+#define MEMBARRIER_CMD_BITMASK	\
+	(MEMBARRIER_CMD_SHARED | MEMBARRIER_CMD_PRIVATE_EXPEDITED)
+
+static void ipi_mb(void *info)
+{
+	smp_mb();	/* IPIs should be serializing but paranoid. */
+}
+
+static void membarrier_private_expedited(void)
+{
+	int cpu;
+	bool fallback = false;
+	cpumask_var_t tmpmask;
+
+	if (num_online_cpus() == 1)
+		return;
+
+	/*
+	 * Matches memory barriers around rq->curr modification in
+	 * scheduler.
+	 */
+	smp_mb();	/* system call entry is not a mb. */
+
+	/*
+	 * Expedited membarrier commands guarantee that they won't
+	 * block, hence the GFP_NOWAIT allocation flag and fallback
+	 * implementation.
+	 */
+	if (!zalloc_cpumask_var(&tmpmask, GFP_NOWAIT)) {
+		/* Fallback for OOM. */
+		fallback = true;
+	}
+
+	cpus_read_lock();
+	for_each_online_cpu(cpu) {
+		struct task_struct *p;
+
+		/*
+		 * Skipping the current CPU is OK even through we can be
+		 * migrated at any point. The current CPU, at the point
+		 * where we read raw_smp_processor_id(), is ensured to
+		 * be in program order with respect to the caller
+		 * thread. Therefore, we can skip this CPU from the
+		 * iteration.
+		 */
+		if (cpu == raw_smp_processor_id())
+			continue;
+		rcu_read_lock();
+		p = task_rcu_dereference(&cpu_rq(cpu)->curr);
+		if (p && p->mm == current->mm) {
+			if (!fallback)
+				__cpumask_set_cpu(cpu, tmpmask);
+			else
+				smp_call_function_single(cpu, ipi_mb, NULL, 1);
+		}
+		rcu_read_unlock();
+	}
+	if (!fallback) {
+		smp_call_function_many(tmpmask, ipi_mb, NULL, 1);
+		free_cpumask_var(tmpmask);
+	}
+	cpus_read_unlock();
+
+	/*
+	 * Memory barrier on the caller thread _after_ we finished
+	 * waiting for the last IPI. Matches memory barriers around
+	 * rq->curr modification in scheduler.
+	 */
+	smp_mb();	/* exit from system call is not a mb */
+}
+
+/**
+ * sys_membarrier - issue memory barriers on a set of threads
+ * @cmd:   Takes command values defined in enum membarrier_cmd.
+ * @flags: Currently needs to be 0. For future extensions.
+ *
+ * If this system call is not implemented, -ENOSYS is returned. If the
+ * command specified does not exist, not available on the running
+ * kernel, or if the command argument is invalid, this system call
+ * returns -EINVAL. For a given command, with flags argument set to 0,
+ * this system call is guaranteed to always return the same value until
+ * reboot.
+ *
+ * All memory accesses performed in program order from each targeted thread
+ * is guaranteed to be ordered with respect to sys_membarrier(). If we use
+ * the semantic "barrier()" to represent a compiler barrier forcing memory
+ * accesses to be performed in program order across the barrier, and
+ * smp_mb() to represent explicit memory barriers forcing full memory
+ * ordering across the barrier, we have the following ordering table for
+ * each pair of barrier(), sys_membarrier() and smp_mb():
+ *
+ * The pair ordering is detailed as (O: ordered, X: not ordered):
+ *
+ *                        barrier()   smp_mb() sys_membarrier()
+ *        barrier()          X           X            O
+ *        smp_mb()           X           O            O
+ *        sys_membarrier()   O           O            O
+ */
+SYSCALL_DEFINE2(membarrier, int, cmd, int, flags)
+{
+	if (unlikely(flags))
+		return -EINVAL;
+	switch (cmd) {
+	case MEMBARRIER_CMD_QUERY:
+	{
+		int cmd_mask = MEMBARRIER_CMD_BITMASK;
+
+		if (tick_nohz_full_enabled())
+			cmd_mask &= ~MEMBARRIER_CMD_SHARED;
+		return cmd_mask;
+	}
+	case MEMBARRIER_CMD_SHARED:
+		/* MEMBARRIER_CMD_SHARED is not compatible with nohz_full. */
+		if (tick_nohz_full_enabled())
+			return -EINVAL;
+		if (num_online_cpus() > 1)
+			synchronize_sched();
+		return 0;
+	case MEMBARRIER_CMD_PRIVATE_EXPEDITED:
+		membarrier_private_expedited();
+		return 0;
+	default:
+		return -EINVAL;
+	}
+}
-- 
cgit v1.2.3-71-gd317


From 0888e372c37fa31882c8ed89fb2f8188b08b6718 Mon Sep 17 00:00:00 2001
From: "Levin, Alexander (Sasha Levin)" <alexander.levin@one.verizon.com>
Date: Thu, 17 Aug 2017 00:35:11 +0000
Subject: net: inet: diag: expose sockets cgroup classid

This is useful for directly looking up a task based on class id rather than
having to scan through all open file descriptors.

Signed-off-by: Sasha Levin <alexander.levin@verizon.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/inet_diag.h |  1 +
 net/ipv4/inet_diag.c           | 11 +++++++++++
 2 files changed, 12 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h
index bbe201047df6..678496897a68 100644
--- a/include/uapi/linux/inet_diag.h
+++ b/include/uapi/linux/inet_diag.h
@@ -142,6 +142,7 @@ enum {
 	INET_DIAG_PAD,
 	INET_DIAG_MARK,
 	INET_DIAG_BBRINFO,
+	INET_DIAG_CLASS_ID,
 	__INET_DIAG_MAX,
 };
 
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 3828b3a805cd..67325d5832d7 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -274,6 +274,17 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
 			goto errout;
 	}
 
+	if (ext & (1 << (INET_DIAG_CLASS_ID - 1))) {
+		u32 classid = 0;
+
+#ifdef CONFIG_SOCK_CGROUP_DATA
+		classid = sock_cgroup_classid(&sk->sk_cgrp_data);
+#endif
+
+		if (nla_put_u32(skb, INET_DIAG_CLASS_ID, classid))
+			goto errout;
+	}
+
 out:
 	nlmsg_end(skb, nlh);
 	return 0;
-- 
cgit v1.2.3-71-gd317


From 99d1712bc41c7c9a5a473c104a4ad15427757b22 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 8 Aug 2017 15:15:29 +0200
Subject: netfilter: exthdr: tcp option set support

This allows setting 2 and 4 byte quantities in the tcp option space.
Main purpose is to allow native replacement for xt_TCPMSS to
work around pmtu blackholes.

Writes to kind and len are now allowed at the moment, it does not seem
useful to do this as it causes corruption of the tcp option space.

We can always lift this restriction later if a use-case appears.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_tables.h |   4 +-
 net/netfilter/nft_exthdr.c               | 164 ++++++++++++++++++++++++++++++-
 2 files changed, 165 insertions(+), 3 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index be25cf69295b..40fd199f7531 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -732,7 +732,8 @@ enum nft_exthdr_op {
  * @NFTA_EXTHDR_OFFSET: extension header offset (NLA_U32)
  * @NFTA_EXTHDR_LEN: extension header length (NLA_U32)
  * @NFTA_EXTHDR_FLAGS: extension header flags (NLA_U32)
- * @NFTA_EXTHDR_OP: option match type (NLA_U8)
+ * @NFTA_EXTHDR_OP: option match type (NLA_U32)
+ * @NFTA_EXTHDR_SREG: option match type (NLA_U32)
  */
 enum nft_exthdr_attributes {
 	NFTA_EXTHDR_UNSPEC,
@@ -742,6 +743,7 @@ enum nft_exthdr_attributes {
 	NFTA_EXTHDR_LEN,
 	NFTA_EXTHDR_FLAGS,
 	NFTA_EXTHDR_OP,
+	NFTA_EXTHDR_SREG,
 	__NFTA_EXTHDR_MAX
 };
 #define NFTA_EXTHDR_MAX		(__NFTA_EXTHDR_MAX - 1)
diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c
index e3a6eebe7e0c..f5a0bf5e3bdd 100644
--- a/net/netfilter/nft_exthdr.c
+++ b/net/netfilter/nft_exthdr.c
@@ -8,6 +8,7 @@
  * Development of this code funded by Astaro AG (http://www.astaro.com/)
  */
 
+#include <asm/unaligned.h>
 #include <linux/kernel.h>
 #include <linux/init.h>
 #include <linux/module.h>
@@ -23,6 +24,7 @@ struct nft_exthdr {
 	u8			len;
 	u8			op;
 	enum nft_registers	dreg:8;
+	enum nft_registers	sreg:8;
 	u8			flags;
 };
 
@@ -124,6 +126,88 @@ err:
 		regs->verdict.code = NFT_BREAK;
 }
 
+static void nft_exthdr_tcp_set_eval(const struct nft_expr *expr,
+				    struct nft_regs *regs,
+				    const struct nft_pktinfo *pkt)
+{
+	u8 buff[sizeof(struct tcphdr) + MAX_TCP_OPTION_SPACE];
+	struct nft_exthdr *priv = nft_expr_priv(expr);
+	unsigned int i, optl, tcphdr_len, offset;
+	struct tcphdr *tcph;
+	u8 *opt;
+	u32 src;
+
+	tcph = nft_tcp_header_pointer(pkt, sizeof(buff), buff, &tcphdr_len);
+	if (!tcph)
+		return;
+
+	opt = (u8 *)tcph;
+	for (i = sizeof(*tcph); i < tcphdr_len - 1; i += optl) {
+		union {
+			u8 octet;
+			__be16 v16;
+			__be32 v32;
+		} old, new;
+
+		optl = optlen(opt, i);
+
+		if (priv->type != opt[i])
+			continue;
+
+		if (i + optl > tcphdr_len || priv->len + priv->offset > optl)
+			return;
+
+		if (!skb_make_writable(pkt->skb, pkt->xt.thoff + i + priv->len))
+			return;
+
+		tcph = nft_tcp_header_pointer(pkt, sizeof(buff), buff,
+					      &tcphdr_len);
+		if (!tcph)
+			return;
+
+		src = regs->data[priv->sreg];
+		offset = i + priv->offset;
+
+		switch (priv->len) {
+		case 2:
+			old.v16 = get_unaligned((u16 *)(opt + offset));
+			new.v16 = src;
+
+			switch (priv->type) {
+			case TCPOPT_MSS:
+				/* increase can cause connection to stall */
+				if (ntohs(old.v16) <= ntohs(new.v16))
+					return;
+			break;
+			}
+
+			if (old.v16 == new.v16)
+				return;
+
+			put_unaligned(new.v16, (u16*)(opt + offset));
+			inet_proto_csum_replace2(&tcph->check, pkt->skb,
+						 old.v16, new.v16, false);
+			break;
+		case 4:
+			new.v32 = src;
+			old.v32 = get_unaligned((u32 *)(opt + offset));
+
+			if (old.v32 == new.v32)
+				return;
+
+			put_unaligned(new.v32, (u32*)(opt + offset));
+			inet_proto_csum_replace4(&tcph->check, pkt->skb,
+						 old.v32, new.v32, false);
+			break;
+		default:
+			WARN_ON_ONCE(1);
+			break;
+		}
+
+		return;
+	}
+}
+
 static const struct nla_policy nft_exthdr_policy[NFTA_EXTHDR_MAX + 1] = {
 	[NFTA_EXTHDR_DREG]		= { .type = NLA_U32 },
 	[NFTA_EXTHDR_TYPE]		= { .type = NLA_U8 },
@@ -180,6 +264,55 @@ static int nft_exthdr_init(const struct nft_ctx *ctx,
 					   NFT_DATA_VALUE, priv->len);
 }
 
+static int nft_exthdr_tcp_set_init(const struct nft_ctx *ctx,
+				   const struct nft_expr *expr,
+				   const struct nlattr * const tb[])
+{
+	struct nft_exthdr *priv = nft_expr_priv(expr);
+	u32 offset, len, flags = 0, op = NFT_EXTHDR_OP_IPV6;
+	int err;
+
+	if (!tb[NFTA_EXTHDR_SREG] ||
+	    !tb[NFTA_EXTHDR_TYPE] ||
+	    !tb[NFTA_EXTHDR_OFFSET] ||
+	    !tb[NFTA_EXTHDR_LEN])
+		return -EINVAL;
+
+	if (tb[NFTA_EXTHDR_DREG] || tb[NFTA_EXTHDR_FLAGS])
+		return -EINVAL;
+
+	err = nft_parse_u32_check(tb[NFTA_EXTHDR_OFFSET], U8_MAX, &offset);
+	if (err < 0)
+		return err;
+
+	err = nft_parse_u32_check(tb[NFTA_EXTHDR_LEN], U8_MAX, &len);
+	if (err < 0)
+		return err;
+
+	if (offset < 2)
+		return -EOPNOTSUPP;
+
+	switch (len) {
+	case 2: break;
+	case 4: break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	err = nft_parse_u32_check(tb[NFTA_EXTHDR_OP], U8_MAX, &op);
+	if (err < 0)
+		return err;
+
+	priv->type   = nla_get_u8(tb[NFTA_EXTHDR_TYPE]);
+	priv->offset = offset;
+	priv->len    = len;
+	priv->sreg   = nft_parse_register(tb[NFTA_EXTHDR_SREG]);
+	priv->flags  = flags;
+	priv->op     = op;
+
+	return nft_validate_register_load(priv->sreg, priv->len);
+}
+
 static int nft_exthdr_dump_common(struct sk_buff *skb, const struct nft_exthdr *priv)
 {
 	if (nla_put_u8(skb, NFTA_EXTHDR_TYPE, priv->type))
@@ -208,6 +341,16 @@ static int nft_exthdr_dump(struct sk_buff *skb, const struct nft_expr *expr)
 	return nft_exthdr_dump_common(skb, priv);
 }
 
+static int nft_exthdr_dump_set(struct sk_buff *skb, const struct nft_expr *expr)
+{
+	const struct nft_exthdr *priv = nft_expr_priv(expr);
+
+	if (nft_dump_register(skb, NFTA_EXTHDR_SREG, priv->sreg))
+		return -1;
+
+	return nft_exthdr_dump_common(skb, priv);
+}
+
 static struct nft_expr_type nft_exthdr_type;
 static const struct nft_expr_ops nft_exthdr_ipv6_ops = {
 	.type		= &nft_exthdr_type,
@@ -225,6 +368,14 @@ static const struct nft_expr_ops nft_exthdr_tcp_ops = {
 	.dump		= nft_exthdr_dump,
 };
 
+static const struct nft_expr_ops nft_exthdr_tcp_set_ops = {
+	.type		= &nft_exthdr_type,
+	.size		= NFT_EXPR_SIZE(sizeof(struct nft_exthdr)),
+	.eval		= nft_exthdr_tcp_set_eval,
+	.init		= nft_exthdr_tcp_set_init,
+	.dump		= nft_exthdr_dump_set,
+};
+
 static const struct nft_expr_ops *
 nft_exthdr_select_ops(const struct nft_ctx *ctx,
 		      const struct nlattr * const tb[])
@@ -234,12 +385,21 @@ nft_exthdr_select_ops(const struct nft_ctx *ctx,
 	if (!tb[NFTA_EXTHDR_OP])
 		return &nft_exthdr_ipv6_ops;
 
+	if (tb[NFTA_EXTHDR_SREG] && tb[NFTA_EXTHDR_DREG])
+		return ERR_PTR(-EOPNOTSUPP);
+
 	op = ntohl(nla_get_u32(tb[NFTA_EXTHDR_OP]));
 	switch (op) {
 	case NFT_EXTHDR_OP_TCPOPT:
-		return &nft_exthdr_tcp_ops;
+		if (tb[NFTA_EXTHDR_SREG])
+			return &nft_exthdr_tcp_set_ops;
+		if (tb[NFTA_EXTHDR_DREG])
+			return &nft_exthdr_tcp_ops;
+		break;
 	case NFT_EXTHDR_OP_IPV6:
-		return &nft_exthdr_ipv6_ops;
+		if (tb[NFTA_EXTHDR_DREG])
+			return &nft_exthdr_ipv6_ops;
+		break;
 	}
 
 	return ERR_PTR(-EOPNOTSUPP);
-- 
cgit v1.2.3-71-gd317


From 6b5dc98e8fac041a3decfc3186e08c1c570ea691 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 8 Aug 2017 15:48:04 +0200
Subject: netfilter: rt: add support to fetch path mss

to be used in combination with tcp option set support to mimic
iptables TCPMSS --clamp-mss-to-pmtu.

v2: Eric Dumazet points out dst must be initialized.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_tables.h |  2 +
 net/netfilter/nft_rt.c                   | 66 ++++++++++++++++++++++++++++++++
 2 files changed, 68 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index 40fd199f7531..b49da72efa68 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -811,11 +811,13 @@ enum nft_meta_keys {
  * @NFT_RT_CLASSID: realm value of packet's route (skb->dst->tclassid)
  * @NFT_RT_NEXTHOP4: routing nexthop for IPv4
  * @NFT_RT_NEXTHOP6: routing nexthop for IPv6
+ * @NFT_RT_TCPMSS: fetch current path tcp mss
  */
 enum nft_rt_keys {
 	NFT_RT_CLASSID,
 	NFT_RT_NEXTHOP4,
 	NFT_RT_NEXTHOP6,
+	NFT_RT_TCPMSS,
 };
 
 /**
diff --git a/net/netfilter/nft_rt.c b/net/netfilter/nft_rt.c
index c7383d8f88d0..e142e65d3176 100644
--- a/net/netfilter/nft_rt.c
+++ b/net/netfilter/nft_rt.c
@@ -23,6 +23,42 @@ struct nft_rt {
 	enum nft_registers	dreg:8;
 };
 
+static u16 get_tcpmss(const struct nft_pktinfo *pkt, const struct dst_entry *skbdst)
+{
+	u32 minlen = sizeof(struct ipv6hdr), mtu = dst_mtu(skbdst);
+	const struct sk_buff *skb = pkt->skb;
+	const struct nf_afinfo *ai;
+	struct flowi fl;
+
+	memset(&fl, 0, sizeof(fl));
+
+	switch (nft_pf(pkt)) {
+	case NFPROTO_IPV4:
+		fl.u.ip4.daddr = ip_hdr(skb)->saddr;
+		minlen = sizeof(struct iphdr);
+		break;
+	case NFPROTO_IPV6:
+		fl.u.ip6.daddr = ipv6_hdr(skb)->saddr;
+		break;
+	}
+
+	ai = nf_get_afinfo(nft_pf(pkt));
+	if (ai) {
+		struct dst_entry *dst = NULL;
+
+		ai->route(nft_net(pkt), &dst, &fl, false);
+		if (dst) {
+			mtu = min(mtu, dst_mtu(dst));
+			dst_release(dst);
+		}
+	}
+
+	if (mtu <= minlen || mtu > 0xffff)
+		return TCP_MSS_DEFAULT;
+
+	return mtu - minlen;
+}
+
 static void nft_rt_get_eval(const struct nft_expr *expr,
 			    struct nft_regs *regs,
 			    const struct nft_pktinfo *pkt)
@@ -57,6 +93,9 @@ static void nft_rt_get_eval(const struct nft_expr *expr,
 					 &ipv6_hdr(skb)->daddr),
 		       sizeof(struct in6_addr));
 		break;
+	case NFT_RT_TCPMSS:
+		nft_reg_store16(dest, get_tcpmss(pkt, dst));
+		break;
 	default:
 		WARN_ON(1);
 		goto err;
@@ -94,6 +133,9 @@ static int nft_rt_get_init(const struct nft_ctx *ctx,
 	case NFT_RT_NEXTHOP6:
 		len = sizeof(struct in6_addr);
 		break;
+	case NFT_RT_TCPMSS:
+		len = sizeof(u16);
+		break;
 	default:
 		return -EOPNOTSUPP;
 	}
@@ -118,6 +160,29 @@ nla_put_failure:
 	return -1;
 }
 
+static int nft_rt_validate(const struct nft_ctx *ctx, const struct nft_expr *expr,
+			   const struct nft_data **data)
+{
+	const struct nft_rt *priv = nft_expr_priv(expr);
+	unsigned int hooks;
+
+	switch (priv->key) {
+	case NFT_RT_NEXTHOP4:
+	case NFT_RT_NEXTHOP6:
+	case NFT_RT_CLASSID:
+		return 0;
+	case NFT_RT_TCPMSS:
+		hooks = (1 << NF_INET_FORWARD) |
+			(1 << NF_INET_LOCAL_OUT) |
+			(1 << NF_INET_POST_ROUTING);
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return nft_chain_validate_hooks(ctx->chain, hooks);
+}
+
 static struct nft_expr_type nft_rt_type;
 static const struct nft_expr_ops nft_rt_get_ops = {
 	.type		= &nft_rt_type,
@@ -125,6 +190,7 @@ static const struct nft_expr_ops nft_rt_get_ops = {
 	.eval		= nft_rt_get_eval,
 	.init		= nft_rt_get_init,
 	.dump		= nft_rt_get_dump,
+	.validate	= nft_rt_validate,
 };
 
 static struct nft_expr_type nft_rt_type __read_mostly = {
-- 
cgit v1.2.3-71-gd317


From 96eabe7a40aa17e613cf3db2c742ee8b1fc764d0 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Fri, 18 Aug 2017 11:28:00 -0700
Subject: bpf: Allow selecting numa node during map creation

The current map creation API does not allow to provide the numa-node
preference.  The memory usually comes from where the map-creation-process
is running.  The performance is not ideal if the bpf_prog is known to
always run in a numa node different from the map-creation-process.

One of the use case is sharding on CPU to different LRU maps (i.e.
an array of LRU maps).  Here is the test result of map_perf_test on
the INNER_LRU_HASH_PREALLOC test if we force the lru map used by
CPU0 to be allocated from a remote numa node:

[ The machine has 20 cores. CPU0-9 at node 0. CPU10-19 at node 1 ]

># taskset -c 10 ./map_perf_test 512 8 1260000 8000000
5:inner_lru_hash_map_perf pre-alloc 1628380 events per sec
4:inner_lru_hash_map_perf pre-alloc 1626396 events per sec
3:inner_lru_hash_map_perf pre-alloc 1626144 events per sec
6:inner_lru_hash_map_perf pre-alloc 1621657 events per sec
2:inner_lru_hash_map_perf pre-alloc 1621534 events per sec
1:inner_lru_hash_map_perf pre-alloc 1620292 events per sec
7:inner_lru_hash_map_perf pre-alloc 1613305 events per sec
0:inner_lru_hash_map_perf pre-alloc 1239150 events per sec  #<<<

After specifying numa node:
># taskset -c 10 ./map_perf_test 512 8 1260000 8000000
5:inner_lru_hash_map_perf pre-alloc 1629627 events per sec
3:inner_lru_hash_map_perf pre-alloc 1628057 events per sec
1:inner_lru_hash_map_perf pre-alloc 1623054 events per sec
6:inner_lru_hash_map_perf pre-alloc 1616033 events per sec
2:inner_lru_hash_map_perf pre-alloc 1614630 events per sec
4:inner_lru_hash_map_perf pre-alloc 1612651 events per sec
7:inner_lru_hash_map_perf pre-alloc 1609337 events per sec
0:inner_lru_hash_map_perf pre-alloc 1619340 events per sec #<<<

This patch adds one field, numa_node, to the bpf_attr.  Since numa node 0
is a valid node, a new flag BPF_F_NUMA_NODE is also added.  The numa_node
field is honored if and only if the BPF_F_NUMA_NODE flag is set.

Numa node selection is not supported for percpu map.

This patch does not change all the kmalloc.  F.e.
'htab = kzalloc()' is not changed since the object
is small enough to stay in the cache.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h      | 10 +++++++++-
 include/uapi/linux/bpf.h | 10 +++++++++-
 kernel/bpf/arraymap.c    |  7 +++++--
 kernel/bpf/devmap.c      |  9 ++++++---
 kernel/bpf/hashtab.c     | 19 +++++++++++++++----
 kernel/bpf/lpm_trie.c    |  9 +++++++--
 kernel/bpf/sockmap.c     | 10 +++++++---
 kernel/bpf/stackmap.c    |  8 +++++---
 kernel/bpf/syscall.c     | 14 ++++++++++----
 9 files changed, 73 insertions(+), 23 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 1cc6c5ff61ec..55b88e329804 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -51,6 +51,7 @@ struct bpf_map {
 	u32 map_flags;
 	u32 pages;
 	u32 id;
+	int numa_node;
 	struct user_struct *user;
 	const struct bpf_map_ops *ops;
 	struct work_struct work;
@@ -264,7 +265,7 @@ struct bpf_map * __must_check bpf_map_inc(struct bpf_map *map, bool uref);
 void bpf_map_put_with_uref(struct bpf_map *map);
 void bpf_map_put(struct bpf_map *map);
 int bpf_map_precharge_memlock(u32 pages);
-void *bpf_map_area_alloc(size_t size);
+void *bpf_map_area_alloc(size_t size, int numa_node);
 void bpf_map_area_free(void *base);
 
 extern int sysctl_unprivileged_bpf_disabled;
@@ -316,6 +317,13 @@ struct net_device  *__dev_map_lookup_elem(struct bpf_map *map, u32 key);
 void __dev_map_insert_ctx(struct bpf_map *map, u32 index);
 void __dev_map_flush(struct bpf_map *map);
 
+/* Return map's numa specified by userspace */
+static inline int bpf_map_attr_numa_node(const union bpf_attr *attr)
+{
+	return (attr->map_flags & BPF_F_NUMA_NODE) ?
+		attr->numa_node : NUMA_NO_NODE;
+}
+
 #else
 static inline struct bpf_prog *bpf_prog_get(u32 ufd)
 {
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 5ecbe812a2cc..843818dff96d 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -165,6 +165,7 @@ enum bpf_attach_type {
 #define BPF_NOEXIST	1 /* create new element if it didn't exist */
 #define BPF_EXIST	2 /* update existing element */
 
+/* flags for BPF_MAP_CREATE command */
 #define BPF_F_NO_PREALLOC	(1U << 0)
 /* Instead of having one common LRU list in the
  * BPF_MAP_TYPE_LRU_[PERCPU_]HASH map, use a percpu LRU list
@@ -173,6 +174,8 @@ enum bpf_attach_type {
  * across different LRU lists.
  */
 #define BPF_F_NO_COMMON_LRU	(1U << 1)
+/* Specify numa node during map creation */
+#define BPF_F_NUMA_NODE		(1U << 2)
 
 union bpf_attr {
 	struct { /* anonymous struct used by BPF_MAP_CREATE command */
@@ -180,8 +183,13 @@ union bpf_attr {
 		__u32	key_size;	/* size of key in bytes */
 		__u32	value_size;	/* size of value in bytes */
 		__u32	max_entries;	/* max number of entries in a map */
-		__u32	map_flags;	/* prealloc or not */
+		__u32	map_flags;	/* BPF_MAP_CREATE related
+					 * flags defined above.
+					 */
 		__u32	inner_map_fd;	/* fd pointing to the inner map */
+		__u32	numa_node;	/* numa node (effective only if
+					 * BPF_F_NUMA_NODE is set).
+					 */
 	};
 
 	struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index d771a3872500..96e9c5c1dfc9 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -49,13 +49,15 @@ static int bpf_array_alloc_percpu(struct bpf_array *array)
 static struct bpf_map *array_map_alloc(union bpf_attr *attr)
 {
 	bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY;
+	int numa_node = bpf_map_attr_numa_node(attr);
 	struct bpf_array *array;
 	u64 array_size;
 	u32 elem_size;
 
 	/* check sanity of attributes */
 	if (attr->max_entries == 0 || attr->key_size != 4 ||
-	    attr->value_size == 0 || attr->map_flags)
+	    attr->value_size == 0 || attr->map_flags & ~BPF_F_NUMA_NODE ||
+	    (percpu && numa_node != NUMA_NO_NODE))
 		return ERR_PTR(-EINVAL);
 
 	if (attr->value_size > KMALLOC_MAX_SIZE)
@@ -77,7 +79,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
 		return ERR_PTR(-ENOMEM);
 
 	/* allocate all map elements and zero-initialize them */
-	array = bpf_map_area_alloc(array_size);
+	array = bpf_map_area_alloc(array_size, numa_node);
 	if (!array)
 		return ERR_PTR(-ENOMEM);
 
@@ -87,6 +89,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
 	array->map.value_size = attr->value_size;
 	array->map.max_entries = attr->max_entries;
 	array->map.map_flags = attr->map_flags;
+	array->map.numa_node = numa_node;
 	array->elem_size = elem_size;
 
 	if (!percpu)
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 18a72a8add43..67f4f00ce33a 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -80,7 +80,7 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
 
 	/* check sanity of attributes */
 	if (attr->max_entries == 0 || attr->key_size != 4 ||
-	    attr->value_size != 4 || attr->map_flags)
+	    attr->value_size != 4 || attr->map_flags & ~BPF_F_NUMA_NODE)
 		return ERR_PTR(-EINVAL);
 
 	dtab = kzalloc(sizeof(*dtab), GFP_USER);
@@ -93,6 +93,7 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
 	dtab->map.value_size = attr->value_size;
 	dtab->map.max_entries = attr->max_entries;
 	dtab->map.map_flags = attr->map_flags;
+	dtab->map.numa_node = bpf_map_attr_numa_node(attr);
 
 	err = -ENOMEM;
 
@@ -119,7 +120,8 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
 		goto free_dtab;
 
 	dtab->netdev_map = bpf_map_area_alloc(dtab->map.max_entries *
-					      sizeof(struct bpf_dtab_netdev *));
+					      sizeof(struct bpf_dtab_netdev *),
+					      dtab->map.numa_node);
 	if (!dtab->netdev_map)
 		goto free_dtab;
 
@@ -344,7 +346,8 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value,
 	if (!ifindex) {
 		dev = NULL;
 	} else {
-		dev = kmalloc(sizeof(*dev), GFP_ATOMIC | __GFP_NOWARN);
+		dev = kmalloc_node(sizeof(*dev), GFP_ATOMIC | __GFP_NOWARN,
+				   map->numa_node);
 		if (!dev)
 			return -ENOMEM;
 
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 4fb463172aa8..47ae748c3a49 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -18,6 +18,9 @@
 #include "bpf_lru_list.h"
 #include "map_in_map.h"
 
+#define HTAB_CREATE_FLAG_MASK \
+	(BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE)
+
 struct bucket {
 	struct hlist_nulls_head head;
 	raw_spinlock_t lock;
@@ -138,7 +141,8 @@ static int prealloc_init(struct bpf_htab *htab)
 	if (!htab_is_percpu(htab) && !htab_is_lru(htab))
 		num_entries += num_possible_cpus();
 
-	htab->elems = bpf_map_area_alloc(htab->elem_size * num_entries);
+	htab->elems = bpf_map_area_alloc(htab->elem_size * num_entries,
+					 htab->map.numa_node);
 	if (!htab->elems)
 		return -ENOMEM;
 
@@ -233,6 +237,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
 	 */
 	bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU);
 	bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC);
+	int numa_node = bpf_map_attr_numa_node(attr);
 	struct bpf_htab *htab;
 	int err, i;
 	u64 cost;
@@ -248,7 +253,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
 		 */
 		return ERR_PTR(-EPERM);
 
-	if (attr->map_flags & ~(BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU))
+	if (attr->map_flags & ~HTAB_CREATE_FLAG_MASK)
 		/* reserved bits should not be used */
 		return ERR_PTR(-EINVAL);
 
@@ -258,6 +263,9 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
 	if (lru && !prealloc)
 		return ERR_PTR(-ENOTSUPP);
 
+	if (numa_node != NUMA_NO_NODE && (percpu || percpu_lru))
+		return ERR_PTR(-EINVAL);
+
 	htab = kzalloc(sizeof(*htab), GFP_USER);
 	if (!htab)
 		return ERR_PTR(-ENOMEM);
@@ -268,6 +276,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
 	htab->map.value_size = attr->value_size;
 	htab->map.max_entries = attr->max_entries;
 	htab->map.map_flags = attr->map_flags;
+	htab->map.numa_node = numa_node;
 
 	/* check sanity of attributes.
 	 * value_size == 0 may be allowed in the future to use map as a set
@@ -346,7 +355,8 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
 
 	err = -ENOMEM;
 	htab->buckets = bpf_map_area_alloc(htab->n_buckets *
-					   sizeof(struct bucket));
+					   sizeof(struct bucket),
+					   htab->map.numa_node);
 	if (!htab->buckets)
 		goto free_htab;
 
@@ -689,7 +699,8 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
 				atomic_dec(&htab->count);
 				return ERR_PTR(-E2BIG);
 			}
-		l_new = kmalloc(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN);
+		l_new = kmalloc_node(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN,
+				     htab->map.numa_node);
 		if (!l_new)
 			return ERR_PTR(-ENOMEM);
 	}
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index b09185f0f17d..1b767844a76f 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -244,7 +244,8 @@ static struct lpm_trie_node *lpm_trie_node_alloc(const struct lpm_trie *trie,
 	if (value)
 		size += trie->map.value_size;
 
-	node = kmalloc(size, GFP_ATOMIC | __GFP_NOWARN);
+	node = kmalloc_node(size, GFP_ATOMIC | __GFP_NOWARN,
+			    trie->map.numa_node);
 	if (!node)
 		return NULL;
 
@@ -405,6 +406,8 @@ static int trie_delete_elem(struct bpf_map *map, void *key)
 #define LPM_KEY_SIZE_MAX	LPM_KEY_SIZE(LPM_DATA_SIZE_MAX)
 #define LPM_KEY_SIZE_MIN	LPM_KEY_SIZE(LPM_DATA_SIZE_MIN)
 
+#define LPM_CREATE_FLAG_MASK	(BPF_F_NO_PREALLOC | BPF_F_NUMA_NODE)
+
 static struct bpf_map *trie_alloc(union bpf_attr *attr)
 {
 	struct lpm_trie *trie;
@@ -416,7 +419,8 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr)
 
 	/* check sanity of attributes */
 	if (attr->max_entries == 0 ||
-	    attr->map_flags != BPF_F_NO_PREALLOC ||
+	    !(attr->map_flags & BPF_F_NO_PREALLOC) ||
+	    attr->map_flags & ~LPM_CREATE_FLAG_MASK ||
 	    attr->key_size < LPM_KEY_SIZE_MIN ||
 	    attr->key_size > LPM_KEY_SIZE_MAX ||
 	    attr->value_size < LPM_VAL_SIZE_MIN ||
@@ -433,6 +437,7 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr)
 	trie->map.value_size = attr->value_size;
 	trie->map.max_entries = attr->max_entries;
 	trie->map.map_flags = attr->map_flags;
+	trie->map.numa_node = bpf_map_attr_numa_node(attr);
 	trie->data_size = attr->key_size -
 			  offsetof(struct bpf_lpm_trie_key, data);
 	trie->max_prefixlen = trie->data_size * 8;
diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index 39de541fbcdc..78b2bb9370ac 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -443,7 +443,9 @@ static struct smap_psock *smap_init_psock(struct sock *sock,
 {
 	struct smap_psock *psock;
 
-	psock = kzalloc(sizeof(struct smap_psock), GFP_ATOMIC | __GFP_NOWARN);
+	psock = kzalloc_node(sizeof(struct smap_psock),
+			     GFP_ATOMIC | __GFP_NOWARN,
+			     stab->map.numa_node);
 	if (!psock)
 		return ERR_PTR(-ENOMEM);
 
@@ -465,7 +467,7 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
 
 	/* check sanity of attributes */
 	if (attr->max_entries == 0 || attr->key_size != 4 ||
-	    attr->value_size != 4 || attr->map_flags)
+	    attr->value_size != 4 || attr->map_flags & ~BPF_F_NUMA_NODE)
 		return ERR_PTR(-EINVAL);
 
 	if (attr->value_size > KMALLOC_MAX_SIZE)
@@ -481,6 +483,7 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
 	stab->map.value_size = attr->value_size;
 	stab->map.max_entries = attr->max_entries;
 	stab->map.map_flags = attr->map_flags;
+	stab->map.numa_node = bpf_map_attr_numa_node(attr);
 
 	/* make sure page count doesn't overflow */
 	cost = (u64) stab->map.max_entries * sizeof(struct sock *);
@@ -495,7 +498,8 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
 		goto free_stab;
 
 	stab->sock_map = bpf_map_area_alloc(stab->map.max_entries *
-					    sizeof(struct sock *));
+					    sizeof(struct sock *),
+					    stab->map.numa_node);
 	if (!stab->sock_map)
 		goto free_stab;
 
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 31147d730abf..135be433e9a0 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -31,7 +31,8 @@ static int prealloc_elems_and_freelist(struct bpf_stack_map *smap)
 	u32 elem_size = sizeof(struct stack_map_bucket) + smap->map.value_size;
 	int err;
 
-	smap->elems = bpf_map_area_alloc(elem_size * smap->map.max_entries);
+	smap->elems = bpf_map_area_alloc(elem_size * smap->map.max_entries,
+					 smap->map.numa_node);
 	if (!smap->elems)
 		return -ENOMEM;
 
@@ -59,7 +60,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
 	if (!capable(CAP_SYS_ADMIN))
 		return ERR_PTR(-EPERM);
 
-	if (attr->map_flags)
+	if (attr->map_flags & ~BPF_F_NUMA_NODE)
 		return ERR_PTR(-EINVAL);
 
 	/* check sanity of attributes */
@@ -75,7 +76,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
 	if (cost >= U32_MAX - PAGE_SIZE)
 		return ERR_PTR(-E2BIG);
 
-	smap = bpf_map_area_alloc(cost);
+	smap = bpf_map_area_alloc(cost, bpf_map_attr_numa_node(attr));
 	if (!smap)
 		return ERR_PTR(-ENOMEM);
 
@@ -91,6 +92,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
 	smap->map.map_flags = attr->map_flags;
 	smap->n_buckets = n_buckets;
 	smap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
+	smap->map.numa_node = bpf_map_attr_numa_node(attr);
 
 	err = bpf_map_precharge_memlock(smap->map.pages);
 	if (err)
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index b8cb1b3c9bfb..9378f3ba2cbf 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -105,7 +105,7 @@ static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
 	return map;
 }
 
-void *bpf_map_area_alloc(size_t size)
+void *bpf_map_area_alloc(size_t size, int numa_node)
 {
 	/* We definitely need __GFP_NORETRY, so OOM killer doesn't
 	 * trigger under memory pressure as we really just want to
@@ -115,12 +115,13 @@ void *bpf_map_area_alloc(size_t size)
 	void *area;
 
 	if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) {
-		area = kmalloc(size, GFP_USER | flags);
+		area = kmalloc_node(size, GFP_USER | flags, numa_node);
 		if (area != NULL)
 			return area;
 	}
 
-	return __vmalloc(size, GFP_KERNEL | flags, PAGE_KERNEL);
+	return __vmalloc_node_flags_caller(size, numa_node, GFP_KERNEL | flags,
+					   __builtin_return_address(0));
 }
 
 void bpf_map_area_free(void *area)
@@ -309,10 +310,11 @@ int bpf_map_new_fd(struct bpf_map *map)
 		   offsetof(union bpf_attr, CMD##_LAST_FIELD) - \
 		   sizeof(attr->CMD##_LAST_FIELD)) != NULL
 
-#define BPF_MAP_CREATE_LAST_FIELD inner_map_fd
+#define BPF_MAP_CREATE_LAST_FIELD numa_node
 /* called via syscall */
 static int map_create(union bpf_attr *attr)
 {
+	int numa_node = bpf_map_attr_numa_node(attr);
 	struct bpf_map *map;
 	int err;
 
@@ -320,6 +322,10 @@ static int map_create(union bpf_attr *attr)
 	if (err)
 		return -EINVAL;
 
+	if (numa_node != NUMA_NO_NODE &&
+	    (numa_node >= nr_node_ids || !node_online(numa_node)))
+		return -EINVAL;
+
 	/* find map type and init map: hashtable vs rbtree vs bloom vs ... */
 	map = find_and_alloc_map(attr);
 	if (IS_ERR(map))
-- 
cgit v1.2.3-71-gd317


From 9a6b2a87405a5022660022722d4a830b768e8033 Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hverkuil@xs4all.nl>
Date: Tue, 15 Aug 2017 15:26:25 -0400
Subject: media: cec: rename pin events/function

The CEC_EVENT_PIN_LOW/HIGH defines and the cec_queue_pin_event() function
did not specify that these were about CEC pin events.

Since in the future there will also be HPD pin events it is wise to rename
the event defines and function to CEC_EVENT_PIN_CEC_LOW/HIGH and
cec_queue_pin_cec_event() now before these become part of the ABI.

Signed-off-by: Hans Verkuil <hans.verkuil@cisco.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
---
 Documentation/media/uapi/cec/cec-ioc-adap-g-caps.rst | 2 +-
 Documentation/media/uapi/cec/cec-ioc-dqevent.rst     | 8 ++++----
 Documentation/media/uapi/cec/cec-ioc-g-mode.rst      | 2 +-
 drivers/media/cec/cec-adap.c                         | 7 ++++---
 drivers/media/cec/cec-api.c                          | 4 ++--
 drivers/media/cec/cec-pin.c                          | 5 +++--
 include/media/cec.h                                  | 9 +++++----
 include/uapi/linux/cec.h                             | 4 ++--
 8 files changed, 22 insertions(+), 19 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/media/uapi/cec/cec-ioc-adap-g-caps.rst b/Documentation/media/uapi/cec/cec-ioc-adap-g-caps.rst
index 0a7aa21f24f4..6c1f6efb822e 100644
--- a/Documentation/media/uapi/cec/cec-ioc-adap-g-caps.rst
+++ b/Documentation/media/uapi/cec/cec-ioc-adap-g-caps.rst
@@ -127,7 +127,7 @@ returns the information to the application. The ioctl never fails.
       - 0x00000080
       - The CEC hardware can monitor CEC pin changes from low to high voltage
         and vice versa. When in pin monitoring mode the application will
-	receive ``CEC_EVENT_PIN_LOW`` and ``CEC_EVENT_PIN_HIGH`` events.
+	receive ``CEC_EVENT_PIN_CEC_LOW`` and ``CEC_EVENT_PIN_CEC_HIGH`` events.
 
 
diff --git a/Documentation/media/uapi/cec/cec-ioc-dqevent.rst b/Documentation/media/uapi/cec/cec-ioc-dqevent.rst
index 766d8b0ce431..db615e3405c0 100644
--- a/Documentation/media/uapi/cec/cec-ioc-dqevent.rst
+++ b/Documentation/media/uapi/cec/cec-ioc-dqevent.rst
@@ -146,16 +146,16 @@ it is guaranteed that the state did change in between the two events.
       - 2
       - Generated if one or more CEC messages were lost because the
 	application didn't dequeue CEC messages fast enough.
-    * .. _`CEC-EVENT-PIN-LOW`:
+    * .. _`CEC-EVENT-PIN-CEC-LOW`:
 
-      - ``CEC_EVENT_PIN_LOW``
+      - ``CEC_EVENT_PIN_CEC_LOW``
       - 3
       - Generated if the CEC pin goes from a high voltage to a low voltage.
         Only applies to adapters that have the ``CEC_CAP_MONITOR_PIN``
 	capability set.
-    * .. _`CEC-EVENT-PIN-HIGH`:
+    * .. _`CEC-EVENT-PIN-CEC-HIGH`:
 
-      - ``CEC_EVENT_PIN_HIGH``
+      - ``CEC_EVENT_PIN_CEC_HIGH``
       - 4
       - Generated if the CEC pin goes from a low voltage to a high voltage.
         Only applies to adapters that have the ``CEC_CAP_MONITOR_PIN``
diff --git a/Documentation/media/uapi/cec/cec-ioc-g-mode.rst b/Documentation/media/uapi/cec/cec-ioc-g-mode.rst
index 494154e9d449..4d8e0647e832 100644
--- a/Documentation/media/uapi/cec/cec-ioc-g-mode.rst
+++ b/Documentation/media/uapi/cec/cec-ioc-g-mode.rst
@@ -159,7 +159,7 @@ Available follower modes are:
 	This mode requires that the :ref:`CEC_CAP_MONITOR_PIN <CEC-CAP-MONITOR-PIN>`
 	capability is set, otherwise the ``EINVAL`` error code is returned.
 	While in pin monitoring mode this file descriptor can receive the
-	``CEC_EVENT_PIN_LOW`` and ``CEC_EVENT_PIN_HIGH`` events to see the
+	``CEC_EVENT_PIN_CEC_LOW`` and ``CEC_EVENT_PIN_CEC_HIGH`` events to see the
 	low-level CEC pin transitions. This is very useful for debugging.
 	This mode is only allowed if the process has the ``CAP_NET_ADMIN``
 	capability. If that is not set, then the ``EPERM`` error code is returned.
diff --git a/drivers/media/cec/cec-adap.c b/drivers/media/cec/cec-adap.c
index 8ac39ddf892c..d9adeb505c09 100644
--- a/drivers/media/cec/cec-adap.c
+++ b/drivers/media/cec/cec-adap.c
@@ -154,10 +154,11 @@ static void cec_queue_event(struct cec_adapter *adap,
 }
 
 /* Notify userspace that the CEC pin changed state at the given time. */
-void cec_queue_pin_event(struct cec_adapter *adap, bool is_high, ktime_t ts)
+void cec_queue_pin_cec_event(struct cec_adapter *adap, bool is_high, ktime_t ts)
 {
 	struct cec_event ev = {
-		.event = is_high ? CEC_EVENT_PIN_HIGH : CEC_EVENT_PIN_LOW,
+		.event = is_high ? CEC_EVENT_PIN_CEC_HIGH :
+				   CEC_EVENT_PIN_CEC_LOW,
 	};
 	struct cec_fh *fh;
 
@@ -167,7 +168,7 @@ void cec_queue_pin_event(struct cec_adapter *adap, bool is_high, ktime_t ts)
 			cec_queue_event_fh(fh, &ev, ktime_to_ns(ts));
 	mutex_unlock(&adap->devnode.lock);
 }
-EXPORT_SYMBOL_GPL(cec_queue_pin_event);
+EXPORT_SYMBOL_GPL(cec_queue_pin_cec_event);
 
 /*
  * Queue a new message for this filehandle.
diff --git a/drivers/media/cec/cec-api.c b/drivers/media/cec/cec-api.c
index 00d43d74020f..87649b0c6381 100644
--- a/drivers/media/cec/cec-api.c
+++ b/drivers/media/cec/cec-api.c
@@ -449,8 +449,8 @@ static long cec_s_mode(struct cec_adapter *adap, struct cec_fh *fh,
 			.flags = CEC_EVENT_FL_INITIAL_STATE,
 		};
 
-		ev.event = adap->pin->cur_value ? CEC_EVENT_PIN_HIGH :
-						  CEC_EVENT_PIN_LOW;
+		ev.event = adap->pin->cur_value ? CEC_EVENT_PIN_CEC_HIGH :
+						  CEC_EVENT_PIN_CEC_LOW;
 		cec_queue_event_fh(fh, &ev, 0);
 #endif
 		adap->monitor_pin_cnt++;
diff --git a/drivers/media/cec/cec-pin.c b/drivers/media/cec/cec-pin.c
index 03f800e5ec1f..31a26d3b8bd8 100644
--- a/drivers/media/cec/cec-pin.c
+++ b/drivers/media/cec/cec-pin.c
@@ -609,8 +609,9 @@ static int cec_pin_thread_func(void *_adap)
 		while (atomic_read(&pin->work_pin_events)) {
 			unsigned int idx = pin->work_pin_events_rd;
 
-			cec_queue_pin_event(adap, pin->work_pin_is_high[idx],
-					    pin->work_pin_ts[idx]);
+			cec_queue_pin_cec_event(adap,
+						pin->work_pin_is_high[idx],
+						pin->work_pin_ts[idx]);
 			pin->work_pin_events_rd = (idx + 1) % CEC_NUM_PIN_EVENTS;
 			atomic_dec(&pin->work_pin_events);
 		}
diff --git a/include/media/cec.h b/include/media/cec.h
index 1bec7bde4792..224359c9941a 100644
--- a/include/media/cec.h
+++ b/include/media/cec.h
@@ -91,7 +91,7 @@ struct cec_event_entry {
 };
 
 #define CEC_NUM_CORE_EVENTS 2
-#define CEC_NUM_EVENTS CEC_EVENT_PIN_HIGH
+#define CEC_NUM_EVENTS CEC_EVENT_PIN_CEC_HIGH
 
 struct cec_fh {
 	struct list_head	list;
@@ -280,14 +280,15 @@ static inline void cec_received_msg(struct cec_adapter *adap,
 }
 
 /**
- * cec_queue_pin_event() - queue a pin event with a given timestamp.
+ * cec_queue_pin_cec_event() - queue a CEC pin event with a given timestamp.
  *
  * @adap:	pointer to the cec adapter
- * @is_high:	when true the pin is high, otherwise it is low
+ * @is_high:	when true the CEC pin is high, otherwise it is low
  * @ts:		the timestamp for this event
  *
  */
-void cec_queue_pin_event(struct cec_adapter *adap, bool is_high, ktime_t ts);
+void cec_queue_pin_cec_event(struct cec_adapter *adap,
+			     bool is_high, ktime_t ts);
 
 /**
  * cec_get_edid_phys_addr() - find and return the physical address
diff --git a/include/uapi/linux/cec.h b/include/uapi/linux/cec.h
index d87a67b0bb06..4351c3481aea 100644
--- a/include/uapi/linux/cec.h
+++ b/include/uapi/linux/cec.h
@@ -408,8 +408,8 @@ struct cec_log_addrs {
  * didn't empty the message queue in time
  */
 #define CEC_EVENT_LOST_MSGS		2
-#define CEC_EVENT_PIN_LOW		3
-#define CEC_EVENT_PIN_HIGH		4
+#define CEC_EVENT_PIN_CEC_LOW		3
+#define CEC_EVENT_PIN_CEC_HIGH		4
 
 #define CEC_EVENT_FL_INITIAL_STATE	(1 << 0)
 #define CEC_EVENT_FL_DROPPED_EVENTS	(1 << 1)
-- 
cgit v1.2.3-71-gd317


From 84e54fe0a5eaed696dee4019c396f8396f5a908b Mon Sep 17 00:00:00 2001
From: William Tu <u9012063@gmail.com>
Date: Tue, 22 Aug 2017 09:40:28 -0700
Subject: gre: introduce native tunnel support for ERSPAN

The patch adds ERSPAN type II tunnel support.  The implementation
is based on the draft at [1].  One of the purposes is for Linux
box to be able to receive ERSPAN monitoring traffic sent from
the Cisco switch, by creating a ERSPAN tunnel device.
In addition, the patch also adds ERSPAN TX, so Linux virtual
switch can redirect monitored traffic to the ERSPAN tunnel device.
The traffic will be encapsulated into ERSPAN and sent out.

The implementation reuses tunnel key as ERSPAN session ID, and
field 'erspan' as ERSPAN Index fields:
./ip link add dev ers11 type erspan seq key 100 erspan 123 \
			local 172.16.1.200 remote 172.16.1.100

To use the above device as ERSPAN receiver, configure
Nexus 5000 switch as below:

monitor session 100 type erspan-source
  erspan-id 123
  vrf default
  destination ip 172.16.1.200
  source interface Ethernet1/11 both
  source interface Ethernet1/12 both
  no shut
monitor erspan origin ip-address 172.16.1.100 global

[1] https://tools.ietf.org/html/draft-foschiano-erspan-01
[2] iproute2 patch: http://marc.info/?l=linux-netdev&m=150306086924951&w=2
[3] test script: http://marc.info/?l=linux-netdev&m=150231021807304&w=2

Signed-off-by: William Tu <u9012063@gmail.com>
Signed-off-by: Meenakshi Vohra <mvohra@vmware.com>
Cc: Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
Cc: Hideaki YOSHIFUJI <yoshfuji@linux-ipv6.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/erspan.h           |  61 ++++++++++
 include/net/ip_tunnels.h       |   3 +
 include/uapi/linux/if_ether.h  |   1 +
 include/uapi/linux/if_tunnel.h |   1 +
 net/ipv4/ip_gre.c              | 269 +++++++++++++++++++++++++++++++++++++++++
 5 files changed, 335 insertions(+)
 create mode 100644 include/net/erspan.h

(limited to 'include/uapi/linux')

diff --git a/include/net/erspan.h b/include/net/erspan.h
new file mode 100644
index 000000000000..ca94fc86865e
--- /dev/null
+++ b/include/net/erspan.h
@@ -0,0 +1,61 @@
+#ifndef __LINUX_ERSPAN_H
+#define __LINUX_ERSPAN_H
+
+/*
+ * GRE header for ERSPAN encapsulation (8 octets [34:41]) -- 8 bytes
+ *       0                   1                   2                   3
+ *      0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ *     +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *     |0|0|0|1|0|00000|000000000|00000|    Protocol Type for ERSPAN   |
+ *     +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *     |      Sequence Number (increments per packet per session)      |
+ *     +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *
+ *  Note that in the above GRE header [RFC1701] out of the C, R, K, S,
+ *  s, Recur, Flags, Version fields only S (bit 03) is set to 1. The
+ *  other fields are set to zero, so only a sequence number follows.
+ *
+ *  ERSPAN Type II header (8 octets [42:49])
+ *  0                   1                   2                   3
+ *  0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * |  Ver  |          VLAN         | COS | En|T|    Session ID     |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * |      Reserved         |                  Index                |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *
+ * GRE proto ERSPAN type II = 0x88BE, type III = 0x22EB
+ */
+
+#define ERSPAN_VERSION	0x1
+
+#define VER_MASK	0xf000
+#define VLAN_MASK	0x0fff
+#define COS_MASK	0xe000
+#define EN_MASK		0x1800
+#define T_MASK		0x0400
+#define ID_MASK		0x03ff
+#define INDEX_MASK	0xfffff
+
+enum erspan_encap_type {
+	ERSPAN_ENCAP_NOVLAN = 0x0,	/* originally without VLAN tag */
+	ERSPAN_ENCAP_ISL = 0x1,		/* originally ISL encapsulated */
+	ERSPAN_ENCAP_8021Q = 0x2,	/* originally 802.1Q encapsulated */
+	ERSPAN_ENCAP_INFRAME = 0x3,	/* VLAN tag perserved in frame */
+};
+
+struct erspan_metadata {
+	__be32 index;   /* type II */
+};
+
+struct erspanhdr {
+	__be16 ver_vlan;
+#define VER_OFFSET  12
+	__be16 session_id;
+#define COS_OFFSET  13
+#define EN_OFFSET   11
+#define T_OFFSET    10
+	struct erspan_metadata md;
+};
+
+#endif
diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index 520809912f03..625c29329372 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -115,6 +115,9 @@ struct ip_tunnel {
 	u32		o_seqno;	/* The last output seqno */
 	int		tun_hlen;	/* Precalculated header length */
 
+	/* This field used only by ERSPAN */
+	u32		index;		/* ERSPAN type II index */
+
 	struct dst_cache dst_cache;
 
 	struct ip_tunnel_parm parms;
diff --git a/include/uapi/linux/if_ether.h b/include/uapi/linux/if_ether.h
index 5bc9bfd816b7..efeb1190c2ca 100644
--- a/include/uapi/linux/if_ether.h
+++ b/include/uapi/linux/if_ether.h
@@ -66,6 +66,7 @@
 #define ETH_P_ATALK	0x809B		/* Appletalk DDP		*/
 #define ETH_P_AARP	0x80F3		/* Appletalk AARP		*/
 #define ETH_P_8021Q	0x8100          /* 802.1Q VLAN Extended Header  */
+#define ETH_P_ERSPAN	0x88BE		/* ERSPAN type II		*/
 #define ETH_P_IPX	0x8137		/* IPX over DIX			*/
 #define ETH_P_IPV6	0x86DD		/* IPv6 over bluebook		*/
 #define ETH_P_PAUSE	0x8808		/* IEEE Pause frames. See 802.3 31B */
diff --git a/include/uapi/linux/if_tunnel.h b/include/uapi/linux/if_tunnel.h
index 6792d1967d31..2e520883c054 100644
--- a/include/uapi/linux/if_tunnel.h
+++ b/include/uapi/linux/if_tunnel.h
@@ -134,6 +134,7 @@ enum {
 	IFLA_GRE_COLLECT_METADATA,
 	IFLA_GRE_IGNORE_DF,
 	IFLA_GRE_FWMARK,
+	IFLA_GRE_ERSPAN_INDEX,
 	__IFLA_GRE_MAX,
 };
 
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 7a7829e839c2..6e8a62289e03 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -48,6 +48,7 @@
 #include <net/rtnetlink.h>
 #include <net/gre.h>
 #include <net/dst_metadata.h>
+#include <net/erspan.h>
 
 /*
    Problems & solutions
@@ -115,6 +116,7 @@ static int ipgre_tunnel_init(struct net_device *dev);
 
 static unsigned int ipgre_net_id __read_mostly;
 static unsigned int gre_tap_net_id __read_mostly;
+static unsigned int erspan_net_id __read_mostly;
 
 static void ipgre_err(struct sk_buff *skb, u32 info,
 		      const struct tnl_ptk_info *tpi)
@@ -246,6 +248,56 @@ static void gre_err(struct sk_buff *skb, u32 info)
 	ipgre_err(skb, info, &tpi);
 }
 
+static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi,
+		      int gre_hdr_len)
+{
+	struct net *net = dev_net(skb->dev);
+	struct metadata_dst *tun_dst = NULL;
+	struct ip_tunnel_net *itn;
+	struct ip_tunnel *tunnel;
+	struct erspanhdr *ershdr;
+	const struct iphdr *iph;
+	__be32 session_id;
+	__be32 index;
+	int len;
+
+	itn = net_generic(net, erspan_net_id);
+	iph = ip_hdr(skb);
+	len = gre_hdr_len + sizeof(*ershdr);
+
+	if (unlikely(!pskb_may_pull(skb, len)))
+		return -ENOMEM;
+
+	iph = ip_hdr(skb);
+	ershdr = (struct erspanhdr *)(skb->data + gre_hdr_len);
+
+	/* The original GRE header does not have key field,
+	 * Use ERSPAN 10-bit session ID as key.
+	 */
+	session_id = cpu_to_be32(ntohs(ershdr->session_id));
+	tpi->key = session_id;
+	index = ershdr->md.index;
+	tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex,
+				  tpi->flags | TUNNEL_KEY,
+				  iph->saddr, iph->daddr, tpi->key);
+
+	if (tunnel) {
+		if (__iptunnel_pull_header(skb,
+					   gre_hdr_len + sizeof(*ershdr),
+					   htons(ETH_P_TEB),
+					   false, false) < 0)
+			goto drop;
+
+		tunnel->index = ntohl(index);
+		skb_reset_mac_header(skb);
+		ip_tunnel_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error);
+		return PACKET_RCVD;
+	}
+drop:
+	kfree_skb(skb);
+	return PACKET_RCVD;
+}
+
 static int __ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi,
 		       struct ip_tunnel_net *itn, int hdr_len, bool raw_proto)
 {
@@ -328,6 +380,11 @@ static int gre_rcv(struct sk_buff *skb)
 	if (hdr_len < 0)
 		goto drop;
 
+	if (unlikely(tpi.proto == htons(ETH_P_ERSPAN))) {
+		if (erspan_rcv(skb, &tpi, hdr_len) == PACKET_RCVD)
+			return 0;
+	}
+
 	if (ipgre_rcv(skb, &tpi, hdr_len) == PACKET_RCVD)
 		return 0;
 
@@ -503,6 +560,81 @@ free_skb:
 	return NETDEV_TX_OK;
 }
 
+static inline u8 tos_to_cos(u8 tos)
+{
+	u8 dscp, cos;
+
+	dscp = tos >> 2;
+	cos = dscp >> 3;
+	return cos;
+}
+
+static void erspan_build_header(struct sk_buff *skb,
+				__be32 id, u32 index, bool truncate)
+{
+	struct iphdr *iphdr = ip_hdr(skb);
+	struct ethhdr *eth = eth_hdr(skb);
+	enum erspan_encap_type enc_type;
+	struct erspanhdr *ershdr;
+	struct qtag_prefix {
+		__be16 eth_type;
+		__be16 tci;
+	} *qp;
+	u16 vlan_tci = 0;
+
+	enc_type = ERSPAN_ENCAP_NOVLAN;
+
+	/* If mirrored packet has vlan tag, extract tci and
+	 *  perserve vlan header in the mirrored frame.
+	 */
+	if (eth->h_proto == htons(ETH_P_8021Q)) {
+		qp = (struct qtag_prefix *)(skb->data + 2 * ETH_ALEN);
+		vlan_tci = ntohs(qp->tci);
+		enc_type = ERSPAN_ENCAP_INFRAME;
+	}
+
+	skb_push(skb, sizeof(*ershdr));
+	ershdr = (struct erspanhdr *)skb->data;
+	memset(ershdr, 0, sizeof(*ershdr));
+
+	ershdr->ver_vlan = htons((vlan_tci & VLAN_MASK) |
+				 (ERSPAN_VERSION << VER_OFFSET));
+	ershdr->session_id = htons((u16)(ntohl(id) & ID_MASK) |
+			   ((tos_to_cos(iphdr->tos) << COS_OFFSET) & COS_MASK) |
+			   (enc_type << EN_OFFSET & EN_MASK) |
+			   ((truncate << T_OFFSET) & T_MASK));
+	ershdr->md.index = htonl(index & INDEX_MASK);
+}
+
+static netdev_tx_t erspan_xmit(struct sk_buff *skb,
+			       struct net_device *dev)
+{
+	struct ip_tunnel *tunnel = netdev_priv(dev);
+	bool truncate = false;
+
+	if (gre_handle_offloads(skb, false))
+		goto free_skb;
+
+	if (skb_cow_head(skb, dev->needed_headroom))
+		goto free_skb;
+
+	if (skb->len > dev->mtu) {
+		pskb_trim(skb, dev->mtu);
+		truncate = true;
+	}
+
+	/* Push ERSPAN header */
+	erspan_build_header(skb, tunnel->parms.o_key, tunnel->index, truncate);
+	tunnel->parms.o_flags &= ~TUNNEL_KEY;
+	__gre_xmit(skb, dev, &tunnel->parms.iph, htons(ETH_P_ERSPAN));
+	return NETDEV_TX_OK;
+
+free_skb:
+	kfree_skb(skb);
+	dev->stats.tx_dropped++;
+	return NETDEV_TX_OK;
+}
+
 static netdev_tx_t gre_tap_xmit(struct sk_buff *skb,
 				struct net_device *dev)
 {
@@ -828,6 +960,39 @@ out:
 	return ipgre_tunnel_validate(tb, data, extack);
 }
 
+static int erspan_validate(struct nlattr *tb[], struct nlattr *data[],
+			   struct netlink_ext_ack *extack)
+{
+	__be16 flags = 0;
+	int ret;
+
+	if (!data)
+		return 0;
+
+	ret = ipgre_tap_validate(tb, data, extack);
+	if (ret)
+		return ret;
+
+	/* ERSPAN should only have GRE sequence and key flag */
+	flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
+	flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
+	if (flags != (GRE_SEQ | GRE_KEY))
+		return -EINVAL;
+
+	/* ERSPAN Session ID only has 10-bit. Since we reuse
+	 * 32-bit key field as ID, check it's range.
+	 */
+	if (data[IFLA_GRE_IKEY] &&
+	    (ntohl(nla_get_be32(data[IFLA_GRE_IKEY])) & ~ID_MASK))
+		return -EINVAL;
+
+	if (data[IFLA_GRE_OKEY] &&
+	    (ntohl(nla_get_be32(data[IFLA_GRE_OKEY])) & ~ID_MASK))
+		return -EINVAL;
+
+	return 0;
+}
+
 static int ipgre_netlink_parms(struct net_device *dev,
 				struct nlattr *data[],
 				struct nlattr *tb[],
@@ -892,6 +1057,13 @@ static int ipgre_netlink_parms(struct net_device *dev,
 	if (data[IFLA_GRE_FWMARK])
 		*fwmark = nla_get_u32(data[IFLA_GRE_FWMARK]);
 
+	if (data[IFLA_GRE_ERSPAN_INDEX]) {
+		t->index = nla_get_u32(data[IFLA_GRE_ERSPAN_INDEX]);
+
+		if (t->index & ~INDEX_MASK)
+			return -EINVAL;
+	}
+
 	return 0;
 }
 
@@ -949,6 +1121,36 @@ static const struct net_device_ops gre_tap_netdev_ops = {
 	.ndo_fill_metadata_dst	= gre_fill_metadata_dst,
 };
 
+static int erspan_tunnel_init(struct net_device *dev)
+{
+	struct ip_tunnel *tunnel = netdev_priv(dev);
+	int t_hlen;
+
+	tunnel->tun_hlen = 8;
+	tunnel->parms.iph.protocol = IPPROTO_GRE;
+	t_hlen = tunnel->hlen + sizeof(struct iphdr) + sizeof(struct erspanhdr);
+
+	dev->needed_headroom = LL_MAX_HEADER + t_hlen + 4;
+	dev->mtu = ETH_DATA_LEN - t_hlen - 4;
+	dev->features		|= GRE_FEATURES;
+	dev->hw_features	|= GRE_FEATURES;
+	dev->priv_flags		|= IFF_LIVE_ADDR_CHANGE;
+
+	return ip_tunnel_init(dev);
+}
+
+static const struct net_device_ops erspan_netdev_ops = {
+	.ndo_init		= erspan_tunnel_init,
+	.ndo_uninit		= ip_tunnel_uninit,
+	.ndo_start_xmit		= erspan_xmit,
+	.ndo_set_mac_address	= eth_mac_addr,
+	.ndo_validate_addr	= eth_validate_addr,
+	.ndo_change_mtu		= ip_tunnel_change_mtu,
+	.ndo_get_stats64	= ip_tunnel_get_stats64,
+	.ndo_get_iflink		= ip_tunnel_get_iflink,
+	.ndo_fill_metadata_dst	= gre_fill_metadata_dst,
+};
+
 static void ipgre_tap_setup(struct net_device *dev)
 {
 	ether_setup(dev);
@@ -1041,6 +1243,8 @@ static size_t ipgre_get_size(const struct net_device *dev)
 		nla_total_size(1) +
 		/* IFLA_GRE_FWMARK */
 		nla_total_size(4) +
+		/* IFLA_GRE_ERSPAN_INDEX */
+		nla_total_size(4) +
 		0;
 }
 
@@ -1083,12 +1287,25 @@ static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
 			goto nla_put_failure;
 	}
 
+	if (t->index)
+		if (nla_put_u32(skb, IFLA_GRE_ERSPAN_INDEX, t->index))
+			goto nla_put_failure;
+
 	return 0;
 
 nla_put_failure:
 	return -EMSGSIZE;
 }
 
+static void erspan_setup(struct net_device *dev)
+{
+	ether_setup(dev);
+	dev->netdev_ops = &erspan_netdev_ops;
+	dev->priv_flags &= ~IFF_TX_SKB_SHARING;
+	dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
+	ip_tunnel_setup(dev, erspan_net_id);
+}
+
 static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
 	[IFLA_GRE_LINK]		= { .type = NLA_U32 },
 	[IFLA_GRE_IFLAGS]	= { .type = NLA_U16 },
@@ -1107,6 +1324,7 @@ static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
 	[IFLA_GRE_COLLECT_METADATA]	= { .type = NLA_FLAG },
 	[IFLA_GRE_IGNORE_DF]	= { .type = NLA_U8 },
 	[IFLA_GRE_FWMARK]	= { .type = NLA_U32 },
+	[IFLA_GRE_ERSPAN_INDEX]	= { .type = NLA_U32 },
 };
 
 static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
@@ -1139,6 +1357,21 @@ static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
 	.get_link_net	= ip_tunnel_get_link_net,
 };
 
+static struct rtnl_link_ops erspan_link_ops __read_mostly = {
+	.kind		= "erspan",
+	.maxtype	= IFLA_GRE_MAX,
+	.policy		= ipgre_policy,
+	.priv_size	= sizeof(struct ip_tunnel),
+	.setup		= erspan_setup,
+	.validate	= erspan_validate,
+	.newlink	= ipgre_newlink,
+	.changelink	= ipgre_changelink,
+	.dellink	= ip_tunnel_dellink,
+	.get_size	= ipgre_get_size,
+	.fill_info	= ipgre_fill_info,
+	.get_link_net	= ip_tunnel_get_link_net,
+};
+
 struct net_device *gretap_fb_dev_create(struct net *net, const char *name,
 					u8 name_assign_type)
 {
@@ -1202,6 +1435,26 @@ static struct pernet_operations ipgre_tap_net_ops = {
 	.size = sizeof(struct ip_tunnel_net),
 };
 
+static int __net_init erspan_init_net(struct net *net)
+{
+	return ip_tunnel_init_net(net, erspan_net_id,
+				  &erspan_link_ops, "erspan0");
+}
+
+static void __net_exit erspan_exit_net(struct net *net)
+{
+	struct ip_tunnel_net *itn = net_generic(net, erspan_net_id);
+
+	ip_tunnel_delete_net(itn, &erspan_link_ops);
+}
+
+static struct pernet_operations erspan_net_ops = {
+	.init = erspan_init_net,
+	.exit = erspan_exit_net,
+	.id   = &erspan_net_id,
+	.size = sizeof(struct ip_tunnel_net),
+};
+
 static int __init ipgre_init(void)
 {
 	int err;
@@ -1216,6 +1469,10 @@ static int __init ipgre_init(void)
 	if (err < 0)
 		goto pnet_tap_faied;
 
+	err = register_pernet_device(&erspan_net_ops);
+	if (err < 0)
+		goto pnet_erspan_failed;
+
 	err = gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO);
 	if (err < 0) {
 		pr_info("%s: can't add protocol\n", __func__);
@@ -1230,13 +1487,21 @@ static int __init ipgre_init(void)
 	if (err < 0)
 		goto tap_ops_failed;
 
+	err = rtnl_link_register(&erspan_link_ops);
+	if (err < 0)
+		goto erspan_link_failed;
+
 	return 0;
 
+erspan_link_failed:
+	rtnl_link_unregister(&ipgre_tap_ops);
 tap_ops_failed:
 	rtnl_link_unregister(&ipgre_link_ops);
 rtnl_link_failed:
 	gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
 add_proto_failed:
+	unregister_pernet_device(&erspan_net_ops);
+pnet_erspan_failed:
 	unregister_pernet_device(&ipgre_tap_net_ops);
 pnet_tap_faied:
 	unregister_pernet_device(&ipgre_net_ops);
@@ -1247,9 +1512,11 @@ static void __exit ipgre_fini(void)
 {
 	rtnl_link_unregister(&ipgre_tap_ops);
 	rtnl_link_unregister(&ipgre_link_ops);
+	rtnl_link_unregister(&erspan_link_ops);
 	gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
 	unregister_pernet_device(&ipgre_tap_net_ops);
 	unregister_pernet_device(&ipgre_net_ops);
+	unregister_pernet_device(&erspan_net_ops);
 }
 
 module_init(ipgre_init);
@@ -1257,5 +1524,7 @@ module_exit(ipgre_fini);
 MODULE_LICENSE("GPL");
 MODULE_ALIAS_RTNL_LINK("gre");
 MODULE_ALIAS_RTNL_LINK("gretap");
+MODULE_ALIAS_RTNL_LINK("erspan");
 MODULE_ALIAS_NETDEV("gre0");
 MODULE_ALIAS_NETDEV("gretap0");
+MODULE_ALIAS_NETDEV("erspan0");
-- 
cgit v1.2.3-71-gd317


From 5cdcf4c6a638591ec0e98c57404a19e7f9997567 Mon Sep 17 00:00:00 2001
From: Martijn Coenen <maco@android.com>
Date: Fri, 28 Jul 2017 13:56:06 +0200
Subject: ANDROID: binder: add padding to binder_fd_array_object.

binder_fd_array_object starts with a 4-byte header,
followed by a few fields that are 8 bytes when
ANDROID_BINDER_IPC_32BIT=N.

This can cause alignment issues in a 64-bit kernel
with a 32-bit userspace, as on x86_32 an 8-byte primitive
may be aligned to a 4-byte address. Pad with a __u32
to fix this.

Signed-off-by: Martijn Coenen <maco@android.com>
Cc: stable <stable@vger.kernel.org> # 4.11+
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/uapi/linux/android/binder.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/android/binder.h b/include/uapi/linux/android/binder.h
index 51f891fb1b18..7668b5791c91 100644
--- a/include/uapi/linux/android/binder.h
+++ b/include/uapi/linux/android/binder.h
@@ -132,6 +132,7 @@ enum {
 
 /* struct binder_fd_array_object - object describing an array of fds in a buffer
  * @hdr:		common header structure
+ * @pad:		padding to ensure correct alignment
  * @num_fds:		number of file descriptors in the buffer
  * @parent:		index in offset array to buffer holding the fd array
  * @parent_offset:	start offset of fd array in the buffer
@@ -152,6 +153,7 @@ enum {
  */
 struct binder_fd_array_object {
 	struct binder_object_header	hdr;
+	__u32				pad;
 	binder_size_t			num_fds;
 	binder_size_t			parent;
 	binder_size_t			parent_offset;
-- 
cgit v1.2.3-71-gd317


From 1e6ec9ea89d30739b9447c1860fcb07fc29f3aef Mon Sep 17 00:00:00 2001
From: Omar Sandoval <osandov@fb.com>
Date: Wed, 23 Aug 2017 14:54:59 -0700
Subject: Revert "loop: support 4k physical blocksize"

There's some stuff still up in the air, let's not get stuck with a
subpar ABI. I'll follow up with something better for 4.14.

Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/loop.c      | 42 ++++++------------------------------------
 drivers/block/loop.h      |  1 -
 include/uapi/linux/loop.h |  3 ---
 3 files changed, 6 insertions(+), 40 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index ef8334949b42..f321b96405f5 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -221,8 +221,7 @@ static void __loop_update_dio(struct loop_device *lo, bool dio)
 }
 
 static int
-figure_loop_size(struct loop_device *lo, loff_t offset, loff_t sizelimit,
-		 loff_t logical_blocksize)
+figure_loop_size(struct loop_device *lo, loff_t offset, loff_t sizelimit)
 {
 	loff_t size = get_size(offset, sizelimit, lo->lo_backing_file);
 	sector_t x = (sector_t)size;
@@ -234,12 +233,6 @@ figure_loop_size(struct loop_device *lo, loff_t offset, loff_t sizelimit,
 		lo->lo_offset = offset;
 	if (lo->lo_sizelimit != sizelimit)
 		lo->lo_sizelimit = sizelimit;
-	if (lo->lo_flags & LO_FLAGS_BLOCKSIZE) {
-		lo->lo_logical_blocksize = logical_blocksize;
-		blk_queue_physical_block_size(lo->lo_queue, lo->lo_blocksize);
-		blk_queue_logical_block_size(lo->lo_queue,
-					     lo->lo_logical_blocksize);
-	}
 	set_capacity(lo->lo_disk, x);
 	bd_set_size(bdev, (loff_t)get_capacity(bdev->bd_disk) << 9);
 	/* let user-space know about the new size */
@@ -820,7 +813,6 @@ static void loop_config_discard(struct loop_device *lo)
 	struct file *file = lo->lo_backing_file;
 	struct inode *inode = file->f_mapping->host;
 	struct request_queue *q = lo->lo_queue;
-	int lo_bits = 9;
 
 	/*
 	 * We use punch hole to reclaim the free space used by the
@@ -840,11 +832,9 @@ static void loop_config_discard(struct loop_device *lo)
 
 	q->limits.discard_granularity = inode->i_sb->s_blocksize;
 	q->limits.discard_alignment = 0;
-	if (lo->lo_flags & LO_FLAGS_BLOCKSIZE)
-		lo_bits = blksize_bits(lo->lo_logical_blocksize);
 
-	blk_queue_max_discard_sectors(q, UINT_MAX >> lo_bits);
-	blk_queue_max_write_zeroes_sectors(q, UINT_MAX >> lo_bits);
+	blk_queue_max_discard_sectors(q, UINT_MAX >> 9);
+	blk_queue_max_write_zeroes_sectors(q, UINT_MAX >> 9);
 	queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q);
 }
 
@@ -938,7 +928,6 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode,
 
 	lo->use_dio = false;
 	lo->lo_blocksize = lo_blocksize;
-	lo->lo_logical_blocksize = 512;
 	lo->lo_device = bdev;
 	lo->lo_flags = lo_flags;
 	lo->lo_backing_file = file;
@@ -1104,7 +1093,6 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
 	int err;
 	struct loop_func_table *xfer;
 	kuid_t uid = current_uid();
-	int lo_flags = lo->lo_flags;
 
 	if (lo->lo_encrypt_key_size &&
 	    !uid_eq(lo->lo_key_owner, uid) &&
@@ -1137,26 +1125,9 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
 	if (err)
 		goto exit;
 
-	if (info->lo_flags & LO_FLAGS_BLOCKSIZE) {
-		if (!(lo->lo_flags & LO_FLAGS_BLOCKSIZE))
-			lo->lo_logical_blocksize = 512;
-		lo->lo_flags |= LO_FLAGS_BLOCKSIZE;
-		if (LO_INFO_BLOCKSIZE(info) != 512 &&
-		    LO_INFO_BLOCKSIZE(info) != 1024 &&
-		    LO_INFO_BLOCKSIZE(info) != 2048 &&
-		    LO_INFO_BLOCKSIZE(info) != 4096)
-			return -EINVAL;
-		if (LO_INFO_BLOCKSIZE(info) > lo->lo_blocksize)
-			return -EINVAL;
-	}
-
 	if (lo->lo_offset != info->lo_offset ||
-	    lo->lo_sizelimit != info->lo_sizelimit ||
-	    lo->lo_flags != lo_flags ||
-	    ((lo->lo_flags & LO_FLAGS_BLOCKSIZE) &&
-	     lo->lo_logical_blocksize != LO_INFO_BLOCKSIZE(info))) {
-		if (figure_loop_size(lo, info->lo_offset, info->lo_sizelimit,
-				     LO_INFO_BLOCKSIZE(info))) {
+	    lo->lo_sizelimit != info->lo_sizelimit) {
+		if (figure_loop_size(lo, info->lo_offset, info->lo_sizelimit)) {
 			err = -EFBIG;
 			goto exit;
 		}
@@ -1348,8 +1319,7 @@ static int loop_set_capacity(struct loop_device *lo)
 	if (unlikely(lo->lo_state != Lo_bound))
 		return -ENXIO;
 
-	return figure_loop_size(lo, lo->lo_offset, lo->lo_sizelimit,
-				lo->lo_logical_blocksize);
+	return figure_loop_size(lo, lo->lo_offset, lo->lo_sizelimit);
 }
 
 static int loop_set_dio(struct loop_device *lo, unsigned long arg)
diff --git a/drivers/block/loop.h b/drivers/block/loop.h
index 2c096b9a17b8..fecd3f97ef8c 100644
--- a/drivers/block/loop.h
+++ b/drivers/block/loop.h
@@ -49,7 +49,6 @@ struct loop_device {
 	struct file *	lo_backing_file;
 	struct block_device *lo_device;
 	unsigned	lo_blocksize;
-	unsigned	lo_logical_blocksize;
 	void		*key_data; 
 
 	gfp_t		old_gfp_mask;
diff --git a/include/uapi/linux/loop.h b/include/uapi/linux/loop.h
index a3960f98679c..c8125ec1f4f2 100644
--- a/include/uapi/linux/loop.h
+++ b/include/uapi/linux/loop.h
@@ -22,7 +22,6 @@ enum {
 	LO_FLAGS_AUTOCLEAR	= 4,
 	LO_FLAGS_PARTSCAN	= 8,
 	LO_FLAGS_DIRECT_IO	= 16,
-	LO_FLAGS_BLOCKSIZE	= 32,
 };
 
 #include <asm/posix_types.h>	/* for __kernel_old_dev_t */
@@ -60,8 +59,6 @@ struct loop_info64 {
 	__u64		   lo_init[2];
 };
 
-#define LO_INFO_BLOCKSIZE(l) (l)->lo_init[0]
-
 /*
  * Loop filter types
  */
-- 
cgit v1.2.3-71-gd317


From ecda85e70277ef24e44a1f6bc00243cebd19f985 Mon Sep 17 00:00:00 2001
From: Juergen Gross <jgross@suse.com>
Date: Wed, 16 Aug 2017 19:31:57 +0200
Subject: x86/lguest: Remove lguest support

Lguest seems to be rather unused these days. It has seen only patches
ensuring it still builds the last two years and its official state is
"Odd Fixes".

Remove it in order to be able to clean up the paravirt code.

Signed-off-by: Juergen Gross <jgross@suse.com>
Acked-by: Rusty Russell <rusty@rustcorp.com.au>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: boris.ostrovsky@oracle.com
Cc: lguest@lists.ozlabs.org
Cc: rusty@rustcorp.com.au
Cc: xen-devel@lists.xenproject.org
Link: http://lkml.kernel.org/r/20170816173157.8633-3-jgross@suse.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 MAINTAINERS                           |   11 -
 arch/x86/Kbuild                       |    3 -
 arch/x86/Kconfig                      |    2 -
 arch/x86/include/asm/lguest.h         |   91 -
 arch/x86/include/asm/lguest_hcall.h   |   74 -
 arch/x86/include/asm/processor.h      |    2 +-
 arch/x86/include/uapi/asm/bootparam.h |    2 +-
 arch/x86/kernel/asm-offsets_32.c      |   20 -
 arch/x86/kernel/head_32.S             |    2 -
 arch/x86/kernel/platform-quirks.c     |    1 -
 arch/x86/kvm/Kconfig                  |    1 -
 arch/x86/lguest/Kconfig               |   14 -
 arch/x86/lguest/Makefile              |    2 -
 arch/x86/lguest/boot.c                | 1558 ---------------
 arch/x86/lguest/head_32.S             |  192 --
 drivers/Makefile                      |    1 -
 drivers/block/Kconfig                 |    2 +-
 drivers/char/Kconfig                  |    2 +-
 drivers/char/virtio_console.c         |    2 +-
 drivers/lguest/Kconfig                |   13 -
 drivers/lguest/Makefile               |   26 -
 drivers/lguest/README                 |   47 -
 drivers/lguest/core.c                 |  398 ----
 drivers/lguest/hypercalls.c           |  304 ---
 drivers/lguest/interrupts_and_traps.c |  706 -------
 drivers/lguest/lg.h                   |  258 ---
 drivers/lguest/lguest_user.c          |  446 -----
 drivers/lguest/page_tables.c          | 1239 ------------
 drivers/lguest/segments.c             |  228 ---
 drivers/lguest/x86/core.c             |  724 -------
 drivers/lguest/x86/switcher_32.S      |  388 ----
 drivers/net/Kconfig                   |    2 +-
 drivers/tty/hvc/Kconfig               |    2 +-
 drivers/virtio/Kconfig                |    4 +-
 include/linux/lguest.h                |   73 -
 include/linux/lguest_launcher.h       |   44 -
 include/uapi/linux/virtio_ring.h      |    4 +-
 tools/Makefile                        |   11 +-
 tools/lguest/.gitignore               |    2 -
 tools/lguest/Makefile                 |   14 -
 tools/lguest/extract                  |   58 -
 tools/lguest/lguest.c                 | 3420 ---------------------------------
 tools/lguest/lguest.txt               |  125 --
 43 files changed, 16 insertions(+), 10502 deletions(-)
 delete mode 100644 arch/x86/include/asm/lguest.h
 delete mode 100644 arch/x86/include/asm/lguest_hcall.h
 delete mode 100644 arch/x86/lguest/Kconfig
 delete mode 100644 arch/x86/lguest/Makefile
 delete mode 100644 arch/x86/lguest/boot.c
 delete mode 100644 arch/x86/lguest/head_32.S
 delete mode 100644 drivers/lguest/Kconfig
 delete mode 100644 drivers/lguest/Makefile
 delete mode 100644 drivers/lguest/README
 delete mode 100644 drivers/lguest/core.c
 delete mode 100644 drivers/lguest/hypercalls.c
 delete mode 100644 drivers/lguest/interrupts_and_traps.c
 delete mode 100644 drivers/lguest/lg.h
 delete mode 100644 drivers/lguest/lguest_user.c
 delete mode 100644 drivers/lguest/page_tables.c
 delete mode 100644 drivers/lguest/segments.c
 delete mode 100644 drivers/lguest/x86/core.c
 delete mode 100644 drivers/lguest/x86/switcher_32.S
 delete mode 100644 include/linux/lguest.h
 delete mode 100644 include/linux/lguest_launcher.h
 delete mode 100644 tools/lguest/.gitignore
 delete mode 100644 tools/lguest/Makefile
 delete mode 100644 tools/lguest/extract
 delete mode 100644 tools/lguest/lguest.c
 delete mode 100644 tools/lguest/lguest.txt

(limited to 'include/uapi/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index 84d6a8277cbd..6c8b66d2adcb 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -7640,17 +7640,6 @@ T:	git git://linuxtv.org/mkrufky/tuners.git
 S:	Maintained
 F:	drivers/media/dvb-frontends/lgdt3305.*
 
-LGUEST
-M:	Rusty Russell <rusty@rustcorp.com.au>
-L:	lguest@lists.ozlabs.org
-W:	http://lguest.ozlabs.org/
-S:	Odd Fixes
-F:	arch/x86/include/asm/lguest*.h
-F:	arch/x86/lguest/
-F:	drivers/lguest/
-F:	include/linux/lguest*.h
-F:	tools/lguest/
-
 LIBATA PATA ARASAN COMPACT FLASH CONTROLLER
 M:	Viresh Kumar <vireshk@kernel.org>
 L:	linux-ide@vger.kernel.org
diff --git a/arch/x86/Kbuild b/arch/x86/Kbuild
index 586b786b3edf..f65a804b86f0 100644
--- a/arch/x86/Kbuild
+++ b/arch/x86/Kbuild
@@ -10,9 +10,6 @@ obj-$(CONFIG_XEN) += xen/
 # Hyper-V paravirtualization support
 obj-$(CONFIG_HYPERVISOR_GUEST) += hyperv/
 
-# lguest paravirtualization support
-obj-$(CONFIG_LGUEST_GUEST) += lguest/
-
 obj-y += realmode/
 obj-y += kernel/
 obj-y += mm/
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 9b302121584d..651021713385 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -777,8 +777,6 @@ config KVM_DEBUG_FS
 	  Statistics are displayed in debugfs filesystem. Enabling this option
 	  may incur significant overhead.
 
-source "arch/x86/lguest/Kconfig"
-
 config PARAVIRT_TIME_ACCOUNTING
 	bool "Paravirtual steal time accounting"
 	depends on PARAVIRT
diff --git a/arch/x86/include/asm/lguest.h b/arch/x86/include/asm/lguest.h
deleted file mode 100644
index 73d0c9b92087..000000000000
--- a/arch/x86/include/asm/lguest.h
+++ /dev/null
@@ -1,91 +0,0 @@
-#ifndef _ASM_X86_LGUEST_H
-#define _ASM_X86_LGUEST_H
-
-#define GDT_ENTRY_LGUEST_CS	10
-#define GDT_ENTRY_LGUEST_DS	11
-#define LGUEST_CS		(GDT_ENTRY_LGUEST_CS * 8)
-#define LGUEST_DS		(GDT_ENTRY_LGUEST_DS * 8)
-
-#ifndef __ASSEMBLY__
-#include <asm/desc.h>
-
-#define GUEST_PL 1
-
-/* Page for Switcher text itself, then two pages per cpu */
-#define SWITCHER_TEXT_PAGES (1)
-#define SWITCHER_STACK_PAGES (2 * nr_cpu_ids)
-#define TOTAL_SWITCHER_PAGES (SWITCHER_TEXT_PAGES + SWITCHER_STACK_PAGES)
-
-/* Where we map the Switcher, in both Host and Guest. */
-extern unsigned long switcher_addr;
-
-/* Found in switcher.S */
-extern unsigned long default_idt_entries[];
-
-/* Declarations for definitions in arch/x86/lguest/head_32.S */
-extern char lguest_noirq_iret[];
-extern const char lgstart_cli[], lgend_cli[];
-extern const char lgstart_pushf[], lgend_pushf[];
-
-extern void lguest_iret(void);
-extern void lguest_init(void);
-
-struct lguest_regs {
-	/* Manually saved part. */
-	unsigned long eax, ebx, ecx, edx;
-	unsigned long esi, edi, ebp;
-	unsigned long gs;
-	unsigned long fs, ds, es;
-	unsigned long trapnum, errcode;
-	/* Trap pushed part */
-	unsigned long eip;
-	unsigned long cs;
-	unsigned long eflags;
-	unsigned long esp;
-	unsigned long ss;
-};
-
-/* This is a guest-specific page (mapped ro) into the guest. */
-struct lguest_ro_state {
-	/* Host information we need to restore when we switch back. */
-	u32 host_cr3;
-	struct desc_ptr host_idt_desc;
-	struct desc_ptr host_gdt_desc;
-	u32 host_sp;
-
-	/* Fields which are used when guest is running. */
-	struct desc_ptr guest_idt_desc;
-	struct desc_ptr guest_gdt_desc;
-	struct x86_hw_tss guest_tss;
-	struct desc_struct guest_idt[IDT_ENTRIES];
-	struct desc_struct guest_gdt[GDT_ENTRIES];
-};
-
-struct lg_cpu_arch {
-	/* The GDT entries copied into lguest_ro_state when running. */
-	struct desc_struct gdt[GDT_ENTRIES];
-
-	/* The IDT entries: some copied into lguest_ro_state when running. */
-	struct desc_struct idt[IDT_ENTRIES];
-
-	/* The address of the last guest-visible pagefault (ie. cr2). */
-	unsigned long last_pagefault;
-};
-
-static inline void lguest_set_ts(void)
-{
-	u32 cr0;
-
-	cr0 = read_cr0();
-	if (!(cr0 & 8))
-		write_cr0(cr0 | 8);
-}
-
-/* Full 4G segment descriptors, suitable for CS and DS. */
-#define FULL_EXEC_SEGMENT \
-	((struct desc_struct)GDT_ENTRY_INIT(0xc09b, 0, 0xfffff))
-#define FULL_SEGMENT ((struct desc_struct)GDT_ENTRY_INIT(0xc093, 0, 0xfffff))
-
-#endif /* __ASSEMBLY__ */
-
-#endif /* _ASM_X86_LGUEST_H */
diff --git a/arch/x86/include/asm/lguest_hcall.h b/arch/x86/include/asm/lguest_hcall.h
deleted file mode 100644
index 6c119cfae218..000000000000
--- a/arch/x86/include/asm/lguest_hcall.h
+++ /dev/null
@@ -1,74 +0,0 @@
-/* Architecture specific portion of the lguest hypercalls */
-#ifndef _ASM_X86_LGUEST_HCALL_H
-#define _ASM_X86_LGUEST_HCALL_H
-
-#define LHCALL_FLUSH_ASYNC	0
-#define LHCALL_LGUEST_INIT	1
-#define LHCALL_SHUTDOWN		2
-#define LHCALL_NEW_PGTABLE	4
-#define LHCALL_FLUSH_TLB	5
-#define LHCALL_LOAD_IDT_ENTRY	6
-#define LHCALL_SET_STACK	7
-#define LHCALL_SET_CLOCKEVENT	9
-#define LHCALL_HALT		10
-#define LHCALL_SET_PMD		13
-#define LHCALL_SET_PTE		14
-#define LHCALL_SET_PGD		15
-#define LHCALL_LOAD_TLS		16
-#define LHCALL_LOAD_GDT_ENTRY	18
-#define LHCALL_SEND_INTERRUPTS	19
-
-#define LGUEST_TRAP_ENTRY 0x1F
-
-/* Argument number 3 to LHCALL_LGUEST_SHUTDOWN */
-#define LGUEST_SHUTDOWN_POWEROFF	1
-#define LGUEST_SHUTDOWN_RESTART		2
-
-#ifndef __ASSEMBLY__
-#include <asm/hw_irq.h>
-
-/*G:030
- * But first, how does our Guest contact the Host to ask for privileged
- * operations?  There are two ways: the direct way is to make a "hypercall",
- * to make requests of the Host Itself.
- *
- * Our hypercall mechanism uses the highest unused trap code (traps 32 and
- * above are used by real hardware interrupts).  Seventeen hypercalls are
- * available: the hypercall number is put in the %eax register, and the
- * arguments (when required) are placed in %ebx, %ecx, %edx and %esi.
- * If a return value makes sense, it's returned in %eax.
- *
- * Grossly invalid calls result in Sudden Death at the hands of the vengeful
- * Host, rather than returning failure.  This reflects Winston Churchill's
- * definition of a gentleman: "someone who is only rude intentionally".
- */
-static inline unsigned long
-hcall(unsigned long call,
-      unsigned long arg1, unsigned long arg2, unsigned long arg3,
-      unsigned long arg4)
-{
-	/* "int" is the Intel instruction to trigger a trap. */
-	asm volatile("int $" __stringify(LGUEST_TRAP_ENTRY)
-		     /* The call in %eax (aka "a") might be overwritten */
-		     : "=a"(call)
-		       /* The arguments are in %eax, %ebx, %ecx, %edx & %esi */
-		     : "a"(call), "b"(arg1), "c"(arg2), "d"(arg3), "S"(arg4)
-		       /* "memory" means this might write somewhere in memory.
-			* This isn't true for all calls, but it's safe to tell
-			* gcc that it might happen so it doesn't get clever. */
-		     : "memory");
-	return call;
-}
-/*:*/
-
-/* Can't use our min() macro here: needs to be a constant */
-#define LGUEST_IRQS (NR_IRQS < 32 ? NR_IRQS: 32)
-
-#define LHCALL_RING_SIZE 64
-struct hcall_args {
-	/* These map directly onto eax/ebx/ecx/edx/esi in struct lguest_regs */
-	unsigned long arg0, arg1, arg2, arg3, arg4;
-};
-
-#endif /* !__ASSEMBLY__ */
-#endif /* _ASM_X86_LGUEST_HCALL_H */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 0b03d655db7c..abc99b9c7ffd 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -662,7 +662,7 @@ static inline void sync_core(void)
 	 * In case NMI unmasking or performance ever becomes a problem,
 	 * the next best option appears to be MOV-to-CR2 and an
 	 * unconditional jump.  That sequence also works on all CPUs,
-	 * but it will fault at CPL3 (i.e. Xen PV and lguest).
+	 * but it will fault at CPL3 (i.e. Xen PV).
 	 *
 	 * CPUID is the conventional way, but it's nasty: it doesn't
 	 * exist on some 486-like CPUs, and it usually exits to a
diff --git a/arch/x86/include/uapi/asm/bootparam.h b/arch/x86/include/uapi/asm/bootparam.h
index ddef37b16af2..66b8f93333d1 100644
--- a/arch/x86/include/uapi/asm/bootparam.h
+++ b/arch/x86/include/uapi/asm/bootparam.h
@@ -201,7 +201,7 @@ struct boot_params {
  *
  * @X86_SUBARCH_PC: Should be used if the hardware is enumerable using standard
  *	PC mechanisms (PCI, ACPI) and doesn't need a special boot flow.
- * @X86_SUBARCH_LGUEST: Used for x86 hypervisor demo, lguest
+ * @X86_SUBARCH_LGUEST: Used for x86 hypervisor demo, lguest, deprecated
  * @X86_SUBARCH_XEN: Used for Xen guest types which follow the PV boot path,
  * 	which start at asm startup_xen() entry point and later jump to the C
  * 	xen_start_kernel() entry point. Both domU and dom0 type of guests are
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index 880aa093268d..710edab9e644 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -4,9 +4,6 @@
 
 #include <asm/ucontext.h>
 
-#include <linux/lguest.h>
-#include "../../../drivers/lguest/lg.h"
-
 #define __SYSCALL_I386(nr, sym, qual) [nr] = 1,
 static char syscalls[] = {
 #include <asm/syscalls_32.h>
@@ -62,23 +59,6 @@ void foo(void)
 	OFFSET(stack_canary_offset, stack_canary, canary);
 #endif
 
-#if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE)
-	BLANK();
-	OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled);
-	OFFSET(LGUEST_DATA_irq_pending, lguest_data, irq_pending);
-
-	BLANK();
-	OFFSET(LGUEST_PAGES_host_gdt_desc, lguest_pages, state.host_gdt_desc);
-	OFFSET(LGUEST_PAGES_host_idt_desc, lguest_pages, state.host_idt_desc);
-	OFFSET(LGUEST_PAGES_host_cr3, lguest_pages, state.host_cr3);
-	OFFSET(LGUEST_PAGES_host_sp, lguest_pages, state.host_sp);
-	OFFSET(LGUEST_PAGES_guest_gdt_desc, lguest_pages,state.guest_gdt_desc);
-	OFFSET(LGUEST_PAGES_guest_idt_desc, lguest_pages,state.guest_idt_desc);
-	OFFSET(LGUEST_PAGES_guest_gdt, lguest_pages, state.guest_gdt);
-	OFFSET(LGUEST_PAGES_regs_trapnum, lguest_pages, regs.trapnum);
-	OFFSET(LGUEST_PAGES_regs_errcode, lguest_pages, regs.errcode);
-	OFFSET(LGUEST_PAGES_regs, lguest_pages, regs);
-#endif
 	BLANK();
 	DEFINE(__NR_syscall_max, sizeof(syscalls) - 1);
 	DEFINE(NR_syscalls, sizeof(syscalls));
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 0332664eb158..29da9599fec0 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -155,7 +155,6 @@ ENTRY(startup_32)
 	jmp *%eax
 
 .Lbad_subarch:
-WEAK(lguest_entry)
 WEAK(xen_entry)
 	/* Unknown implementation; there's really
 	   nothing we can do at this point. */
@@ -165,7 +164,6 @@ WEAK(xen_entry)
 
 subarch_entries:
 	.long .Ldefault_entry		/* normal x86/PC */
-	.long lguest_entry		/* lguest hypervisor */
 	.long xen_entry			/* Xen hypervisor */
 	.long .Ldefault_entry		/* Moorestown MID */
 num_subarch_entries = (. - subarch_entries) / 4
diff --git a/arch/x86/kernel/platform-quirks.c b/arch/x86/kernel/platform-quirks.c
index 91271122f0df..502a77d0adb0 100644
--- a/arch/x86/kernel/platform-quirks.c
+++ b/arch/x86/kernel/platform-quirks.c
@@ -16,7 +16,6 @@ void __init x86_early_init_platform_quirks(void)
 		x86_platform.legacy.reserve_bios_regions = 1;
 		break;
 	case X86_SUBARCH_XEN:
-	case X86_SUBARCH_LGUEST:
 		x86_platform.legacy.devices.pnpbios = 0;
 		x86_platform.legacy.rtc = 0;
 		break;
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 2688c7dc5323..3ea624452f93 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -89,6 +89,5 @@ config KVM_MMU_AUDIT
 # OK, it's a little counter-intuitive to do this, but it puts it neatly under
 # the virtualization menu.
 source drivers/vhost/Kconfig
-source drivers/lguest/Kconfig
 
 endif # VIRTUALIZATION
diff --git a/arch/x86/lguest/Kconfig b/arch/x86/lguest/Kconfig
deleted file mode 100644
index 08f41caada45..000000000000
--- a/arch/x86/lguest/Kconfig
+++ /dev/null
@@ -1,14 +0,0 @@
-config LGUEST_GUEST
-	bool "Lguest guest support"
-	depends on X86_32 && PARAVIRT && PCI
-	select TTY
-	select VIRTUALIZATION
-	select VIRTIO
-	select VIRTIO_CONSOLE
-	help
-	  Lguest is a tiny in-kernel hypervisor.  Selecting this will
-	  allow your kernel to boot under lguest.  This option will increase
-	  your kernel size by about 10k.  If in doubt, say N.
-
-	  If you say Y here, make sure you say Y (or M) to the virtio block
-	  and net drivers which lguest needs.
diff --git a/arch/x86/lguest/Makefile b/arch/x86/lguest/Makefile
deleted file mode 100644
index 8f38d577a2fa..000000000000
--- a/arch/x86/lguest/Makefile
+++ /dev/null
@@ -1,2 +0,0 @@
-obj-y		:= head_32.o boot.o
-CFLAGS_boot.o	:= $(call cc-option, -fno-stack-protector)
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
deleted file mode 100644
index 99472698c931..000000000000
--- a/arch/x86/lguest/boot.c
+++ /dev/null
@@ -1,1558 +0,0 @@
-/*P:010
- * A hypervisor allows multiple Operating Systems to run on a single machine.
- * To quote David Wheeler: "Any problem in computer science can be solved with
- * another layer of indirection."
- *
- * We keep things simple in two ways.  First, we start with a normal Linux
- * kernel and insert a module (lg.ko) which allows us to run other Linux
- * kernels the same way we'd run processes.  We call the first kernel the Host,
- * and the others the Guests.  The program which sets up and configures Guests
- * (such as the example in tools/lguest/lguest.c) is called the Launcher.
- *
- * Secondly, we only run specially modified Guests, not normal kernels: setting
- * CONFIG_LGUEST_GUEST to "y" compiles this file into the kernel so it knows
- * how to be a Guest at boot time.  This means that you can use the same kernel
- * you boot normally (ie. as a Host) as a Guest.
- *
- * These Guests know that they cannot do privileged operations, such as disable
- * interrupts, and that they have to ask the Host to do such things explicitly.
- * This file consists of all the replacements for such low-level native
- * hardware operations: these special Guest versions call the Host.
- *
- * So how does the kernel know it's a Guest?  We'll see that later, but let's
- * just say that we end up here where we replace the native functions various
- * "paravirt" structures with our Guest versions, then boot like normal.
-:*/
-
-/*
- * Copyright (C) 2006, Rusty Russell <rusty@rustcorp.com.au> IBM Corporation.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- * NON INFRINGEMENT.  See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-#include <linux/kernel.h>
-#include <linux/start_kernel.h>
-#include <linux/string.h>
-#include <linux/console.h>
-#include <linux/screen_info.h>
-#include <linux/irq.h>
-#include <linux/interrupt.h>
-#include <linux/clocksource.h>
-#include <linux/clockchips.h>
-#include <linux/lguest.h>
-#include <linux/lguest_launcher.h>
-#include <linux/virtio_console.h>
-#include <linux/pm.h>
-#include <linux/export.h>
-#include <linux/pci.h>
-#include <linux/virtio_pci.h>
-#include <asm/acpi.h>
-#include <asm/apic.h>
-#include <asm/lguest.h>
-#include <asm/paravirt.h>
-#include <asm/param.h>
-#include <asm/page.h>
-#include <asm/pgtable.h>
-#include <asm/desc.h>
-#include <asm/setup.h>
-#include <asm/e820/api.h>
-#include <asm/mce.h>
-#include <asm/io.h>
-#include <asm/fpu/api.h>
-#include <asm/stackprotector.h>
-#include <asm/reboot.h>		/* for struct machine_ops */
-#include <asm/kvm_para.h>
-#include <asm/pci_x86.h>
-#include <asm/pci-direct.h>
-
-/*G:010
- * Welcome to the Guest!
- *
- * The Guest in our tale is a simple creature: identical to the Host but
- * behaving in simplified but equivalent ways.  In particular, the Guest is the
- * same kernel as the Host (or at least, built from the same source code).
-:*/
-
-struct lguest_data lguest_data = {
-	.hcall_status = { [0 ... LHCALL_RING_SIZE-1] = 0xFF },
-	.noirq_iret = (u32)lguest_noirq_iret,
-	.kernel_address = PAGE_OFFSET,
-	.blocked_interrupts = { 1 }, /* Block timer interrupts */
-	.syscall_vec = IA32_SYSCALL_VECTOR,
-};
-
-/*G:037
- * async_hcall() is pretty simple: I'm quite proud of it really.  We have a
- * ring buffer of stored hypercalls which the Host will run though next time we
- * do a normal hypercall.  Each entry in the ring has 5 slots for the hypercall
- * arguments, and a "hcall_status" word which is 0 if the call is ready to go,
- * and 255 once the Host has finished with it.
- *
- * If we come around to a slot which hasn't been finished, then the table is
- * full and we just make the hypercall directly.  This has the nice side
- * effect of causing the Host to run all the stored calls in the ring buffer
- * which empties it for next time!
- */
-static void async_hcall(unsigned long call, unsigned long arg1,
-			unsigned long arg2, unsigned long arg3,
-			unsigned long arg4)
-{
-	/* Note: This code assumes we're uniprocessor. */
-	static unsigned int next_call;
-	unsigned long flags;
-
-	/*
-	 * Disable interrupts if not already disabled: we don't want an
-	 * interrupt handler making a hypercall while we're already doing
-	 * one!
-	 */
-	local_irq_save(flags);
-	if (lguest_data.hcall_status[next_call] != 0xFF) {
-		/* Table full, so do normal hcall which will flush table. */
-		hcall(call, arg1, arg2, arg3, arg4);
-	} else {
-		lguest_data.hcalls[next_call].arg0 = call;
-		lguest_data.hcalls[next_call].arg1 = arg1;
-		lguest_data.hcalls[next_call].arg2 = arg2;
-		lguest_data.hcalls[next_call].arg3 = arg3;
-		lguest_data.hcalls[next_call].arg4 = arg4;
-		/* Arguments must all be written before we mark it to go */
-		wmb();
-		lguest_data.hcall_status[next_call] = 0;
-		if (++next_call == LHCALL_RING_SIZE)
-			next_call = 0;
-	}
-	local_irq_restore(flags);
-}
-
-/*G:035
- * Notice the lazy_hcall() above, rather than hcall().  This is our first real
- * optimization trick!
- *
- * When lazy_mode is set, it means we're allowed to defer all hypercalls and do
- * them as a batch when lazy_mode is eventually turned off.  Because hypercalls
- * are reasonably expensive, batching them up makes sense.  For example, a
- * large munmap might update dozens of page table entries: that code calls
- * paravirt_enter_lazy_mmu(), does the dozen updates, then calls
- * lguest_leave_lazy_mode().
- *
- * So, when we're in lazy mode, we call async_hcall() to store the call for
- * future processing:
- */
-static void lazy_hcall1(unsigned long call, unsigned long arg1)
-{
-	if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE)
-		hcall(call, arg1, 0, 0, 0);
-	else
-		async_hcall(call, arg1, 0, 0, 0);
-}
-
-/* You can imagine what lazy_hcall2, 3 and 4 look like. :*/
-static void lazy_hcall2(unsigned long call,
-			unsigned long arg1,
-			unsigned long arg2)
-{
-	if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE)
-		hcall(call, arg1, arg2, 0, 0);
-	else
-		async_hcall(call, arg1, arg2, 0, 0);
-}
-
-static void lazy_hcall3(unsigned long call,
-			unsigned long arg1,
-			unsigned long arg2,
-			unsigned long arg3)
-{
-	if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE)
-		hcall(call, arg1, arg2, arg3, 0);
-	else
-		async_hcall(call, arg1, arg2, arg3, 0);
-}
-
-#ifdef CONFIG_X86_PAE
-static void lazy_hcall4(unsigned long call,
-			unsigned long arg1,
-			unsigned long arg2,
-			unsigned long arg3,
-			unsigned long arg4)
-{
-	if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE)
-		hcall(call, arg1, arg2, arg3, arg4);
-	else
-		async_hcall(call, arg1, arg2, arg3, arg4);
-}
-#endif
-
-/*G:036
- * When lazy mode is turned off, we issue the do-nothing hypercall to
- * flush any stored calls, and call the generic helper to reset the
- * per-cpu lazy mode variable.
- */
-static void lguest_leave_lazy_mmu_mode(void)
-{
-	hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0, 0);
-	paravirt_leave_lazy_mmu();
-}
-
-/*
- * We also catch the end of context switch; we enter lazy mode for much of
- * that too, so again we need to flush here.
- *
- * (Technically, this is lazy CPU mode, and normally we're in lazy MMU
- * mode, but unlike Xen, lguest doesn't care about the difference).
- */
-static void lguest_end_context_switch(struct task_struct *next)
-{
-	hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0, 0);
-	paravirt_end_context_switch(next);
-}
-
-/*G:032
- * After that diversion we return to our first native-instruction
- * replacements: four functions for interrupt control.
- *
- * The simplest way of implementing these would be to have "turn interrupts
- * off" and "turn interrupts on" hypercalls.  Unfortunately, this is too slow:
- * these are by far the most commonly called functions of those we override.
- *
- * So instead we keep an "irq_enabled" field inside our "struct lguest_data",
- * which the Guest can update with a single instruction.  The Host knows to
- * check there before it tries to deliver an interrupt.
- */
-
-/*
- * save_flags() is expected to return the processor state (ie. "flags").  The
- * flags word contains all kind of stuff, but in practice Linux only cares
- * about the interrupt flag.  Our "save_flags()" just returns that.
- */
-asmlinkage __visible unsigned long lguest_save_fl(void)
-{
-	return lguest_data.irq_enabled;
-}
-
-/* Interrupts go off... */
-asmlinkage __visible void lguest_irq_disable(void)
-{
-	lguest_data.irq_enabled = 0;
-}
-
-/*
- * Let's pause a moment.  Remember how I said these are called so often?
- * Jeremy Fitzhardinge optimized them so hard early in 2009 that he had to
- * break some rules.  In particular, these functions are assumed to save their
- * own registers if they need to: normal C functions assume they can trash the
- * eax register.  To use normal C functions, we use
- * PV_CALLEE_SAVE_REGS_THUNK(), which pushes %eax onto the stack, calls the
- * C function, then restores it.
- */
-PV_CALLEE_SAVE_REGS_THUNK(lguest_save_fl);
-PV_CALLEE_SAVE_REGS_THUNK(lguest_irq_disable);
-/*:*/
-
-/* These are in head_32.S */
-extern void lg_irq_enable(void);
-extern void lg_restore_fl(unsigned long flags);
-
-/*M:003
- * We could be more efficient in our checking of outstanding interrupts, rather
- * than using a branch.  One way would be to put the "irq_enabled" field in a
- * page by itself, and have the Host write-protect it when an interrupt comes
- * in when irqs are disabled.  There will then be a page fault as soon as
- * interrupts are re-enabled.
- *
- * A better method is to implement soft interrupt disable generally for x86:
- * instead of disabling interrupts, we set a flag.  If an interrupt does come
- * in, we then disable them for real.  This is uncommon, so we could simply use
- * a hypercall for interrupt control and not worry about efficiency.
-:*/
-
-/*G:034
- * The Interrupt Descriptor Table (IDT).
- *
- * The IDT tells the processor what to do when an interrupt comes in.  Each
- * entry in the table is a 64-bit descriptor: this holds the privilege level,
- * address of the handler, and... well, who cares?  The Guest just asks the
- * Host to make the change anyway, because the Host controls the real IDT.
- */
-static void lguest_write_idt_entry(gate_desc *dt,
-				   int entrynum, const gate_desc *g)
-{
-	/*
-	 * The gate_desc structure is 8 bytes long: we hand it to the Host in
-	 * two 32-bit chunks.  The whole 32-bit kernel used to hand descriptors
-	 * around like this; typesafety wasn't a big concern in Linux's early
-	 * years.
-	 */
-	u32 *desc = (u32 *)g;
-	/* Keep the local copy up to date. */
-	native_write_idt_entry(dt, entrynum, g);
-	/* Tell Host about this new entry. */
-	hcall(LHCALL_LOAD_IDT_ENTRY, entrynum, desc[0], desc[1], 0);
-}
-
-/*
- * Changing to a different IDT is very rare: we keep the IDT up-to-date every
- * time it is written, so we can simply loop through all entries and tell the
- * Host about them.
- */
-static void lguest_load_idt(const struct desc_ptr *desc)
-{
-	unsigned int i;
-	struct desc_struct *idt = (void *)desc->address;
-
-	for (i = 0; i < (desc->size+1)/8; i++)
-		hcall(LHCALL_LOAD_IDT_ENTRY, i, idt[i].a, idt[i].b, 0);
-}
-
-/*
- * The Global Descriptor Table.
- *
- * The Intel architecture defines another table, called the Global Descriptor
- * Table (GDT).  You tell the CPU where it is (and its size) using the "lgdt"
- * instruction, and then several other instructions refer to entries in the
- * table.  There are three entries which the Switcher needs, so the Host simply
- * controls the entire thing and the Guest asks it to make changes using the
- * LOAD_GDT hypercall.
- *
- * This is the exactly like the IDT code.
- */
-static void lguest_load_gdt(const struct desc_ptr *desc)
-{
-	unsigned int i;
-	struct desc_struct *gdt = (void *)desc->address;
-
-	for (i = 0; i < (desc->size+1)/8; i++)
-		hcall(LHCALL_LOAD_GDT_ENTRY, i, gdt[i].a, gdt[i].b, 0);
-}
-
-/*
- * For a single GDT entry which changes, we simply change our copy and
- * then tell the host about it.
- */
-static void lguest_write_gdt_entry(struct desc_struct *dt, int entrynum,
-				   const void *desc, int type)
-{
-	native_write_gdt_entry(dt, entrynum, desc, type);
-	/* Tell Host about this new entry. */
-	hcall(LHCALL_LOAD_GDT_ENTRY, entrynum,
-	      dt[entrynum].a, dt[entrynum].b, 0);
-}
-
-/*
- * There are three "thread local storage" GDT entries which change
- * on every context switch (these three entries are how glibc implements
- * __thread variables).  As an optimization, we have a hypercall
- * specifically for this case.
- *
- * Wouldn't it be nicer to have a general LOAD_GDT_ENTRIES hypercall
- * which took a range of entries?
- */
-static void lguest_load_tls(struct thread_struct *t, unsigned int cpu)
-{
-	/*
-	 * There's one problem which normal hardware doesn't have: the Host
-	 * can't handle us removing entries we're currently using.  So we clear
-	 * the GS register here: if it's needed it'll be reloaded anyway.
-	 */
-	lazy_load_gs(0);
-	lazy_hcall2(LHCALL_LOAD_TLS, __pa(&t->tls_array), cpu);
-}
-
-/*G:038
- * That's enough excitement for now, back to ploughing through each of the
- * different pv_ops structures (we're about 1/3 of the way through).
- *
- * This is the Local Descriptor Table, another weird Intel thingy.  Linux only
- * uses this for some strange applications like Wine.  We don't do anything
- * here, so they'll get an informative and friendly Segmentation Fault.
- */
-static void lguest_set_ldt(const void *addr, unsigned entries)
-{
-}
-
-/*
- * This loads a GDT entry into the "Task Register": that entry points to a
- * structure called the Task State Segment.  Some comments scattered though the
- * kernel code indicate that this used for task switching in ages past, along
- * with blood sacrifice and astrology.
- *
- * Now there's nothing interesting in here that we don't get told elsewhere.
- * But the native version uses the "ltr" instruction, which makes the Host
- * complain to the Guest about a Segmentation Fault and it'll oops.  So we
- * override the native version with a do-nothing version.
- */
-static void lguest_load_tr_desc(void)
-{
-}
-
-/*
- * The "cpuid" instruction is a way of querying both the CPU identity
- * (manufacturer, model, etc) and its features.  It was introduced before the
- * Pentium in 1993 and keeps getting extended by both Intel, AMD and others.
- * As you might imagine, after a decade and a half this treatment, it is now a
- * giant ball of hair.  Its entry in the current Intel manual runs to 28 pages.
- *
- * This instruction even it has its own Wikipedia entry.  The Wikipedia entry
- * has been translated into 6 languages.  I am not making this up!
- *
- * We could get funky here and identify ourselves as "GenuineLguest", but
- * instead we just use the real "cpuid" instruction.  Then I pretty much turned
- * off feature bits until the Guest booted.  (Don't say that: you'll damage
- * lguest sales!)  Shut up, inner voice!  (Hey, just pointing out that this is
- * hardly future proof.)  No one's listening!  They don't like you anyway,
- * parenthetic weirdo!
- *
- * Replacing the cpuid so we can turn features off is great for the kernel, but
- * anyone (including userspace) can just use the raw "cpuid" instruction and
- * the Host won't even notice since it isn't privileged.  So we try not to get
- * too worked up about it.
- */
-static void lguest_cpuid(unsigned int *ax, unsigned int *bx,
-			 unsigned int *cx, unsigned int *dx)
-{
-	int function = *ax;
-
-	native_cpuid(ax, bx, cx, dx);
-	switch (function) {
-	/*
-	 * CPUID 0 gives the highest legal CPUID number (and the ID string).
-	 * We futureproof our code a little by sticking to known CPUID values.
-	 */
-	case 0:
-		if (*ax > 5)
-			*ax = 5;
-		break;
-
-	/*
-	 * CPUID 1 is a basic feature request.
-	 *
-	 * CX: we only allow kernel to see SSE3, CMPXCHG16B and SSSE3
-	 * DX: SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, TSC, FPU and PAE.
-	 */
-	case 1:
-		*cx &= 0x00002201;
-		*dx &= 0x07808151;
-		/*
-		 * The Host can do a nice optimization if it knows that the
-		 * kernel mappings (addresses above 0xC0000000 or whatever
-		 * PAGE_OFFSET is set to) haven't changed.  But Linux calls
-		 * flush_tlb_user() for both user and kernel mappings unless
-		 * the Page Global Enable (PGE) feature bit is set.
-		 */
-		*dx |= 0x00002000;
-		/*
-		 * We also lie, and say we're family id 5.  6 or greater
-		 * leads to a rdmsr in early_init_intel which we can't handle.
-		 * Family ID is returned as bits 8-12 in ax.
-		 */
-		*ax &= 0xFFFFF0FF;
-		*ax |= 0x00000500;
-		break;
-
-	/*
-	 * This is used to detect if we're running under KVM.  We might be,
-	 * but that's a Host matter, not us.  So say we're not.
-	 */
-	case KVM_CPUID_SIGNATURE:
-		*bx = *cx = *dx = 0;
-		break;
-
-	/*
-	 * 0x80000000 returns the highest Extended Function, so we futureproof
-	 * like we do above by limiting it to known fields.
-	 */
-	case 0x80000000:
-		if (*ax > 0x80000008)
-			*ax = 0x80000008;
-		break;
-
-	/*
-	 * PAE systems can mark pages as non-executable.  Linux calls this the
-	 * NX bit.  Intel calls it XD (eXecute Disable), AMD EVP (Enhanced
-	 * Virus Protection).  We just switch it off here, since we don't
-	 * support it.
-	 */
-	case 0x80000001:
-		*dx &= ~(1 << 20);
-		break;
-	}
-}
-
-/*
- * Intel has four control registers, imaginatively named cr0, cr2, cr3 and cr4.
- * I assume there's a cr1, but it hasn't bothered us yet, so we'll not bother
- * it.  The Host needs to know when the Guest wants to change them, so we have
- * a whole series of functions like read_cr0() and write_cr0().
- *
- * We start with cr0.  cr0 allows you to turn on and off all kinds of basic
- * features, but the only cr0 bit that Linux ever used at runtime was the
- * horrifically-named Task Switched (TS) bit at bit 3 (ie. 8)
- *
- * What does the TS bit do?  Well, it causes the CPU to trap (interrupt 7) if
- * the floating point unit is used.  Which allows us to restore FPU state
- * lazily after a task switch if we wanted to, but wouldn't a name like
- * "FPUTRAP bit" be a little less cryptic?
- *
- * Fortunately, Linux keeps it simple and doesn't use TS, so we can ignore
- * cr0.
- */
-static void lguest_write_cr0(unsigned long val)
-{
-}
-
-static unsigned long lguest_read_cr0(void)
-{
-	return 0;
-}
-
-/*
- * cr2 is the virtual address of the last page fault, which the Guest only ever
- * reads.  The Host kindly writes this into our "struct lguest_data", so we
- * just read it out of there.
- */
-static unsigned long lguest_read_cr2(void)
-{
-	return lguest_data.cr2;
-}
-
-/* See lguest_set_pte() below. */
-static bool cr3_changed = false;
-static unsigned long current_cr3;
-
-/*
- * cr3 is the current toplevel pagetable page: the principle is the same as
- * cr0.  Keep a local copy, and tell the Host when it changes.
- */
-static void lguest_write_cr3(unsigned long cr3)
-{
-	lazy_hcall1(LHCALL_NEW_PGTABLE, cr3);
-	current_cr3 = cr3;
-
-	/* These two page tables are simple, linear, and used during boot */
-	if (cr3 != __pa_symbol(swapper_pg_dir) &&
-	    cr3 != __pa_symbol(initial_page_table))
-		cr3_changed = true;
-}
-
-static unsigned long lguest_read_cr3(void)
-{
-	return current_cr3;
-}
-
-/* cr4 is used to enable and disable PGE, but we don't care. */
-static unsigned long lguest_read_cr4(void)
-{
-	return 0;
-}
-
-static void lguest_write_cr4(unsigned long val)
-{
-}
-
-/*
- * Page Table Handling.
- *
- * Now would be a good time to take a rest and grab a coffee or similarly
- * relaxing stimulant.  The easy parts are behind us, and the trek gradually
- * winds uphill from here.
- *
- * Quick refresher: memory is divided into "pages" of 4096 bytes each.  The CPU
- * maps virtual addresses to physical addresses using "page tables".  We could
- * use one huge index of 1 million entries: each address is 4 bytes, so that's
- * 1024 pages just to hold the page tables.   But since most virtual addresses
- * are unused, we use a two level index which saves space.  The cr3 register
- * contains the physical address of the top level "page directory" page, which
- * contains physical addresses of up to 1024 second-level pages.  Each of these
- * second level pages contains up to 1024 physical addresses of actual pages,
- * or Page Table Entries (PTEs).
- *
- * Here's a diagram, where arrows indicate physical addresses:
- *
- * cr3 ---> +---------+
- *	    |  	   --------->+---------+
- *	    |	      |	     | PADDR1  |
- *	  Mid-level   |	     | PADDR2  |
- *	  (PMD) page  |	     | 	       |
- *	    |	      |	   Lower-level |
- *	    |	      |	   (PTE) page  |
- *	    |	      |	     |	       |
- *	      ....    	     	 ....
- *
- * So to convert a virtual address to a physical address, we look up the top
- * level, which points us to the second level, which gives us the physical
- * address of that page.  If the top level entry was not present, or the second
- * level entry was not present, then the virtual address is invalid (we
- * say "the page was not mapped").
- *
- * Put another way, a 32-bit virtual address is divided up like so:
- *
- *  1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
- * |<---- 10 bits ---->|<---- 10 bits ---->|<------ 12 bits ------>|
- *    Index into top     Index into second      Offset within page
- *  page directory page    pagetable page
- *
- * Now, unfortunately, this isn't the whole story: Intel added Physical Address
- * Extension (PAE) to allow 32 bit systems to use 64GB of memory (ie. 36 bits).
- * These are held in 64-bit page table entries, so we can now only fit 512
- * entries in a page, and the neat three-level tree breaks down.
- *
- * The result is a four level page table:
- *
- * cr3 --> [ 4 Upper  ]
- *	   [   Level  ]
- *	   [  Entries ]
- *	   [(PUD Page)]---> +---------+
- *	 		    |  	   --------->+---------+
- *	 		    |	      |	     | PADDR1  |
- *	 		  Mid-level   |	     | PADDR2  |
- *	 		  (PMD) page  |	     | 	       |
- *	 		    |	      |	   Lower-level |
- *	 		    |	      |	   (PTE) page  |
- *	 		    |	      |	     |	       |
- *	 		      ....    	     	 ....
- *
- *
- * And the virtual address is decoded as:
- *
- *         1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
- *      |<-2->|<--- 9 bits ---->|<---- 9 bits --->|<------ 12 bits ------>|
- * Index into    Index into mid    Index into lower    Offset within page
- * top entries   directory page     pagetable page
- *
- * It's too hard to switch between these two formats at runtime, so Linux only
- * supports one or the other depending on whether CONFIG_X86_PAE is set.  Many
- * distributions turn it on, and not just for people with silly amounts of
- * memory: the larger PTE entries allow room for the NX bit, which lets the
- * kernel disable execution of pages and increase security.
- *
- * This was a problem for lguest, which couldn't run on these distributions;
- * then Matias Zabaljauregui figured it all out and implemented it, and only a
- * handful of puppies were crushed in the process!
- *
- * Back to our point: the kernel spends a lot of time changing both the
- * top-level page directory and lower-level pagetable pages.  The Guest doesn't
- * know physical addresses, so while it maintains these page tables exactly
- * like normal, it also needs to keep the Host informed whenever it makes a
- * change: the Host will create the real page tables based on the Guests'.
- */
-
-/*
- * The Guest calls this after it has set a second-level entry (pte), ie. to map
- * a page into a process' address space.  We tell the Host the toplevel and
- * address this corresponds to.  The Guest uses one pagetable per process, so
- * we need to tell the Host which one we're changing (mm->pgd).
- */
-static void lguest_pte_update(struct mm_struct *mm, unsigned long addr,
-			       pte_t *ptep)
-{
-#ifdef CONFIG_X86_PAE
-	/* PAE needs to hand a 64 bit page table entry, so it uses two args. */
-	lazy_hcall4(LHCALL_SET_PTE, __pa(mm->pgd), addr,
-		    ptep->pte_low, ptep->pte_high);
-#else
-	lazy_hcall3(LHCALL_SET_PTE, __pa(mm->pgd), addr, ptep->pte_low);
-#endif
-}
-
-/* This is the "set and update" combo-meal-deal version. */
-static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr,
-			      pte_t *ptep, pte_t pteval)
-{
-	native_set_pte(ptep, pteval);
-	lguest_pte_update(mm, addr, ptep);
-}
-
-/*
- * The Guest calls lguest_set_pud to set a top-level entry and lguest_set_pmd
- * to set a middle-level entry when PAE is activated.
- *
- * Again, we set the entry then tell the Host which page we changed,
- * and the index of the entry we changed.
- */
-#ifdef CONFIG_X86_PAE
-static void lguest_set_pud(pud_t *pudp, pud_t pudval)
-{
-	native_set_pud(pudp, pudval);
-
-	/* 32 bytes aligned pdpt address and the index. */
-	lazy_hcall2(LHCALL_SET_PGD, __pa(pudp) & 0xFFFFFFE0,
-		   (__pa(pudp) & 0x1F) / sizeof(pud_t));
-}
-
-static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval)
-{
-	native_set_pmd(pmdp, pmdval);
-	lazy_hcall2(LHCALL_SET_PMD, __pa(pmdp) & PAGE_MASK,
-		   (__pa(pmdp) & (PAGE_SIZE - 1)) / sizeof(pmd_t));
-}
-#else
-
-/* The Guest calls lguest_set_pmd to set a top-level entry when !PAE. */
-static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval)
-{
-	native_set_pmd(pmdp, pmdval);
-	lazy_hcall2(LHCALL_SET_PGD, __pa(pmdp) & PAGE_MASK,
-		   (__pa(pmdp) & (PAGE_SIZE - 1)) / sizeof(pmd_t));
-}
-#endif
-
-/*
- * There are a couple of legacy places where the kernel sets a PTE, but we
- * don't know the top level any more.  This is useless for us, since we don't
- * know which pagetable is changing or what address, so we just tell the Host
- * to forget all of them.  Fortunately, this is very rare.
- *
- * ... except in early boot when the kernel sets up the initial pagetables,
- * which makes booting astonishingly slow: 48 seconds!  So we don't even tell
- * the Host anything changed until we've done the first real page table switch,
- * which brings boot back to 4.3 seconds.
- */
-static void lguest_set_pte(pte_t *ptep, pte_t pteval)
-{
-	native_set_pte(ptep, pteval);
-	if (cr3_changed)
-		lazy_hcall1(LHCALL_FLUSH_TLB, 1);
-}
-
-#ifdef CONFIG_X86_PAE
-/*
- * With 64-bit PTE values, we need to be careful setting them: if we set 32
- * bits at a time, the hardware could see a weird half-set entry.  These
- * versions ensure we update all 64 bits at once.
- */
-static void lguest_set_pte_atomic(pte_t *ptep, pte_t pte)
-{
-	native_set_pte_atomic(ptep, pte);
-	if (cr3_changed)
-		lazy_hcall1(LHCALL_FLUSH_TLB, 1);
-}
-
-static void lguest_pte_clear(struct mm_struct *mm, unsigned long addr,
-			     pte_t *ptep)
-{
-	native_pte_clear(mm, addr, ptep);
-	lguest_pte_update(mm, addr, ptep);
-}
-
-static void lguest_pmd_clear(pmd_t *pmdp)
-{
-	lguest_set_pmd(pmdp, __pmd(0));
-}
-#endif
-
-/*
- * Unfortunately for Lguest, the pv_mmu_ops for page tables were based on
- * native page table operations.  On native hardware you can set a new page
- * table entry whenever you want, but if you want to remove one you have to do
- * a TLB flush (a TLB is a little cache of page table entries kept by the CPU).
- *
- * So the lguest_set_pte_at() and lguest_set_pmd() functions above are only
- * called when a valid entry is written, not when it's removed (ie. marked not
- * present).  Instead, this is where we come when the Guest wants to remove a
- * page table entry: we tell the Host to set that entry to 0 (ie. the present
- * bit is zero).
- */
-static void lguest_flush_tlb_single(unsigned long addr)
-{
-	/* Simply set it to zero: if it was not, it will fault back in. */
-	lazy_hcall3(LHCALL_SET_PTE, current_cr3, addr, 0);
-}
-
-/*
- * This is what happens after the Guest has removed a large number of entries.
- * This tells the Host that any of the page table entries for userspace might
- * have changed, ie. virtual addresses below PAGE_OFFSET.
- */
-static void lguest_flush_tlb_user(void)
-{
-	lazy_hcall1(LHCALL_FLUSH_TLB, 0);
-}
-
-/*
- * This is called when the kernel page tables have changed.  That's not very
- * common (unless the Guest is using highmem, which makes the Guest extremely
- * slow), so it's worth separating this from the user flushing above.
- */
-static void lguest_flush_tlb_kernel(void)
-{
-	lazy_hcall1(LHCALL_FLUSH_TLB, 1);
-}
-
-/*
- * The Unadvanced Programmable Interrupt Controller.
- *
- * This is an attempt to implement the simplest possible interrupt controller.
- * I spent some time looking though routines like set_irq_chip_and_handler,
- * set_irq_chip_and_handler_name, set_irq_chip_data and set_phasers_to_stun and
- * I *think* this is as simple as it gets.
- *
- * We can tell the Host what interrupts we want blocked ready for using the
- * lguest_data.interrupts bitmap, so disabling (aka "masking") them is as
- * simple as setting a bit.  We don't actually "ack" interrupts as such, we
- * just mask and unmask them.  I wonder if we should be cleverer?
- */
-static void disable_lguest_irq(struct irq_data *data)
-{
-	set_bit(data->irq, lguest_data.blocked_interrupts);
-}
-
-static void enable_lguest_irq(struct irq_data *data)
-{
-	clear_bit(data->irq, lguest_data.blocked_interrupts);
-}
-
-/* This structure describes the lguest IRQ controller. */
-static struct irq_chip lguest_irq_controller = {
-	.name		= "lguest",
-	.irq_mask	= disable_lguest_irq,
-	.irq_mask_ack	= disable_lguest_irq,
-	.irq_unmask	= enable_lguest_irq,
-};
-
-/*
- * Interrupt descriptors are allocated as-needed, but low-numbered ones are
- * reserved by the generic x86 code.  So we ignore irq_alloc_desc_at if it
- * tells us the irq is already used: other errors (ie. ENOMEM) we take
- * seriously.
- */
-static int lguest_setup_irq(unsigned int irq)
-{
-	struct irq_desc *desc;
-	int err;
-
-	/* Returns -ve error or vector number. */
-	err = irq_alloc_desc_at(irq, 0);
-	if (err < 0 && err != -EEXIST)
-		return err;
-
-	/*
-	 * Tell the Linux infrastructure that the interrupt is
-	 * controlled by our level-based lguest interrupt controller.
-	 */
-	irq_set_chip_and_handler_name(irq, &lguest_irq_controller,
-				      handle_level_irq, "level");
-
-	/* Some systems map "vectors" to interrupts weirdly.  Not us! */
-	desc = irq_to_desc(irq);
-	__this_cpu_write(vector_irq[FIRST_EXTERNAL_VECTOR + irq], desc);
-	return 0;
-}
-
-static int lguest_enable_irq(struct pci_dev *dev)
-{
-	int err;
-	u8 line = 0;
-
-	/* We literally use the PCI interrupt line as the irq number. */
-	pci_read_config_byte(dev, PCI_INTERRUPT_LINE, &line);
-	err = lguest_setup_irq(line);
-	if (!err)
-		dev->irq = line;
-	return err;
-}
-
-/* We don't do hotplug PCI, so this shouldn't be called. */
-static void lguest_disable_irq(struct pci_dev *dev)
-{
-	WARN_ON(1);
-}
-
-/*
- * This sets up the Interrupt Descriptor Table (IDT) entry for each hardware
- * interrupt (except 128, which is used for system calls).
- */
-static void __init lguest_init_IRQ(void)
-{
-	unsigned int i;
-
-	for (i = FIRST_EXTERNAL_VECTOR; i < FIRST_SYSTEM_VECTOR; i++) {
-		if (i != IA32_SYSCALL_VECTOR)
-			set_intr_gate(i, irq_entries_start +
-					8 * (i - FIRST_EXTERNAL_VECTOR));
-	}
-
-	/*
-	 * This call is required to set up for 4k stacks, where we have
-	 * separate stacks for hard and soft interrupts.
-	 */
-	irq_ctx_init(smp_processor_id());
-}
-
-/*
- * Time.
- *
- * It would be far better for everyone if the Guest had its own clock, but
- * until then the Host gives us the time on every interrupt.
- */
-static void lguest_get_wallclock(struct timespec *now)
-{
-	*now = lguest_data.time;
-}
-
-/*
- * The TSC is an Intel thing called the Time Stamp Counter.  The Host tells us
- * what speed it runs at, or 0 if it's unusable as a reliable clock source.
- * This matches what we want here: if we return 0 from this function, the x86
- * TSC clock will give up and not register itself.
- */
-static unsigned long lguest_tsc_khz(void)
-{
-	return lguest_data.tsc_khz;
-}
-
-/*
- * If we can't use the TSC, the kernel falls back to our lower-priority
- * "lguest_clock", where we read the time value given to us by the Host.
- */
-static u64 lguest_clock_read(struct clocksource *cs)
-{
-	unsigned long sec, nsec;
-
-	/*
-	 * Since the time is in two parts (seconds and nanoseconds), we risk
-	 * reading it just as it's changing from 99 & 0.999999999 to 100 and 0,
-	 * and getting 99 and 0.  As Linux tends to come apart under the stress
-	 * of time travel, we must be careful:
-	 */
-	do {
-		/* First we read the seconds part. */
-		sec = lguest_data.time.tv_sec;
-		/*
-		 * This read memory barrier tells the compiler and the CPU that
-		 * this can't be reordered: we have to complete the above
-		 * before going on.
-		 */
-		rmb();
-		/* Now we read the nanoseconds part. */
-		nsec = lguest_data.time.tv_nsec;
-		/* Make sure we've done that. */
-		rmb();
-		/* Now if the seconds part has changed, try again. */
-	} while (unlikely(lguest_data.time.tv_sec != sec));
-
-	/* Our lguest clock is in real nanoseconds. */
-	return sec*1000000000ULL + nsec;
-}
-
-/* This is the fallback clocksource: lower priority than the TSC clocksource. */
-static struct clocksource lguest_clock = {
-	.name		= "lguest",
-	.rating		= 200,
-	.read		= lguest_clock_read,
-	.mask		= CLOCKSOURCE_MASK(64),
-	.flags		= CLOCK_SOURCE_IS_CONTINUOUS,
-};
-
-/*
- * We also need a "struct clock_event_device": Linux asks us to set it to go
- * off some time in the future.  Actually, James Morris figured all this out, I
- * just applied the patch.
- */
-static int lguest_clockevent_set_next_event(unsigned long delta,
-                                           struct clock_event_device *evt)
-{
-	/* FIXME: I don't think this can ever happen, but James tells me he had
-	 * to put this code in.  Maybe we should remove it now.  Anyone? */
-	if (delta < LG_CLOCK_MIN_DELTA) {
-		if (printk_ratelimit())
-			printk(KERN_DEBUG "%s: small delta %lu ns\n",
-			       __func__, delta);
-		return -ETIME;
-	}
-
-	/* Please wake us this far in the future. */
-	hcall(LHCALL_SET_CLOCKEVENT, delta, 0, 0, 0);
-	return 0;
-}
-
-static int lguest_clockevent_shutdown(struct clock_event_device *evt)
-{
-	/* A 0 argument shuts the clock down. */
-	hcall(LHCALL_SET_CLOCKEVENT, 0, 0, 0, 0);
-	return 0;
-}
-
-/* This describes our primitive timer chip. */
-static struct clock_event_device lguest_clockevent = {
-	.name                   = "lguest",
-	.features               = CLOCK_EVT_FEAT_ONESHOT,
-	.set_next_event         = lguest_clockevent_set_next_event,
-	.set_state_shutdown	= lguest_clockevent_shutdown,
-	.rating                 = INT_MAX,
-	.mult                   = 1,
-	.shift                  = 0,
-	.min_delta_ns           = LG_CLOCK_MIN_DELTA,
-	.min_delta_ticks        = LG_CLOCK_MIN_DELTA,
-	.max_delta_ns           = LG_CLOCK_MAX_DELTA,
-	.max_delta_ticks        = LG_CLOCK_MAX_DELTA,
-};
-
-/*
- * This is the Guest timer interrupt handler (hardware interrupt 0).  We just
- * call the clockevent infrastructure and it does whatever needs doing.
- */
-static void lguest_time_irq(struct irq_desc *desc)
-{
-	unsigned long flags;
-
-	/* Don't interrupt us while this is running. */
-	local_irq_save(flags);
-	lguest_clockevent.event_handler(&lguest_clockevent);
-	local_irq_restore(flags);
-}
-
-/*
- * At some point in the boot process, we get asked to set up our timing
- * infrastructure.  The kernel doesn't expect timer interrupts before this, but
- * we cleverly initialized the "blocked_interrupts" field of "struct
- * lguest_data" so that timer interrupts were blocked until now.
- */
-static void lguest_time_init(void)
-{
-	/* Set up the timer interrupt (0) to go to our simple timer routine */
-	if (lguest_setup_irq(0) != 0)
-		panic("Could not set up timer irq");
-	irq_set_handler(0, lguest_time_irq);
-
-	clocksource_register_hz(&lguest_clock, NSEC_PER_SEC);
-
-	/* We can't set cpumask in the initializer: damn C limitations!  Set it
-	 * here and register our timer device. */
-	lguest_clockevent.cpumask = cpumask_of(0);
-	clockevents_register_device(&lguest_clockevent);
-
-	/* Finally, we unblock the timer interrupt. */
-	clear_bit(0, lguest_data.blocked_interrupts);
-}
-
-/*
- * Miscellaneous bits and pieces.
- *
- * Here is an oddball collection of functions which the Guest needs for things
- * to work.  They're pretty simple.
- */
-
-/*
- * The Guest needs to tell the Host what stack it expects traps to use.  For
- * native hardware, this is part of the Task State Segment mentioned above in
- * lguest_load_tr_desc(), but to help hypervisors there's this special call.
- *
- * We tell the Host the segment we want to use (__KERNEL_DS is the kernel data
- * segment), the privilege level (we're privilege level 1, the Host is 0 and
- * will not tolerate us trying to use that), the stack pointer, and the number
- * of pages in the stack.
- */
-static void lguest_load_sp0(struct tss_struct *tss,
-			    struct thread_struct *thread)
-{
-	lazy_hcall3(LHCALL_SET_STACK, __KERNEL_DS | 0x1, thread->sp0,
-		   THREAD_SIZE / PAGE_SIZE);
-	tss->x86_tss.sp0 = thread->sp0;
-}
-
-/* Let's just say, I wouldn't do debugging under a Guest. */
-static unsigned long lguest_get_debugreg(int regno)
-{
-	/* FIXME: Implement */
-	return 0;
-}
-
-static void lguest_set_debugreg(int regno, unsigned long value)
-{
-	/* FIXME: Implement */
-}
-
-/*
- * There are times when the kernel wants to make sure that no memory writes are
- * caught in the cache (that they've all reached real hardware devices).  This
- * doesn't matter for the Guest which has virtual hardware.
- *
- * On the Pentium 4 and above, cpuid() indicates that the Cache Line Flush
- * (clflush) instruction is available and the kernel uses that.  Otherwise, it
- * uses the older "Write Back and Invalidate Cache" (wbinvd) instruction.
- * Unlike clflush, wbinvd can only be run at privilege level 0.  So we can
- * ignore clflush, but replace wbinvd.
- */
-static void lguest_wbinvd(void)
-{
-}
-
-/*
- * If the Guest expects to have an Advanced Programmable Interrupt Controller,
- * we play dumb by ignoring writes and returning 0 for reads.  So it's no
- * longer Programmable nor Controlling anything, and I don't think 8 lines of
- * code qualifies for Advanced.  It will also never interrupt anything.  It
- * does, however, allow us to get through the Linux boot code.
- */
-#ifdef CONFIG_X86_LOCAL_APIC
-static void lguest_apic_write(u32 reg, u32 v)
-{
-}
-
-static u32 lguest_apic_read(u32 reg)
-{
-	return 0;
-}
-
-static u64 lguest_apic_icr_read(void)
-{
-	return 0;
-}
-
-static void lguest_apic_icr_write(u32 low, u32 id)
-{
-	/* Warn to see if there's any stray references */
-	WARN_ON(1);
-}
-
-static void lguest_apic_wait_icr_idle(void)
-{
-	return;
-}
-
-static u32 lguest_apic_safe_wait_icr_idle(void)
-{
-	return 0;
-}
-
-static void set_lguest_basic_apic_ops(void)
-{
-	apic->read = lguest_apic_read;
-	apic->write = lguest_apic_write;
-	apic->icr_read = lguest_apic_icr_read;
-	apic->icr_write = lguest_apic_icr_write;
-	apic->wait_icr_idle = lguest_apic_wait_icr_idle;
-	apic->safe_wait_icr_idle = lguest_apic_safe_wait_icr_idle;
-};
-#endif
-
-/* STOP!  Until an interrupt comes in. */
-static void lguest_safe_halt(void)
-{
-	hcall(LHCALL_HALT, 0, 0, 0, 0);
-}
-
-/*
- * The SHUTDOWN hypercall takes a string to describe what's happening, and
- * an argument which says whether this to restart (reboot) the Guest or not.
- *
- * Note that the Host always prefers that the Guest speak in physical addresses
- * rather than virtual addresses, so we use __pa() here.
- */
-static void lguest_power_off(void)
-{
-	hcall(LHCALL_SHUTDOWN, __pa("Power down"),
-	      LGUEST_SHUTDOWN_POWEROFF, 0, 0);
-}
-
-/*
- * Panicing.
- *
- * Don't.  But if you did, this is what happens.
- */
-static int lguest_panic(struct notifier_block *nb, unsigned long l, void *p)
-{
-	hcall(LHCALL_SHUTDOWN, __pa(p), LGUEST_SHUTDOWN_POWEROFF, 0, 0);
-	/* The hcall won't return, but to keep gcc happy, we're "done". */
-	return NOTIFY_DONE;
-}
-
-static struct notifier_block paniced = {
-	.notifier_call = lguest_panic
-};
-
-/* Setting up memory is fairly easy. */
-static __init char *lguest_memory_setup(void)
-{
-	/*
-	 * The Linux bootloader header contains an "e820" memory map: the
-	 * Launcher populated the first entry with our memory limit.
-	 */
-	e820__range_add(boot_params.e820_table[0].addr,
-			  boot_params.e820_table[0].size,
-			  boot_params.e820_table[0].type);
-
-	/* This string is for the boot messages. */
-	return "LGUEST";
-}
-
-/* Offset within PCI config space of BAR access capability. */
-static int console_cfg_offset = 0;
-static int console_access_cap;
-
-/* Set up so that we access off in bar0 (on bus 0, device 1, function 0) */
-static void set_cfg_window(u32 cfg_offset, u32 off)
-{
-	write_pci_config_byte(0, 1, 0,
-			      cfg_offset + offsetof(struct virtio_pci_cap, bar),
-			      0);
-	write_pci_config(0, 1, 0,
-			 cfg_offset + offsetof(struct virtio_pci_cap, length),
-			 4);
-	write_pci_config(0, 1, 0,
-			 cfg_offset + offsetof(struct virtio_pci_cap, offset),
-			 off);
-}
-
-static void write_bar_via_cfg(u32 cfg_offset, u32 off, u32 val)
-{
-	/*
-	 * We could set this up once, then leave it; nothing else in the *
-	 * kernel should touch these registers.  But if it went wrong, that
-	 * would be a horrible bug to find.
-	 */
-	set_cfg_window(cfg_offset, off);
-	write_pci_config(0, 1, 0,
-			 cfg_offset + sizeof(struct virtio_pci_cap), val);
-}
-
-static void probe_pci_console(void)
-{
-	u8 cap, common_cap = 0, device_cap = 0;
-	u32 device_len;
-
-	/* Avoid recursive printk into here. */
-	console_cfg_offset = -1;
-
-	if (!early_pci_allowed()) {
-		printk(KERN_ERR "lguest: early PCI access not allowed!\n");
-		return;
-	}
-
-	/* We expect a console PCI device at BUS0, slot 1. */
-	if (read_pci_config(0, 1, 0, 0) != 0x10431AF4) {
-		printk(KERN_ERR "lguest: PCI device is %#x!\n",
-		       read_pci_config(0, 1, 0, 0));
-		return;
-	}
-
-	/* Find the capabilities we need (must be in bar0) */
-	cap = read_pci_config_byte(0, 1, 0, PCI_CAPABILITY_LIST);
-	while (cap) {
-		u8 vndr = read_pci_config_byte(0, 1, 0, cap);
-		if (vndr == PCI_CAP_ID_VNDR) {
-			u8 type, bar;
-
-			type = read_pci_config_byte(0, 1, 0,
-			    cap + offsetof(struct virtio_pci_cap, cfg_type));
-			bar = read_pci_config_byte(0, 1, 0,
-			    cap + offsetof(struct virtio_pci_cap, bar));
-
-			switch (type) {
-			case VIRTIO_PCI_CAP_DEVICE_CFG:
-				if (bar == 0)
-					device_cap = cap;
-				break;
-			case VIRTIO_PCI_CAP_PCI_CFG:
-				console_access_cap = cap;
-				break;
-			}
-		}
-		cap = read_pci_config_byte(0, 1, 0, cap + PCI_CAP_LIST_NEXT);
-	}
-	if (!device_cap || !console_access_cap) {
-		printk(KERN_ERR "lguest: No caps (%u/%u/%u) in console!\n",
-		       common_cap, device_cap, console_access_cap);
-		return;
-	}
-
-	/*
-	 * Note that we can't check features, until we've set the DRIVER
-	 * status bit.  We don't want to do that until we have a real driver,
-	 * so we just check that the device-specific config has room for
-	 * emerg_wr.  If it doesn't support VIRTIO_CONSOLE_F_EMERG_WRITE
-	 * it should ignore the access.
-	 */
-	device_len = read_pci_config(0, 1, 0,
-			device_cap + offsetof(struct virtio_pci_cap, length));
-	if (device_len < (offsetof(struct virtio_console_config, emerg_wr)
-			  + sizeof(u32))) {
-		printk(KERN_ERR "lguest: console missing emerg_wr field\n");
-		return;
-	}
-
-	console_cfg_offset = read_pci_config(0, 1, 0,
-			device_cap + offsetof(struct virtio_pci_cap, offset));
-	printk(KERN_INFO "lguest: Console via virtio-pci emerg_wr\n");
-}
-
-/*
- * We will eventually use the virtio console device to produce console output,
- * but before that is set up we use the virtio PCI console's backdoor mmio
- * access and the "emergency" write facility (which is legal even before the
- * device is configured).
- */
-static __init int early_put_chars(u32 vtermno, const char *buf, int count)
-{
-	/* If we couldn't find PCI console, forget it. */
-	if (console_cfg_offset < 0)
-		return count;
-
-	if (unlikely(!console_cfg_offset)) {
-		probe_pci_console();
-		if (console_cfg_offset < 0)
-			return count;
-	}
-
-	write_bar_via_cfg(console_access_cap,
-			  console_cfg_offset
-			  + offsetof(struct virtio_console_config, emerg_wr),
-			  buf[0]);
-	return 1;
-}
-
-/*
- * Rebooting also tells the Host we're finished, but the RESTART flag tells the
- * Launcher to reboot us.
- */
-static void lguest_restart(char *reason)
-{
-	hcall(LHCALL_SHUTDOWN, __pa(reason), LGUEST_SHUTDOWN_RESTART, 0, 0);
-}
-
-/*G:050
- * Patching (Powerfully Placating Performance Pedants)
- *
- * We have already seen that pv_ops structures let us replace simple native
- * instructions with calls to the appropriate back end all throughout the
- * kernel.  This allows the same kernel to run as a Guest and as a native
- * kernel, but it's slow because of all the indirect branches.
- *
- * Remember that David Wheeler quote about "Any problem in computer science can
- * be solved with another layer of indirection"?  The rest of that quote is
- * "... But that usually will create another problem."  This is the first of
- * those problems.
- *
- * Our current solution is to allow the paravirt back end to optionally patch
- * over the indirect calls to replace them with something more efficient.  We
- * patch two of the simplest of the most commonly called functions: disable
- * interrupts and save interrupts.  We usually have 6 or 10 bytes to patch
- * into: the Guest versions of these operations are small enough that we can
- * fit comfortably.
- *
- * First we need assembly templates of each of the patchable Guest operations,
- * and these are in head_32.S.
- */
-
-/*G:060 We construct a table from the assembler templates: */
-static const struct lguest_insns
-{
-	const char *start, *end;
-} lguest_insns[] = {
-	[PARAVIRT_PATCH(pv_irq_ops.irq_disable)] = { lgstart_cli, lgend_cli },
-	[PARAVIRT_PATCH(pv_irq_ops.save_fl)] = { lgstart_pushf, lgend_pushf },
-};
-
-/*
- * Now our patch routine is fairly simple (based on the native one in
- * paravirt.c).  If we have a replacement, we copy it in and return how much of
- * the available space we used.
- */
-static unsigned lguest_patch(u8 type, u16 clobber, void *ibuf,
-			     unsigned long addr, unsigned len)
-{
-	unsigned int insn_len;
-
-	/* Don't do anything special if we don't have a replacement */
-	if (type >= ARRAY_SIZE(lguest_insns) || !lguest_insns[type].start)
-		return paravirt_patch_default(type, clobber, ibuf, addr, len);
-
-	insn_len = lguest_insns[type].end - lguest_insns[type].start;
-
-	/* Similarly if it can't fit (doesn't happen, but let's be thorough). */
-	if (len < insn_len)
-		return paravirt_patch_default(type, clobber, ibuf, addr, len);
-
-	/* Copy in our instructions. */
-	memcpy(ibuf, lguest_insns[type].start, insn_len);
-	return insn_len;
-}
-
-/*G:029
- * Once we get to lguest_init(), we know we're a Guest.  The various
- * pv_ops structures in the kernel provide points for (almost) every routine we
- * have to override to avoid privileged instructions.
- */
-__init void lguest_init(void)
-{
-	/* We're under lguest. */
-	pv_info.name = "lguest";
-	/* We're running at privilege level 1, not 0 as normal. */
-	pv_info.kernel_rpl = 1;
-	/* Everyone except Xen runs with this set. */
-	pv_info.shared_kernel_pmd = 1;
-
-	/*
-	 * We set up all the lguest overrides for sensitive operations.  These
-	 * are detailed with the operations themselves.
-	 */
-
-	/* Interrupt-related operations */
-	pv_irq_ops.save_fl = PV_CALLEE_SAVE(lguest_save_fl);
-	pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(lg_restore_fl);
-	pv_irq_ops.irq_disable = PV_CALLEE_SAVE(lguest_irq_disable);
-	pv_irq_ops.irq_enable = __PV_IS_CALLEE_SAVE(lg_irq_enable);
-	pv_irq_ops.safe_halt = lguest_safe_halt;
-
-	/* Setup operations */
-	pv_init_ops.patch = lguest_patch;
-
-	/* Intercepts of various CPU instructions */
-	pv_cpu_ops.load_gdt = lguest_load_gdt;
-	pv_cpu_ops.cpuid = lguest_cpuid;
-	pv_cpu_ops.load_idt = lguest_load_idt;
-	pv_cpu_ops.iret = lguest_iret;
-	pv_cpu_ops.load_sp0 = lguest_load_sp0;
-	pv_cpu_ops.load_tr_desc = lguest_load_tr_desc;
-	pv_cpu_ops.set_ldt = lguest_set_ldt;
-	pv_cpu_ops.load_tls = lguest_load_tls;
-	pv_cpu_ops.get_debugreg = lguest_get_debugreg;
-	pv_cpu_ops.set_debugreg = lguest_set_debugreg;
-	pv_cpu_ops.read_cr0 = lguest_read_cr0;
-	pv_cpu_ops.write_cr0 = lguest_write_cr0;
-	pv_cpu_ops.read_cr4 = lguest_read_cr4;
-	pv_cpu_ops.write_cr4 = lguest_write_cr4;
-	pv_cpu_ops.write_gdt_entry = lguest_write_gdt_entry;
-	pv_cpu_ops.write_idt_entry = lguest_write_idt_entry;
-	pv_cpu_ops.wbinvd = lguest_wbinvd;
-	pv_cpu_ops.start_context_switch = paravirt_start_context_switch;
-	pv_cpu_ops.end_context_switch = lguest_end_context_switch;
-
-	/* Pagetable management */
-	pv_mmu_ops.write_cr3 = lguest_write_cr3;
-	pv_mmu_ops.flush_tlb_user = lguest_flush_tlb_user;
-	pv_mmu_ops.flush_tlb_single = lguest_flush_tlb_single;
-	pv_mmu_ops.flush_tlb_kernel = lguest_flush_tlb_kernel;
-	pv_mmu_ops.set_pte = lguest_set_pte;
-	pv_mmu_ops.set_pte_at = lguest_set_pte_at;
-	pv_mmu_ops.set_pmd = lguest_set_pmd;
-#ifdef CONFIG_X86_PAE
-	pv_mmu_ops.set_pte_atomic = lguest_set_pte_atomic;
-	pv_mmu_ops.pte_clear = lguest_pte_clear;
-	pv_mmu_ops.pmd_clear = lguest_pmd_clear;
-	pv_mmu_ops.set_pud = lguest_set_pud;
-#endif
-	pv_mmu_ops.read_cr2 = lguest_read_cr2;
-	pv_mmu_ops.read_cr3 = lguest_read_cr3;
-	pv_mmu_ops.lazy_mode.enter = paravirt_enter_lazy_mmu;
-	pv_mmu_ops.lazy_mode.leave = lguest_leave_lazy_mmu_mode;
-	pv_mmu_ops.lazy_mode.flush = paravirt_flush_lazy_mmu;
-	pv_mmu_ops.pte_update = lguest_pte_update;
-
-#ifdef CONFIG_X86_LOCAL_APIC
-	/* APIC read/write intercepts */
-	set_lguest_basic_apic_ops();
-#endif
-
-	x86_init.resources.memory_setup = lguest_memory_setup;
-	x86_init.irqs.intr_init = lguest_init_IRQ;
-	x86_init.timers.timer_init = lguest_time_init;
-	x86_platform.calibrate_tsc = lguest_tsc_khz;
-	x86_platform.get_wallclock =  lguest_get_wallclock;
-
-	/*
-	 * Now is a good time to look at the implementations of these functions
-	 * before returning to the rest of lguest_init().
-	 */
-
-	/*G:070
-	 * Now we've seen all the paravirt_ops, we return to
-	 * lguest_init() where the rest of the fairly chaotic boot setup
-	 * occurs.
-	 */
-
-	/*
-	 * The stack protector is a weird thing where gcc places a canary
-	 * value on the stack and then checks it on return.  This file is
-	 * compiled with -fno-stack-protector it, so we got this far without
-	 * problems.  The value of the canary is kept at offset 20 from the
-	 * %gs register, so we need to set that up before calling C functions
-	 * in other files.
-	 */
-	setup_stack_canary_segment(0);
-
-	/*
-	 * We could just call load_stack_canary_segment(), but we might as well
-	 * call switch_to_new_gdt() which loads the whole table and sets up the
-	 * per-cpu segment descriptor register %fs as well.
-	 */
-	switch_to_new_gdt(0);
-
-	/*
-	 * The Host<->Guest Switcher lives at the top of our address space, and
-	 * the Host told us how big it is when we made LGUEST_INIT hypercall:
-	 * it put the answer in lguest_data.reserve_mem
-	 */
-	reserve_top_address(lguest_data.reserve_mem);
-
-	/* Hook in our special panic hypercall code. */
-	atomic_notifier_chain_register(&panic_notifier_list, &paniced);
-
-	/*
-	 * This is messy CPU setup stuff which the native boot code does before
-	 * start_kernel, so we have to do, too:
-	 */
-	cpu_detect(&new_cpu_data);
-	/* head.S usually sets up the first capability word, so do it here. */
-	new_cpu_data.x86_capability[CPUID_1_EDX] = cpuid_edx(1);
-
-	/* Math is always hard! */
-	set_cpu_cap(&new_cpu_data, X86_FEATURE_FPU);
-
-	/* We don't have features.  We have puppies!  Puppies! */
-#ifdef CONFIG_X86_MCE
-	mca_cfg.disabled = true;
-#endif
-#ifdef CONFIG_ACPI
-	acpi_disabled = 1;
-#endif
-
-	/*
-	 * We set the preferred console to "hvc".  This is the "hypervisor
-	 * virtual console" driver written by the PowerPC people, which we also
-	 * adapted for lguest's use.
-	 */
-	add_preferred_console("hvc", 0, NULL);
-
-	/* Register our very early console. */
-	virtio_cons_early_init(early_put_chars);
-
-	/* Don't let ACPI try to control our PCI interrupts. */
-	disable_acpi();
-
-	/* We control them ourselves, by overriding these two hooks. */
-	pcibios_enable_irq = lguest_enable_irq;
-	pcibios_disable_irq = lguest_disable_irq;
-
-	/*
-	 * Last of all, we set the power management poweroff hook to point to
-	 * the Guest routine to power off, and the reboot hook to our restart
-	 * routine.
-	 */
-	pm_power_off = lguest_power_off;
-	machine_ops.restart = lguest_restart;
-
-	/*
-	 * Now we're set up, call i386_start_kernel() in head32.c and we proceed
-	 * to boot as normal.  It never returns.
-	 */
-	i386_start_kernel();
-}
-/*
- * This marks the end of stage II of our journey, The Guest.
- *
- * It is now time for us to explore the layer of virtual drivers and complete
- * our understanding of the Guest in "make Drivers".
- */
diff --git a/arch/x86/lguest/head_32.S b/arch/x86/lguest/head_32.S
deleted file mode 100644
index d5ae63f5ec5d..000000000000
--- a/arch/x86/lguest/head_32.S
+++ /dev/null
@@ -1,192 +0,0 @@
-#include <linux/linkage.h>
-#include <linux/lguest.h>
-#include <asm/lguest_hcall.h>
-#include <asm/asm-offsets.h>
-#include <asm/thread_info.h>
-#include <asm/processor-flags.h>
-
-/*G:020
-
- * Our story starts with the bzImage: booting starts at startup_32 in
- * arch/x86/boot/compressed/head_32.S.  This merely uncompresses the real
- * kernel in place and then jumps into it: startup_32 in
- * arch/x86/kernel/head_32.S.  Both routines expects a boot header in the %esi
- * register, which is created by the bootloader (the Launcher in our case).
- *
- * The startup_32 function does very little: it clears the uninitialized global
- * C variables which we expect to be zero (ie. BSS) and then copies the boot
- * header and kernel command line somewhere safe, and populates some initial
- * page tables.  Finally it checks the 'hardware_subarch' field.  This was
- * introduced in 2.6.24 for lguest and Xen: if it's set to '1' (lguest's
- * assigned number), then it calls us here.
- *
- * WARNING: be very careful here!  We're running at addresses equal to physical
- * addresses (around 0), not above PAGE_OFFSET as most code expects
- * (eg. 0xC0000000).  Jumps are relative, so they're OK, but we can't touch any
- * data without remembering to subtract __PAGE_OFFSET!
- *
- * The .section line puts this code in .init.text so it will be discarded after
- * boot.
- */
-.section .init.text, "ax", @progbits
-ENTRY(lguest_entry)
-	/*
-	 * We make the "initialization" hypercall now to tell the Host where
-	 * our lguest_data struct is.
-	 */
-	movl $LHCALL_LGUEST_INIT, %eax
-	movl $lguest_data - __PAGE_OFFSET, %ebx
-	int $LGUEST_TRAP_ENTRY
-
-	/* Now turn our pagetables on; setup by arch/x86/kernel/head_32.S. */
-	movl $LHCALL_NEW_PGTABLE, %eax
-	movl $(initial_page_table - __PAGE_OFFSET), %ebx
-	int $LGUEST_TRAP_ENTRY
-
-	/* Set up the initial stack so we can run C code. */
-	movl $(init_thread_union+THREAD_SIZE),%esp
-
-	/* Jumps are relative: we're running __PAGE_OFFSET too low. */
-	jmp lguest_init+__PAGE_OFFSET
-
-/*G:055
- * We create a macro which puts the assembler code between lgstart_ and lgend_
- * markers.  These templates are put in the .text section: they can't be
- * discarded after boot as we may need to patch modules, too.
- */
-.text
-#define LGUEST_PATCH(name, insns...)			\
-	lgstart_##name:	insns; lgend_##name:;		\
-	.globl lgstart_##name; .globl lgend_##name
-
-LGUEST_PATCH(cli, movl $0, lguest_data+LGUEST_DATA_irq_enabled)
-LGUEST_PATCH(pushf, movl lguest_data+LGUEST_DATA_irq_enabled, %eax)
-
-/*G:033
- * But using those wrappers is inefficient (we'll see why that doesn't matter
- * for save_fl and irq_disable later).  If we write our routines carefully in
- * assembler, we can avoid clobbering any registers and avoid jumping through
- * the wrapper functions.
- *
- * I skipped over our first piece of assembler, but this one is worth studying
- * in a bit more detail so I'll describe in easy stages.  First, the routine to
- * enable interrupts:
- */
-ENTRY(lg_irq_enable)
-	/*
-	 * The reverse of irq_disable, this sets lguest_data.irq_enabled to
-	 * X86_EFLAGS_IF (ie. "Interrupts enabled").
-	 */
-	movl $X86_EFLAGS_IF, lguest_data+LGUEST_DATA_irq_enabled
-	/*
-	 * But now we need to check if the Host wants to know: there might have
-	 * been interrupts waiting to be delivered, in which case it will have
-	 * set lguest_data.irq_pending to X86_EFLAGS_IF.  If it's not zero, we
-	 * jump to send_interrupts, otherwise we're done.
-	 */
-	cmpl $0, lguest_data+LGUEST_DATA_irq_pending
-	jnz send_interrupts
-	/*
-	 * One cool thing about x86 is that you can do many things without using
-	 * a register.  In this case, the normal path hasn't needed to save or
-	 * restore any registers at all!
-	 */
-	ret
-send_interrupts:
-	/*
-	 * OK, now we need a register: eax is used for the hypercall number,
-	 * which is LHCALL_SEND_INTERRUPTS.
-	 *
-	 * We used not to bother with this pending detection at all, which was
-	 * much simpler.  Sooner or later the Host would realize it had to
-	 * send us an interrupt.  But that turns out to make performance 7
-	 * times worse on a simple tcp benchmark.  So now we do this the hard
-	 * way.
-	 */
-	pushl %eax
-	movl $LHCALL_SEND_INTERRUPTS, %eax
-	/* This is the actual hypercall trap. */
-	int  $LGUEST_TRAP_ENTRY
-	/* Put eax back the way we found it. */
-	popl %eax
-	ret
-
-/*
- * Finally, the "popf" or "restore flags" routine.  The %eax register holds the
- * flags (in practice, either X86_EFLAGS_IF or 0): if it's X86_EFLAGS_IF we're
- * enabling interrupts again, if it's 0 we're leaving them off.
- */
-ENTRY(lg_restore_fl)
-	/* This is just "lguest_data.irq_enabled = flags;" */
-	movl %eax, lguest_data+LGUEST_DATA_irq_enabled
-	/*
-	 * Now, if the %eax value has enabled interrupts and
-	 * lguest_data.irq_pending is set, we want to tell the Host so it can
-	 * deliver any outstanding interrupts.  Fortunately, both values will
-	 * be X86_EFLAGS_IF (ie. 512) in that case, and the "testl"
-	 * instruction will AND them together for us.  If both are set, we
-	 * jump to send_interrupts.
-	 */
-	testl lguest_data+LGUEST_DATA_irq_pending, %eax
-	jnz send_interrupts
-	/* Again, the normal path has used no extra registers.  Clever, huh? */
-	ret
-/*:*/
-
-/* These demark the EIP where host should never deliver interrupts. */
-.global lguest_noirq_iret
-
-/*M:004
- * When the Host reflects a trap or injects an interrupt into the Guest, it
- * sets the eflags interrupt bit on the stack based on lguest_data.irq_enabled,
- * so the Guest iret logic does the right thing when restoring it.  However,
- * when the Host sets the Guest up for direct traps, such as system calls, the
- * processor is the one to push eflags onto the stack, and the interrupt bit
- * will be 1 (in reality, interrupts are always enabled in the Guest).
- *
- * This turns out to be harmless: the only trap which should happen under Linux
- * with interrupts disabled is Page Fault (due to our lazy mapping of vmalloc
- * regions), which has to be reflected through the Host anyway.  If another
- * trap *does* go off when interrupts are disabled, the Guest will panic, and
- * we'll never get to this iret!
-:*/
-
-/*G:045
- * There is one final paravirt_op that the Guest implements, and glancing at it
- * you can see why I left it to last.  It's *cool*!  It's in *assembler*!
- *
- * The "iret" instruction is used to return from an interrupt or trap.  The
- * stack looks like this:
- *   old address
- *   old code segment & privilege level
- *   old processor flags ("eflags")
- *
- * The "iret" instruction pops those values off the stack and restores them all
- * at once.  The only problem is that eflags includes the Interrupt Flag which
- * the Guest can't change: the CPU will simply ignore it when we do an "iret".
- * So we have to copy eflags from the stack to lguest_data.irq_enabled before
- * we do the "iret".
- *
- * There are two problems with this: firstly, we can't clobber any registers
- * and secondly, the whole thing needs to be atomic.  The first problem
- * is solved by using "push memory"/"pop memory" instruction pair for copying.
- *
- * The second is harder: copying eflags to lguest_data.irq_enabled will turn
- * interrupts on before we're finished, so we could be interrupted before we
- * return to userspace or wherever.  Our solution to this is to tell the
- * Host that it is *never* to interrupt us there, even if interrupts seem to be
- * enabled. (It's not necessary to protect pop instruction, since
- * data gets updated only after it completes, so we only need to protect
- * one instruction, iret).
- */
-ENTRY(lguest_iret)
-	pushl	2*4(%esp)
-	/*
-	 * Note the %ss: segment prefix here.  Normal data accesses use the
-	 * "ds" segment, but that will have already been restored for whatever
-	 * we're returning to (such as userspace): we can't trust it.  The %ss:
-	 * prefix makes sure we use the stack segment, which is still valid.
-	 */
-	popl	%ss:lguest_data+LGUEST_DATA_irq_enabled
-lguest_noirq_iret:
-	iret
diff --git a/drivers/Makefile b/drivers/Makefile
index dfdcda00bfe3..d90fdc413648 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -125,7 +125,6 @@ obj-$(CONFIG_ACCESSIBILITY)	+= accessibility/
 obj-$(CONFIG_ISDN)		+= isdn/
 obj-$(CONFIG_EDAC)		+= edac/
 obj-$(CONFIG_EISA)		+= eisa/
-obj-y				+= lguest/
 obj-$(CONFIG_CPU_FREQ)		+= cpufreq/
 obj-$(CONFIG_CPU_IDLE)		+= cpuidle/
 obj-y				+= mmc/
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index 8ddc98279c8f..80aaf3420e12 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -470,7 +470,7 @@ config VIRTIO_BLK
 	depends on VIRTIO
 	---help---
 	  This is the virtual block driver for virtio.  It can be used with
-          lguest or QEMU based VMMs (like KVM or Xen).  Say Y or M.
+          QEMU based VMMs (like KVM or Xen).  Say Y or M.
 
 config VIRTIO_BLK_SCSI
 	bool "SCSI passthrough request for the Virtio block driver"
diff --git a/drivers/char/Kconfig b/drivers/char/Kconfig
index ccd239ab879f..623714344600 100644
--- a/drivers/char/Kconfig
+++ b/drivers/char/Kconfig
@@ -161,7 +161,7 @@ config VIRTIO_CONSOLE
 	depends on VIRTIO && TTY
 	select HVC_DRIVER
 	help
-	  Virtio console for use with lguest and other hypervisors.
+	  Virtio console for use with hypervisors.
 
 	  Also serves as a general-purpose serial device for data
 	  transfer between the guest and host.  Character devices at
diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c
index ad843eb02ae7..4d229dde6522 100644
--- a/drivers/char/virtio_console.c
+++ b/drivers/char/virtio_console.c
@@ -1130,7 +1130,7 @@ static const struct file_operations port_fops = {
  * We turn the characters into a scatter-gather list, add it to the
  * output queue and then kick the Host.  Then we sit here waiting for
  * it to finish: inefficient in theory, but in practice
- * implementations will do it immediately (lguest's Launcher does).
+ * implementations will do it immediately.
  */
 static int put_chars(u32 vtermno, const char *buf, int count)
 {
diff --git a/drivers/lguest/Kconfig b/drivers/lguest/Kconfig
deleted file mode 100644
index 169172d2ba05..000000000000
--- a/drivers/lguest/Kconfig
+++ /dev/null
@@ -1,13 +0,0 @@
-config LGUEST
-	tristate "Linux hypervisor example code"
-	depends on X86_32 && EVENTFD && TTY && PCI_DIRECT
-	select HVC_DRIVER
-	---help---
-	  This is a very simple module which allows you to run
-	  multiple instances of the same Linux kernel, using the
-	  "lguest" command found in the tools/lguest directory.
-
-	  Note that "lguest" is pronounced to rhyme with "fell quest",
-	  not "rustyvisor". See tools/lguest/lguest.txt.
-
-	  If unsure, say N.  If curious, say M.  If masochistic, say Y.
diff --git a/drivers/lguest/Makefile b/drivers/lguest/Makefile
deleted file mode 100644
index 16f52ee73994..000000000000
--- a/drivers/lguest/Makefile
+++ /dev/null
@@ -1,26 +0,0 @@
-# Host requires the other files, which can be a module.
-obj-$(CONFIG_LGUEST)	+= lg.o
-lg-y = core.o hypercalls.o page_tables.o interrupts_and_traps.o \
-	segments.o lguest_user.o
-
-lg-$(CONFIG_X86_32) += x86/switcher_32.o x86/core.o
-
-Preparation Preparation!: PREFIX=P
-Guest: PREFIX=G
-Drivers: PREFIX=D
-Launcher: PREFIX=L
-Host: PREFIX=H
-Switcher: PREFIX=S
-Mastery: PREFIX=M
-Beer:
-	@for f in Preparation Guest Drivers Launcher Host Switcher Mastery; do echo "{==- $$f -==}"; make -s $$f; done; echo "{==-==}"
-Preparation Preparation! Guest Drivers Launcher Host Switcher Mastery:
-	@sh ../../tools/lguest/extract $(PREFIX) `find ../../* -name '*.[chS]' -wholename '*lguest*'`
-Puppy:
-	@clear
-	@printf "      __  \n (___()'\`;\n /,    /\`\n \\\\\\\"--\\\\\\   \n"
-	@sleep 2; clear; printf "\n\n   Sit!\n\n"; sleep 1; clear
-	@printf "    __    \n   ()'\`;  \n   /\\|\` \n  /  |  \n(/_)_|_   \n"
-	@sleep 2; clear; printf "\n\n  Stand!\n\n"; sleep 1; clear
-	@printf "    __    \n   ()'\`;  \n   /\\|\` \n  /._.= \n /| /     \n(_\_)_    \n"
-	@sleep 2; clear; printf "\n\n  Good puppy!\n\n"; sleep 1; clear
diff --git a/drivers/lguest/README b/drivers/lguest/README
deleted file mode 100644
index b7db39a64c66..000000000000
--- a/drivers/lguest/README
+++ /dev/null
@@ -1,47 +0,0 @@
-Welcome, friend reader, to lguest.
-
-Lguest is an adventure, with you, the reader, as Hero.  I can't think of many
-5000-line projects which offer both such capability and glimpses of future
-potential; it is an exciting time to be delving into the source!
-
-But be warned; this is an arduous journey of several hours or more!  And as we
-know, all true Heroes are driven by a Noble Goal.  Thus I offer a Beer (or
-equivalent) to anyone I meet who has completed this documentation.
-
-So get comfortable and keep your wits about you (both quick and humorous).
-Along your way to the Noble Goal, you will also gain masterly insight into
-lguest, and hypervisors and x86 virtualization in general.
-
-Our Quest is in seven parts: (best read with C highlighting turned on)
-
-I) Preparation
-	- In which our potential hero is flown quickly over the landscape for a
-	  taste of its scope.  Suitable for the armchair coders and other such
-	  persons of faint constitution.
-
-II) Guest
-	- Where we encounter the first tantalising wisps of code, and come to
-	  understand the details of the life of a Guest kernel.
-
-III) Drivers
-	- Whereby the Guest finds its voice and become useful, and our
-	  understanding of the Guest is completed.
-
-IV) Launcher
-	- Where we trace back to the creation of the Guest, and thus begin our
-	  understanding of the Host.
-
-V) Host
-	- Where we master the Host code, through a long and tortuous journey.
-	  Indeed, it is here that our hero is tested in the Bit of Despair.
-
-VI) Switcher
-	- Where our understanding of the intertwined nature of Guests and Hosts
-	  is completed.
-
-VII) Mastery
-	- Where our fully fledged hero grapples with the Great Question:
-	  "What next?"
-
-make Preparation!
-Rusty Russell.
diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c
deleted file mode 100644
index 395ed1961dbf..000000000000
--- a/drivers/lguest/core.c
+++ /dev/null
@@ -1,398 +0,0 @@
-/*P:400
- * This contains run_guest() which actually calls into the Host<->Guest
- * Switcher and analyzes the return, such as determining if the Guest wants the
- * Host to do something.  This file also contains useful helper routines.
-:*/
-#include <linux/module.h>
-#include <linux/stringify.h>
-#include <linux/stddef.h>
-#include <linux/io.h>
-#include <linux/mm.h>
-#include <linux/sched/signal.h>
-#include <linux/vmalloc.h>
-#include <linux/cpu.h>
-#include <linux/freezer.h>
-#include <linux/highmem.h>
-#include <linux/slab.h>
-#include <asm/paravirt.h>
-#include <asm/pgtable.h>
-#include <linux/uaccess.h>
-#include <asm/poll.h>
-#include <asm/asm-offsets.h>
-#include "lg.h"
-
-unsigned long switcher_addr;
-struct page **lg_switcher_pages;
-static struct vm_struct *switcher_text_vma;
-static struct vm_struct *switcher_stacks_vma;
-
-/* This One Big lock protects all inter-guest data structures. */
-DEFINE_MUTEX(lguest_lock);
-
-/*H:010
- * We need to set up the Switcher at a high virtual address.  Remember the
- * Switcher is a few hundred bytes of assembler code which actually changes the
- * CPU to run the Guest, and then changes back to the Host when a trap or
- * interrupt happens.
- *
- * The Switcher code must be at the same virtual address in the Guest as the
- * Host since it will be running as the switchover occurs.
- *
- * Trying to map memory at a particular address is an unusual thing to do, so
- * it's not a simple one-liner.
- */
-static __init int map_switcher(void)
-{
-	int i, err;
-
-	/*
-	 * Map the Switcher in to high memory.
-	 *
-	 * It turns out that if we choose the address 0xFFC00000 (4MB under the
-	 * top virtual address), it makes setting up the page tables really
-	 * easy.
-	 */
-
-	/* We assume Switcher text fits into a single page. */
-	if (end_switcher_text - start_switcher_text > PAGE_SIZE) {
-		printk(KERN_ERR "lguest: switcher text too large (%zu)\n",
-		       end_switcher_text - start_switcher_text);
-		return -EINVAL;
-	}
-
-	/*
-	 * We allocate an array of struct page pointers.  map_vm_area() wants
-	 * this, rather than just an array of pages.
-	 */
-	lg_switcher_pages = kmalloc(sizeof(lg_switcher_pages[0])
-				    * TOTAL_SWITCHER_PAGES,
-				    GFP_KERNEL);
-	if (!lg_switcher_pages) {
-		err = -ENOMEM;
-		goto out;
-	}
-
-	/*
-	 * Now we actually allocate the pages.  The Guest will see these pages,
-	 * so we make sure they're zeroed.
-	 */
-	for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) {
-		lg_switcher_pages[i] = alloc_page(GFP_KERNEL|__GFP_ZERO);
-		if (!lg_switcher_pages[i]) {
-			err = -ENOMEM;
-			goto free_some_pages;
-		}
-	}
-
-	/*
-	 * Copy in the compiled-in Switcher code (from x86/switcher_32.S).
-	 * It goes in the first page, which we map in momentarily.
-	 */
-	memcpy(kmap(lg_switcher_pages[0]), start_switcher_text,
-	       end_switcher_text - start_switcher_text);
-	kunmap(lg_switcher_pages[0]);
-
-	/*
-	 * We place the Switcher underneath the fixmap area, which is the
-	 * highest virtual address we can get.  This is important, since we
-	 * tell the Guest it can't access this memory, so we want its ceiling
-	 * as high as possible.
-	 */
-	switcher_addr = FIXADDR_START - TOTAL_SWITCHER_PAGES*PAGE_SIZE;
-
-	/*
-	 * Now we reserve the "virtual memory area"s we want.  We might
-	 * not get them in theory, but in practice it's worked so far.
-	 *
-	 * We want the switcher text to be read-only and executable, and
-	 * the stacks to be read-write and non-executable.
-	 */
-	switcher_text_vma = __get_vm_area(PAGE_SIZE, VM_ALLOC|VM_NO_GUARD,
-					  switcher_addr,
-					  switcher_addr + PAGE_SIZE);
-
-	if (!switcher_text_vma) {
-		err = -ENOMEM;
-		printk("lguest: could not map switcher pages high\n");
-		goto free_pages;
-	}
-
-	switcher_stacks_vma = __get_vm_area(SWITCHER_STACK_PAGES * PAGE_SIZE,
-					    VM_ALLOC|VM_NO_GUARD,
-					    switcher_addr + PAGE_SIZE,
-					    switcher_addr + TOTAL_SWITCHER_PAGES * PAGE_SIZE);
-	if (!switcher_stacks_vma) {
-		err = -ENOMEM;
-		printk("lguest: could not map switcher pages high\n");
-		goto free_text_vma;
-	}
-
-	/*
-	 * This code actually sets up the pages we've allocated to appear at
-	 * switcher_addr.  map_vm_area() takes the vma we allocated above, the
-	 * kind of pages we're mapping (kernel text pages and kernel writable
-	 * pages respectively), and a pointer to our array of struct pages.
-	 */
-	err = map_vm_area(switcher_text_vma, PAGE_KERNEL_RX, lg_switcher_pages);
-	if (err) {
-		printk("lguest: text map_vm_area failed: %i\n", err);
-		goto free_vmas;
-	}
-
-	err = map_vm_area(switcher_stacks_vma, PAGE_KERNEL,
-			  lg_switcher_pages + SWITCHER_TEXT_PAGES);
-	if (err) {
-		printk("lguest: stacks map_vm_area failed: %i\n", err);
-		goto free_vmas;
-	}
-
-	/*
-	 * Now the Switcher is mapped at the right address, we can't fail!
-	 */
-	printk(KERN_INFO "lguest: mapped switcher at %p\n",
-	       switcher_text_vma->addr);
-	/* And we succeeded... */
-	return 0;
-
-free_vmas:
-	/* Undoes map_vm_area and __get_vm_area */
-	vunmap(switcher_stacks_vma->addr);
-free_text_vma:
-	vunmap(switcher_text_vma->addr);
-free_pages:
-	i = TOTAL_SWITCHER_PAGES;
-free_some_pages:
-	for (--i; i >= 0; i--)
-		__free_pages(lg_switcher_pages[i], 0);
-	kfree(lg_switcher_pages);
-out:
-	return err;
-}
-/*:*/
-
-/* Cleaning up the mapping when the module is unloaded is almost... too easy. */
-static void unmap_switcher(void)
-{
-	unsigned int i;
-
-	/* vunmap() undoes *both* map_vm_area() and __get_vm_area(). */
-	vunmap(switcher_text_vma->addr);
-	vunmap(switcher_stacks_vma->addr);
-	/* Now we just need to free the pages we copied the switcher into */
-	for (i = 0; i < TOTAL_SWITCHER_PAGES; i++)
-		__free_pages(lg_switcher_pages[i], 0);
-	kfree(lg_switcher_pages);
-}
-
-/*H:032
- * Dealing With Guest Memory.
- *
- * Before we go too much further into the Host, we need to grok the routines
- * we use to deal with Guest memory.
- *
- * When the Guest gives us (what it thinks is) a physical address, we can use
- * the normal copy_from_user() & copy_to_user() on the corresponding place in
- * the memory region allocated by the Launcher.
- *
- * But we can't trust the Guest: it might be trying to access the Launcher
- * code.  We have to check that the range is below the pfn_limit the Launcher
- * gave us.  We have to make sure that addr + len doesn't give us a false
- * positive by overflowing, too.
- */
-bool lguest_address_ok(const struct lguest *lg,
-		       unsigned long addr, unsigned long len)
-{
-	return addr+len <= lg->pfn_limit * PAGE_SIZE && (addr+len >= addr);
-}
-
-/*
- * This routine copies memory from the Guest.  Here we can see how useful the
- * kill_lguest() routine we met in the Launcher can be: we return a random
- * value (all zeroes) instead of needing to return an error.
- */
-void __lgread(struct lg_cpu *cpu, void *b, unsigned long addr, unsigned bytes)
-{
-	if (!lguest_address_ok(cpu->lg, addr, bytes)
-	    || copy_from_user(b, cpu->lg->mem_base + addr, bytes) != 0) {
-		/* copy_from_user should do this, but as we rely on it... */
-		memset(b, 0, bytes);
-		kill_guest(cpu, "bad read address %#lx len %u", addr, bytes);
-	}
-}
-
-/* This is the write (copy into Guest) version. */
-void __lgwrite(struct lg_cpu *cpu, unsigned long addr, const void *b,
-	       unsigned bytes)
-{
-	if (!lguest_address_ok(cpu->lg, addr, bytes)
-	    || copy_to_user(cpu->lg->mem_base + addr, b, bytes) != 0)
-		kill_guest(cpu, "bad write address %#lx len %u", addr, bytes);
-}
-/*:*/
-
-/*H:030
- * Let's jump straight to the the main loop which runs the Guest.
- * Remember, this is called by the Launcher reading /dev/lguest, and we keep
- * going around and around until something interesting happens.
- */
-int run_guest(struct lg_cpu *cpu, unsigned long __user *user)
-{
-	/* If the launcher asked for a register with LHREQ_GETREG */
-	if (cpu->reg_read) {
-		if (put_user(*cpu->reg_read, user))
-			return -EFAULT;
-		cpu->reg_read = NULL;
-		return sizeof(*cpu->reg_read);
-	}
-
-	/* We stop running once the Guest is dead. */
-	while (!cpu->lg->dead) {
-		unsigned int irq;
-		bool more;
-
-		/* First we run any hypercalls the Guest wants done. */
-		if (cpu->hcall)
-			do_hypercalls(cpu);
-
-		/* Do we have to tell the Launcher about a trap? */
-		if (cpu->pending.trap) {
-			if (copy_to_user(user, &cpu->pending,
-					 sizeof(cpu->pending)))
-				return -EFAULT;
-			return sizeof(cpu->pending);
-		}
-
-		/*
-		 * All long-lived kernel loops need to check with this horrible
-		 * thing called the freezer.  If the Host is trying to suspend,
-		 * it stops us.
-		 */
-		try_to_freeze();
-
-		/* Check for signals */
-		if (signal_pending(current))
-			return -ERESTARTSYS;
-
-		/*
-		 * Check if there are any interrupts which can be delivered now:
-		 * if so, this sets up the hander to be executed when we next
-		 * run the Guest.
-		 */
-		irq = interrupt_pending(cpu, &more);
-		if (irq < LGUEST_IRQS)
-			try_deliver_interrupt(cpu, irq, more);
-
-		/*
-		 * Just make absolutely sure the Guest is still alive.  One of
-		 * those hypercalls could have been fatal, for example.
-		 */
-		if (cpu->lg->dead)
-			break;
-
-		/*
-		 * If the Guest asked to be stopped, we sleep.  The Guest's
-		 * clock timer will wake us.
-		 */
-		if (cpu->halted) {
-			set_current_state(TASK_INTERRUPTIBLE);
-			/*
-			 * Just before we sleep, make sure no interrupt snuck in
-			 * which we should be doing.
-			 */
-			if (interrupt_pending(cpu, &more) < LGUEST_IRQS)
-				set_current_state(TASK_RUNNING);
-			else
-				schedule();
-			continue;
-		}
-
-		/*
-		 * OK, now we're ready to jump into the Guest.  First we put up
-		 * the "Do Not Disturb" sign:
-		 */
-		local_irq_disable();
-
-		/* Actually run the Guest until something happens. */
-		lguest_arch_run_guest(cpu);
-
-		/* Now we're ready to be interrupted or moved to other CPUs */
-		local_irq_enable();
-
-		/* Now we deal with whatever happened to the Guest. */
-		lguest_arch_handle_trap(cpu);
-	}
-
-	/* Special case: Guest is 'dead' but wants a reboot. */
-	if (cpu->lg->dead == ERR_PTR(-ERESTART))
-		return -ERESTART;
-
-	/* The Guest is dead => "No such file or directory" */
-	return -ENOENT;
-}
-
-/*H:000
- * Welcome to the Host!
- *
- * By this point your brain has been tickled by the Guest code and numbed by
- * the Launcher code; prepare for it to be stretched by the Host code.  This is
- * the heart.  Let's begin at the initialization routine for the Host's lg
- * module.
- */
-static int __init init(void)
-{
-	int err;
-
-	/* Lguest can't run under Xen, VMI or itself.  It does Tricky Stuff. */
-	if (get_kernel_rpl() != 0) {
-		printk("lguest is afraid of being a guest\n");
-		return -EPERM;
-	}
-
-	/* First we put the Switcher up in very high virtual memory. */
-	err = map_switcher();
-	if (err)
-		goto out;
-
-	/* We might need to reserve an interrupt vector. */
-	err = init_interrupts();
-	if (err)
-		goto unmap;
-
-	/* /dev/lguest needs to be registered. */
-	err = lguest_device_init();
-	if (err)
-		goto free_interrupts;
-
-	/* Finally we do some architecture-specific setup. */
-	lguest_arch_host_init();
-
-	/* All good! */
-	return 0;
-
-free_interrupts:
-	free_interrupts();
-unmap:
-	unmap_switcher();
-out:
-	return err;
-}
-
-/* Cleaning up is just the same code, backwards.  With a little French. */
-static void __exit fini(void)
-{
-	lguest_device_remove();
-	free_interrupts();
-	unmap_switcher();
-
-	lguest_arch_host_fini();
-}
-/*:*/
-
-/*
- * The Host side of lguest can be a module.  This is a nice way for people to
- * play with it.
- */
-module_init(init);
-module_exit(fini);
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>");
diff --git a/drivers/lguest/hypercalls.c b/drivers/lguest/hypercalls.c
deleted file mode 100644
index 601f81c04873..000000000000
--- a/drivers/lguest/hypercalls.c
+++ /dev/null
@@ -1,304 +0,0 @@
-/*P:500
- * Just as userspace programs request kernel operations through a system
- * call, the Guest requests Host operations through a "hypercall".  You might
- * notice this nomenclature doesn't really follow any logic, but the name has
- * been around for long enough that we're stuck with it.  As you'd expect, this
- * code is basically a one big switch statement.
-:*/
-
-/*  Copyright (C) 2006 Rusty Russell IBM Corporation
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
-*/
-#include <linux/uaccess.h>
-#include <linux/syscalls.h>
-#include <linux/mm.h>
-#include <linux/ktime.h>
-#include <asm/page.h>
-#include <asm/pgtable.h>
-#include "lg.h"
-
-/*H:120
- * This is the core hypercall routine: where the Guest gets what it wants.
- * Or gets killed.  Or, in the case of LHCALL_SHUTDOWN, both.
- */
-static void do_hcall(struct lg_cpu *cpu, struct hcall_args *args)
-{
-	switch (args->arg0) {
-	case LHCALL_FLUSH_ASYNC:
-		/*
-		 * This call does nothing, except by breaking out of the Guest
-		 * it makes us process all the asynchronous hypercalls.
-		 */
-		break;
-	case LHCALL_SEND_INTERRUPTS:
-		/*
-		 * This call does nothing too, but by breaking out of the Guest
-		 * it makes us process any pending interrupts.
-		 */
-		break;
-	case LHCALL_LGUEST_INIT:
-		/*
-		 * You can't get here unless you're already initialized.  Don't
-		 * do that.
-		 */
-		kill_guest(cpu, "already have lguest_data");
-		break;
-	case LHCALL_SHUTDOWN: {
-		char msg[128];
-		/*
-		 * Shutdown is such a trivial hypercall that we do it in five
-		 * lines right here.
-		 *
-		 * If the lgread fails, it will call kill_guest() itself; the
-		 * kill_guest() with the message will be ignored.
-		 */
-		__lgread(cpu, msg, args->arg1, sizeof(msg));
-		msg[sizeof(msg)-1] = '\0';
-		kill_guest(cpu, "CRASH: %s", msg);
-		if (args->arg2 == LGUEST_SHUTDOWN_RESTART)
-			cpu->lg->dead = ERR_PTR(-ERESTART);
-		break;
-	}
-	case LHCALL_FLUSH_TLB:
-		/* FLUSH_TLB comes in two flavors, depending on the argument: */
-		if (args->arg1)
-			guest_pagetable_clear_all(cpu);
-		else
-			guest_pagetable_flush_user(cpu);
-		break;
-
-	/*
-	 * All these calls simply pass the arguments through to the right
-	 * routines.
-	 */
-	case LHCALL_NEW_PGTABLE:
-		guest_new_pagetable(cpu, args->arg1);
-		break;
-	case LHCALL_SET_STACK:
-		guest_set_stack(cpu, args->arg1, args->arg2, args->arg3);
-		break;
-	case LHCALL_SET_PTE:
-#ifdef CONFIG_X86_PAE
-		guest_set_pte(cpu, args->arg1, args->arg2,
-				__pte(args->arg3 | (u64)args->arg4 << 32));
-#else
-		guest_set_pte(cpu, args->arg1, args->arg2, __pte(args->arg3));
-#endif
-		break;
-	case LHCALL_SET_PGD:
-		guest_set_pgd(cpu->lg, args->arg1, args->arg2);
-		break;
-#ifdef CONFIG_X86_PAE
-	case LHCALL_SET_PMD:
-		guest_set_pmd(cpu->lg, args->arg1, args->arg2);
-		break;
-#endif
-	case LHCALL_SET_CLOCKEVENT:
-		guest_set_clockevent(cpu, args->arg1);
-		break;
-	case LHCALL_HALT:
-		/* Similarly, this sets the halted flag for run_guest(). */
-		cpu->halted = 1;
-		break;
-	default:
-		/* It should be an architecture-specific hypercall. */
-		if (lguest_arch_do_hcall(cpu, args))
-			kill_guest(cpu, "Bad hypercall %li\n", args->arg0);
-	}
-}
-
-/*H:124
- * Asynchronous hypercalls are easy: we just look in the array in the
- * Guest's "struct lguest_data" to see if any new ones are marked "ready".
- *
- * We are careful to do these in order: obviously we respect the order the
- * Guest put them in the ring, but we also promise the Guest that they will
- * happen before any normal hypercall (which is why we check this before
- * checking for a normal hcall).
- */
-static void do_async_hcalls(struct lg_cpu *cpu)
-{
-	unsigned int i;
-	u8 st[LHCALL_RING_SIZE];
-
-	/* For simplicity, we copy the entire call status array in at once. */
-	if (copy_from_user(&st, &cpu->lg->lguest_data->hcall_status, sizeof(st)))
-		return;
-
-	/* We process "struct lguest_data"s hcalls[] ring once. */
-	for (i = 0; i < ARRAY_SIZE(st); i++) {
-		struct hcall_args args;
-		/*
-		 * We remember where we were up to from last time.  This makes
-		 * sure that the hypercalls are done in the order the Guest
-		 * places them in the ring.
-		 */
-		unsigned int n = cpu->next_hcall;
-
-		/* 0xFF means there's no call here (yet). */
-		if (st[n] == 0xFF)
-			break;
-
-		/*
-		 * OK, we have hypercall.  Increment the "next_hcall" cursor,
-		 * and wrap back to 0 if we reach the end.
-		 */
-		if (++cpu->next_hcall == LHCALL_RING_SIZE)
-			cpu->next_hcall = 0;
-
-		/*
-		 * Copy the hypercall arguments into a local copy of the
-		 * hcall_args struct.
-		 */
-		if (copy_from_user(&args, &cpu->lg->lguest_data->hcalls[n],
-				   sizeof(struct hcall_args))) {
-			kill_guest(cpu, "Fetching async hypercalls");
-			break;
-		}
-
-		/* Do the hypercall, same as a normal one. */
-		do_hcall(cpu, &args);
-
-		/* Mark the hypercall done. */
-		if (put_user(0xFF, &cpu->lg->lguest_data->hcall_status[n])) {
-			kill_guest(cpu, "Writing result for async hypercall");
-			break;
-		}
-
-		/*
-		 * Stop doing hypercalls if they want to notify the Launcher:
-		 * it needs to service this first.
-		 */
-		if (cpu->pending.trap)
-			break;
-	}
-}
-
-/*
- * Last of all, we look at what happens first of all.  The very first time the
- * Guest makes a hypercall, we end up here to set things up:
- */
-static void initialize(struct lg_cpu *cpu)
-{
-	/*
-	 * You can't do anything until you're initialized.  The Guest knows the
-	 * rules, so we're unforgiving here.
-	 */
-	if (cpu->hcall->arg0 != LHCALL_LGUEST_INIT) {
-		kill_guest(cpu, "hypercall %li before INIT", cpu->hcall->arg0);
-		return;
-	}
-
-	if (lguest_arch_init_hypercalls(cpu))
-		kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data);
-
-	/*
-	 * The Guest tells us where we're not to deliver interrupts by putting
-	 * the instruction address into "struct lguest_data".
-	 */
-	if (get_user(cpu->lg->noirq_iret, &cpu->lg->lguest_data->noirq_iret))
-		kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data);
-
-	/*
-	 * We write the current time into the Guest's data page once so it can
-	 * set its clock.
-	 */
-	write_timestamp(cpu);
-
-	/* page_tables.c will also do some setup. */
-	page_table_guest_data_init(cpu);
-
-	/*
-	 * This is the one case where the above accesses might have been the
-	 * first write to a Guest page.  This may have caused a copy-on-write
-	 * fault, but the old page might be (read-only) in the Guest
-	 * pagetable.
-	 */
-	guest_pagetable_clear_all(cpu);
-}
-/*:*/
-
-/*M:013
- * If a Guest reads from a page (so creates a mapping) that it has never
- * written to, and then the Launcher writes to it (ie. the output of a virtual
- * device), the Guest will still see the old page.  In practice, this never
- * happens: why would the Guest read a page which it has never written to?  But
- * a similar scenario might one day bite us, so it's worth mentioning.
- *
- * Note that if we used a shared anonymous mapping in the Launcher instead of
- * mapping /dev/zero private, we wouldn't worry about cop-on-write.  And we
- * need that to switch the Launcher to processes (away from threads) anyway.
-:*/
-
-/*H:100
- * Hypercalls
- *
- * Remember from the Guest, hypercalls come in two flavors: normal and
- * asynchronous.  This file handles both of types.
- */
-void do_hypercalls(struct lg_cpu *cpu)
-{
-	/* Not initialized yet?  This hypercall must do it. */
-	if (unlikely(!cpu->lg->lguest_data)) {
-		/* Set up the "struct lguest_data" */
-		initialize(cpu);
-		/* Hcall is done. */
-		cpu->hcall = NULL;
-		return;
-	}
-
-	/*
-	 * The Guest has initialized.
-	 *
-	 * Look in the hypercall ring for the async hypercalls:
-	 */
-	do_async_hcalls(cpu);
-
-	/*
-	 * If we stopped reading the hypercall ring because the Guest did a
-	 * NOTIFY to the Launcher, we want to return now.  Otherwise we do
-	 * the hypercall.
-	 */
-	if (!cpu->pending.trap) {
-		do_hcall(cpu, cpu->hcall);
-		/*
-		 * Tricky point: we reset the hcall pointer to mark the
-		 * hypercall as "done".  We use the hcall pointer rather than
-		 * the trap number to indicate a hypercall is pending.
-		 * Normally it doesn't matter: the Guest will run again and
-		 * update the trap number before we come back here.
-		 *
-		 * However, if we are signalled or the Guest sends I/O to the
-		 * Launcher, the run_guest() loop will exit without running the
-		 * Guest.  When it comes back it would try to re-run the
-		 * hypercall.  Finding that bug sucked.
-		 */
-		cpu->hcall = NULL;
-	}
-}
-
-/*
- * This routine supplies the Guest with time: it's used for wallclock time at
- * initial boot and as a rough time source if the TSC isn't available.
- */
-void write_timestamp(struct lg_cpu *cpu)
-{
-	struct timespec now;
-	ktime_get_real_ts(&now);
-	if (copy_to_user(&cpu->lg->lguest_data->time,
-			 &now, sizeof(struct timespec)))
-		kill_guest(cpu, "Writing timestamp");
-}
diff --git a/drivers/lguest/interrupts_and_traps.c b/drivers/lguest/interrupts_and_traps.c
deleted file mode 100644
index 67392b6ab845..000000000000
--- a/drivers/lguest/interrupts_and_traps.c
+++ /dev/null
@@ -1,706 +0,0 @@
-/*P:800
- * Interrupts (traps) are complicated enough to earn their own file.
- * There are three classes of interrupts:
- *
- * 1) Real hardware interrupts which occur while we're running the Guest,
- * 2) Interrupts for virtual devices attached to the Guest, and
- * 3) Traps and faults from the Guest.
- *
- * Real hardware interrupts must be delivered to the Host, not the Guest.
- * Virtual interrupts must be delivered to the Guest, but we make them look
- * just like real hardware would deliver them.  Traps from the Guest can be set
- * up to go directly back into the Guest, but sometimes the Host wants to see
- * them first, so we also have a way of "reflecting" them into the Guest as if
- * they had been delivered to it directly.
-:*/
-#include <linux/uaccess.h>
-#include <linux/interrupt.h>
-#include <linux/module.h>
-#include <linux/sched.h>
-#include "lg.h"
-
-/* Allow Guests to use a non-128 (ie. non-Linux) syscall trap. */
-static unsigned int syscall_vector = IA32_SYSCALL_VECTOR;
-module_param(syscall_vector, uint, 0444);
-
-/* The address of the interrupt handler is split into two bits: */
-static unsigned long idt_address(u32 lo, u32 hi)
-{
-	return (lo & 0x0000FFFF) | (hi & 0xFFFF0000);
-}
-
-/*
- * The "type" of the interrupt handler is a 4 bit field: we only support a
- * couple of types.
- */
-static int idt_type(u32 lo, u32 hi)
-{
-	return (hi >> 8) & 0xF;
-}
-
-/* An IDT entry can't be used unless the "present" bit is set. */
-static bool idt_present(u32 lo, u32 hi)
-{
-	return (hi & 0x8000);
-}
-
-/*
- * We need a helper to "push" a value onto the Guest's stack, since that's a
- * big part of what delivering an interrupt does.
- */
-static void push_guest_stack(struct lg_cpu *cpu, unsigned long *gstack, u32 val)
-{
-	/* Stack grows upwards: move stack then write value. */
-	*gstack -= 4;
-	lgwrite(cpu, *gstack, u32, val);
-}
-
-/*H:210
- * The push_guest_interrupt_stack() routine saves Guest state on the stack for
- * an interrupt or trap.  The mechanics of delivering traps and interrupts to
- * the Guest are the same, except some traps have an "error code" which gets
- * pushed onto the stack as well: the caller tells us if this is one.
- *
- * We set up the stack just like the CPU does for a real interrupt, so it's
- * identical for the Guest (and the standard "iret" instruction will undo
- * it).
- */
-static void push_guest_interrupt_stack(struct lg_cpu *cpu, bool has_err)
-{
-	unsigned long gstack, origstack;
-	u32 eflags, ss, irq_enable;
-	unsigned long virtstack;
-
-	/*
-	 * There are two cases for interrupts: one where the Guest is already
-	 * in the kernel, and a more complex one where the Guest is in
-	 * userspace.  We check the privilege level to find out.
-	 */
-	if ((cpu->regs->ss&0x3) != GUEST_PL) {
-		/*
-		 * The Guest told us their kernel stack with the SET_STACK
-		 * hypercall: both the virtual address and the segment.
-		 */
-		virtstack = cpu->esp1;
-		ss = cpu->ss1;
-
-		origstack = gstack = guest_pa(cpu, virtstack);
-		/*
-		 * We push the old stack segment and pointer onto the new
-		 * stack: when the Guest does an "iret" back from the interrupt
-		 * handler the CPU will notice they're dropping privilege
-		 * levels and expect these here.
-		 */
-		push_guest_stack(cpu, &gstack, cpu->regs->ss);
-		push_guest_stack(cpu, &gstack, cpu->regs->esp);
-	} else {
-		/* We're staying on the same Guest (kernel) stack. */
-		virtstack = cpu->regs->esp;
-		ss = cpu->regs->ss;
-
-		origstack = gstack = guest_pa(cpu, virtstack);
-	}
-
-	/*
-	 * Remember that we never let the Guest actually disable interrupts, so
-	 * the "Interrupt Flag" bit is always set.  We copy that bit from the
-	 * Guest's "irq_enabled" field into the eflags word: we saw the Guest
-	 * copy it back in "lguest_iret".
-	 */
-	eflags = cpu->regs->eflags;
-	if (get_user(irq_enable, &cpu->lg->lguest_data->irq_enabled) == 0
-	    && !(irq_enable & X86_EFLAGS_IF))
-		eflags &= ~X86_EFLAGS_IF;
-
-	/*
-	 * An interrupt is expected to push three things on the stack: the old
-	 * "eflags" word, the old code segment, and the old instruction
-	 * pointer.
-	 */
-	push_guest_stack(cpu, &gstack, eflags);
-	push_guest_stack(cpu, &gstack, cpu->regs->cs);
-	push_guest_stack(cpu, &gstack, cpu->regs->eip);
-
-	/* For the six traps which supply an error code, we push that, too. */
-	if (has_err)
-		push_guest_stack(cpu, &gstack, cpu->regs->errcode);
-
-	/* Adjust the stack pointer and stack segment. */
-	cpu->regs->ss = ss;
-	cpu->regs->esp = virtstack + (gstack - origstack);
-}
-
-/*
- * This actually makes the Guest start executing the given interrupt/trap
- * handler.
- *
- * "lo" and "hi" are the two parts of the Interrupt Descriptor Table for this
- * interrupt or trap.  It's split into two parts for traditional reasons: gcc
- * on i386 used to be frightened by 64 bit numbers.
- */
-static void guest_run_interrupt(struct lg_cpu *cpu, u32 lo, u32 hi)
-{
-	/* If we're already in the kernel, we don't change stacks. */
-	if ((cpu->regs->ss&0x3) != GUEST_PL)
-		cpu->regs->ss = cpu->esp1;
-
-	/*
-	 * Set the code segment and the address to execute.
-	 */
-	cpu->regs->cs = (__KERNEL_CS|GUEST_PL);
-	cpu->regs->eip = idt_address(lo, hi);
-
-	/*
-	 * Trapping always clears these flags:
-	 * TF: Trap flag
-	 * VM: Virtual 8086 mode
-	 * RF: Resume
-	 * NT: Nested task.
-	 */
-	cpu->regs->eflags &=
-		~(X86_EFLAGS_TF|X86_EFLAGS_VM|X86_EFLAGS_RF|X86_EFLAGS_NT);
-
-	/*
-	 * There are two kinds of interrupt handlers: 0xE is an "interrupt
-	 * gate" which expects interrupts to be disabled on entry.
-	 */
-	if (idt_type(lo, hi) == 0xE)
-		if (put_user(0, &cpu->lg->lguest_data->irq_enabled))
-			kill_guest(cpu, "Disabling interrupts");
-}
-
-/* This restores the eflags word which was pushed on the stack by a trap */
-static void restore_eflags(struct lg_cpu *cpu)
-{
-	/* This is the physical address of the stack. */
-	unsigned long stack_pa = guest_pa(cpu, cpu->regs->esp);
-
-	/*
-	 * Stack looks like this:
-	 * Address	Contents
-	 * esp		EIP
-	 * esp + 4	CS
-	 * esp + 8	EFLAGS
-	 */
-	cpu->regs->eflags = lgread(cpu, stack_pa + 8, u32);
-	cpu->regs->eflags &=
-		~(X86_EFLAGS_TF|X86_EFLAGS_VM|X86_EFLAGS_RF|X86_EFLAGS_NT);
-}
-
-/*H:205
- * Virtual Interrupts.
- *
- * interrupt_pending() returns the first pending interrupt which isn't blocked
- * by the Guest.  It is called before every entry to the Guest, and just before
- * we go to sleep when the Guest has halted itself.
- */
-unsigned int interrupt_pending(struct lg_cpu *cpu, bool *more)
-{
-	unsigned int irq;
-	DECLARE_BITMAP(blk, LGUEST_IRQS);
-
-	/* If the Guest hasn't even initialized yet, we can do nothing. */
-	if (!cpu->lg->lguest_data)
-		return LGUEST_IRQS;
-
-	/*
-	 * Take our "irqs_pending" array and remove any interrupts the Guest
-	 * wants blocked: the result ends up in "blk".
-	 */
-	if (copy_from_user(&blk, cpu->lg->lguest_data->blocked_interrupts,
-			   sizeof(blk)))
-		return LGUEST_IRQS;
-	bitmap_andnot(blk, cpu->irqs_pending, blk, LGUEST_IRQS);
-
-	/* Find the first interrupt. */
-	irq = find_first_bit(blk, LGUEST_IRQS);
-	*more = find_next_bit(blk, LGUEST_IRQS, irq+1);
-
-	return irq;
-}
-
-/*
- * This actually diverts the Guest to running an interrupt handler, once an
- * interrupt has been identified by interrupt_pending().
- */
-void try_deliver_interrupt(struct lg_cpu *cpu, unsigned int irq, bool more)
-{
-	struct desc_struct *idt;
-
-	BUG_ON(irq >= LGUEST_IRQS);
-
-	/* If they're halted, interrupts restart them. */
-	if (cpu->halted) {
-		/* Re-enable interrupts. */
-		if (put_user(X86_EFLAGS_IF, &cpu->lg->lguest_data->irq_enabled))
-			kill_guest(cpu, "Re-enabling interrupts");
-		cpu->halted = 0;
-	} else {
-		/* Otherwise we check if they have interrupts disabled. */
-		u32 irq_enabled;
-		if (get_user(irq_enabled, &cpu->lg->lguest_data->irq_enabled))
-			irq_enabled = 0;
-		if (!irq_enabled) {
-			/* Make sure they know an IRQ is pending. */
-			put_user(X86_EFLAGS_IF,
-				 &cpu->lg->lguest_data->irq_pending);
-			return;
-		}
-	}
-
-	/*
-	 * Look at the IDT entry the Guest gave us for this interrupt.  The
-	 * first 32 (FIRST_EXTERNAL_VECTOR) entries are for traps, so we skip
-	 * over them.
-	 */
-	idt = &cpu->arch.idt[FIRST_EXTERNAL_VECTOR+irq];
-	/* If they don't have a handler (yet?), we just ignore it */
-	if (idt_present(idt->a, idt->b)) {
-		/* OK, mark it no longer pending and deliver it. */
-		clear_bit(irq, cpu->irqs_pending);
-
-		/*
-		 * They may be about to iret, where they asked us never to
-		 * deliver interrupts.  In this case, we can emulate that iret
-		 * then immediately deliver the interrupt.  This is basically
-		 * a noop: the iret would pop the interrupt frame and restore
-		 * eflags, and then we'd set it up again.  So just restore the
-		 * eflags word and jump straight to the handler in this case.
-		 *
-		 * Denys Vlasenko points out that this isn't quite right: if
-		 * the iret was returning to userspace, then that interrupt
-		 * would reset the stack pointer (which the Guest told us
-		 * about via LHCALL_SET_STACK).  But unless the Guest is being
-		 * *really* weird, that will be the same as the current stack
-		 * anyway.
-		 */
-		if (cpu->regs->eip == cpu->lg->noirq_iret) {
-			restore_eflags(cpu);
-		} else {
-			/*
-			 * set_guest_interrupt() takes a flag to say whether
-			 * this interrupt pushes an error code onto the stack
-			 * as well: virtual interrupts never do.
-			 */
-			push_guest_interrupt_stack(cpu, false);
-		}
-		/* Actually make Guest cpu jump to handler. */
-		guest_run_interrupt(cpu, idt->a, idt->b);
-	}
-
-	/*
-	 * Every time we deliver an interrupt, we update the timestamp in the
-	 * Guest's lguest_data struct.  It would be better for the Guest if we
-	 * did this more often, but it can actually be quite slow: doing it
-	 * here is a compromise which means at least it gets updated every
-	 * timer interrupt.
-	 */
-	write_timestamp(cpu);
-
-	/*
-	 * If there are no other interrupts we want to deliver, clear
-	 * the pending flag.
-	 */
-	if (!more)
-		put_user(0, &cpu->lg->lguest_data->irq_pending);
-}
-
-/* And this is the routine when we want to set an interrupt for the Guest. */
-void set_interrupt(struct lg_cpu *cpu, unsigned int irq)
-{
-	/*
-	 * Next time the Guest runs, the core code will see if it can deliver
-	 * this interrupt.
-	 */
-	set_bit(irq, cpu->irqs_pending);
-
-	/*
-	 * Make sure it sees it; it might be asleep (eg. halted), or running
-	 * the Guest right now, in which case kick_process() will knock it out.
-	 */
-	if (!wake_up_process(cpu->tsk))
-		kick_process(cpu->tsk);
-}
-/*:*/
-
-/*
- * Linux uses trap 128 for system calls.  Plan9 uses 64, and Ron Minnich sent
- * me a patch, so we support that too.  It'd be a big step for lguest if half
- * the Plan 9 user base were to start using it.
- *
- * Actually now I think of it, it's possible that Ron *is* half the Plan 9
- * userbase.  Oh well.
- */
-bool could_be_syscall(unsigned int num)
-{
-	/* Normal Linux IA32_SYSCALL_VECTOR or reserved vector? */
-	return num == IA32_SYSCALL_VECTOR || num == syscall_vector;
-}
-
-/* The syscall vector it wants must be unused by Host. */
-bool check_syscall_vector(struct lguest *lg)
-{
-	u32 vector;
-
-	if (get_user(vector, &lg->lguest_data->syscall_vec))
-		return false;
-
-	return could_be_syscall(vector);
-}
-
-int init_interrupts(void)
-{
-	/* If they want some strange system call vector, reserve it now */
-	if (syscall_vector != IA32_SYSCALL_VECTOR) {
-		if (test_bit(syscall_vector, used_vectors) ||
-		    vector_used_by_percpu_irq(syscall_vector)) {
-			printk(KERN_ERR "lg: couldn't reserve syscall %u\n",
-				 syscall_vector);
-			return -EBUSY;
-		}
-		set_bit(syscall_vector, used_vectors);
-	}
-
-	return 0;
-}
-
-void free_interrupts(void)
-{
-	if (syscall_vector != IA32_SYSCALL_VECTOR)
-		clear_bit(syscall_vector, used_vectors);
-}
-
-/*H:220
- * Now we've got the routines to deliver interrupts, delivering traps like
- * page fault is easy.  The only trick is that Intel decided that some traps
- * should have error codes:
- */
-static bool has_err(unsigned int trap)
-{
-	return (trap == 8 || (trap >= 10 && trap <= 14) || trap == 17);
-}
-
-/* deliver_trap() returns true if it could deliver the trap. */
-bool deliver_trap(struct lg_cpu *cpu, unsigned int num)
-{
-	/*
-	 * Trap numbers are always 8 bit, but we set an impossible trap number
-	 * for traps inside the Switcher, so check that here.
-	 */
-	if (num >= ARRAY_SIZE(cpu->arch.idt))
-		return false;
-
-	/*
-	 * Early on the Guest hasn't set the IDT entries (or maybe it put a
-	 * bogus one in): if we fail here, the Guest will be killed.
-	 */
-	if (!idt_present(cpu->arch.idt[num].a, cpu->arch.idt[num].b))
-		return false;
-	push_guest_interrupt_stack(cpu, has_err(num));
-	guest_run_interrupt(cpu, cpu->arch.idt[num].a,
-			    cpu->arch.idt[num].b);
-	return true;
-}
-
-/*H:250
- * Here's the hard part: returning to the Host every time a trap happens
- * and then calling deliver_trap() and re-entering the Guest is slow.
- * Particularly because Guest userspace system calls are traps (usually trap
- * 128).
- *
- * So we'd like to set up the IDT to tell the CPU to deliver traps directly
- * into the Guest.  This is possible, but the complexities cause the size of
- * this file to double!  However, 150 lines of code is worth writing for taking
- * system calls down from 1750ns to 270ns.  Plus, if lguest didn't do it, all
- * the other hypervisors would beat it up at lunchtime.
- *
- * This routine indicates if a particular trap number could be delivered
- * directly.
- *
- * Unfortunately, Linux 4.6 started using an interrupt gate instead of a
- * trap gate for syscalls, so this trick is ineffective.  See Mastery for
- * how we could do this anyway...
- */
-static bool direct_trap(unsigned int num)
-{
-	/*
-	 * Hardware interrupts don't go to the Guest at all (except system
-	 * call).
-	 */
-	if (num >= FIRST_EXTERNAL_VECTOR && !could_be_syscall(num))
-		return false;
-
-	/*
-	 * The Host needs to see page faults (for shadow paging and to save the
-	 * fault address), general protection faults (in/out emulation) and
-	 * device not available (TS handling) and of course, the hypercall trap.
-	 */
-	return num != 14 && num != 13 && num != 7 && num != LGUEST_TRAP_ENTRY;
-}
-/*:*/
-
-/*M:005
- * The Guest has the ability to turn its interrupt gates into trap gates,
- * if it is careful.  The Host will let trap gates can go directly to the
- * Guest, but the Guest needs the interrupts atomically disabled for an
- * interrupt gate.  The Host could provide a mechanism to register more
- * "no-interrupt" regions, and the Guest could point the trap gate at
- * instructions within that region, where it can safely disable interrupts.
- */
-
-/*M:006
- * The Guests do not use the sysenter (fast system call) instruction,
- * because it's hardcoded to enter privilege level 0 and so can't go direct.
- * It's about twice as fast as the older "int 0x80" system call, so it might
- * still be worthwhile to handle it in the Switcher and lcall down to the
- * Guest.  The sysenter semantics are hairy tho: search for that keyword in
- * entry.S
-:*/
-
-/*H:260
- * When we make traps go directly into the Guest, we need to make sure
- * the kernel stack is valid (ie. mapped in the page tables).  Otherwise, the
- * CPU trying to deliver the trap will fault while trying to push the interrupt
- * words on the stack: this is called a double fault, and it forces us to kill
- * the Guest.
- *
- * Which is deeply unfair, because (literally!) it wasn't the Guests' fault.
- */
-void pin_stack_pages(struct lg_cpu *cpu)
-{
-	unsigned int i;
-
-	/*
-	 * Depending on the CONFIG_4KSTACKS option, the Guest can have one or
-	 * two pages of stack space.
-	 */
-	for (i = 0; i < cpu->lg->stack_pages; i++)
-		/*
-		 * The stack grows *upwards*, so the address we're given is the
-		 * start of the page after the kernel stack.  Subtract one to
-		 * get back onto the first stack page, and keep subtracting to
-		 * get to the rest of the stack pages.
-		 */
-		pin_page(cpu, cpu->esp1 - 1 - i * PAGE_SIZE);
-}
-
-/*
- * Direct traps also mean that we need to know whenever the Guest wants to use
- * a different kernel stack, so we can change the guest TSS to use that
- * stack.  The TSS entries expect a virtual address, so unlike most addresses
- * the Guest gives us, the "esp" (stack pointer) value here is virtual, not
- * physical.
- *
- * In Linux each process has its own kernel stack, so this happens a lot: we
- * change stacks on each context switch.
- */
-void guest_set_stack(struct lg_cpu *cpu, u32 seg, u32 esp, unsigned int pages)
-{
-	/*
-	 * You're not allowed a stack segment with privilege level 0: bad Guest!
-	 */
-	if ((seg & 0x3) != GUEST_PL)
-		kill_guest(cpu, "bad stack segment %i", seg);
-	/* We only expect one or two stack pages. */
-	if (pages > 2)
-		kill_guest(cpu, "bad stack pages %u", pages);
-	/* Save where the stack is, and how many pages */
-	cpu->ss1 = seg;
-	cpu->esp1 = esp;
-	cpu->lg->stack_pages = pages;
-	/* Make sure the new stack pages are mapped */
-	pin_stack_pages(cpu);
-}
-
-/*
- * All this reference to mapping stacks leads us neatly into the other complex
- * part of the Host: page table handling.
- */
-
-/*H:235
- * This is the routine which actually checks the Guest's IDT entry and
- * transfers it into the entry in "struct lguest":
- */
-static void set_trap(struct lg_cpu *cpu, struct desc_struct *trap,
-		     unsigned int num, u32 lo, u32 hi)
-{
-	u8 type = idt_type(lo, hi);
-
-	/* We zero-out a not-present entry */
-	if (!idt_present(lo, hi)) {
-		trap->a = trap->b = 0;
-		return;
-	}
-
-	/* We only support interrupt and trap gates. */
-	if (type != 0xE && type != 0xF)
-		kill_guest(cpu, "bad IDT type %i", type);
-
-	/*
-	 * We only copy the handler address, present bit, privilege level and
-	 * type.  The privilege level controls where the trap can be triggered
-	 * manually with an "int" instruction.  This is usually GUEST_PL,
-	 * except for system calls which userspace can use.
-	 */
-	trap->a = ((__KERNEL_CS|GUEST_PL)<<16) | (lo&0x0000FFFF);
-	trap->b = (hi&0xFFFFEF00);
-}
-
-/*H:230
- * While we're here, dealing with delivering traps and interrupts to the
- * Guest, we might as well complete the picture: how the Guest tells us where
- * it wants them to go.  This would be simple, except making traps fast
- * requires some tricks.
- *
- * We saw the Guest setting Interrupt Descriptor Table (IDT) entries with the
- * LHCALL_LOAD_IDT_ENTRY hypercall before: that comes here.
- */
-void load_guest_idt_entry(struct lg_cpu *cpu, unsigned int num, u32 lo, u32 hi)
-{
-	/*
-	 * Guest never handles: NMI, doublefault, spurious interrupt or
-	 * hypercall.  We ignore when it tries to set them.
-	 */
-	if (num == 2 || num == 8 || num == 15 || num == LGUEST_TRAP_ENTRY)
-		return;
-
-	/*
-	 * Mark the IDT as changed: next time the Guest runs we'll know we have
-	 * to copy this again.
-	 */
-	cpu->changed |= CHANGED_IDT;
-
-	/* Check that the Guest doesn't try to step outside the bounds. */
-	if (num >= ARRAY_SIZE(cpu->arch.idt))
-		kill_guest(cpu, "Setting idt entry %u", num);
-	else
-		set_trap(cpu, &cpu->arch.idt[num], num, lo, hi);
-}
-
-/*
- * The default entry for each interrupt points into the Switcher routines which
- * simply return to the Host.  The run_guest() loop will then call
- * deliver_trap() to bounce it back into the Guest.
- */
-static void default_idt_entry(struct desc_struct *idt,
-			      int trap,
-			      const unsigned long handler,
-			      const struct desc_struct *base)
-{
-	/* A present interrupt gate. */
-	u32 flags = 0x8e00;
-
-	/*
-	 * Set the privilege level on the entry for the hypercall: this allows
-	 * the Guest to use the "int" instruction to trigger it.
-	 */
-	if (trap == LGUEST_TRAP_ENTRY)
-		flags |= (GUEST_PL << 13);
-	else if (base)
-		/*
-		 * Copy privilege level from what Guest asked for.  This allows
-		 * debug (int 3) traps from Guest userspace, for example.
-		 */
-		flags |= (base->b & 0x6000);
-
-	/* Now pack it into the IDT entry in its weird format. */
-	idt->a = (LGUEST_CS<<16) | (handler&0x0000FFFF);
-	idt->b = (handler&0xFFFF0000) | flags;
-}
-
-/* When the Guest first starts, we put default entries into the IDT. */
-void setup_default_idt_entries(struct lguest_ro_state *state,
-			       const unsigned long *def)
-{
-	unsigned int i;
-
-	for (i = 0; i < ARRAY_SIZE(state->guest_idt); i++)
-		default_idt_entry(&state->guest_idt[i], i, def[i], NULL);
-}
-
-/*H:240
- * We don't use the IDT entries in the "struct lguest" directly, instead
- * we copy them into the IDT which we've set up for Guests on this CPU, just
- * before we run the Guest.  This routine does that copy.
- */
-void copy_traps(const struct lg_cpu *cpu, struct desc_struct *idt,
-		const unsigned long *def)
-{
-	unsigned int i;
-
-	/*
-	 * We can simply copy the direct traps, otherwise we use the default
-	 * ones in the Switcher: they will return to the Host.
-	 */
-	for (i = 0; i < ARRAY_SIZE(cpu->arch.idt); i++) {
-		const struct desc_struct *gidt = &cpu->arch.idt[i];
-
-		/* If no Guest can ever override this trap, leave it alone. */
-		if (!direct_trap(i))
-			continue;
-
-		/*
-		 * Only trap gates (type 15) can go direct to the Guest.
-		 * Interrupt gates (type 14) disable interrupts as they are
-		 * entered, which we never let the Guest do.  Not present
-		 * entries (type 0x0) also can't go direct, of course.
-		 *
-		 * If it can't go direct, we still need to copy the priv. level:
-		 * they might want to give userspace access to a software
-		 * interrupt.
-		 */
-		if (idt_type(gidt->a, gidt->b) == 0xF)
-			idt[i] = *gidt;
-		else
-			default_idt_entry(&idt[i], i, def[i], gidt);
-	}
-}
-
-/*H:200
- * The Guest Clock.
- *
- * There are two sources of virtual interrupts.  We saw one in lguest_user.c:
- * the Launcher sending interrupts for virtual devices.  The other is the Guest
- * timer interrupt.
- *
- * The Guest uses the LHCALL_SET_CLOCKEVENT hypercall to tell us how long to
- * the next timer interrupt (in nanoseconds).  We use the high-resolution timer
- * infrastructure to set a callback at that time.
- *
- * 0 means "turn off the clock".
- */
-void guest_set_clockevent(struct lg_cpu *cpu, unsigned long delta)
-{
-	ktime_t expires;
-
-	if (unlikely(delta == 0)) {
-		/* Clock event device is shutting down. */
-		hrtimer_cancel(&cpu->hrt);
-		return;
-	}
-
-	/*
-	 * We use wallclock time here, so the Guest might not be running for
-	 * all the time between now and the timer interrupt it asked for.  This
-	 * is almost always the right thing to do.
-	 */
-	expires = ktime_add_ns(ktime_get_real(), delta);
-	hrtimer_start(&cpu->hrt, expires, HRTIMER_MODE_ABS);
-}
-
-/* This is the function called when the Guest's timer expires. */
-static enum hrtimer_restart clockdev_fn(struct hrtimer *timer)
-{
-	struct lg_cpu *cpu = container_of(timer, struct lg_cpu, hrt);
-
-	/* Remember the first interrupt is the timer interrupt. */
-	set_interrupt(cpu, 0);
-	return HRTIMER_NORESTART;
-}
-
-/* This sets up the timer for this Guest. */
-void init_clockdev(struct lg_cpu *cpu)
-{
-	hrtimer_init(&cpu->hrt, CLOCK_REALTIME, HRTIMER_MODE_ABS);
-	cpu->hrt.function = clockdev_fn;
-}
diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h
deleted file mode 100644
index 2356a2318034..000000000000
--- a/drivers/lguest/lg.h
+++ /dev/null
@@ -1,258 +0,0 @@
-#ifndef _LGUEST_H
-#define _LGUEST_H
-
-#ifndef __ASSEMBLY__
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/stringify.h>
-#include <linux/lguest.h>
-#include <linux/lguest_launcher.h>
-#include <linux/wait.h>
-#include <linux/hrtimer.h>
-#include <linux/err.h>
-#include <linux/slab.h>
-
-#include <asm/lguest.h>
-
-struct pgdir {
-	unsigned long gpgdir;
-	bool switcher_mapped;
-	int last_host_cpu;
-	pgd_t *pgdir;
-};
-
-/* We have two pages shared with guests, per cpu.  */
-struct lguest_pages {
-	/* This is the stack page mapped rw in guest */
-	char spare[PAGE_SIZE - sizeof(struct lguest_regs)];
-	struct lguest_regs regs;
-
-	/* This is the host state & guest descriptor page, ro in guest */
-	struct lguest_ro_state state;
-} __attribute__((aligned(PAGE_SIZE)));
-
-#define CHANGED_IDT		1
-#define CHANGED_GDT		2
-#define CHANGED_GDT_TLS		4 /* Actually a subset of CHANGED_GDT */
-#define CHANGED_ALL	        3
-
-struct lg_cpu {
-	unsigned int id;
-	struct lguest *lg;
-	struct task_struct *tsk;
-	struct mm_struct *mm; 	/* == tsk->mm, but that becomes NULL on exit */
-
-	u32 cr2;
-	u32 esp1;
-	u16 ss1;
-
-	/* Bitmap of what has changed: see CHANGED_* above. */
-	int changed;
-
-	/* Pending operation. */
-	struct lguest_pending pending;
-
-	unsigned long *reg_read; /* register from LHREQ_GETREG */
-
-	/* At end of a page shared mapped over lguest_pages in guest. */
-	unsigned long regs_page;
-	struct lguest_regs *regs;
-
-	struct lguest_pages *last_pages;
-
-	/* Initialization mode: linear map everything. */
-	bool linear_pages;
-	int cpu_pgd; /* Which pgd this cpu is currently using */
-
-	/* If a hypercall was asked for, this points to the arguments. */
-	struct hcall_args *hcall;
-	u32 next_hcall;
-
-	/* Virtual clock device */
-	struct hrtimer hrt;
-
-	/* Did the Guest tell us to halt? */
-	int halted;
-
-	/* Pending virtual interrupts */
-	DECLARE_BITMAP(irqs_pending, LGUEST_IRQS);
-
-	struct lg_cpu_arch arch;
-};
-
-/* The private info the thread maintains about the guest. */
-struct lguest {
-	struct lguest_data __user *lguest_data;
-	struct lg_cpu cpus[NR_CPUS];
-	unsigned int nr_cpus;
-
-	/* Valid guest memory pages must be < this. */
-	u32 pfn_limit;
-
-	/* Device memory is >= pfn_limit and < device_limit. */
-	u32 device_limit;
-
-	/*
-	 * This provides the offset to the base of guest-physical memory in the
-	 * Launcher.
-	 */
-	void __user *mem_base;
-	unsigned long kernel_address;
-
-	struct pgdir pgdirs[4];
-
-	unsigned long noirq_iret;
-
-	unsigned int stack_pages;
-	u32 tsc_khz;
-
-	/* Dead? */
-	const char *dead;
-};
-
-extern struct mutex lguest_lock;
-
-/* core.c: */
-bool lguest_address_ok(const struct lguest *lg,
-		       unsigned long addr, unsigned long len);
-void __lgread(struct lg_cpu *, void *, unsigned long, unsigned);
-void __lgwrite(struct lg_cpu *, unsigned long, const void *, unsigned);
-extern struct page **lg_switcher_pages;
-
-/*H:035
- * Using memory-copy operations like that is usually inconvient, so we
- * have the following helper macros which read and write a specific type (often
- * an unsigned long).
- *
- * This reads into a variable of the given type then returns that.
- */
-#define lgread(cpu, addr, type)						\
-	({ type _v; __lgread((cpu), &_v, (addr), sizeof(_v)); _v; })
-
-/* This checks that the variable is of the given type, then writes it out. */
-#define lgwrite(cpu, addr, type, val)				\
-	do {							\
-		typecheck(type, val);				\
-		__lgwrite((cpu), (addr), &(val), sizeof(val));	\
-	} while(0)
-/* (end of memory access helper routines) :*/
-
-int run_guest(struct lg_cpu *cpu, unsigned long __user *user);
-
-/*
- * Helper macros to obtain the first 12 or the last 20 bits, this is only the
- * first step in the migration to the kernel types.  pte_pfn is already defined
- * in the kernel.
- */
-#define pgd_flags(x)	(pgd_val(x) & ~PAGE_MASK)
-#define pgd_pfn(x)	(pgd_val(x) >> PAGE_SHIFT)
-#define pmd_flags(x)    (pmd_val(x) & ~PAGE_MASK)
-#define pmd_pfn(x)	(pmd_val(x) >> PAGE_SHIFT)
-
-/* interrupts_and_traps.c: */
-unsigned int interrupt_pending(struct lg_cpu *cpu, bool *more);
-void try_deliver_interrupt(struct lg_cpu *cpu, unsigned int irq, bool more);
-void set_interrupt(struct lg_cpu *cpu, unsigned int irq);
-bool deliver_trap(struct lg_cpu *cpu, unsigned int num);
-void load_guest_idt_entry(struct lg_cpu *cpu, unsigned int i,
-			  u32 low, u32 hi);
-void guest_set_stack(struct lg_cpu *cpu, u32 seg, u32 esp, unsigned int pages);
-void pin_stack_pages(struct lg_cpu *cpu);
-void setup_default_idt_entries(struct lguest_ro_state *state,
-			       const unsigned long *def);
-void copy_traps(const struct lg_cpu *cpu, struct desc_struct *idt,
-		const unsigned long *def);
-void guest_set_clockevent(struct lg_cpu *cpu, unsigned long delta);
-bool send_notify_to_eventfd(struct lg_cpu *cpu);
-void init_clockdev(struct lg_cpu *cpu);
-bool check_syscall_vector(struct lguest *lg);
-bool could_be_syscall(unsigned int num);
-int init_interrupts(void);
-void free_interrupts(void);
-
-/* segments.c: */
-void setup_default_gdt_entries(struct lguest_ro_state *state);
-void setup_guest_gdt(struct lg_cpu *cpu);
-void load_guest_gdt_entry(struct lg_cpu *cpu, unsigned int i,
-			  u32 low, u32 hi);
-void guest_load_tls(struct lg_cpu *cpu, unsigned long tls_array);
-void copy_gdt(const struct lg_cpu *cpu, struct desc_struct *gdt);
-void copy_gdt_tls(const struct lg_cpu *cpu, struct desc_struct *gdt);
-
-/* page_tables.c: */
-int init_guest_pagetable(struct lguest *lg);
-void free_guest_pagetable(struct lguest *lg);
-void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable);
-void guest_set_pgd(struct lguest *lg, unsigned long gpgdir, u32 i);
-#ifdef CONFIG_X86_PAE
-void guest_set_pmd(struct lguest *lg, unsigned long gpgdir, u32 i);
-#endif
-void guest_pagetable_clear_all(struct lg_cpu *cpu);
-void guest_pagetable_flush_user(struct lg_cpu *cpu);
-void guest_set_pte(struct lg_cpu *cpu, unsigned long gpgdir,
-		   unsigned long vaddr, pte_t val);
-void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages);
-bool demand_page(struct lg_cpu *cpu, unsigned long cr2, int errcode,
-		 unsigned long *iomem);
-void pin_page(struct lg_cpu *cpu, unsigned long vaddr);
-bool __guest_pa(struct lg_cpu *cpu, unsigned long vaddr, unsigned long *paddr);
-unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr);
-void page_table_guest_data_init(struct lg_cpu *cpu);
-
-/* <arch>/core.c: */
-void lguest_arch_host_init(void);
-void lguest_arch_host_fini(void);
-void lguest_arch_run_guest(struct lg_cpu *cpu);
-void lguest_arch_handle_trap(struct lg_cpu *cpu);
-int lguest_arch_init_hypercalls(struct lg_cpu *cpu);
-int lguest_arch_do_hcall(struct lg_cpu *cpu, struct hcall_args *args);
-void lguest_arch_setup_regs(struct lg_cpu *cpu, unsigned long start);
-unsigned long *lguest_arch_regptr(struct lg_cpu *cpu, size_t reg_off, bool any);
-
-/* <arch>/switcher.S: */
-extern char start_switcher_text[], end_switcher_text[], switch_to_guest[];
-
-/* lguest_user.c: */
-int lguest_device_init(void);
-void lguest_device_remove(void);
-
-/* hypercalls.c: */
-void do_hypercalls(struct lg_cpu *cpu);
-void write_timestamp(struct lg_cpu *cpu);
-
-/*L:035
- * Let's step aside for the moment, to study one important routine that's used
- * widely in the Host code.
- *
- * There are many cases where the Guest can do something invalid, like pass crap
- * to a hypercall.  Since only the Guest kernel can make hypercalls, it's quite
- * acceptable to simply terminate the Guest and give the Launcher a nicely
- * formatted reason.  It's also simpler for the Guest itself, which doesn't
- * need to check most hypercalls for "success"; if you're still running, it
- * succeeded.
- *
- * Once this is called, the Guest will never run again, so most Host code can
- * call this then continue as if nothing had happened.  This means many
- * functions don't have to explicitly return an error code, which keeps the
- * code simple.
- *
- * It also means that this can be called more than once: only the first one is
- * remembered.  The only trick is that we still need to kill the Guest even if
- * we can't allocate memory to store the reason.  Linux has a neat way of
- * packing error codes into invalid pointers, so we use that here.
- *
- * Like any macro which uses an "if", it is safely wrapped in a run-once "do {
- * } while(0)".
- */
-#define kill_guest(cpu, fmt...)					\
-do {								\
-	if (!(cpu)->lg->dead) {					\
-		(cpu)->lg->dead = kasprintf(GFP_ATOMIC, fmt);	\
-		if (!(cpu)->lg->dead)				\
-			(cpu)->lg->dead = ERR_PTR(-ENOMEM);	\
-	}							\
-} while(0)
-/* (End of aside) :*/
-
-#endif	/* __ASSEMBLY__ */
-#endif	/* _LGUEST_H */
diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c
deleted file mode 100644
index 1a6787bc9386..000000000000
--- a/drivers/lguest/lguest_user.c
+++ /dev/null
@@ -1,446 +0,0 @@
-/*P:200 This contains all the /dev/lguest code, whereby the userspace
- * launcher controls and communicates with the Guest.  For example,
- * the first write will tell us the Guest's memory layout and entry
- * point.  A read will run the Guest until something happens, such as
- * a signal or the Guest accessing a device.
-:*/
-#include <linux/uaccess.h>
-#include <linux/miscdevice.h>
-#include <linux/fs.h>
-#include <linux/sched.h>
-#include <linux/sched/mm.h>
-#include <linux/file.h>
-#include <linux/slab.h>
-#include <linux/export.h>
-#include "lg.h"
-
-/*L:052
-  The Launcher can get the registers, and also set some of them.
-*/
-static int getreg_setup(struct lg_cpu *cpu, const unsigned long __user *input)
-{
-	unsigned long which;
-
-	/* We re-use the ptrace structure to specify which register to read. */
-	if (get_user(which, input) != 0)
-		return -EFAULT;
-
-	/*
-	 * We set up the cpu register pointer, and their next read will
-	 * actually get the value (instead of running the guest).
-	 *
-	 * The last argument 'true' says we can access any register.
-	 */
-	cpu->reg_read = lguest_arch_regptr(cpu, which, true);
-	if (!cpu->reg_read)
-		return -ENOENT;
-
-	/* And because this is a write() call, we return the length used. */
-	return sizeof(unsigned long) * 2;
-}
-
-static int setreg(struct lg_cpu *cpu, const unsigned long __user *input)
-{
-	unsigned long which, value, *reg;
-
-	/* We re-use the ptrace structure to specify which register to read. */
-	if (get_user(which, input) != 0)
-		return -EFAULT;
-	input++;
-	if (get_user(value, input) != 0)
-		return -EFAULT;
-
-	/* The last argument 'false' means we can't access all registers. */
-	reg = lguest_arch_regptr(cpu, which, false);
-	if (!reg)
-		return -ENOENT;
-
-	*reg = value;
-
-	/* And because this is a write() call, we return the length used. */
-	return sizeof(unsigned long) * 3;
-}
-
-/*L:050
- * Sending an interrupt is done by writing LHREQ_IRQ and an interrupt
- * number to /dev/lguest.
- */
-static int user_send_irq(struct lg_cpu *cpu, const unsigned long __user *input)
-{
-	unsigned long irq;
-
-	if (get_user(irq, input) != 0)
-		return -EFAULT;
-	if (irq >= LGUEST_IRQS)
-		return -EINVAL;
-
-	/*
-	 * Next time the Guest runs, the core code will see if it can deliver
-	 * this interrupt.
-	 */
-	set_interrupt(cpu, irq);
-	return 0;
-}
-
-/*L:053
- * Deliver a trap: this is used by the Launcher if it can't emulate
- * an instruction.
- */
-static int trap(struct lg_cpu *cpu, const unsigned long __user *input)
-{
-	unsigned long trapnum;
-
-	if (get_user(trapnum, input) != 0)
-		return -EFAULT;
-
-	if (!deliver_trap(cpu, trapnum))
-		return -EINVAL;
-
-	return 0;
-}
-
-/*L:040
- * Once our Guest is initialized, the Launcher makes it run by reading
- * from /dev/lguest.
- */
-static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o)
-{
-	struct lguest *lg = file->private_data;
-	struct lg_cpu *cpu;
-	unsigned int cpu_id = *o;
-
-	/* You must write LHREQ_INITIALIZE first! */
-	if (!lg)
-		return -EINVAL;
-
-	/* Watch out for arbitrary vcpu indexes! */
-	if (cpu_id >= lg->nr_cpus)
-		return -EINVAL;
-
-	cpu = &lg->cpus[cpu_id];
-
-	/* If you're not the task which owns the Guest, go away. */
-	if (current != cpu->tsk)
-		return -EPERM;
-
-	/* If the Guest is already dead, we indicate why */
-	if (lg->dead) {
-		size_t len;
-
-		/* lg->dead either contains an error code, or a string. */
-		if (IS_ERR(lg->dead))
-			return PTR_ERR(lg->dead);
-
-		/* We can only return as much as the buffer they read with. */
-		len = min(size, strlen(lg->dead)+1);
-		if (copy_to_user(user, lg->dead, len) != 0)
-			return -EFAULT;
-		return len;
-	}
-
-	/*
-	 * If we returned from read() last time because the Guest sent I/O,
-	 * clear the flag.
-	 */
-	if (cpu->pending.trap)
-		cpu->pending.trap = 0;
-
-	/* Run the Guest until something interesting happens. */
-	return run_guest(cpu, (unsigned long __user *)user);
-}
-
-/*L:025
- * This actually initializes a CPU.  For the moment, a Guest is only
- * uniprocessor, so "id" is always 0.
- */
-static int lg_cpu_start(struct lg_cpu *cpu, unsigned id, unsigned long start_ip)
-{
-	/* We have a limited number of CPUs in the lguest struct. */
-	if (id >= ARRAY_SIZE(cpu->lg->cpus))
-		return -EINVAL;
-
-	/* Set up this CPU's id, and pointer back to the lguest struct. */
-	cpu->id = id;
-	cpu->lg = container_of(cpu, struct lguest, cpus[id]);
-	cpu->lg->nr_cpus++;
-
-	/* Each CPU has a timer it can set. */
-	init_clockdev(cpu);
-
-	/*
-	 * We need a complete page for the Guest registers: they are accessible
-	 * to the Guest and we can only grant it access to whole pages.
-	 */
-	cpu->regs_page = get_zeroed_page(GFP_KERNEL);
-	if (!cpu->regs_page)
-		return -ENOMEM;
-
-	/* We actually put the registers at the end of the page. */
-	cpu->regs = (void *)cpu->regs_page + PAGE_SIZE - sizeof(*cpu->regs);
-
-	/*
-	 * Now we initialize the Guest's registers, handing it the start
-	 * address.
-	 */
-	lguest_arch_setup_regs(cpu, start_ip);
-
-	/*
-	 * We keep a pointer to the Launcher task (ie. current task) for when
-	 * other Guests want to wake this one (eg. console input).
-	 */
-	cpu->tsk = current;
-
-	/*
-	 * We need to keep a pointer to the Launcher's memory map, because if
-	 * the Launcher dies we need to clean it up.  If we don't keep a
-	 * reference, it is destroyed before close() is called.
-	 */
-	cpu->mm = get_task_mm(cpu->tsk);
-
-	/*
-	 * We remember which CPU's pages this Guest used last, for optimization
-	 * when the same Guest runs on the same CPU twice.
-	 */
-	cpu->last_pages = NULL;
-
-	/* No error == success. */
-	return 0;
-}
-
-/*L:020
- * The initialization write supplies 3 pointer sized (32 or 64 bit) values (in
- * addition to the LHREQ_INITIALIZE value).  These are:
- *
- * base: The start of the Guest-physical memory inside the Launcher memory.
- *
- * pfnlimit: The highest (Guest-physical) page number the Guest should be
- * allowed to access.  The Guest memory lives inside the Launcher, so it sets
- * this to ensure the Guest can only reach its own memory.
- *
- * start: The first instruction to execute ("eip" in x86-speak).
- */
-static int initialize(struct file *file, const unsigned long __user *input)
-{
-	/* "struct lguest" contains all we (the Host) know about a Guest. */
-	struct lguest *lg;
-	int err;
-	unsigned long args[4];
-
-	/*
-	 * We grab the Big Lguest lock, which protects against multiple
-	 * simultaneous initializations.
-	 */
-	mutex_lock(&lguest_lock);
-	/* You can't initialize twice!  Close the device and start again... */
-	if (file->private_data) {
-		err = -EBUSY;
-		goto unlock;
-	}
-
-	if (copy_from_user(args, input, sizeof(args)) != 0) {
-		err = -EFAULT;
-		goto unlock;
-	}
-
-	lg = kzalloc(sizeof(*lg), GFP_KERNEL);
-	if (!lg) {
-		err = -ENOMEM;
-		goto unlock;
-	}
-
-	/* Populate the easy fields of our "struct lguest" */
-	lg->mem_base = (void __user *)args[0];
-	lg->pfn_limit = args[1];
-	lg->device_limit = args[3];
-
-	/* This is the first cpu (cpu 0) and it will start booting at args[2] */
-	err = lg_cpu_start(&lg->cpus[0], 0, args[2]);
-	if (err)
-		goto free_lg;
-
-	/*
-	 * Initialize the Guest's shadow page tables.  This allocates
-	 * memory, so can fail.
-	 */
-	err = init_guest_pagetable(lg);
-	if (err)
-		goto free_regs;
-
-	/* We keep our "struct lguest" in the file's private_data. */
-	file->private_data = lg;
-
-	mutex_unlock(&lguest_lock);
-
-	/* And because this is a write() call, we return the length used. */
-	return sizeof(args);
-
-free_regs:
-	/* FIXME: This should be in free_vcpu */
-	free_page(lg->cpus[0].regs_page);
-free_lg:
-	kfree(lg);
-unlock:
-	mutex_unlock(&lguest_lock);
-	return err;
-}
-
-/*L:010
- * The first operation the Launcher does must be a write.  All writes
- * start with an unsigned long number: for the first write this must be
- * LHREQ_INITIALIZE to set up the Guest.  After that the Launcher can use
- * writes of other values to send interrupts or set up receipt of notifications.
- *
- * Note that we overload the "offset" in the /dev/lguest file to indicate what
- * CPU number we're dealing with.  Currently this is always 0 since we only
- * support uniprocessor Guests, but you can see the beginnings of SMP support
- * here.
- */
-static ssize_t write(struct file *file, const char __user *in,
-		     size_t size, loff_t *off)
-{
-	/*
-	 * Once the Guest is initialized, we hold the "struct lguest" in the
-	 * file private data.
-	 */
-	struct lguest *lg = file->private_data;
-	const unsigned long __user *input = (const unsigned long __user *)in;
-	unsigned long req;
-	struct lg_cpu *uninitialized_var(cpu);
-	unsigned int cpu_id = *off;
-
-	/* The first value tells us what this request is. */
-	if (get_user(req, input) != 0)
-		return -EFAULT;
-	input++;
-
-	/* If you haven't initialized, you must do that first. */
-	if (req != LHREQ_INITIALIZE) {
-		if (!lg || (cpu_id >= lg->nr_cpus))
-			return -EINVAL;
-		cpu = &lg->cpus[cpu_id];
-
-		/* Once the Guest is dead, you can only read() why it died. */
-		if (lg->dead)
-			return -ENOENT;
-	}
-
-	switch (req) {
-	case LHREQ_INITIALIZE:
-		return initialize(file, input);
-	case LHREQ_IRQ:
-		return user_send_irq(cpu, input);
-	case LHREQ_GETREG:
-		return getreg_setup(cpu, input);
-	case LHREQ_SETREG:
-		return setreg(cpu, input);
-	case LHREQ_TRAP:
-		return trap(cpu, input);
-	default:
-		return -EINVAL;
-	}
-}
-
-static int open(struct inode *inode, struct file *file)
-{
-	file->private_data = NULL;
-
-	return 0;
-}
-
-/*L:060
- * The final piece of interface code is the close() routine.  It reverses
- * everything done in initialize().  This is usually called because the
- * Launcher exited.
- *
- * Note that the close routine returns 0 or a negative error number: it can't
- * really fail, but it can whine.  I blame Sun for this wart, and K&R C for
- * letting them do it.
-:*/
-static int close(struct inode *inode, struct file *file)
-{
-	struct lguest *lg = file->private_data;
-	unsigned int i;
-
-	/* If we never successfully initialized, there's nothing to clean up */
-	if (!lg)
-		return 0;
-
-	/*
-	 * We need the big lock, to protect from inter-guest I/O and other
-	 * Launchers initializing guests.
-	 */
-	mutex_lock(&lguest_lock);
-
-	/* Free up the shadow page tables for the Guest. */
-	free_guest_pagetable(lg);
-
-	for (i = 0; i < lg->nr_cpus; i++) {
-		/* Cancels the hrtimer set via LHCALL_SET_CLOCKEVENT. */
-		hrtimer_cancel(&lg->cpus[i].hrt);
-		/* We can free up the register page we allocated. */
-		free_page(lg->cpus[i].regs_page);
-		/*
-		 * Now all the memory cleanups are done, it's safe to release
-		 * the Launcher's memory management structure.
-		 */
-		mmput(lg->cpus[i].mm);
-	}
-
-	/*
-	 * If lg->dead doesn't contain an error code it will be NULL or a
-	 * kmalloc()ed string, either of which is ok to hand to kfree().
-	 */
-	if (!IS_ERR(lg->dead))
-		kfree(lg->dead);
-	/* Free the memory allocated to the lguest_struct */
-	kfree(lg);
-	/* Release lock and exit. */
-	mutex_unlock(&lguest_lock);
-
-	return 0;
-}
-
-/*L:000
- * Welcome to our journey through the Launcher!
- *
- * The Launcher is the Host userspace program which sets up, runs and services
- * the Guest.  In fact, many comments in the Drivers which refer to "the Host"
- * doing things are inaccurate: the Launcher does all the device handling for
- * the Guest, but the Guest can't know that.
- *
- * Just to confuse you: to the Host kernel, the Launcher *is* the Guest and we
- * shall see more of that later.
- *
- * We begin our understanding with the Host kernel interface which the Launcher
- * uses: reading and writing a character device called /dev/lguest.  All the
- * work happens in the read(), write() and close() routines:
- */
-static const struct file_operations lguest_fops = {
-	.owner	 = THIS_MODULE,
-	.open	 = open,
-	.release = close,
-	.write	 = write,
-	.read	 = read,
-	.llseek  = default_llseek,
-};
-/*:*/
-
-/*
- * This is a textbook example of a "misc" character device.  Populate a "struct
- * miscdevice" and register it with misc_register().
- */
-static struct miscdevice lguest_dev = {
-	.minor	= MISC_DYNAMIC_MINOR,
-	.name	= "lguest",
-	.fops	= &lguest_fops,
-};
-
-int __init lguest_device_init(void)
-{
-	return misc_register(&lguest_dev);
-}
-
-void __exit lguest_device_remove(void)
-{
-	misc_deregister(&lguest_dev);
-}
diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c
deleted file mode 100644
index 0bc127e9f16a..000000000000
--- a/drivers/lguest/page_tables.c
+++ /dev/null
@@ -1,1239 +0,0 @@
-/*P:700
- * The pagetable code, on the other hand, still shows the scars of
- * previous encounters.  It's functional, and as neat as it can be in the
- * circumstances, but be wary, for these things are subtle and break easily.
- * The Guest provides a virtual to physical mapping, but we can neither trust
- * it nor use it: we verify and convert it here then point the CPU to the
- * converted Guest pages when running the Guest.
-:*/
-
-/* Copyright (C) Rusty Russell IBM Corporation 2013.
- * GPL v2 and any later version */
-#include <linux/mm.h>
-#include <linux/gfp.h>
-#include <linux/types.h>
-#include <linux/spinlock.h>
-#include <linux/random.h>
-#include <linux/percpu.h>
-#include <asm/tlbflush.h>
-#include <linux/uaccess.h>
-#include "lg.h"
-
-/*M:008
- * We hold reference to pages, which prevents them from being swapped.
- * It'd be nice to have a callback in the "struct mm_struct" when Linux wants
- * to swap out.  If we had this, and a shrinker callback to trim PTE pages, we
- * could probably consider launching Guests as non-root.
-:*/
-
-/*H:300
- * The Page Table Code
- *
- * We use two-level page tables for the Guest, or three-level with PAE.  If
- * you're not entirely comfortable with virtual addresses, physical addresses
- * and page tables then I recommend you review arch/x86/lguest/boot.c's "Page
- * Table Handling" (with diagrams!).
- *
- * The Guest keeps page tables, but we maintain the actual ones here: these are
- * called "shadow" page tables.  Which is a very Guest-centric name: these are
- * the real page tables the CPU uses, although we keep them up to date to
- * reflect the Guest's.  (See what I mean about weird naming?  Since when do
- * shadows reflect anything?)
- *
- * Anyway, this is the most complicated part of the Host code.  There are seven
- * parts to this:
- *  (i) Looking up a page table entry when the Guest faults,
- *  (ii) Making sure the Guest stack is mapped,
- *  (iii) Setting up a page table entry when the Guest tells us one has changed,
- *  (iv) Switching page tables,
- *  (v) Flushing (throwing away) page tables,
- *  (vi) Mapping the Switcher when the Guest is about to run,
- *  (vii) Setting up the page tables initially.
-:*/
-
-/*
- * The Switcher uses the complete top PTE page.  That's 1024 PTE entries (4MB)
- * or 512 PTE entries with PAE (2MB).
- */
-#define SWITCHER_PGD_INDEX (PTRS_PER_PGD - 1)
-
-/*
- * For PAE we need the PMD index as well. We use the last 2MB, so we
- * will need the last pmd entry of the last pmd page.
- */
-#ifdef CONFIG_X86_PAE
-#define CHECK_GPGD_MASK		_PAGE_PRESENT
-#else
-#define CHECK_GPGD_MASK		_PAGE_TABLE
-#endif
-
-/*H:320
- * The page table code is curly enough to need helper functions to keep it
- * clear and clean.  The kernel itself provides many of them; one advantage
- * of insisting that the Guest and Host use the same CONFIG_X86_PAE setting.
- *
- * There are two functions which return pointers to the shadow (aka "real")
- * page tables.
- *
- * spgd_addr() takes the virtual address and returns a pointer to the top-level
- * page directory entry (PGD) for that address.  Since we keep track of several
- * page tables, the "i" argument tells us which one we're interested in (it's
- * usually the current one).
- */
-static pgd_t *spgd_addr(struct lg_cpu *cpu, u32 i, unsigned long vaddr)
-{
-	unsigned int index = pgd_index(vaddr);
-
-	/* Return a pointer index'th pgd entry for the i'th page table. */
-	return &cpu->lg->pgdirs[i].pgdir[index];
-}
-
-#ifdef CONFIG_X86_PAE
-/*
- * This routine then takes the PGD entry given above, which contains the
- * address of the PMD page.  It then returns a pointer to the PMD entry for the
- * given address.
- */
-static pmd_t *spmd_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr)
-{
-	unsigned int index = pmd_index(vaddr);
-	pmd_t *page;
-
-	/* You should never call this if the PGD entry wasn't valid */
-	BUG_ON(!(pgd_flags(spgd) & _PAGE_PRESENT));
-	page = __va(pgd_pfn(spgd) << PAGE_SHIFT);
-
-	return &page[index];
-}
-#endif
-
-/*
- * This routine then takes the page directory entry returned above, which
- * contains the address of the page table entry (PTE) page.  It then returns a
- * pointer to the PTE entry for the given address.
- */
-static pte_t *spte_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr)
-{
-#ifdef CONFIG_X86_PAE
-	pmd_t *pmd = spmd_addr(cpu, spgd, vaddr);
-	pte_t *page = __va(pmd_pfn(*pmd) << PAGE_SHIFT);
-
-	/* You should never call this if the PMD entry wasn't valid */
-	BUG_ON(!(pmd_flags(*pmd) & _PAGE_PRESENT));
-#else
-	pte_t *page = __va(pgd_pfn(spgd) << PAGE_SHIFT);
-	/* You should never call this if the PGD entry wasn't valid */
-	BUG_ON(!(pgd_flags(spgd) & _PAGE_PRESENT));
-#endif
-
-	return &page[pte_index(vaddr)];
-}
-
-/*
- * These functions are just like the above, except they access the Guest
- * page tables.  Hence they return a Guest address.
- */
-static unsigned long gpgd_addr(struct lg_cpu *cpu, unsigned long vaddr)
-{
-	unsigned int index = vaddr >> (PGDIR_SHIFT);
-	return cpu->lg->pgdirs[cpu->cpu_pgd].gpgdir + index * sizeof(pgd_t);
-}
-
-#ifdef CONFIG_X86_PAE
-/* Follow the PGD to the PMD. */
-static unsigned long gpmd_addr(pgd_t gpgd, unsigned long vaddr)
-{
-	unsigned long gpage = pgd_pfn(gpgd) << PAGE_SHIFT;
-	BUG_ON(!(pgd_flags(gpgd) & _PAGE_PRESENT));
-	return gpage + pmd_index(vaddr) * sizeof(pmd_t);
-}
-
-/* Follow the PMD to the PTE. */
-static unsigned long gpte_addr(struct lg_cpu *cpu,
-			       pmd_t gpmd, unsigned long vaddr)
-{
-	unsigned long gpage = pmd_pfn(gpmd) << PAGE_SHIFT;
-
-	BUG_ON(!(pmd_flags(gpmd) & _PAGE_PRESENT));
-	return gpage + pte_index(vaddr) * sizeof(pte_t);
-}
-#else
-/* Follow the PGD to the PTE (no mid-level for !PAE). */
-static unsigned long gpte_addr(struct lg_cpu *cpu,
-				pgd_t gpgd, unsigned long vaddr)
-{
-	unsigned long gpage = pgd_pfn(gpgd) << PAGE_SHIFT;
-
-	BUG_ON(!(pgd_flags(gpgd) & _PAGE_PRESENT));
-	return gpage + pte_index(vaddr) * sizeof(pte_t);
-}
-#endif
-/*:*/
-
-/*M:007
- * get_pfn is slow: we could probably try to grab batches of pages here as
- * an optimization (ie. pre-faulting).
-:*/
-
-/*H:350
- * This routine takes a page number given by the Guest and converts it to
- * an actual, physical page number.  It can fail for several reasons: the
- * virtual address might not be mapped by the Launcher, the write flag is set
- * and the page is read-only, or the write flag was set and the page was
- * shared so had to be copied, but we ran out of memory.
- *
- * This holds a reference to the page, so release_pte() is careful to put that
- * back.
- */
-static unsigned long get_pfn(unsigned long virtpfn, int write)
-{
-	struct page *page;
-
-	/* gup me one page at this address please! */
-	if (get_user_pages_fast(virtpfn << PAGE_SHIFT, 1, write, &page) == 1)
-		return page_to_pfn(page);
-
-	/* This value indicates failure. */
-	return -1UL;
-}
-
-/*H:340
- * Converting a Guest page table entry to a shadow (ie. real) page table
- * entry can be a little tricky.  The flags are (almost) the same, but the
- * Guest PTE contains a virtual page number: the CPU needs the real page
- * number.
- */
-static pte_t gpte_to_spte(struct lg_cpu *cpu, pte_t gpte, int write)
-{
-	unsigned long pfn, base, flags;
-
-	/*
-	 * The Guest sets the global flag, because it thinks that it is using
-	 * PGE.  We only told it to use PGE so it would tell us whether it was
-	 * flushing a kernel mapping or a userspace mapping.  We don't actually
-	 * use the global bit, so throw it away.
-	 */
-	flags = (pte_flags(gpte) & ~_PAGE_GLOBAL);
-
-	/* The Guest's pages are offset inside the Launcher. */
-	base = (unsigned long)cpu->lg->mem_base / PAGE_SIZE;
-
-	/*
-	 * We need a temporary "unsigned long" variable to hold the answer from
-	 * get_pfn(), because it returns 0xFFFFFFFF on failure, which wouldn't
-	 * fit in spte.pfn.  get_pfn() finds the real physical number of the
-	 * page, given the virtual number.
-	 */
-	pfn = get_pfn(base + pte_pfn(gpte), write);
-	if (pfn == -1UL) {
-		kill_guest(cpu, "failed to get page %lu", pte_pfn(gpte));
-		/*
-		 * When we destroy the Guest, we'll go through the shadow page
-		 * tables and release_pte() them.  Make sure we don't think
-		 * this one is valid!
-		 */
-		flags = 0;
-	}
-	/* Now we assemble our shadow PTE from the page number and flags. */
-	return pfn_pte(pfn, __pgprot(flags));
-}
-
-/*H:460 And to complete the chain, release_pte() looks like this: */
-static void release_pte(pte_t pte)
-{
-	/*
-	 * Remember that get_user_pages_fast() took a reference to the page, in
-	 * get_pfn()?  We have to put it back now.
-	 */
-	if (pte_flags(pte) & _PAGE_PRESENT)
-		put_page(pte_page(pte));
-}
-/*:*/
-
-static bool gpte_in_iomem(struct lg_cpu *cpu, pte_t gpte)
-{
-	/* We don't handle large pages. */
-	if (pte_flags(gpte) & _PAGE_PSE)
-		return false;
-
-	return (pte_pfn(gpte) >= cpu->lg->pfn_limit
-		&& pte_pfn(gpte) < cpu->lg->device_limit);
-}
-
-static bool check_gpte(struct lg_cpu *cpu, pte_t gpte)
-{
-	if ((pte_flags(gpte) & _PAGE_PSE) ||
-	    pte_pfn(gpte) >= cpu->lg->pfn_limit) {
-		kill_guest(cpu, "bad page table entry");
-		return false;
-	}
-	return true;
-}
-
-static bool check_gpgd(struct lg_cpu *cpu, pgd_t gpgd)
-{
-	if ((pgd_flags(gpgd) & ~CHECK_GPGD_MASK) ||
-	    (pgd_pfn(gpgd) >= cpu->lg->pfn_limit)) {
-		kill_guest(cpu, "bad page directory entry");
-		return false;
-	}
-	return true;
-}
-
-#ifdef CONFIG_X86_PAE
-static bool check_gpmd(struct lg_cpu *cpu, pmd_t gpmd)
-{
-	if ((pmd_flags(gpmd) & ~_PAGE_TABLE) ||
-	    (pmd_pfn(gpmd) >= cpu->lg->pfn_limit)) {
-		kill_guest(cpu, "bad page middle directory entry");
-		return false;
-	}
-	return true;
-}
-#endif
-
-/*H:331
- * This is the core routine to walk the shadow page tables and find the page
- * table entry for a specific address.
- *
- * If allocate is set, then we allocate any missing levels, setting the flags
- * on the new page directory and mid-level directories using the arguments
- * (which are copied from the Guest's page table entries).
- */
-static pte_t *find_spte(struct lg_cpu *cpu, unsigned long vaddr, bool allocate,
-			int pgd_flags, int pmd_flags)
-{
-	pgd_t *spgd;
-	/* Mid level for PAE. */
-#ifdef CONFIG_X86_PAE
-	pmd_t *spmd;
-#endif
-
-	/* Get top level entry. */
-	spgd = spgd_addr(cpu, cpu->cpu_pgd, vaddr);
-	if (!(pgd_flags(*spgd) & _PAGE_PRESENT)) {
-		/* No shadow entry: allocate a new shadow PTE page. */
-		unsigned long ptepage;
-
-		/* If they didn't want us to allocate anything, stop. */
-		if (!allocate)
-			return NULL;
-
-		ptepage = get_zeroed_page(GFP_KERNEL);
-		/*
-		 * This is not really the Guest's fault, but killing it is
-		 * simple for this corner case.
-		 */
-		if (!ptepage) {
-			kill_guest(cpu, "out of memory allocating pte page");
-			return NULL;
-		}
-		/*
-		 * And we copy the flags to the shadow PGD entry.  The page
-		 * number in the shadow PGD is the page we just allocated.
-		 */
-		set_pgd(spgd, __pgd(__pa(ptepage) | pgd_flags));
-	}
-
-	/*
-	 * Intel's Physical Address Extension actually uses three levels of
-	 * page tables, so we need to look in the mid-level.
-	 */
-#ifdef CONFIG_X86_PAE
-	/* Now look at the mid-level shadow entry. */
-	spmd = spmd_addr(cpu, *spgd, vaddr);
-
-	if (!(pmd_flags(*spmd) & _PAGE_PRESENT)) {
-		/* No shadow entry: allocate a new shadow PTE page. */
-		unsigned long ptepage;
-
-		/* If they didn't want us to allocate anything, stop. */
-		if (!allocate)
-			return NULL;
-
-		ptepage = get_zeroed_page(GFP_KERNEL);
-
-		/*
-		 * This is not really the Guest's fault, but killing it is
-		 * simple for this corner case.
-		 */
-		if (!ptepage) {
-			kill_guest(cpu, "out of memory allocating pmd page");
-			return NULL;
-		}
-
-		/*
-		 * And we copy the flags to the shadow PMD entry.  The page
-		 * number in the shadow PMD is the page we just allocated.
-		 */
-		set_pmd(spmd, __pmd(__pa(ptepage) | pmd_flags));
-	}
-#endif
-
-	/* Get the pointer to the shadow PTE entry we're going to set. */
-	return spte_addr(cpu, *spgd, vaddr);
-}
-
-/*H:330
- * (i) Looking up a page table entry when the Guest faults.
- *
- * We saw this call in run_guest(): when we see a page fault in the Guest, we
- * come here.  That's because we only set up the shadow page tables lazily as
- * they're needed, so we get page faults all the time and quietly fix them up
- * and return to the Guest without it knowing.
- *
- * If we fixed up the fault (ie. we mapped the address), this routine returns
- * true.  Otherwise, it was a real fault and we need to tell the Guest.
- *
- * There's a corner case: they're trying to access memory between
- * pfn_limit and device_limit, which is I/O memory.  In this case, we
- * return false and set @iomem to the physical address, so the the
- * Launcher can handle the instruction manually.
- */
-bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode,
-		 unsigned long *iomem)
-{
-	unsigned long gpte_ptr;
-	pte_t gpte;
-	pte_t *spte;
-	pmd_t gpmd;
-	pgd_t gpgd;
-
-	*iomem = 0;
-
-	/* We never demand page the Switcher, so trying is a mistake. */
-	if (vaddr >= switcher_addr)
-		return false;
-
-	/* First step: get the top-level Guest page table entry. */
-	if (unlikely(cpu->linear_pages)) {
-		/* Faking up a linear mapping. */
-		gpgd = __pgd(CHECK_GPGD_MASK);
-	} else {
-		gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t);
-		/* Toplevel not present?  We can't map it in. */
-		if (!(pgd_flags(gpgd) & _PAGE_PRESENT))
-			return false;
-
-		/* 
-		 * This kills the Guest if it has weird flags or tries to
-		 * refer to a "physical" address outside the bounds.
-		 */
-		if (!check_gpgd(cpu, gpgd))
-			return false;
-	}
-
-	/* This "mid-level" entry is only used for non-linear, PAE mode. */
-	gpmd = __pmd(_PAGE_TABLE);
-
-#ifdef CONFIG_X86_PAE
-	if (likely(!cpu->linear_pages)) {
-		gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t);
-		/* Middle level not present?  We can't map it in. */
-		if (!(pmd_flags(gpmd) & _PAGE_PRESENT))
-			return false;
-
-		/* 
-		 * This kills the Guest if it has weird flags or tries to
-		 * refer to a "physical" address outside the bounds.
-		 */
-		if (!check_gpmd(cpu, gpmd))
-			return false;
-	}
-
-	/*
-	 * OK, now we look at the lower level in the Guest page table: keep its
-	 * address, because we might update it later.
-	 */
-	gpte_ptr = gpte_addr(cpu, gpmd, vaddr);
-#else
-	/*
-	 * OK, now we look at the lower level in the Guest page table: keep its
-	 * address, because we might update it later.
-	 */
-	gpte_ptr = gpte_addr(cpu, gpgd, vaddr);
-#endif
-
-	if (unlikely(cpu->linear_pages)) {
-		/* Linear?  Make up a PTE which points to same page. */
-		gpte = __pte((vaddr & PAGE_MASK) | _PAGE_RW | _PAGE_PRESENT);
-	} else {
-		/* Read the actual PTE value. */
-		gpte = lgread(cpu, gpte_ptr, pte_t);
-	}
-
-	/* If this page isn't in the Guest page tables, we can't page it in. */
-	if (!(pte_flags(gpte) & _PAGE_PRESENT))
-		return false;
-
-	/*
-	 * Check they're not trying to write to a page the Guest wants
-	 * read-only (bit 2 of errcode == write).
-	 */
-	if ((errcode & 2) && !(pte_flags(gpte) & _PAGE_RW))
-		return false;
-
-	/* User access to a kernel-only page? (bit 3 == user access) */
-	if ((errcode & 4) && !(pte_flags(gpte) & _PAGE_USER))
-		return false;
-
-	/* If they're accessing io memory, we expect a fault. */
-	if (gpte_in_iomem(cpu, gpte)) {
-		*iomem = (pte_pfn(gpte) << PAGE_SHIFT) | (vaddr & ~PAGE_MASK);
-		return false;
-	}
-
-	/*
-	 * Check that the Guest PTE flags are OK, and the page number is below
-	 * the pfn_limit (ie. not mapping the Launcher binary).
-	 */
-	if (!check_gpte(cpu, gpte))
-		return false;
-
-	/* Add the _PAGE_ACCESSED and (for a write) _PAGE_DIRTY flag */
-	gpte = pte_mkyoung(gpte);
-	if (errcode & 2)
-		gpte = pte_mkdirty(gpte);
-
-	/* Get the pointer to the shadow PTE entry we're going to set. */
-	spte = find_spte(cpu, vaddr, true, pgd_flags(gpgd), pmd_flags(gpmd));
-	if (!spte)
-		return false;
-
-	/*
-	 * If there was a valid shadow PTE entry here before, we release it.
-	 * This can happen with a write to a previously read-only entry.
-	 */
-	release_pte(*spte);
-
-	/*
-	 * If this is a write, we insist that the Guest page is writable (the
-	 * final arg to gpte_to_spte()).
-	 */
-	if (pte_dirty(gpte))
-		*spte = gpte_to_spte(cpu, gpte, 1);
-	else
-		/*
-		 * If this is a read, don't set the "writable" bit in the page
-		 * table entry, even if the Guest says it's writable.  That way
-		 * we will come back here when a write does actually occur, so
-		 * we can update the Guest's _PAGE_DIRTY flag.
-		 */
-		set_pte(spte, gpte_to_spte(cpu, pte_wrprotect(gpte), 0));
-
-	/*
-	 * Finally, we write the Guest PTE entry back: we've set the
-	 * _PAGE_ACCESSED and maybe the _PAGE_DIRTY flags.
-	 */
-	if (likely(!cpu->linear_pages))
-		lgwrite(cpu, gpte_ptr, pte_t, gpte);
-
-	/*
-	 * The fault is fixed, the page table is populated, the mapping
-	 * manipulated, the result returned and the code complete.  A small
-	 * delay and a trace of alliteration are the only indications the Guest
-	 * has that a page fault occurred at all.
-	 */
-	return true;
-}
-
-/*H:360
- * (ii) Making sure the Guest stack is mapped.
- *
- * Remember that direct traps into the Guest need a mapped Guest kernel stack.
- * pin_stack_pages() calls us here: we could simply call demand_page(), but as
- * we've seen that logic is quite long, and usually the stack pages are already
- * mapped, so it's overkill.
- *
- * This is a quick version which answers the question: is this virtual address
- * mapped by the shadow page tables, and is it writable?
- */
-static bool page_writable(struct lg_cpu *cpu, unsigned long vaddr)
-{
-	pte_t *spte;
-	unsigned long flags;
-
-	/* You can't put your stack in the Switcher! */
-	if (vaddr >= switcher_addr)
-		return false;
-
-	/* If there's no shadow PTE, it's not writable. */
-	spte = find_spte(cpu, vaddr, false, 0, 0);
-	if (!spte)
-		return false;
-
-	/*
-	 * Check the flags on the pte entry itself: it must be present and
-	 * writable.
-	 */
-	flags = pte_flags(*spte);
-	return (flags & (_PAGE_PRESENT|_PAGE_RW)) == (_PAGE_PRESENT|_PAGE_RW);
-}
-
-/*
- * So, when pin_stack_pages() asks us to pin a page, we check if it's already
- * in the page tables, and if not, we call demand_page() with error code 2
- * (meaning "write").
- */
-void pin_page(struct lg_cpu *cpu, unsigned long vaddr)
-{
-	unsigned long iomem;
-
-	if (!page_writable(cpu, vaddr) && !demand_page(cpu, vaddr, 2, &iomem))
-		kill_guest(cpu, "bad stack page %#lx", vaddr);
-}
-/*:*/
-
-#ifdef CONFIG_X86_PAE
-static void release_pmd(pmd_t *spmd)
-{
-	/* If the entry's not present, there's nothing to release. */
-	if (pmd_flags(*spmd) & _PAGE_PRESENT) {
-		unsigned int i;
-		pte_t *ptepage = __va(pmd_pfn(*spmd) << PAGE_SHIFT);
-		/* For each entry in the page, we might need to release it. */
-		for (i = 0; i < PTRS_PER_PTE; i++)
-			release_pte(ptepage[i]);
-		/* Now we can free the page of PTEs */
-		free_page((long)ptepage);
-		/* And zero out the PMD entry so we never release it twice. */
-		set_pmd(spmd, __pmd(0));
-	}
-}
-
-static void release_pgd(pgd_t *spgd)
-{
-	/* If the entry's not present, there's nothing to release. */
-	if (pgd_flags(*spgd) & _PAGE_PRESENT) {
-		unsigned int i;
-		pmd_t *pmdpage = __va(pgd_pfn(*spgd) << PAGE_SHIFT);
-
-		for (i = 0; i < PTRS_PER_PMD; i++)
-			release_pmd(&pmdpage[i]);
-
-		/* Now we can free the page of PMDs */
-		free_page((long)pmdpage);
-		/* And zero out the PGD entry so we never release it twice. */
-		set_pgd(spgd, __pgd(0));
-	}
-}
-
-#else /* !CONFIG_X86_PAE */
-/*H:450
- * If we chase down the release_pgd() code, the non-PAE version looks like
- * this.  The PAE version is almost identical, but instead of calling
- * release_pte it calls release_pmd(), which looks much like this.
- */
-static void release_pgd(pgd_t *spgd)
-{
-	/* If the entry's not present, there's nothing to release. */
-	if (pgd_flags(*spgd) & _PAGE_PRESENT) {
-		unsigned int i;
-		/*
-		 * Converting the pfn to find the actual PTE page is easy: turn
-		 * the page number into a physical address, then convert to a
-		 * virtual address (easy for kernel pages like this one).
-		 */
-		pte_t *ptepage = __va(pgd_pfn(*spgd) << PAGE_SHIFT);
-		/* For each entry in the page, we might need to release it. */
-		for (i = 0; i < PTRS_PER_PTE; i++)
-			release_pte(ptepage[i]);
-		/* Now we can free the page of PTEs */
-		free_page((long)ptepage);
-		/* And zero out the PGD entry so we never release it twice. */
-		*spgd = __pgd(0);
-	}
-}
-#endif
-
-/*H:445
- * We saw flush_user_mappings() twice: once from the flush_user_mappings()
- * hypercall and once in new_pgdir() when we re-used a top-level pgdir page.
- * It simply releases every PTE page from 0 up to the Guest's kernel address.
- */
-static void flush_user_mappings(struct lguest *lg, int idx)
-{
-	unsigned int i;
-	/* Release every pgd entry up to the kernel's address. */
-	for (i = 0; i < pgd_index(lg->kernel_address); i++)
-		release_pgd(lg->pgdirs[idx].pgdir + i);
-}
-
-/*H:440
- * (v) Flushing (throwing away) page tables,
- *
- * The Guest has a hypercall to throw away the page tables: it's used when a
- * large number of mappings have been changed.
- */
-void guest_pagetable_flush_user(struct lg_cpu *cpu)
-{
-	/* Drop the userspace part of the current page table. */
-	flush_user_mappings(cpu->lg, cpu->cpu_pgd);
-}
-/*:*/
-
-/* We walk down the guest page tables to get a guest-physical address */
-bool __guest_pa(struct lg_cpu *cpu, unsigned long vaddr, unsigned long *paddr)
-{
-	pgd_t gpgd;
-	pte_t gpte;
-#ifdef CONFIG_X86_PAE
-	pmd_t gpmd;
-#endif
-
-	/* Still not set up?  Just map 1:1. */
-	if (unlikely(cpu->linear_pages)) {
-		*paddr = vaddr;
-		return true;
-	}
-
-	/* First step: get the top-level Guest page table entry. */
-	gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t);
-	/* Toplevel not present?  We can't map it in. */
-	if (!(pgd_flags(gpgd) & _PAGE_PRESENT))
-		goto fail;
-
-#ifdef CONFIG_X86_PAE
-	gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t);
-	if (!(pmd_flags(gpmd) & _PAGE_PRESENT))
-		goto fail;
-	gpte = lgread(cpu, gpte_addr(cpu, gpmd, vaddr), pte_t);
-#else
-	gpte = lgread(cpu, gpte_addr(cpu, gpgd, vaddr), pte_t);
-#endif
-	if (!(pte_flags(gpte) & _PAGE_PRESENT))
-		goto fail;
-
-	*paddr = pte_pfn(gpte) * PAGE_SIZE | (vaddr & ~PAGE_MASK);
-	return true;
-
-fail:
-	*paddr = -1UL;
-	return false;
-}
-
-/*
- * This is the version we normally use: kills the Guest if it uses a
- * bad address
- */
-unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr)
-{
-	unsigned long paddr;
-
-	if (!__guest_pa(cpu, vaddr, &paddr))
-		kill_guest(cpu, "Bad address %#lx", vaddr);
-	return paddr;
-}
-
-/*
- * We keep several page tables.  This is a simple routine to find the page
- * table (if any) corresponding to this top-level address the Guest has given
- * us.
- */
-static unsigned int find_pgdir(struct lguest *lg, unsigned long pgtable)
-{
-	unsigned int i;
-	for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++)
-		if (lg->pgdirs[i].pgdir && lg->pgdirs[i].gpgdir == pgtable)
-			break;
-	return i;
-}
-
-/*H:435
- * And this is us, creating the new page directory.  If we really do
- * allocate a new one (and so the kernel parts are not there), we set
- * blank_pgdir.
- */
-static unsigned int new_pgdir(struct lg_cpu *cpu,
-			      unsigned long gpgdir,
-			      int *blank_pgdir)
-{
-	unsigned int next;
-
-	/*
-	 * We pick one entry at random to throw out.  Choosing the Least
-	 * Recently Used might be better, but this is easy.
-	 */
-	next = prandom_u32() % ARRAY_SIZE(cpu->lg->pgdirs);
-	/* If it's never been allocated at all before, try now. */
-	if (!cpu->lg->pgdirs[next].pgdir) {
-		cpu->lg->pgdirs[next].pgdir =
-					(pgd_t *)get_zeroed_page(GFP_KERNEL);
-		/* If the allocation fails, just keep using the one we have */
-		if (!cpu->lg->pgdirs[next].pgdir)
-			next = cpu->cpu_pgd;
-		else {
-			/*
-			 * This is a blank page, so there are no kernel
-			 * mappings: caller must map the stack!
-			 */
-			*blank_pgdir = 1;
-		}
-	}
-	/* Record which Guest toplevel this shadows. */
-	cpu->lg->pgdirs[next].gpgdir = gpgdir;
-	/* Release all the non-kernel mappings. */
-	flush_user_mappings(cpu->lg, next);
-
-	/* This hasn't run on any CPU at all. */
-	cpu->lg->pgdirs[next].last_host_cpu = -1;
-
-	return next;
-}
-
-/*H:501
- * We do need the Switcher code mapped at all times, so we allocate that
- * part of the Guest page table here.  We map the Switcher code immediately,
- * but defer mapping of the guest register page and IDT/LDT etc page until
- * just before we run the guest in map_switcher_in_guest().
- *
- * We *could* do this setup in map_switcher_in_guest(), but at that point
- * we've interrupts disabled, and allocating pages like that is fraught: we
- * can't sleep if we need to free up some memory.
- */
-static bool allocate_switcher_mapping(struct lg_cpu *cpu)
-{
-	int i;
-
-	for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) {
-		pte_t *pte = find_spte(cpu, switcher_addr + i * PAGE_SIZE, true,
-				       CHECK_GPGD_MASK, _PAGE_TABLE);
-		if (!pte)
-			return false;
-
-		/*
-		 * Map the switcher page if not already there.  It might
-		 * already be there because we call allocate_switcher_mapping()
-		 * in guest_set_pgd() just in case it did discard our Switcher
-		 * mapping, but it probably didn't.
-		 */
-		if (i == 0 && !(pte_flags(*pte) & _PAGE_PRESENT)) {
-			/* Get a reference to the Switcher page. */
-			get_page(lg_switcher_pages[0]);
-			/* Create a read-only, exectuable, kernel-style PTE */
-			set_pte(pte,
-				mk_pte(lg_switcher_pages[0], PAGE_KERNEL_RX));
-		}
-	}
-	cpu->lg->pgdirs[cpu->cpu_pgd].switcher_mapped = true;
-	return true;
-}
-
-/*H:470
- * Finally, a routine which throws away everything: all PGD entries in all
- * the shadow page tables, including the Guest's kernel mappings.  This is used
- * when we destroy the Guest.
- */
-static void release_all_pagetables(struct lguest *lg)
-{
-	unsigned int i, j;
-
-	/* Every shadow pagetable this Guest has */
-	for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) {
-		if (!lg->pgdirs[i].pgdir)
-			continue;
-
-		/* Every PGD entry. */
-		for (j = 0; j < PTRS_PER_PGD; j++)
-			release_pgd(lg->pgdirs[i].pgdir + j);
-		lg->pgdirs[i].switcher_mapped = false;
-		lg->pgdirs[i].last_host_cpu = -1;
-	}
-}
-
-/*
- * We also throw away everything when a Guest tells us it's changed a kernel
- * mapping.  Since kernel mappings are in every page table, it's easiest to
- * throw them all away.  This traps the Guest in amber for a while as
- * everything faults back in, but it's rare.
- */
-void guest_pagetable_clear_all(struct lg_cpu *cpu)
-{
-	release_all_pagetables(cpu->lg);
-	/* We need the Guest kernel stack mapped again. */
-	pin_stack_pages(cpu);
-	/* And we need Switcher allocated. */
-	if (!allocate_switcher_mapping(cpu))
-		kill_guest(cpu, "Cannot populate switcher mapping");
-}
-
-/*H:430
- * (iv) Switching page tables
- *
- * Now we've seen all the page table setting and manipulation, let's see
- * what happens when the Guest changes page tables (ie. changes the top-level
- * pgdir).  This occurs on almost every context switch.
- */
-void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable)
-{
-	int newpgdir, repin = 0;
-
-	/*
-	 * The very first time they call this, we're actually running without
-	 * any page tables; we've been making it up.  Throw them away now.
-	 */
-	if (unlikely(cpu->linear_pages)) {
-		release_all_pagetables(cpu->lg);
-		cpu->linear_pages = false;
-		/* Force allocation of a new pgdir. */
-		newpgdir = ARRAY_SIZE(cpu->lg->pgdirs);
-	} else {
-		/* Look to see if we have this one already. */
-		newpgdir = find_pgdir(cpu->lg, pgtable);
-	}
-
-	/*
-	 * If not, we allocate or mug an existing one: if it's a fresh one,
-	 * repin gets set to 1.
-	 */
-	if (newpgdir == ARRAY_SIZE(cpu->lg->pgdirs))
-		newpgdir = new_pgdir(cpu, pgtable, &repin);
-	/* Change the current pgd index to the new one. */
-	cpu->cpu_pgd = newpgdir;
-	/*
-	 * If it was completely blank, we map in the Guest kernel stack and
-	 * the Switcher.
-	 */
-	if (repin)
-		pin_stack_pages(cpu);
-
-	if (!cpu->lg->pgdirs[cpu->cpu_pgd].switcher_mapped) {
-		if (!allocate_switcher_mapping(cpu))
-			kill_guest(cpu, "Cannot populate switcher mapping");
-	}
-}
-/*:*/
-
-/*M:009
- * Since we throw away all mappings when a kernel mapping changes, our
- * performance sucks for guests using highmem.  In fact, a guest with
- * PAGE_OFFSET 0xc0000000 (the default) and more than about 700MB of RAM is
- * usually slower than a Guest with less memory.
- *
- * This, of course, cannot be fixed.  It would take some kind of... well, I
- * don't know, but the term "puissant code-fu" comes to mind.
-:*/
-
-/*H:420
- * This is the routine which actually sets the page table entry for then
- * "idx"'th shadow page table.
- *
- * Normally, we can just throw out the old entry and replace it with 0: if they
- * use it demand_page() will put the new entry in.  We need to do this anyway:
- * The Guest expects _PAGE_ACCESSED to be set on its PTE the first time a page
- * is read from, and _PAGE_DIRTY when it's written to.
- *
- * But Avi Kivity pointed out that most Operating Systems (Linux included) set
- * these bits on PTEs immediately anyway.  This is done to save the CPU from
- * having to update them, but it helps us the same way: if they set
- * _PAGE_ACCESSED then we can put a read-only PTE entry in immediately, and if
- * they set _PAGE_DIRTY then we can put a writable PTE entry in immediately.
- */
-static void __guest_set_pte(struct lg_cpu *cpu, int idx,
-		       unsigned long vaddr, pte_t gpte)
-{
-	/* Look up the matching shadow page directory entry. */
-	pgd_t *spgd = spgd_addr(cpu, idx, vaddr);
-#ifdef CONFIG_X86_PAE
-	pmd_t *spmd;
-#endif
-
-	/* If the top level isn't present, there's no entry to update. */
-	if (pgd_flags(*spgd) & _PAGE_PRESENT) {
-#ifdef CONFIG_X86_PAE
-		spmd = spmd_addr(cpu, *spgd, vaddr);
-		if (pmd_flags(*spmd) & _PAGE_PRESENT) {
-#endif
-			/* Otherwise, start by releasing the existing entry. */
-			pte_t *spte = spte_addr(cpu, *spgd, vaddr);
-			release_pte(*spte);
-
-			/*
-			 * If they're setting this entry as dirty or accessed,
-			 * we might as well put that entry they've given us in
-			 * now.  This shaves 10% off a copy-on-write
-			 * micro-benchmark.
-			 */
-			if ((pte_flags(gpte) & (_PAGE_DIRTY | _PAGE_ACCESSED))
-			    && !gpte_in_iomem(cpu, gpte)) {
-				if (!check_gpte(cpu, gpte))
-					return;
-				set_pte(spte,
-					gpte_to_spte(cpu, gpte,
-						pte_flags(gpte) & _PAGE_DIRTY));
-			} else {
-				/*
-				 * Otherwise kill it and we can demand_page()
-				 * it in later.
-				 */
-				set_pte(spte, __pte(0));
-			}
-#ifdef CONFIG_X86_PAE
-		}
-#endif
-	}
-}
-
-/*H:410
- * Updating a PTE entry is a little trickier.
- *
- * We keep track of several different page tables (the Guest uses one for each
- * process, so it makes sense to cache at least a few).  Each of these have
- * identical kernel parts: ie. every mapping above PAGE_OFFSET is the same for
- * all processes.  So when the page table above that address changes, we update
- * all the page tables, not just the current one.  This is rare.
- *
- * The benefit is that when we have to track a new page table, we can keep all
- * the kernel mappings.  This speeds up context switch immensely.
- */
-void guest_set_pte(struct lg_cpu *cpu,
-		   unsigned long gpgdir, unsigned long vaddr, pte_t gpte)
-{
-	/* We don't let you remap the Switcher; we need it to get back! */
-	if (vaddr >= switcher_addr) {
-		kill_guest(cpu, "attempt to set pte into Switcher pages");
-		return;
-	}
-
-	/*
-	 * Kernel mappings must be changed on all top levels.  Slow, but doesn't
-	 * happen often.
-	 */
-	if (vaddr >= cpu->lg->kernel_address) {
-		unsigned int i;
-		for (i = 0; i < ARRAY_SIZE(cpu->lg->pgdirs); i++)
-			if (cpu->lg->pgdirs[i].pgdir)
-				__guest_set_pte(cpu, i, vaddr, gpte);
-	} else {
-		/* Is this page table one we have a shadow for? */
-		int pgdir = find_pgdir(cpu->lg, gpgdir);
-		if (pgdir != ARRAY_SIZE(cpu->lg->pgdirs))
-			/* If so, do the update. */
-			__guest_set_pte(cpu, pgdir, vaddr, gpte);
-	}
-}
-
-/*H:400
- * (iii) Setting up a page table entry when the Guest tells us one has changed.
- *
- * Just like we did in interrupts_and_traps.c, it makes sense for us to deal
- * with the other side of page tables while we're here: what happens when the
- * Guest asks for a page table to be updated?
- *
- * We already saw that demand_page() will fill in the shadow page tables when
- * needed, so we can simply remove shadow page table entries whenever the Guest
- * tells us they've changed.  When the Guest tries to use the new entry it will
- * fault and demand_page() will fix it up.
- *
- * So with that in mind here's our code to update a (top-level) PGD entry:
- */
-void guest_set_pgd(struct lguest *lg, unsigned long gpgdir, u32 idx)
-{
-	int pgdir;
-
-	if (idx > PTRS_PER_PGD) {
-		kill_guest(&lg->cpus[0], "Attempt to set pgd %u/%u",
-			   idx, PTRS_PER_PGD);
-		return;
-	}
-
-	/* If they're talking about a page table we have a shadow for... */
-	pgdir = find_pgdir(lg, gpgdir);
-	if (pgdir < ARRAY_SIZE(lg->pgdirs)) {
-		/* ... throw it away. */
-		release_pgd(lg->pgdirs[pgdir].pgdir + idx);
-		/* That might have been the Switcher mapping, remap it. */
-		if (!allocate_switcher_mapping(&lg->cpus[0])) {
-			kill_guest(&lg->cpus[0],
-				   "Cannot populate switcher mapping");
-		}
-		lg->pgdirs[pgdir].last_host_cpu = -1;
-	}
-}
-
-#ifdef CONFIG_X86_PAE
-/* For setting a mid-level, we just throw everything away.  It's easy. */
-void guest_set_pmd(struct lguest *lg, unsigned long pmdp, u32 idx)
-{
-	guest_pagetable_clear_all(&lg->cpus[0]);
-}
-#endif
-
-/*H:500
- * (vii) Setting up the page tables initially.
- *
- * When a Guest is first created, set initialize a shadow page table which
- * we will populate on future faults.  The Guest doesn't have any actual
- * pagetables yet, so we set linear_pages to tell demand_page() to fake it
- * for the moment.
- *
- * We do need the Switcher to be mapped at all times, so we allocate that
- * part of the Guest page table here.
- */
-int init_guest_pagetable(struct lguest *lg)
-{
-	struct lg_cpu *cpu = &lg->cpus[0];
-	int allocated = 0;
-
-	/* lg (and lg->cpus[]) starts zeroed: this allocates a new pgdir */
-	cpu->cpu_pgd = new_pgdir(cpu, 0, &allocated);
-	if (!allocated)
-		return -ENOMEM;
-
-	/* We start with a linear mapping until the initialize. */
-	cpu->linear_pages = true;
-
-	/* Allocate the page tables for the Switcher. */
-	if (!allocate_switcher_mapping(cpu)) {
-		release_all_pagetables(lg);
-		return -ENOMEM;
-	}
-
-	return 0;
-}
-
-/*H:508 When the Guest calls LHCALL_LGUEST_INIT we do more setup. */
-void page_table_guest_data_init(struct lg_cpu *cpu)
-{
-	/*
-	 * We tell the Guest that it can't use the virtual addresses
-	 * used by the Switcher.  This trick is equivalent to 4GB -
-	 * switcher_addr.
-	 */
-	u32 top = ~switcher_addr + 1;
-
-	/* We get the kernel address: above this is all kernel memory. */
-	if (get_user(cpu->lg->kernel_address,
-		     &cpu->lg->lguest_data->kernel_address)
-		/*
-		 * We tell the Guest that it can't use the top virtual
-		 * addresses (used by the Switcher).
-		 */
-	    || put_user(top, &cpu->lg->lguest_data->reserve_mem)) {
-		kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data);
-		return;
-	}
-
-	/*
-	 * In flush_user_mappings() we loop from 0 to
-	 * "pgd_index(lg->kernel_address)".  This assumes it won't hit the
-	 * Switcher mappings, so check that now.
-	 */
-	if (cpu->lg->kernel_address >= switcher_addr)
-		kill_guest(cpu, "bad kernel address %#lx",
-				 cpu->lg->kernel_address);
-}
-
-/* When a Guest dies, our cleanup is fairly simple. */
-void free_guest_pagetable(struct lguest *lg)
-{
-	unsigned int i;
-
-	/* Throw away all page table pages. */
-	release_all_pagetables(lg);
-	/* Now free the top levels: free_page() can handle 0 just fine. */
-	for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++)
-		free_page((long)lg->pgdirs[i].pgdir);
-}
-
-/*H:481
- * This clears the Switcher mappings for cpu #i.
- */
-static void remove_switcher_percpu_map(struct lg_cpu *cpu, unsigned int i)
-{
-	unsigned long base = switcher_addr + PAGE_SIZE + i * PAGE_SIZE*2;
-	pte_t *pte;
-
-	/* Clear the mappings for both pages. */
-	pte = find_spte(cpu, base, false, 0, 0);
-	release_pte(*pte);
-	set_pte(pte, __pte(0));
-
-	pte = find_spte(cpu, base + PAGE_SIZE, false, 0, 0);
-	release_pte(*pte);
-	set_pte(pte, __pte(0));
-}
-
-/*H:480
- * (vi) Mapping the Switcher when the Guest is about to run.
- *
- * The Switcher and the two pages for this CPU need to be visible in the Guest
- * (and not the pages for other CPUs).
- *
- * The pages for the pagetables have all been allocated before: we just need
- * to make sure the actual PTEs are up-to-date for the CPU we're about to run
- * on.
- */
-void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages)
-{
-	unsigned long base;
-	struct page *percpu_switcher_page, *regs_page;
-	pte_t *pte;
-	struct pgdir *pgdir = &cpu->lg->pgdirs[cpu->cpu_pgd];
-
-	/* Switcher page should always be mapped by now! */
-	BUG_ON(!pgdir->switcher_mapped);
-
-	/* 
-	 * Remember that we have two pages for each Host CPU, so we can run a
-	 * Guest on each CPU without them interfering.  We need to make sure
-	 * those pages are mapped correctly in the Guest, but since we usually
-	 * run on the same CPU, we cache that, and only update the mappings
-	 * when we move.
-	 */
-	if (pgdir->last_host_cpu == raw_smp_processor_id())
-		return;
-
-	/* -1 means unknown so we remove everything. */
-	if (pgdir->last_host_cpu == -1) {
-		unsigned int i;
-		for_each_possible_cpu(i)
-			remove_switcher_percpu_map(cpu, i);
-	} else {
-		/* We know exactly what CPU mapping to remove. */
-		remove_switcher_percpu_map(cpu, pgdir->last_host_cpu);
-	}
-
-	/*
-	 * When we're running the Guest, we want the Guest's "regs" page to
-	 * appear where the first Switcher page for this CPU is.  This is an
-	 * optimization: when the Switcher saves the Guest registers, it saves
-	 * them into the first page of this CPU's "struct lguest_pages": if we
-	 * make sure the Guest's register page is already mapped there, we
-	 * don't have to copy them out again.
-	 */
-	/* Find the shadow PTE for this regs page. */
-	base = switcher_addr + PAGE_SIZE
-		+ raw_smp_processor_id() * sizeof(struct lguest_pages);
-	pte = find_spte(cpu, base, false, 0, 0);
-	regs_page = pfn_to_page(__pa(cpu->regs_page) >> PAGE_SHIFT);
-	get_page(regs_page);
-	set_pte(pte, mk_pte(regs_page, __pgprot(__PAGE_KERNEL & ~_PAGE_GLOBAL)));
-
-	/*
-	 * We map the second page of the struct lguest_pages read-only in
-	 * the Guest: the IDT, GDT and other things it's not supposed to
-	 * change.
-	 */
-	pte = find_spte(cpu, base + PAGE_SIZE, false, 0, 0);
-	percpu_switcher_page
-		= lg_switcher_pages[1 + raw_smp_processor_id()*2 + 1];
-	get_page(percpu_switcher_page);
-	set_pte(pte, mk_pte(percpu_switcher_page,
-			    __pgprot(__PAGE_KERNEL_RO & ~_PAGE_GLOBAL)));
-
-	pgdir->last_host_cpu = raw_smp_processor_id();
-}
-
-/*H:490
- * We've made it through the page table code.  Perhaps our tired brains are
- * still processing the details, or perhaps we're simply glad it's over.
- *
- * If nothing else, note that all this complexity in juggling shadow page tables
- * in sync with the Guest's page tables is for one reason: for most Guests this
- * page table dance determines how bad performance will be.  This is why Xen
- * uses exotic direct Guest pagetable manipulation, and why both Intel and AMD
- * have implemented shadow page table support directly into hardware.
- *
- * There is just one file remaining in the Host.
- */
diff --git a/drivers/lguest/segments.c b/drivers/lguest/segments.c
deleted file mode 100644
index c4fb424dfddb..000000000000
--- a/drivers/lguest/segments.c
+++ /dev/null
@@ -1,228 +0,0 @@
-/*P:600
- * The x86 architecture has segments, which involve a table of descriptors
- * which can be used to do funky things with virtual address interpretation.
- * We originally used to use segments so the Guest couldn't alter the
- * Guest<->Host Switcher, and then we had to trim Guest segments, and restore
- * for userspace per-thread segments, but trim again for on userspace->kernel
- * transitions...  This nightmarish creation was contained within this file,
- * where we knew not to tread without heavy armament and a change of underwear.
- *
- * In these modern times, the segment handling code consists of simple sanity
- * checks, and the worst you'll experience reading this code is butterfly-rash
- * from frolicking through its parklike serenity.
-:*/
-#include "lg.h"
-
-/*H:600
- * Segments & The Global Descriptor Table
- *
- * (That title sounds like a bad Nerdcore group.  Not to suggest that there are
- * any good Nerdcore groups, but in high school a friend of mine had a band
- * called Joe Fish and the Chips, so there are definitely worse band names).
- *
- * To refresh: the GDT is a table of 8-byte values describing segments.  Once
- * set up, these segments can be loaded into one of the 6 "segment registers".
- *
- * GDT entries are passed around as "struct desc_struct"s, which like IDT
- * entries are split into two 32-bit members, "a" and "b".  One day, someone
- * will clean that up, and be declared a Hero.  (No pressure, I'm just saying).
- *
- * Anyway, the GDT entry contains a base (the start address of the segment), a
- * limit (the size of the segment - 1), and some flags.  Sounds simple, and it
- * would be, except those zany Intel engineers decided that it was too boring
- * to put the base at one end, the limit at the other, and the flags in
- * between.  They decided to shotgun the bits at random throughout the 8 bytes,
- * like so:
- *
- * 0               16                     40       48  52  56     63
- * [ limit part 1 ][     base part 1     ][ flags ][li][fl][base ]
- *                                                  mit ags part 2
- *                                                part 2
- *
- * As a result, this file contains a certain amount of magic numeracy.  Let's
- * begin.
- */
-
-/*
- * There are several entries we don't let the Guest set.  The TSS entry is the
- * "Task State Segment" which controls all kinds of delicate things.  The
- * LGUEST_CS and LGUEST_DS entries are reserved for the Switcher, and the
- * the Guest can't be trusted to deal with double faults.
- */
-static bool ignored_gdt(unsigned int num)
-{
-	return (num == GDT_ENTRY_TSS
-		|| num == GDT_ENTRY_LGUEST_CS
-		|| num == GDT_ENTRY_LGUEST_DS
-		|| num == GDT_ENTRY_DOUBLEFAULT_TSS);
-}
-
-/*H:630
- * Once the Guest gave us new GDT entries, we fix them up a little.  We
- * don't care if they're invalid: the worst that can happen is a General
- * Protection Fault in the Switcher when it restores a Guest segment register
- * which tries to use that entry.  Then we kill the Guest for causing such a
- * mess: the message will be "unhandled trap 256".
- */
-static void fixup_gdt_table(struct lg_cpu *cpu, unsigned start, unsigned end)
-{
-	unsigned int i;
-
-	for (i = start; i < end; i++) {
-		/*
-		 * We never copy these ones to real GDT, so we don't care what
-		 * they say
-		 */
-		if (ignored_gdt(i))
-			continue;
-
-		/*
-		 * Segment descriptors contain a privilege level: the Guest is
-		 * sometimes careless and leaves this as 0, even though it's
-		 * running at privilege level 1.  If so, we fix it here.
-		 */
-		if (cpu->arch.gdt[i].dpl == 0)
-			cpu->arch.gdt[i].dpl |= GUEST_PL;
-
-		/*
-		 * Each descriptor has an "accessed" bit.  If we don't set it
-		 * now, the CPU will try to set it when the Guest first loads
-		 * that entry into a segment register.  But the GDT isn't
-		 * writable by the Guest, so bad things can happen.
-		 */
-		cpu->arch.gdt[i].type |= 0x1;
-	}
-}
-
-/*H:610
- * Like the IDT, we never simply use the GDT the Guest gives us.  We keep
- * a GDT for each CPU, and copy across the Guest's entries each time we want to
- * run the Guest on that CPU.
- *
- * This routine is called at boot or modprobe time for each CPU to set up the
- * constant GDT entries: the ones which are the same no matter what Guest we're
- * running.
- */
-void setup_default_gdt_entries(struct lguest_ro_state *state)
-{
-	struct desc_struct *gdt = state->guest_gdt;
-	unsigned long tss = (unsigned long)&state->guest_tss;
-
-	/* The Switcher segments are full 0-4G segments, privilege level 0 */
-	gdt[GDT_ENTRY_LGUEST_CS] = FULL_EXEC_SEGMENT;
-	gdt[GDT_ENTRY_LGUEST_DS] = FULL_SEGMENT;
-
-	/*
-	 * The TSS segment refers to the TSS entry for this particular CPU.
-	 */
-	gdt[GDT_ENTRY_TSS].a = 0;
-	gdt[GDT_ENTRY_TSS].b = 0;
-
-	gdt[GDT_ENTRY_TSS].limit0 = 0x67;
-	gdt[GDT_ENTRY_TSS].base0  = tss & 0xFFFF;
-	gdt[GDT_ENTRY_TSS].base1  = (tss >> 16) & 0xFF;
-	gdt[GDT_ENTRY_TSS].base2  = tss >> 24;
-	gdt[GDT_ENTRY_TSS].type   = 0x9; /* 32-bit TSS (available) */
-	gdt[GDT_ENTRY_TSS].p      = 0x1; /* Entry is present */
-	gdt[GDT_ENTRY_TSS].dpl    = 0x0; /* Privilege level 0 */
-	gdt[GDT_ENTRY_TSS].s      = 0x0; /* system segment */
-
-}
-
-/*
- * This routine sets up the initial Guest GDT for booting.  All entries start
- * as 0 (unusable).
- */
-void setup_guest_gdt(struct lg_cpu *cpu)
-{
-	/*
-	 * Start with full 0-4G segments...except the Guest is allowed to use
-	 * them, so set the privilege level appropriately in the flags.
-	 */
-	cpu->arch.gdt[GDT_ENTRY_KERNEL_CS] = FULL_EXEC_SEGMENT;
-	cpu->arch.gdt[GDT_ENTRY_KERNEL_DS] = FULL_SEGMENT;
-	cpu->arch.gdt[GDT_ENTRY_KERNEL_CS].dpl |= GUEST_PL;
-	cpu->arch.gdt[GDT_ENTRY_KERNEL_DS].dpl |= GUEST_PL;
-}
-
-/*H:650
- * An optimization of copy_gdt(), for just the three "thead-local storage"
- * entries.
- */
-void copy_gdt_tls(const struct lg_cpu *cpu, struct desc_struct *gdt)
-{
-	unsigned int i;
-
-	for (i = GDT_ENTRY_TLS_MIN; i <= GDT_ENTRY_TLS_MAX; i++)
-		gdt[i] = cpu->arch.gdt[i];
-}
-
-/*H:640
- * When the Guest is run on a different CPU, or the GDT entries have changed,
- * copy_gdt() is called to copy the Guest's GDT entries across to this CPU's
- * GDT.
- */
-void copy_gdt(const struct lg_cpu *cpu, struct desc_struct *gdt)
-{
-	unsigned int i;
-
-	/*
-	 * The default entries from setup_default_gdt_entries() are not
-	 * replaced.  See ignored_gdt() above.
-	 */
-	for (i = 0; i < GDT_ENTRIES; i++)
-		if (!ignored_gdt(i))
-			gdt[i] = cpu->arch.gdt[i];
-}
-
-/*H:620
- * This is where the Guest asks us to load a new GDT entry
- * (LHCALL_LOAD_GDT_ENTRY).  We tweak the entry and copy it in.
- */
-void load_guest_gdt_entry(struct lg_cpu *cpu, u32 num, u32 lo, u32 hi)
-{
-	/*
-	 * We assume the Guest has the same number of GDT entries as the
-	 * Host, otherwise we'd have to dynamically allocate the Guest GDT.
-	 */
-	if (num >= ARRAY_SIZE(cpu->arch.gdt)) {
-		kill_guest(cpu, "too many gdt entries %i", num);
-		return;
-	}
-
-	/* Set it up, then fix it. */
-	cpu->arch.gdt[num].a = lo;
-	cpu->arch.gdt[num].b = hi;
-	fixup_gdt_table(cpu, num, num+1);
-	/*
-	 * Mark that the GDT changed so the core knows it has to copy it again,
-	 * even if the Guest is run on the same CPU.
-	 */
-	cpu->changed |= CHANGED_GDT;
-}
-
-/*
- * This is the fast-track version for just changing the three TLS entries.
- * Remember that this happens on every context switch, so it's worth
- * optimizing.  But wouldn't it be neater to have a single hypercall to cover
- * both cases?
- */
-void guest_load_tls(struct lg_cpu *cpu, unsigned long gtls)
-{
-	struct desc_struct *tls = &cpu->arch.gdt[GDT_ENTRY_TLS_MIN];
-
-	__lgread(cpu, tls, gtls, sizeof(*tls)*GDT_ENTRY_TLS_ENTRIES);
-	fixup_gdt_table(cpu, GDT_ENTRY_TLS_MIN, GDT_ENTRY_TLS_MAX+1);
-	/* Note that just the TLS entries have changed. */
-	cpu->changed |= CHANGED_GDT_TLS;
-}
-
-/*H:660
- * With this, we have finished the Host.
- *
- * Five of the seven parts of our task are complete.  You have made it through
- * the Bit of Despair (I think that's somewhere in the page table code,
- * myself).
- *
- * Next, we examine "make Switcher".  It's short, but intense.
- */
diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c
deleted file mode 100644
index b4f79b923aea..000000000000
--- a/drivers/lguest/x86/core.c
+++ /dev/null
@@ -1,724 +0,0 @@
-/*
- * Copyright (C) 2006, Rusty Russell <rusty@rustcorp.com.au> IBM Corporation.
- * Copyright (C) 2007, Jes Sorensen <jes@sgi.com> SGI.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- * NON INFRINGEMENT.  See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-/*P:450
- * This file contains the x86-specific lguest code.  It used to be all
- * mixed in with drivers/lguest/core.c but several foolhardy code slashers
- * wrestled most of the dependencies out to here in preparation for porting
- * lguest to other architectures (see what I mean by foolhardy?).
- *
- * This also contains a couple of non-obvious setup and teardown pieces which
- * were implemented after days of debugging pain.
-:*/
-#include <linux/kernel.h>
-#include <linux/start_kernel.h>
-#include <linux/string.h>
-#include <linux/console.h>
-#include <linux/screen_info.h>
-#include <linux/irq.h>
-#include <linux/interrupt.h>
-#include <linux/clocksource.h>
-#include <linux/clockchips.h>
-#include <linux/cpu.h>
-#include <linux/lguest.h>
-#include <linux/lguest_launcher.h>
-#include <asm/paravirt.h>
-#include <asm/param.h>
-#include <asm/page.h>
-#include <asm/pgtable.h>
-#include <asm/desc.h>
-#include <asm/setup.h>
-#include <asm/lguest.h>
-#include <linux/uaccess.h>
-#include <asm/fpu/internal.h>
-#include <asm/tlbflush.h>
-#include "../lg.h"
-
-static int cpu_had_pge;
-
-static struct {
-	unsigned long offset;
-	unsigned short segment;
-} lguest_entry;
-
-/* Offset from where switcher.S was compiled to where we've copied it */
-static unsigned long switcher_offset(void)
-{
-	return switcher_addr - (unsigned long)start_switcher_text;
-}
-
-/* This cpu's struct lguest_pages (after the Switcher text page) */
-static struct lguest_pages *lguest_pages(unsigned int cpu)
-{
-	return &(((struct lguest_pages *)(switcher_addr + PAGE_SIZE))[cpu]);
-}
-
-static DEFINE_PER_CPU(struct lg_cpu *, lg_last_cpu);
-
-/*S:010
- * We approach the Switcher.
- *
- * Remember that each CPU has two pages which are visible to the Guest when it
- * runs on that CPU.  This has to contain the state for that Guest: we copy the
- * state in just before we run the Guest.
- *
- * Each Guest has "changed" flags which indicate what has changed in the Guest
- * since it last ran.  We saw this set in interrupts_and_traps.c and
- * segments.c.
- */
-static void copy_in_guest_info(struct lg_cpu *cpu, struct lguest_pages *pages)
-{
-	/*
-	 * Copying all this data can be quite expensive.  We usually run the
-	 * same Guest we ran last time (and that Guest hasn't run anywhere else
-	 * meanwhile).  If that's not the case, we pretend everything in the
-	 * Guest has changed.
-	 */
-	if (__this_cpu_read(lg_last_cpu) != cpu || cpu->last_pages != pages) {
-		__this_cpu_write(lg_last_cpu, cpu);
-		cpu->last_pages = pages;
-		cpu->changed = CHANGED_ALL;
-	}
-
-	/*
-	 * These copies are pretty cheap, so we do them unconditionally: */
-	/* Save the current Host top-level page directory.
-	 */
-	pages->state.host_cr3 = __pa(current->mm->pgd);
-	/*
-	 * Set up the Guest's page tables to see this CPU's pages (and no
-	 * other CPU's pages).
-	 */
-	map_switcher_in_guest(cpu, pages);
-	/*
-	 * Set up the two "TSS" members which tell the CPU what stack to use
-	 * for traps which do directly into the Guest (ie. traps at privilege
-	 * level 1).
-	 */
-	pages->state.guest_tss.sp1 = cpu->esp1;
-	pages->state.guest_tss.ss1 = cpu->ss1;
-
-	/* Copy direct-to-Guest trap entries. */
-	if (cpu->changed & CHANGED_IDT)
-		copy_traps(cpu, pages->state.guest_idt, default_idt_entries);
-
-	/* Copy all GDT entries which the Guest can change. */
-	if (cpu->changed & CHANGED_GDT)
-		copy_gdt(cpu, pages->state.guest_gdt);
-	/* If only the TLS entries have changed, copy them. */
-	else if (cpu->changed & CHANGED_GDT_TLS)
-		copy_gdt_tls(cpu, pages->state.guest_gdt);
-
-	/* Mark the Guest as unchanged for next time. */
-	cpu->changed = 0;
-}
-
-/* Finally: the code to actually call into the Switcher to run the Guest. */
-static void run_guest_once(struct lg_cpu *cpu, struct lguest_pages *pages)
-{
-	/* This is a dummy value we need for GCC's sake. */
-	unsigned int clobber;
-
-	/*
-	 * Copy the guest-specific information into this CPU's "struct
-	 * lguest_pages".
-	 */
-	copy_in_guest_info(cpu, pages);
-
-	/*
-	 * Set the trap number to 256 (impossible value).  If we fault while
-	 * switching to the Guest (bad segment registers or bug), this will
-	 * cause us to abort the Guest.
-	 */
-	cpu->regs->trapnum = 256;
-
-	/*
-	 * Now: we push the "eflags" register on the stack, then do an "lcall".
-	 * This is how we change from using the kernel code segment to using
-	 * the dedicated lguest code segment, as well as jumping into the
-	 * Switcher.
-	 *
-	 * The lcall also pushes the old code segment (KERNEL_CS) onto the
-	 * stack, then the address of this call.  This stack layout happens to
-	 * exactly match the stack layout created by an interrupt...
-	 */
-	asm volatile("pushf; lcall *%4"
-		     /*
-		      * This is how we tell GCC that %eax ("a") and %ebx ("b")
-		      * are changed by this routine.  The "=" means output.
-		      */
-		     : "=a"(clobber), "=b"(clobber)
-		     /*
-		      * %eax contains the pages pointer.  ("0" refers to the
-		      * 0-th argument above, ie "a").  %ebx contains the
-		      * physical address of the Guest's top-level page
-		      * directory.
-		      */
-		     : "0"(pages), 
-		       "1"(__pa(cpu->lg->pgdirs[cpu->cpu_pgd].pgdir)),
-		       "m"(lguest_entry)
-		     /*
-		      * We tell gcc that all these registers could change,
-		      * which means we don't have to save and restore them in
-		      * the Switcher.
-		      */
-		     : "memory", "%edx", "%ecx", "%edi", "%esi");
-}
-/*:*/
-
-unsigned long *lguest_arch_regptr(struct lg_cpu *cpu, size_t reg_off, bool any)
-{
-	switch (reg_off) {
-	case offsetof(struct pt_regs, bx):
-		return &cpu->regs->ebx;
-	case offsetof(struct pt_regs, cx):
-		return &cpu->regs->ecx;
-	case offsetof(struct pt_regs, dx):
-		return &cpu->regs->edx;
-	case offsetof(struct pt_regs, si):
-		return &cpu->regs->esi;
-	case offsetof(struct pt_regs, di):
-		return &cpu->regs->edi;
-	case offsetof(struct pt_regs, bp):
-		return &cpu->regs->ebp;
-	case offsetof(struct pt_regs, ax):
-		return &cpu->regs->eax;
-	case offsetof(struct pt_regs, ip):
-		return &cpu->regs->eip;
-	case offsetof(struct pt_regs, sp):
-		return &cpu->regs->esp;
-	}
-
-	/* Launcher can read these, but we don't allow any setting. */
-	if (any) {
-		switch (reg_off) {
-		case offsetof(struct pt_regs, ds):
-			return &cpu->regs->ds;
-		case offsetof(struct pt_regs, es):
-			return &cpu->regs->es;
-		case offsetof(struct pt_regs, fs):
-			return &cpu->regs->fs;
-		case offsetof(struct pt_regs, gs):
-			return &cpu->regs->gs;
-		case offsetof(struct pt_regs, cs):
-			return &cpu->regs->cs;
-		case offsetof(struct pt_regs, flags):
-			return &cpu->regs->eflags;
-		case offsetof(struct pt_regs, ss):
-			return &cpu->regs->ss;
-		}
-	}
-
-	return NULL;
-}
-
-/*M:002
- * There are hooks in the scheduler which we can register to tell when we
- * get kicked off the CPU (preempt_notifier_register()).  This would allow us
- * to lazily disable SYSENTER which would regain some performance, and should
- * also simplify copy_in_guest_info().  Note that we'd still need to restore
- * things when we exit to Launcher userspace, but that's fairly easy.
- *
- * We could also try using these hooks for PGE, but that might be too expensive.
- *
- * The hooks were designed for KVM, but we can also put them to good use.
-:*/
-
-/*H:040
- * This is the i386-specific code to setup and run the Guest.  Interrupts
- * are disabled: we own the CPU.
- */
-void lguest_arch_run_guest(struct lg_cpu *cpu)
-{
-	/*
-	 * SYSENTER is an optimized way of doing system calls.  We can't allow
-	 * it because it always jumps to privilege level 0.  A normal Guest
-	 * won't try it because we don't advertise it in CPUID, but a malicious
-	 * Guest (or malicious Guest userspace program) could, so we tell the
-	 * CPU to disable it before running the Guest.
-	 */
-	if (boot_cpu_has(X86_FEATURE_SEP))
-		wrmsr(MSR_IA32_SYSENTER_CS, 0, 0);
-
-	/*
-	 * Now we actually run the Guest.  It will return when something
-	 * interesting happens, and we can examine its registers to see what it
-	 * was doing.
-	 */
-	run_guest_once(cpu, lguest_pages(raw_smp_processor_id()));
-
-	/*
-	 * Note that the "regs" structure contains two extra entries which are
-	 * not really registers: a trap number which says what interrupt or
-	 * trap made the switcher code come back, and an error code which some
-	 * traps set.
-	 */
-
-	 /* Restore SYSENTER if it's supposed to be on. */
-	 if (boot_cpu_has(X86_FEATURE_SEP))
-		wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0);
-
-	/*
-	 * If the Guest page faulted, then the cr2 register will tell us the
-	 * bad virtual address.  We have to grab this now, because once we
-	 * re-enable interrupts an interrupt could fault and thus overwrite
-	 * cr2, or we could even move off to a different CPU.
-	 */
-	if (cpu->regs->trapnum == 14)
-		cpu->arch.last_pagefault = read_cr2();
-	/*
-	 * Similarly, if we took a trap because the Guest used the FPU,
-	 * we have to restore the FPU it expects to see.
-	 * fpu__restore() may sleep and we may even move off to
-	 * a different CPU. So all the critical stuff should be done
-	 * before this.
-	 */
-	else if (cpu->regs->trapnum == 7 && !fpregs_active())
-		fpu__restore(&current->thread.fpu);
-}
-
-/*H:130
- * Now we've examined the hypercall code; our Guest can make requests.
- * Our Guest is usually so well behaved; it never tries to do things it isn't
- * allowed to, and uses hypercalls instead.  Unfortunately, Linux's paravirtual
- * infrastructure isn't quite complete, because it doesn't contain replacements
- * for the Intel I/O instructions.  As a result, the Guest sometimes fumbles
- * across one during the boot process as it probes for various things which are
- * usually attached to a PC.
- *
- * When the Guest uses one of these instructions, we get a trap (General
- * Protection Fault) and come here.  We queue this to be sent out to the
- * Launcher to handle.
- */
-
-/*
- * The eip contains the *virtual* address of the Guest's instruction:
- * we copy the instruction here so the Launcher doesn't have to walk
- * the page tables to decode it.  We handle the case (eg. in a kernel
- * module) where the instruction is over two pages, and the pages are
- * virtually but not physically contiguous.
- *
- * The longest possible x86 instruction is 15 bytes, but we don't handle
- * anything that strange.
- */
-static void copy_from_guest(struct lg_cpu *cpu,
-			    void *dst, unsigned long vaddr, size_t len)
-{
-	size_t to_page_end = PAGE_SIZE - (vaddr % PAGE_SIZE);
-	unsigned long paddr;
-
-	BUG_ON(len > PAGE_SIZE);
-
-	/* If it goes over a page, copy in two parts. */
-	if (len > to_page_end) {
-		/* But make sure the next page is mapped! */
-		if (__guest_pa(cpu, vaddr + to_page_end, &paddr))
-			copy_from_guest(cpu, dst + to_page_end,
-					vaddr + to_page_end,
-					len - to_page_end);
-		else
-			/* Otherwise fill with zeroes. */
-			memset(dst + to_page_end, 0, len - to_page_end);
-		len = to_page_end;
-	}
-
-	/* This will kill the guest if it isn't mapped, but that
-	 * shouldn't happen. */
-	__lgread(cpu, dst, guest_pa(cpu, vaddr), len);
-}
-
-
-static void setup_emulate_insn(struct lg_cpu *cpu)
-{
-	cpu->pending.trap = 13;
-	copy_from_guest(cpu, cpu->pending.insn, cpu->regs->eip,
-			sizeof(cpu->pending.insn));
-}
-
-static void setup_iomem_insn(struct lg_cpu *cpu, unsigned long iomem_addr)
-{
-	cpu->pending.trap = 14;
-	cpu->pending.addr = iomem_addr;
-	copy_from_guest(cpu, cpu->pending.insn, cpu->regs->eip,
-			sizeof(cpu->pending.insn));
-}
-
-/*H:050 Once we've re-enabled interrupts, we look at why the Guest exited. */
-void lguest_arch_handle_trap(struct lg_cpu *cpu)
-{
-	unsigned long iomem_addr;
-
-	switch (cpu->regs->trapnum) {
-	case 13: /* We've intercepted a General Protection Fault. */
-		/* Hand to Launcher to emulate those pesky IN and OUT insns */
-		if (cpu->regs->errcode == 0) {
-			setup_emulate_insn(cpu);
-			return;
-		}
-		break;
-	case 14: /* We've intercepted a Page Fault. */
-		/*
-		 * The Guest accessed a virtual address that wasn't mapped.
-		 * This happens a lot: we don't actually set up most of the page
-		 * tables for the Guest at all when we start: as it runs it asks
-		 * for more and more, and we set them up as required. In this
-		 * case, we don't even tell the Guest that the fault happened.
-		 *
-		 * The errcode tells whether this was a read or a write, and
-		 * whether kernel or userspace code.
-		 */
-		if (demand_page(cpu, cpu->arch.last_pagefault,
-				cpu->regs->errcode, &iomem_addr))
-			return;
-
-		/* Was this an access to memory mapped IO? */
-		if (iomem_addr) {
-			/* Tell Launcher, let it handle it. */
-			setup_iomem_insn(cpu, iomem_addr);
-			return;
-		}
-
-		/*
-		 * OK, it's really not there (or not OK): the Guest needs to
-		 * know.  We write out the cr2 value so it knows where the
-		 * fault occurred.
-		 *
-		 * Note that if the Guest were really messed up, this could
-		 * happen before it's done the LHCALL_LGUEST_INIT hypercall, so
-		 * lg->lguest_data could be NULL
-		 */
-		if (cpu->lg->lguest_data &&
-		    put_user(cpu->arch.last_pagefault,
-			     &cpu->lg->lguest_data->cr2))
-			kill_guest(cpu, "Writing cr2");
-		break;
-	case 7: /* We've intercepted a Device Not Available fault. */
-		/* No special handling is needed here. */
-		break;
-	case 32 ... 255:
-		/* This might be a syscall. */
-		if (could_be_syscall(cpu->regs->trapnum))
-			break;
-
-		/*
-		 * Other values mean a real interrupt occurred, in which case
-		 * the Host handler has already been run. We just do a
-		 * friendly check if another process should now be run, then
-		 * return to run the Guest again.
-		 */
-		cond_resched();
-		return;
-	case LGUEST_TRAP_ENTRY:
-		/*
-		 * Our 'struct hcall_args' maps directly over our regs: we set
-		 * up the pointer now to indicate a hypercall is pending.
-		 */
-		cpu->hcall = (struct hcall_args *)cpu->regs;
-		return;
-	}
-
-	/* We didn't handle the trap, so it needs to go to the Guest. */
-	if (!deliver_trap(cpu, cpu->regs->trapnum))
-		/*
-		 * If the Guest doesn't have a handler (either it hasn't
-		 * registered any yet, or it's one of the faults we don't let
-		 * it handle), it dies with this cryptic error message.
-		 */
-		kill_guest(cpu, "unhandled trap %li at %#lx (%#lx)",
-			   cpu->regs->trapnum, cpu->regs->eip,
-			   cpu->regs->trapnum == 14 ? cpu->arch.last_pagefault
-			   : cpu->regs->errcode);
-}
-
-/*
- * Now we can look at each of the routines this calls, in increasing order of
- * complexity: do_hypercalls(), emulate_insn(), maybe_do_interrupt(),
- * deliver_trap() and demand_page().  After all those, we'll be ready to
- * examine the Switcher, and our philosophical understanding of the Host/Guest
- * duality will be complete.
-:*/
-static void adjust_pge(void *on)
-{
-	if (on)
-		cr4_set_bits(X86_CR4_PGE);
-	else
-		cr4_clear_bits(X86_CR4_PGE);
-}
-
-/*H:020
- * Now the Switcher is mapped and every thing else is ready, we need to do
- * some more i386-specific initialization.
- */
-void __init lguest_arch_host_init(void)
-{
-	int i;
-
-	/*
-	 * Most of the x86/switcher_32.S doesn't care that it's been moved; on
-	 * Intel, jumps are relative, and it doesn't access any references to
-	 * external code or data.
-	 *
-	 * The only exception is the interrupt handlers in switcher.S: their
-	 * addresses are placed in a table (default_idt_entries), so we need to
-	 * update the table with the new addresses.  switcher_offset() is a
-	 * convenience function which returns the distance between the
-	 * compiled-in switcher code and the high-mapped copy we just made.
-	 */
-	for (i = 0; i < IDT_ENTRIES; i++)
-		default_idt_entries[i] += switcher_offset();
-
-	/*
-	 * Set up the Switcher's per-cpu areas.
-	 *
-	 * Each CPU gets two pages of its own within the high-mapped region
-	 * (aka. "struct lguest_pages").  Much of this can be initialized now,
-	 * but some depends on what Guest we are running (which is set up in
-	 * copy_in_guest_info()).
-	 */
-	for_each_possible_cpu(i) {
-		/* lguest_pages() returns this CPU's two pages. */
-		struct lguest_pages *pages = lguest_pages(i);
-		/* This is a convenience pointer to make the code neater. */
-		struct lguest_ro_state *state = &pages->state;
-
-		/*
-		 * The Global Descriptor Table: the Host has a different one
-		 * for each CPU.  We keep a descriptor for the GDT which says
-		 * where it is and how big it is (the size is actually the last
-		 * byte, not the size, hence the "-1").
-		 */
-		state->host_gdt_desc.size = GDT_SIZE-1;
-		state->host_gdt_desc.address = (long)get_cpu_gdt_rw(i);
-
-		/*
-		 * All CPUs on the Host use the same Interrupt Descriptor
-		 * Table, so we just use store_idt(), which gets this CPU's IDT
-		 * descriptor.
-		 */
-		store_idt(&state->host_idt_desc);
-
-		/*
-		 * The descriptors for the Guest's GDT and IDT can be filled
-		 * out now, too.  We copy the GDT & IDT into ->guest_gdt and
-		 * ->guest_idt before actually running the Guest.
-		 */
-		state->guest_idt_desc.size = sizeof(state->guest_idt)-1;
-		state->guest_idt_desc.address = (long)&state->guest_idt;
-		state->guest_gdt_desc.size = sizeof(state->guest_gdt)-1;
-		state->guest_gdt_desc.address = (long)&state->guest_gdt;
-
-		/*
-		 * We know where we want the stack to be when the Guest enters
-		 * the Switcher: in pages->regs.  The stack grows upwards, so
-		 * we start it at the end of that structure.
-		 */
-		state->guest_tss.sp0 = (long)(&pages->regs + 1);
-		/*
-		 * And this is the GDT entry to use for the stack: we keep a
-		 * couple of special LGUEST entries.
-		 */
-		state->guest_tss.ss0 = LGUEST_DS;
-
-		/*
-		 * x86 can have a finegrained bitmap which indicates what I/O
-		 * ports the process can use.  We set it to the end of our
-		 * structure, meaning "none".
-		 */
-		state->guest_tss.io_bitmap_base = sizeof(state->guest_tss);
-
-		/*
-		 * Some GDT entries are the same across all Guests, so we can
-		 * set them up now.
-		 */
-		setup_default_gdt_entries(state);
-		/* Most IDT entries are the same for all Guests, too.*/
-		setup_default_idt_entries(state, default_idt_entries);
-
-		/*
-		 * The Host needs to be able to use the LGUEST segments on this
-		 * CPU, too, so put them in the Host GDT.
-		 */
-		get_cpu_gdt_rw(i)[GDT_ENTRY_LGUEST_CS] = FULL_EXEC_SEGMENT;
-		get_cpu_gdt_rw(i)[GDT_ENTRY_LGUEST_DS] = FULL_SEGMENT;
-	}
-
-	/*
-	 * In the Switcher, we want the %cs segment register to use the
-	 * LGUEST_CS GDT entry: we've put that in the Host and Guest GDTs, so
-	 * it will be undisturbed when we switch.  To change %cs and jump we
-	 * need this structure to feed to Intel's "lcall" instruction.
-	 */
-	lguest_entry.offset = (long)switch_to_guest + switcher_offset();
-	lguest_entry.segment = LGUEST_CS;
-
-	/*
-	 * Finally, we need to turn off "Page Global Enable".  PGE is an
-	 * optimization where page table entries are specially marked to show
-	 * they never change.  The Host kernel marks all the kernel pages this
-	 * way because it's always present, even when userspace is running.
-	 *
-	 * Lguest breaks this: unbeknownst to the rest of the Host kernel, we
-	 * switch to the Guest kernel.  If you don't disable this on all CPUs,
-	 * you'll get really weird bugs that you'll chase for two days.
-	 *
-	 * I used to turn PGE off every time we switched to the Guest and back
-	 * on when we return, but that slowed the Switcher down noticibly.
-	 */
-
-	/*
-	 * We don't need the complexity of CPUs coming and going while we're
-	 * doing this.
-	 */
-	get_online_cpus();
-	if (boot_cpu_has(X86_FEATURE_PGE)) { /* We have a broader idea of "global". */
-		/* Remember that this was originally set (for cleanup). */
-		cpu_had_pge = 1;
-		/*
-		 * adjust_pge is a helper function which sets or unsets the PGE
-		 * bit on its CPU, depending on the argument (0 == unset).
-		 */
-		on_each_cpu(adjust_pge, (void *)0, 1);
-		/* Turn off the feature in the global feature set. */
-		clear_cpu_cap(&boot_cpu_data, X86_FEATURE_PGE);
-	}
-	put_online_cpus();
-}
-/*:*/
-
-void __exit lguest_arch_host_fini(void)
-{
-	/* If we had PGE before we started, turn it back on now. */
-	get_online_cpus();
-	if (cpu_had_pge) {
-		set_cpu_cap(&boot_cpu_data, X86_FEATURE_PGE);
-		/* adjust_pge's argument "1" means set PGE. */
-		on_each_cpu(adjust_pge, (void *)1, 1);
-	}
-	put_online_cpus();
-}
-
-
-/*H:122 The i386-specific hypercalls simply farm out to the right functions. */
-int lguest_arch_do_hcall(struct lg_cpu *cpu, struct hcall_args *args)
-{
-	switch (args->arg0) {
-	case LHCALL_LOAD_GDT_ENTRY:
-		load_guest_gdt_entry(cpu, args->arg1, args->arg2, args->arg3);
-		break;
-	case LHCALL_LOAD_IDT_ENTRY:
-		load_guest_idt_entry(cpu, args->arg1, args->arg2, args->arg3);
-		break;
-	case LHCALL_LOAD_TLS:
-		guest_load_tls(cpu, args->arg1);
-		break;
-	default:
-		/* Bad Guest.  Bad! */
-		return -EIO;
-	}
-	return 0;
-}
-
-/*H:126 i386-specific hypercall initialization: */
-int lguest_arch_init_hypercalls(struct lg_cpu *cpu)
-{
-	u32 tsc_speed;
-
-	/*
-	 * The pointer to the Guest's "struct lguest_data" is the only argument.
-	 * We check that address now.
-	 */
-	if (!lguest_address_ok(cpu->lg, cpu->hcall->arg1,
-			       sizeof(*cpu->lg->lguest_data)))
-		return -EFAULT;
-
-	/*
-	 * Having checked it, we simply set lg->lguest_data to point straight
-	 * into the Launcher's memory at the right place and then use
-	 * copy_to_user/from_user from now on, instead of lgread/write.  I put
-	 * this in to show that I'm not immune to writing stupid
-	 * optimizations.
-	 */
-	cpu->lg->lguest_data = cpu->lg->mem_base + cpu->hcall->arg1;
-
-	/*
-	 * We insist that the Time Stamp Counter exist and doesn't change with
-	 * cpu frequency.  Some devious chip manufacturers decided that TSC
-	 * changes could be handled in software.  I decided that time going
-	 * backwards might be good for benchmarks, but it's bad for users.
-	 *
-	 * We also insist that the TSC be stable: the kernel detects unreliable
-	 * TSCs for its own purposes, and we use that here.
-	 */
-	if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC) && !check_tsc_unstable())
-		tsc_speed = tsc_khz;
-	else
-		tsc_speed = 0;
-	if (put_user(tsc_speed, &cpu->lg->lguest_data->tsc_khz))
-		return -EFAULT;
-
-	/* The interrupt code might not like the system call vector. */
-	if (!check_syscall_vector(cpu->lg))
-		kill_guest(cpu, "bad syscall vector");
-
-	return 0;
-}
-/*:*/
-
-/*L:030
- * Most of the Guest's registers are left alone: we used get_zeroed_page() to
- * allocate the structure, so they will be 0.
- */
-void lguest_arch_setup_regs(struct lg_cpu *cpu, unsigned long start)
-{
-	struct lguest_regs *regs = cpu->regs;
-
-	/*
-	 * There are four "segment" registers which the Guest needs to boot:
-	 * The "code segment" register (cs) refers to the kernel code segment
-	 * __KERNEL_CS, and the "data", "extra" and "stack" segment registers
-	 * refer to the kernel data segment __KERNEL_DS.
-	 *
-	 * The privilege level is packed into the lower bits.  The Guest runs
-	 * at privilege level 1 (GUEST_PL).
-	 */
-	regs->ds = regs->es = regs->ss = __KERNEL_DS|GUEST_PL;
-	regs->cs = __KERNEL_CS|GUEST_PL;
-
-	/*
-	 * The "eflags" register contains miscellaneous flags.  Bit 1 (0x002)
-	 * is supposed to always be "1".  Bit 9 (0x200) controls whether
-	 * interrupts are enabled.  We always leave interrupts enabled while
-	 * running the Guest.
-	 */
-	regs->eflags = X86_EFLAGS_IF | X86_EFLAGS_FIXED;
-
-	/*
-	 * The "Extended Instruction Pointer" register says where the Guest is
-	 * running.
-	 */
-	regs->eip = start;
-
-	/*
-	 * %esi points to our boot information, at physical address 0, so don't
-	 * touch it.
-	 */
-
-	/* There are a couple of GDT entries the Guest expects at boot. */
-	setup_guest_gdt(cpu);
-}
diff --git a/drivers/lguest/x86/switcher_32.S b/drivers/lguest/x86/switcher_32.S
deleted file mode 100644
index 40634b0db9f7..000000000000
--- a/drivers/lguest/x86/switcher_32.S
+++ /dev/null
@@ -1,388 +0,0 @@
-/*P:900
- * This is the Switcher: code which sits at 0xFFC00000 (or 0xFFE00000) astride
- * both the Host and Guest to do the low-level Guest<->Host switch.  It is as
- * simple as it can be made, but it's naturally very specific to x86.
- *
- * You have now completed Preparation.  If this has whet your appetite; if you
- * are feeling invigorated and refreshed then the next, more challenging stage
- * can be found in "make Guest".
- :*/
-
-/*M:012
- * Lguest is meant to be simple: my rule of thumb is that 1% more LOC must
- * gain at least 1% more performance.  Since neither LOC nor performance can be
- * measured beforehand, it generally means implementing a feature then deciding
- * if it's worth it.  And once it's implemented, who can say no?
- *
- * This is why I haven't implemented this idea myself.  I want to, but I
- * haven't.  You could, though.
- *
- * The main place where lguest performance sucks is Guest page faulting.  When
- * a Guest userspace process hits an unmapped page we switch back to the Host,
- * walk the page tables, find it's not mapped, switch back to the Guest page
- * fault handler, which calls a hypercall to set the page table entry, then
- * finally returns to userspace.  That's two round-trips.
- *
- * If we had a small walker in the Switcher, we could quickly check the Guest
- * page table and if the page isn't mapped, immediately reflect the fault back
- * into the Guest.  This means the Switcher would have to know the top of the
- * Guest page table and the page fault handler address.
- *
- * For simplicity, the Guest should only handle the case where the privilege
- * level of the fault is 3 and probably only not present or write faults.  It
- * should also detect recursive faults, and hand the original fault to the
- * Host (which is actually really easy).
- *
- * Two questions remain.  Would the performance gain outweigh the complexity?
- * And who would write the verse documenting it?
-:*/
-
-/*M:011
- * Lguest64 handles NMI.  This gave me NMI envy (until I looked at their
- * code).  It's worth doing though, since it would let us use oprofile in the
- * Host when a Guest is running.
-:*/
-
-/*S:100
- * Welcome to the Switcher itself!
- *
- * This file contains the low-level code which changes the CPU to run the Guest
- * code, and returns to the Host when something happens.  Understand this, and
- * you understand the heart of our journey.
- *
- * Because this is in assembler rather than C, our tale switches from prose to
- * verse.  First I tried limericks:
- *
- *	There once was an eax reg,
- *	To which our pointer was fed,
- *	It needed an add,
- *	Which asm-offsets.h had
- *	But this limerick is hurting my head.
- *
- * Next I tried haikus, but fitting the required reference to the seasons in
- * every stanza was quickly becoming tiresome:
- *
- *	The %eax reg
- *	Holds "struct lguest_pages" now:
- *	Cherry blossoms fall.
- *
- * Then I started with Heroic Verse, but the rhyming requirement leeched away
- * the content density and led to some uniquely awful oblique rhymes:
- *
- *	These constants are coming from struct offsets
- *	For use within the asm switcher text.
- *
- * Finally, I settled for something between heroic hexameter, and normal prose
- * with inappropriate linebreaks.  Anyway, it aint no Shakespeare.
- */
-
-// Not all kernel headers work from assembler
-// But these ones are needed: the ENTRY() define
-// And constants extracted from struct offsets
-// To avoid magic numbers and breakage:
-// Should they change the compiler can't save us
-// Down here in the depths of assembler code.
-#include <linux/linkage.h>
-#include <asm/asm-offsets.h>
-#include <asm/page.h>
-#include <asm/segment.h>
-#include <asm/lguest.h>
-
-// We mark the start of the code to copy
-// It's placed in .text tho it's never run here
-// You'll see the trick macro at the end
-// Which interleaves data and text to effect.
-.text
-ENTRY(start_switcher_text)
-
-// When we reach switch_to_guest we have just left
-// The safe and comforting shores of C code
-// %eax has the "struct lguest_pages" to use
-// Where we save state and still see it from the Guest
-// And %ebx holds the Guest shadow pagetable:
-// Once set we have truly left Host behind.
-ENTRY(switch_to_guest)
-	// We told gcc all its regs could fade,
-	// Clobbered by our journey into the Guest
-	// We could have saved them, if we tried
-	// But time is our master and cycles count.
-
-	// Segment registers must be saved for the Host
-	// We push them on the Host stack for later
-	pushl	%es
-	pushl	%ds
-	pushl	%gs
-	pushl	%fs
-	// But the compiler is fickle, and heeds
-	// No warning of %ebp clobbers
-	// When frame pointers are used.  That register
-	// Must be saved and restored or chaos strikes.
-	pushl	%ebp
-	// The Host's stack is done, now save it away
-	// In our "struct lguest_pages" at offset
-	// Distilled into asm-offsets.h
-	movl	%esp, LGUEST_PAGES_host_sp(%eax)
-
-	// All saved and there's now five steps before us:
-	// Stack, GDT, IDT, TSS
-	// Then last of all the page tables are flipped.
-
-	// Yet beware that our stack pointer must be
-	// Always valid lest an NMI hits
-	// %edx does the duty here as we juggle
-	// %eax is lguest_pages: our stack lies within.
-	movl	%eax, %edx
-	addl	$LGUEST_PAGES_regs, %edx
-	movl	%edx, %esp
-
-	// The Guest's GDT we so carefully
-	// Placed in the "struct lguest_pages" before
-	lgdt	LGUEST_PAGES_guest_gdt_desc(%eax)
-
-	// The Guest's IDT we did partially
-	// Copy to "struct lguest_pages" as well.
-	lidt	LGUEST_PAGES_guest_idt_desc(%eax)
-
-	// The TSS entry which controls traps
-	// Must be loaded up with "ltr" now:
-	// The GDT entry that TSS uses 
-	// Changes type when we load it: damn Intel!
-	// For after we switch over our page tables
-	// That entry will be read-only: we'd crash.
-	movl	$(GDT_ENTRY_TSS*8), %edx
-	ltr	%dx
-
-	// Look back now, before we take this last step!
-	// The Host's TSS entry was also marked used;
-	// Let's clear it again for our return.
-	// The GDT descriptor of the Host
-	// Points to the table after two "size" bytes
-	movl	(LGUEST_PAGES_host_gdt_desc+2)(%eax), %edx
-	// Clear "used" from type field (byte 5, bit 2)
-	andb	$0xFD, (GDT_ENTRY_TSS*8 + 5)(%edx)
-
-	// Once our page table's switched, the Guest is live!
-	// The Host fades as we run this final step.
-	// Our "struct lguest_pages" is now read-only.
-	movl	%ebx, %cr3
-
-	// The page table change did one tricky thing:
-	// The Guest's register page has been mapped
-	// Writable under our %esp (stack) --
-	// We can simply pop off all Guest regs.
-	popl	%eax
-	popl	%ebx
-	popl	%ecx
-	popl	%edx
-	popl	%esi
-	popl	%edi
-	popl	%ebp
-	popl	%gs
-	popl	%fs
-	popl	%ds
-	popl	%es
-
-	// Near the base of the stack lurk two strange fields
-	// Which we fill as we exit the Guest
-	// These are the trap number and its error
-	// We can simply step past them on our way.
-	addl	$8, %esp
-
-	// The last five stack slots hold return address
-	// And everything needed to switch privilege
-	// From Switcher's level 0 to Guest's 1,
-	// And the stack where the Guest had last left it.
-	// Interrupts are turned back on: we are Guest.
-	iret
-
-// We tread two paths to switch back to the Host
-// Yet both must save Guest state and restore Host
-// So we put the routine in a macro.
-#define SWITCH_TO_HOST							\
-	/* We save the Guest state: all registers first			\
-	 * Laid out just as "struct lguest_regs" defines */		\
-	pushl	%es;							\
-	pushl	%ds;							\
-	pushl	%fs;							\
-	pushl	%gs;							\
-	pushl	%ebp;							\
-	pushl	%edi;							\
-	pushl	%esi;							\
-	pushl	%edx;							\
-	pushl	%ecx;							\
-	pushl	%ebx;							\
-	pushl	%eax;							\
-	/* Our stack and our code are using segments			\
-	 * Set in the TSS and IDT					\
-	 * Yet if we were to touch data we'd use			\
-	 * Whatever data segment the Guest had.				\
-	 * Load the lguest ds segment for now. */			\
-	movl	$(LGUEST_DS), %eax;					\
-	movl	%eax, %ds;						\
-	/* So where are we?  Which CPU, which struct?			\
-	 * The stack is our clue: our TSS starts			\
-	 * It at the end of "struct lguest_pages".			\
-	 * Or we may have stumbled while restoring			\
-	 * Our Guest segment regs while in switch_to_guest,		\
-	 * The fault pushed atop that part-unwound stack.		\
-	 * If we round the stack down to the page start			\
-	 * We're at the start of "struct lguest_pages". */		\
-	movl	%esp, %eax;						\
-	andl	$(~(1 << PAGE_SHIFT - 1)), %eax;			\
-	/* Save our trap number: the switch will obscure it		\
-	 * (In the Host the Guest regs are not mapped here)		\
-	 * %ebx holds it safe for deliver_to_host */			\
-	movl	LGUEST_PAGES_regs_trapnum(%eax), %ebx;			\
-	/* The Host GDT, IDT and stack!					\
-	 * All these lie safely hidden from the Guest:			\
-	 * We must return to the Host page tables			\
-	 * (Hence that was saved in struct lguest_pages) */		\
-	movl	LGUEST_PAGES_host_cr3(%eax), %edx;			\
-	movl	%edx, %cr3;						\
-	/* As before, when we looked back at the Host			\
-	 * As we left and marked TSS unused				\
-	 * So must we now for the Guest left behind. */			\
-	andb	$0xFD, (LGUEST_PAGES_guest_gdt+GDT_ENTRY_TSS*8+5)(%eax); \
-	/* Switch to Host's GDT, IDT. */				\
-	lgdt	LGUEST_PAGES_host_gdt_desc(%eax);			\
-	lidt	LGUEST_PAGES_host_idt_desc(%eax);			\
-	/* Restore the Host's stack where its saved regs lie */		\
-	movl	LGUEST_PAGES_host_sp(%eax), %esp;			\
-	/* Last the TSS: our Host is returned */			\
-	movl	$(GDT_ENTRY_TSS*8), %edx;				\
-	ltr	%dx;							\
-	/* Restore now the regs saved right at the first. */		\
-	popl	%ebp;							\
-	popl	%fs;							\
-	popl	%gs;							\
-	popl	%ds;							\
-	popl	%es
-
-// The first path is trod when the Guest has trapped:
-// (Which trap it was has been pushed on the stack).
-// We need only switch back, and the Host will decode
-// Why we came home, and what needs to be done.
-return_to_host:
-	SWITCH_TO_HOST
-	iret
-
-// We are lead to the second path like so:
-// An interrupt, with some cause external
-// Has ajerked us rudely from the Guest's code
-// Again we must return home to the Host
-deliver_to_host:
-	SWITCH_TO_HOST
-	// But now we must go home via that place
-	// Where that interrupt was supposed to go
-	// Had we not been ensconced, running the Guest.
-	// Here we see the trickness of run_guest_once():
-	// The Host stack is formed like an interrupt
-	// With EIP, CS and EFLAGS layered.
-	// Interrupt handlers end with "iret"
-	// And that will take us home at long long last.
-
-	// But first we must find the handler to call!
-	// The IDT descriptor for the Host
-	// Has two bytes for size, and four for address:
-	// %edx will hold it for us for now.
-	movl	(LGUEST_PAGES_host_idt_desc+2)(%eax), %edx
-	// We now know the table address we need,
-	// And saved the trap's number inside %ebx.
-	// Yet the pointer to the handler is smeared
-	// Across the bits of the table entry.
-	// What oracle can tell us how to extract
-	// From such a convoluted encoding?
-	// I consulted gcc, and it gave
-	// These instructions, which I gladly credit:
-	leal	(%edx,%ebx,8), %eax
-	movzwl	(%eax),%edx
-	movl	4(%eax), %eax
-	xorw	%ax, %ax
-	orl	%eax, %edx
-	// Now the address of the handler's in %edx
-	// We call it now: its "iret" drops us home.
-	jmp	*%edx
-
-// Every interrupt can come to us here
-// But we must truly tell each apart.
-// They number two hundred and fifty six
-// And each must land in a different spot,
-// Push its number on stack, and join the stream.
-
-// And worse, a mere six of the traps stand apart
-// And push on their stack an addition:
-// An error number, thirty two bits long
-// So we punish the other two fifty
-// And make them push a zero so they match.
-
-// Yet two fifty six entries is long
-// And all will look most the same as the last
-// So we create a macro which can make
-// As many entries as we need to fill.
-
-// Note the change to .data then .text:
-// We plant the address of each entry
-// Into a (data) table for the Host
-// To know where each Guest interrupt should go.
-.macro IRQ_STUB N TARGET
-	.data; .long 1f; .text; 1:
- // Trap eight, ten through fourteen and seventeen
- // Supply an error number.  Else zero.
- .if (\N <> 8) && (\N < 10 || \N > 14) && (\N <> 17)
-	pushl	$0
- .endif
-	pushl	$\N
-	jmp	\TARGET
-	ALIGN
-.endm
-
-// This macro creates numerous entries
-// Using GAS macros which out-power C's.
-.macro IRQ_STUBS FIRST LAST TARGET
- irq=\FIRST
- .rept \LAST-\FIRST+1
-	IRQ_STUB irq \TARGET
-  irq=irq+1
- .endr
-.endm
-
-// Here's the marker for our pointer table
-// Laid in the data section just before
-// Each macro places the address of code
-// Forming an array: each one points to text
-// Which handles interrupt in its turn.
-.data
-.global default_idt_entries
-default_idt_entries:
-.text
-	// The first two traps go straight back to the Host
-	IRQ_STUBS 0 1 return_to_host
-	// We'll say nothing, yet, about NMI
-	IRQ_STUB 2 handle_nmi
-	// Other traps also return to the Host
-	IRQ_STUBS 3 31 return_to_host
-	// All interrupts go via their handlers
-	IRQ_STUBS 32 127 deliver_to_host
-	// 'Cept system calls coming from userspace
-	// Are to go to the Guest, never the Host.
-	IRQ_STUB 128 return_to_host
-	IRQ_STUBS 129 255 deliver_to_host
-
-// The NMI, what a fabulous beast
-// Which swoops in and stops us no matter that
-// We're suspended between heaven and hell,
-// (Or more likely between the Host and Guest)
-// When in it comes!  We are dazed and confused
-// So we do the simplest thing which one can.
-// Though we've pushed the trap number and zero
-// We discard them, return, and hope we live.
-handle_nmi:
-	addl	$8, %esp
-	iret
-
-// We are done; all that's left is Mastery
-// And "make Mastery" is a journey long
-// Designed to make your fingers itch to code.
-
-// Here ends the text, the file and poem.
-ENTRY(end_switcher_text)
diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index 83a1616903f8..aba0d652095b 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -333,7 +333,7 @@ config VIRTIO_NET
 	depends on VIRTIO
 	---help---
 	  This is the virtual network driver for virtio.  It can be used with
-	  lguest or QEMU based VMMs (like KVM or Xen).  Say Y or M.
+	  QEMU based VMMs (like KVM or Xen).  Say Y or M.
 
 config NLMON
 	tristate "Virtual netlink monitoring device"
diff --git a/drivers/tty/hvc/Kconfig b/drivers/tty/hvc/Kconfig
index b8d5ea0ae26b..fec457edad14 100644
--- a/drivers/tty/hvc/Kconfig
+++ b/drivers/tty/hvc/Kconfig
@@ -4,7 +4,7 @@ config HVC_DRIVER
 	bool
 	help
 	  Generic "hypervisor virtual console" infrastructure for various
-	  hypervisors (pSeries, iSeries, Xen, lguest).
+	  hypervisors (pSeries, iSeries, Xen).
 	  It will automatically be selected if one of the back-end console drivers
 	  is selected.
 
diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig
index 623f72334fa5..cff773f15b7e 100644
--- a/drivers/virtio/Kconfig
+++ b/drivers/virtio/Kconfig
@@ -2,8 +2,8 @@ config VIRTIO
 	tristate
 	---help---
 	  This option is selected by any driver which implements the virtio
-	  bus, such as CONFIG_VIRTIO_PCI, CONFIG_VIRTIO_MMIO, CONFIG_LGUEST,
-	  CONFIG_RPMSG or CONFIG_S390_GUEST.
+	  bus, such as CONFIG_VIRTIO_PCI, CONFIG_VIRTIO_MMIO, CONFIG_RPMSG
+	  or CONFIG_S390_GUEST.
 
 menu "Virtio drivers"
 
diff --git a/include/linux/lguest.h b/include/linux/lguest.h
deleted file mode 100644
index 6db19f35f7c5..000000000000
--- a/include/linux/lguest.h
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- * Things the lguest guest needs to know.  Note: like all lguest interfaces,
- * this is subject to wild and random change between versions.
- */
-#ifndef _LINUX_LGUEST_H
-#define _LINUX_LGUEST_H
-
-#ifndef __ASSEMBLY__
-#include <linux/time.h>
-#include <asm/irq.h>
-#include <asm/lguest_hcall.h>
-
-#define LG_CLOCK_MIN_DELTA	100UL
-#define LG_CLOCK_MAX_DELTA	ULONG_MAX
-
-/*G:031
- * The second method of communicating with the Host is to via "struct
- * lguest_data".  Once the Guest's initialization hypercall tells the Host where
- * this is, the Guest and Host both publish information in it.
-:*/
-struct lguest_data {
-	/*
-	 * 512 == enabled (same as eflags in normal hardware).  The Guest
-	 * changes interrupts so often that a hypercall is too slow.
-	 */
-	unsigned int irq_enabled;
-	/* Fine-grained interrupt disabling by the Guest */
-	DECLARE_BITMAP(blocked_interrupts, LGUEST_IRQS);
-
-	/*
-	 * The Host writes the virtual address of the last page fault here,
-	 * which saves the Guest a hypercall.  CR2 is the native register where
-	 * this address would normally be found.
-	 */
-	unsigned long cr2;
-
-	/* Wallclock time set by the Host. */
-	struct timespec time;
-
-	/*
-	 * Interrupt pending set by the Host.  The Guest should do a hypercall
-	 * if it re-enables interrupts and sees this set (to X86_EFLAGS_IF).
-	 */
-	int irq_pending;
-
-	/*
-	 * Async hypercall ring.  Instead of directly making hypercalls, we can
-	 * place them in here for processing the next time the Host wants.
-	 * This batching can be quite efficient.
-	 */
-
-	/* 0xFF == done (set by Host), 0 == pending (set by Guest). */
-	u8 hcall_status[LHCALL_RING_SIZE];
-	/* The actual registers for the hypercalls. */
-	struct hcall_args hcalls[LHCALL_RING_SIZE];
-
-/* Fields initialized by the Host at boot: */
-	/* Memory not to try to access */
-	unsigned long reserve_mem;
-	/* KHz for the TSC clock. */
-	u32 tsc_khz;
-
-/* Fields initialized by the Guest at boot: */
-	/* Instruction to suppress interrupts even if enabled */
-	unsigned long noirq_iret;
-	/* Address above which page tables are all identical. */
-	unsigned long kernel_address;
-	/* The vector to try to use for system calls (0x40 or 0x80). */
-	unsigned int syscall_vec;
-};
-extern struct lguest_data lguest_data;
-#endif /* __ASSEMBLY__ */
-#endif	/* _LINUX_LGUEST_H */
diff --git a/include/linux/lguest_launcher.h b/include/linux/lguest_launcher.h
deleted file mode 100644
index acd5b12565cc..000000000000
--- a/include/linux/lguest_launcher.h
+++ /dev/null
@@ -1,44 +0,0 @@
-#ifndef _LINUX_LGUEST_LAUNCHER
-#define _LINUX_LGUEST_LAUNCHER
-/* Everything the "lguest" userspace program needs to know. */
-#include <linux/types.h>
-
-/*D:010
- * Drivers
- *
- * The Guest needs devices to do anything useful.  Since we don't let it touch
- * real devices (think of the damage it could do!) we provide virtual devices.
- * We emulate a PCI bus with virtio devices on it; we used to have our own
- * lguest bus which was far simpler, but this tests the virtio 1.0 standard.
- *
- * Virtio devices are also used by kvm, so we can simply reuse their optimized
- * device drivers.  And one day when everyone uses virtio, my plan will be
- * complete.  Bwahahahah!
- */
-
-/* Write command first word is a request. */
-enum lguest_req
-{
-	LHREQ_INITIALIZE, /* + base, pfnlimit, start */
-	LHREQ_GETDMA, /* No longer used */
-	LHREQ_IRQ, /* + irq */
-	LHREQ_BREAK, /* No longer used */
-	LHREQ_EVENTFD, /* No longer used. */
-	LHREQ_GETREG, /* + offset within struct pt_regs (then read value). */
-	LHREQ_SETREG, /* + offset within struct pt_regs, value. */
-	LHREQ_TRAP, /* + trap number to deliver to guest. */
-};
-
-/*
- * This is what read() of the lguest fd populates.  trap ==
- * LGUEST_TRAP_ENTRY for an LHCALL_NOTIFY (addr is the
- * argument), 14 for a page fault in the MMIO region (addr is
- * the trap address, insn is the instruction), or 13 for a GPF
- * (insn is the instruction).
- */
-struct lguest_pending {
-	__u8 trap;
-	__u8 insn[7];
-	__u32 addr;
-};
-#endif /* _LINUX_LGUEST_LAUNCHER */
diff --git a/include/uapi/linux/virtio_ring.h b/include/uapi/linux/virtio_ring.h
index c07295969b7e..6d5d5faa989b 100644
--- a/include/uapi/linux/virtio_ring.h
+++ b/include/uapi/linux/virtio_ring.h
@@ -1,7 +1,7 @@
 #ifndef _UAPI_LINUX_VIRTIO_RING_H
 #define _UAPI_LINUX_VIRTIO_RING_H
-/* An interface for efficient virtio implementation, currently for use by KVM
- * and lguest, but hopefully others soon.  Do NOT change this since it will
+/* An interface for efficient virtio implementation, currently for use by KVM,
+ * but hopefully others soon.  Do NOT change this since it will
  * break existing servers and clients.
  *
  * This header is BSD licensed so anyone can use the definitions to implement
diff --git a/tools/Makefile b/tools/Makefile
index 221e1ce78b06..a19b176b914b 100644
--- a/tools/Makefile
+++ b/tools/Makefile
@@ -18,7 +18,6 @@ help:
 	@echo '  iio                    - IIO tools'
 	@echo '  kvm_stat               - top-like utility for displaying kvm statistics'
 	@echo '  leds                   - LEDs  tools'
-	@echo '  lguest                 - a minimal 32-bit x86 hypervisor'
 	@echo '  liblockdep             - user-space wrapper for kernel locking-validator'
 	@echo '  net                    - misc networking tools'
 	@echo '  perf                   - Linux performance measurement and analysis tool'
@@ -90,7 +89,7 @@ freefall: FORCE
 kvm_stat: FORCE
 	$(call descend,kvm/$@)
 
-all: acpi cgroup cpupower gpio hv firewire lguest liblockdep \
+all: acpi cgroup cpupower gpio hv firewire liblockdep \
 		perf selftests turbostat usb \
 		virtio vm net x86_energy_perf_policy \
 		tmon freefall objtool kvm_stat
@@ -101,7 +100,7 @@ acpi_install:
 cpupower_install:
 	$(call descend,power/$(@:_install=),install)
 
-cgroup_install firewire_install gpio_install hv_install lguest_install perf_install usb_install virtio_install vm_install net_install objtool_install:
+cgroup_install firewire_install gpio_install hv_install perf_install usb_install virtio_install vm_install net_install objtool_install:
 	$(call descend,$(@:_install=),install)
 
 liblockdep_install:
@@ -123,7 +122,7 @@ kvm_stat_install:
 	$(call descend,kvm/$(@:_install=),install)
 
 install: acpi_install cgroup_install cpupower_install gpio_install \
-		hv_install firewire_install lguest_install liblockdep_install \
+		hv_install firewire_install liblockdep_install \
 		perf_install selftests_install turbostat_install usb_install \
 		virtio_install vm_install net_install x86_energy_perf_policy_install \
 		tmon_install freefall_install objtool_install kvm_stat_install
@@ -134,7 +133,7 @@ acpi_clean:
 cpupower_clean:
 	$(call descend,power/cpupower,clean)
 
-cgroup_clean hv_clean firewire_clean lguest_clean spi_clean usb_clean virtio_clean vm_clean net_clean iio_clean gpio_clean objtool_clean leds_clean:
+cgroup_clean hv_clean firewire_clean spi_clean usb_clean virtio_clean vm_clean net_clean iio_clean gpio_clean objtool_clean leds_clean:
 	$(call descend,$(@:_clean=),clean)
 
 liblockdep_clean:
@@ -168,7 +167,7 @@ freefall_clean:
 build_clean:
 	$(call descend,build,clean)
 
-clean: acpi_clean cgroup_clean cpupower_clean hv_clean firewire_clean lguest_clean \
+clean: acpi_clean cgroup_clean cpupower_clean hv_clean firewire_clean \
 		perf_clean selftests_clean turbostat_clean spi_clean usb_clean virtio_clean \
 		vm_clean net_clean iio_clean x86_energy_perf_policy_clean tmon_clean \
 		freefall_clean build_clean libbpf_clean libsubcmd_clean liblockdep_clean \
diff --git a/tools/lguest/.gitignore b/tools/lguest/.gitignore
deleted file mode 100644
index 8d9a8383a52e..000000000000
--- a/tools/lguest/.gitignore
+++ /dev/null
@@ -1,2 +0,0 @@
-lguest
-include
diff --git a/tools/lguest/Makefile b/tools/lguest/Makefile
deleted file mode 100644
index d04599a79802..000000000000
--- a/tools/lguest/Makefile
+++ /dev/null
@@ -1,14 +0,0 @@
-# This creates the demonstration utility "lguest" which runs a Linux guest.
-CFLAGS:=-m32 -Wall -Wmissing-declarations -Wmissing-prototypes -O3 -U_FORTIFY_SOURCE -Iinclude
-
-all: lguest
-
-include/linux/virtio_types.h: ../../include/uapi/linux/virtio_types.h
-	mkdir -p include/linux 2>&1 || true
-	ln -sf ../../../../include/uapi/linux/virtio_types.h $@
-
-lguest: include/linux/virtio_types.h
-
-clean:
-	rm -f lguest
-	rm -rf include
diff --git a/tools/lguest/extract b/tools/lguest/extract
deleted file mode 100644
index 7730bb6e4b94..000000000000
--- a/tools/lguest/extract
+++ /dev/null
@@ -1,58 +0,0 @@
-#! /bin/sh
-
-set -e
-
-PREFIX=$1
-shift
-
-trap 'rm -r $TMPDIR' 0
-TMPDIR=`mktemp -d`
-
-exec 3>/dev/null
-for f; do
-    while IFS="
-" read -r LINE; do
-	case "$LINE" in
-	    *$PREFIX:[0-9]*:\**)
-		NUM=`echo "$LINE" | sed "s/.*$PREFIX:\([0-9]*\).*/\1/"`
-		if [ -f $TMPDIR/$NUM ]; then
-		    echo "$TMPDIR/$NUM already exits prior to $f"
-		    exit 1
-		fi
-		exec 3>>$TMPDIR/$NUM
-		echo $f | sed 's,\.\./,,g' > $TMPDIR/.$NUM
-		/bin/echo "$LINE" | sed -e "s/$PREFIX:[0-9]*//" -e "s/:\*/*/" >&3
-		;;
-	    *$PREFIX:[0-9]*)
-		NUM=`echo "$LINE" | sed "s/.*$PREFIX:\([0-9]*\).*/\1/"`
-		if [ -f $TMPDIR/$NUM ]; then
-		    echo "$TMPDIR/$NUM already exits prior to $f"
-		    exit 1
-		fi
-		exec 3>>$TMPDIR/$NUM
-		echo $f | sed 's,\.\./,,g' > $TMPDIR/.$NUM
-		/bin/echo "$LINE" | sed "s/$PREFIX:[0-9]*//" >&3
-		;;
-	    *:\**)
-		/bin/echo "$LINE" | sed -e "s/:\*/*/" -e "s,/\*\*/,," >&3
-		echo >&3
-		exec 3>/dev/null
-		;;
-	    *)
-		/bin/echo "$LINE" >&3
-		;;
-	esac
-    done < $f
-    echo >&3
-    exec 3>/dev/null
-done
-
-LASTFILE=""
-for f in $TMPDIR/*; do
-    if [ "$LASTFILE" != $(cat $TMPDIR/.$(basename $f) ) ]; then
-	LASTFILE=$(cat $TMPDIR/.$(basename $f) )
-	echo "[ $LASTFILE ]"
-    fi
-    cat $f
-done
-
diff --git a/tools/lguest/lguest.c b/tools/lguest/lguest.c
deleted file mode 100644
index 897cd6f3f687..000000000000
--- a/tools/lguest/lguest.c
+++ /dev/null
@@ -1,3420 +0,0 @@
-/*P:100
- * This is the Launcher code, a simple program which lays out the "physical"
- * memory for the new Guest by mapping the kernel image and the virtual
- * devices, then opens /dev/lguest to tell the kernel about the Guest and
- * control it.
-:*/
-#define _LARGEFILE64_SOURCE
-#define _GNU_SOURCE
-#include <stdio.h>
-#include <string.h>
-#include <unistd.h>
-#include <err.h>
-#include <stdint.h>
-#include <stdlib.h>
-#include <elf.h>
-#include <sys/mman.h>
-#include <sys/param.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <sys/wait.h>
-#include <sys/eventfd.h>
-#include <fcntl.h>
-#include <stdbool.h>
-#include <errno.h>
-#include <ctype.h>
-#include <sys/socket.h>
-#include <sys/ioctl.h>
-#include <sys/time.h>
-#include <time.h>
-#include <netinet/in.h>
-#include <net/if.h>
-#include <linux/sockios.h>
-#include <linux/if_tun.h>
-#include <sys/uio.h>
-#include <termios.h>
-#include <getopt.h>
-#include <assert.h>
-#include <sched.h>
-#include <limits.h>
-#include <stddef.h>
-#include <signal.h>
-#include <pwd.h>
-#include <grp.h>
-#include <sys/user.h>
-#include <linux/pci_regs.h>
-
-#ifndef VIRTIO_F_ANY_LAYOUT
-#define VIRTIO_F_ANY_LAYOUT		27
-#endif
-
-/*L:110
- * We can ignore the 43 include files we need for this program, but I do want
- * to draw attention to the use of kernel-style types.
- *
- * As Linus said, "C is a Spartan language, and so should your naming be."  I
- * like these abbreviations, so we define them here.  Note that u64 is always
- * unsigned long long, which works on all Linux systems: this means that we can
- * use %llu in printf for any u64.
- */
-typedef unsigned long long u64;
-typedef uint32_t u32;
-typedef uint16_t u16;
-typedef uint8_t u8;
-/*:*/
-
-#define VIRTIO_CONFIG_NO_LEGACY
-#define VIRTIO_PCI_NO_LEGACY
-#define VIRTIO_BLK_NO_LEGACY
-#define VIRTIO_NET_NO_LEGACY
-
-/* Use in-kernel ones, which defines VIRTIO_F_VERSION_1 */
-#include "../../include/uapi/linux/virtio_config.h"
-#include "../../include/uapi/linux/virtio_net.h"
-#include "../../include/uapi/linux/virtio_blk.h"
-#include "../../include/uapi/linux/virtio_console.h"
-#include "../../include/uapi/linux/virtio_rng.h"
-#include <linux/virtio_ring.h>
-#include "../../include/uapi/linux/virtio_pci.h"
-#include <asm/bootparam.h>
-#include "../../include/linux/lguest_launcher.h"
-
-#define BRIDGE_PFX "bridge:"
-#ifndef SIOCBRADDIF
-#define SIOCBRADDIF	0x89a2		/* add interface to bridge      */
-#endif
-/* We can have up to 256 pages for devices. */
-#define DEVICE_PAGES 256
-/* This will occupy 3 pages: it must be a power of 2. */
-#define VIRTQUEUE_NUM 256
-
-/*L:120
- * verbose is both a global flag and a macro.  The C preprocessor allows
- * this, and although I wouldn't recommend it, it works quite nicely here.
- */
-static bool verbose;
-#define verbose(args...) \
-	do { if (verbose) printf(args); } while(0)
-/*:*/
-
-/* The pointer to the start of guest memory. */
-static void *guest_base;
-/* The maximum guest physical address allowed, and maximum possible. */
-static unsigned long guest_limit, guest_max, guest_mmio;
-/* The /dev/lguest file descriptor. */
-static int lguest_fd;
-
-/* a per-cpu variable indicating whose vcpu is currently running */
-static unsigned int __thread cpu_id;
-
-/* 5 bit device number in the PCI_CONFIG_ADDR => 32 only */
-#define MAX_PCI_DEVICES 32
-
-/* This is our list of devices. */
-struct device_list {
-	/* Counter to assign interrupt numbers. */
-	unsigned int next_irq;
-
-	/* Counter to print out convenient device numbers. */
-	unsigned int device_num;
-
-	/* PCI devices. */
-	struct device *pci[MAX_PCI_DEVICES];
-};
-
-/* The list of Guest devices, based on command line arguments. */
-static struct device_list devices;
-
-/*
- * Just like struct virtio_pci_cfg_cap in uapi/linux/virtio_pci.h,
- * but uses a u32 explicitly for the data.
- */
-struct virtio_pci_cfg_cap_u32 {
-	struct virtio_pci_cap cap;
-	u32 pci_cfg_data; /* Data for BAR access. */
-};
-
-struct virtio_pci_mmio {
-	struct virtio_pci_common_cfg cfg;
-	u16 notify;
-	u8 isr;
-	u8 padding;
-	/* Device-specific configuration follows this. */
-};
-
-/* This is the layout (little-endian) of the PCI config space. */
-struct pci_config {
-	u16 vendor_id, device_id;
-	u16 command, status;
-	u8 revid, prog_if, subclass, class;
-	u8 cacheline_size, lat_timer, header_type, bist;
-	u32 bar[6];
-	u32 cardbus_cis_ptr;
-	u16 subsystem_vendor_id, subsystem_device_id;
-	u32 expansion_rom_addr;
-	u8 capabilities, reserved1[3];
-	u32 reserved2;
-	u8 irq_line, irq_pin, min_grant, max_latency;
-
-	/* Now, this is the linked capability list. */
-	struct virtio_pci_cap common;
-	struct virtio_pci_notify_cap notify;
-	struct virtio_pci_cap isr;
-	struct virtio_pci_cap device;
-	struct virtio_pci_cfg_cap_u32 cfg_access;
-};
-
-/* The device structure describes a single device. */
-struct device {
-	/* The name of this device, for --verbose. */
-	const char *name;
-
-	/* Any queues attached to this device */
-	struct virtqueue *vq;
-
-	/* Is it operational */
-	bool running;
-
-	/* Has it written FEATURES_OK but not re-checked it? */
-	bool wrote_features_ok;
-
-	/* PCI configuration */
-	union {
-		struct pci_config config;
-		u32 config_words[sizeof(struct pci_config) / sizeof(u32)];
-	};
-
-	/* Features we offer, and those accepted. */
-	u64 features, features_accepted;
-
-	/* Device-specific config hangs off the end of this. */
-	struct virtio_pci_mmio *mmio;
-
-	/* PCI MMIO resources (all in BAR0) */
-	size_t mmio_size;
-	u32 mmio_addr;
-
-	/* Device-specific data. */
-	void *priv;
-};
-
-/* The virtqueue structure describes a queue attached to a device. */
-struct virtqueue {
-	struct virtqueue *next;
-
-	/* Which device owns me. */
-	struct device *dev;
-
-	/* Name for printing errors. */
-	const char *name;
-
-	/* The actual ring of buffers. */
-	struct vring vring;
-
-	/* The information about this virtqueue (we only use queue_size on) */
-	struct virtio_pci_common_cfg pci_config;
-
-	/* Last available index we saw. */
-	u16 last_avail_idx;
-
-	/* How many are used since we sent last irq? */
-	unsigned int pending_used;
-
-	/* Eventfd where Guest notifications arrive. */
-	int eventfd;
-
-	/* Function for the thread which is servicing this virtqueue. */
-	void (*service)(struct virtqueue *vq);
-	pid_t thread;
-};
-
-/* Remember the arguments to the program so we can "reboot" */
-static char **main_args;
-
-/* The original tty settings to restore on exit. */
-static struct termios orig_term;
-
-/*
- * We have to be careful with barriers: our devices are all run in separate
- * threads and so we need to make sure that changes visible to the Guest happen
- * in precise order.
- */
-#define wmb() __asm__ __volatile__("" : : : "memory")
-#define rmb() __asm__ __volatile__("lock; addl $0,0(%%esp)" : : : "memory")
-#define mb() __asm__ __volatile__("lock; addl $0,0(%%esp)" : : : "memory")
-
-/* Wrapper for the last available index.  Makes it easier to change. */
-#define lg_last_avail(vq)	((vq)->last_avail_idx)
-
-/*
- * The virtio configuration space is defined to be little-endian.  x86 is
- * little-endian too, but it's nice to be explicit so we have these helpers.
- */
-#define cpu_to_le16(v16) (v16)
-#define cpu_to_le32(v32) (v32)
-#define cpu_to_le64(v64) (v64)
-#define le16_to_cpu(v16) (v16)
-#define le32_to_cpu(v32) (v32)
-#define le64_to_cpu(v64) (v64)
-
-/*
- * A real device would ignore weird/non-compliant driver behaviour.  We
- * stop and flag it, to help debugging Linux problems.
- */
-#define bad_driver(d, fmt, ...) \
-	errx(1, "%s: bad driver: " fmt, (d)->name, ## __VA_ARGS__)
-#define bad_driver_vq(vq, fmt, ...)			       \
-	errx(1, "%s vq %s: bad driver: " fmt, (vq)->dev->name, \
-	     vq->name, ## __VA_ARGS__)
-
-/* Is this iovec empty? */
-static bool iov_empty(const struct iovec iov[], unsigned int num_iov)
-{
-	unsigned int i;
-
-	for (i = 0; i < num_iov; i++)
-		if (iov[i].iov_len)
-			return false;
-	return true;
-}
-
-/* Take len bytes from the front of this iovec. */
-static void iov_consume(struct device *d,
-			struct iovec iov[], unsigned num_iov,
-			void *dest, unsigned len)
-{
-	unsigned int i;
-
-	for (i = 0; i < num_iov; i++) {
-		unsigned int used;
-
-		used = iov[i].iov_len < len ? iov[i].iov_len : len;
-		if (dest) {
-			memcpy(dest, iov[i].iov_base, used);
-			dest += used;
-		}
-		iov[i].iov_base += used;
-		iov[i].iov_len -= used;
-		len -= used;
-	}
-	if (len != 0)
-		bad_driver(d, "iovec too short!");
-}
-
-/*L:100
- * The Launcher code itself takes us out into userspace, that scary place where
- * pointers run wild and free!  Unfortunately, like most userspace programs,
- * it's quite boring (which is why everyone likes to hack on the kernel!).
- * Perhaps if you make up an Lguest Drinking Game at this point, it will get
- * you through this section.  Or, maybe not.
- *
- * The Launcher sets up a big chunk of memory to be the Guest's "physical"
- * memory and stores it in "guest_base".  In other words, Guest physical ==
- * Launcher virtual with an offset.
- *
- * This can be tough to get your head around, but usually it just means that we
- * use these trivial conversion functions when the Guest gives us its
- * "physical" addresses:
- */
-static void *from_guest_phys(unsigned long addr)
-{
-	return guest_base + addr;
-}
-
-static unsigned long to_guest_phys(const void *addr)
-{
-	return (addr - guest_base);
-}
-
-/*L:130
- * Loading the Kernel.
- *
- * We start with couple of simple helper routines.  open_or_die() avoids
- * error-checking code cluttering the callers:
- */
-static int open_or_die(const char *name, int flags)
-{
-	int fd = open(name, flags);
-	if (fd < 0)
-		err(1, "Failed to open %s", name);
-	return fd;
-}
-
-/* map_zeroed_pages() takes a number of pages. */
-static void *map_zeroed_pages(unsigned int num)
-{
-	int fd = open_or_die("/dev/zero", O_RDONLY);
-	void *addr;
-
-	/*
-	 * We use a private mapping (ie. if we write to the page, it will be
-	 * copied). We allocate an extra two pages PROT_NONE to act as guard
-	 * pages against read/write attempts that exceed allocated space.
-	 */
-	addr = mmap(NULL, getpagesize() * (num+2),
-		    PROT_NONE, MAP_PRIVATE, fd, 0);
-
-	if (addr == MAP_FAILED)
-		err(1, "Mmapping %u pages of /dev/zero", num);
-
-	if (mprotect(addr + getpagesize(), getpagesize() * num,
-		     PROT_READ|PROT_WRITE) == -1)
-		err(1, "mprotect rw %u pages failed", num);
-
-	/*
-	 * One neat mmap feature is that you can close the fd, and it
-	 * stays mapped.
-	 */
-	close(fd);
-
-	/* Return address after PROT_NONE page */
-	return addr + getpagesize();
-}
-
-/* Get some bytes which won't be mapped into the guest. */
-static unsigned long get_mmio_region(size_t size)
-{
-	unsigned long addr = guest_mmio;
-	size_t i;
-
-	if (!size)
-		return addr;
-
-	/* Size has to be a power of 2 (and multiple of 16) */
-	for (i = 1; i < size; i <<= 1);
-
-	guest_mmio += i;
-
-	return addr;
-}
-
-/*
- * This routine is used to load the kernel or initrd.  It tries mmap, but if
- * that fails (Plan 9's kernel file isn't nicely aligned on page boundaries),
- * it falls back to reading the memory in.
- */
-static void map_at(int fd, void *addr, unsigned long offset, unsigned long len)
-{
-	ssize_t r;
-
-	/*
-	 * We map writable even though for some segments are marked read-only.
-	 * The kernel really wants to be writable: it patches its own
-	 * instructions.
-	 *
-	 * MAP_PRIVATE means that the page won't be copied until a write is
-	 * done to it.  This allows us to share untouched memory between
-	 * Guests.
-	 */
-	if (mmap(addr, len, PROT_READ|PROT_WRITE,
-		 MAP_FIXED|MAP_PRIVATE, fd, offset) != MAP_FAILED)
-		return;
-
-	/* pread does a seek and a read in one shot: saves a few lines. */
-	r = pread(fd, addr, len, offset);
-	if (r != len)
-		err(1, "Reading offset %lu len %lu gave %zi", offset, len, r);
-}
-
-/*
- * This routine takes an open vmlinux image, which is in ELF, and maps it into
- * the Guest memory.  ELF = Embedded Linking Format, which is the format used
- * by all modern binaries on Linux including the kernel.
- *
- * The ELF headers give *two* addresses: a physical address, and a virtual
- * address.  We use the physical address; the Guest will map itself to the
- * virtual address.
- *
- * We return the starting address.
- */
-static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr)
-{
-	Elf32_Phdr phdr[ehdr->e_phnum];
-	unsigned int i;
-
-	/*
-	 * Sanity checks on the main ELF header: an x86 executable with a
-	 * reasonable number of correctly-sized program headers.
-	 */
-	if (ehdr->e_type != ET_EXEC
-	    || ehdr->e_machine != EM_386
-	    || ehdr->e_phentsize != sizeof(Elf32_Phdr)
-	    || ehdr->e_phnum < 1 || ehdr->e_phnum > 65536U/sizeof(Elf32_Phdr))
-		errx(1, "Malformed elf header");
-
-	/*
-	 * An ELF executable contains an ELF header and a number of "program"
-	 * headers which indicate which parts ("segments") of the program to
-	 * load where.
-	 */
-
-	/* We read in all the program headers at once: */
-	if (lseek(elf_fd, ehdr->e_phoff, SEEK_SET) < 0)
-		err(1, "Seeking to program headers");
-	if (read(elf_fd, phdr, sizeof(phdr)) != sizeof(phdr))
-		err(1, "Reading program headers");
-
-	/*
-	 * Try all the headers: there are usually only three.  A read-only one,
-	 * a read-write one, and a "note" section which we don't load.
-	 */
-	for (i = 0; i < ehdr->e_phnum; i++) {
-		/* If this isn't a loadable segment, we ignore it */
-		if (phdr[i].p_type != PT_LOAD)
-			continue;
-
-		verbose("Section %i: size %i addr %p\n",
-			i, phdr[i].p_memsz, (void *)phdr[i].p_paddr);
-
-		/* We map this section of the file at its physical address. */
-		map_at(elf_fd, from_guest_phys(phdr[i].p_paddr),
-		       phdr[i].p_offset, phdr[i].p_filesz);
-	}
-
-	/* The entry point is given in the ELF header. */
-	return ehdr->e_entry;
-}
-
-/*L:150
- * A bzImage, unlike an ELF file, is not meant to be loaded.  You're supposed
- * to jump into it and it will unpack itself.  We used to have to perform some
- * hairy magic because the unpacking code scared me.
- *
- * Fortunately, Jeremy Fitzhardinge convinced me it wasn't that hard and wrote
- * a small patch to jump over the tricky bits in the Guest, so now we just read
- * the funky header so we know where in the file to load, and away we go!
- */
-static unsigned long load_bzimage(int fd)
-{
-	struct boot_params boot;
-	int r;
-	/* Modern bzImages get loaded at 1M. */
-	void *p = from_guest_phys(0x100000);
-
-	/*
-	 * Go back to the start of the file and read the header.  It should be
-	 * a Linux boot header (see Documentation/x86/boot.txt)
-	 */
-	lseek(fd, 0, SEEK_SET);
-	read(fd, &boot, sizeof(boot));
-
-	/* Inside the setup_hdr, we expect the magic "HdrS" */
-	if (memcmp(&boot.hdr.header, "HdrS", 4) != 0)
-		errx(1, "This doesn't look like a bzImage to me");
-
-	/* Skip over the extra sectors of the header. */
-	lseek(fd, (boot.hdr.setup_sects+1) * 512, SEEK_SET);
-
-	/* Now read everything into memory. in nice big chunks. */
-	while ((r = read(fd, p, 65536)) > 0)
-		p += r;
-
-	/* Finally, code32_start tells us where to enter the kernel. */
-	return boot.hdr.code32_start;
-}
-
-/*L:140
- * Loading the kernel is easy when it's a "vmlinux", but most kernels
- * come wrapped up in the self-decompressing "bzImage" format.  With a little
- * work, we can load those, too.
- */
-static unsigned long load_kernel(int fd)
-{
-	Elf32_Ehdr hdr;
-
-	/* Read in the first few bytes. */
-	if (read(fd, &hdr, sizeof(hdr)) != sizeof(hdr))
-		err(1, "Reading kernel");
-
-	/* If it's an ELF file, it starts with "\177ELF" */
-	if (memcmp(hdr.e_ident, ELFMAG, SELFMAG) == 0)
-		return map_elf(fd, &hdr);
-
-	/* Otherwise we assume it's a bzImage, and try to load it. */
-	return load_bzimage(fd);
-}
-
-/*
- * This is a trivial little helper to align pages.  Andi Kleen hated it because
- * it calls getpagesize() twice: "it's dumb code."
- *
- * Kernel guys get really het up about optimization, even when it's not
- * necessary.  I leave this code as a reaction against that.
- */
-static inline unsigned long page_align(unsigned long addr)
-{
-	/* Add upwards and truncate downwards. */
-	return ((addr + getpagesize()-1) & ~(getpagesize()-1));
-}
-
-/*L:180
- * An "initial ram disk" is a disk image loaded into memory along with the
- * kernel which the kernel can use to boot from without needing any drivers.
- * Most distributions now use this as standard: the initrd contains the code to
- * load the appropriate driver modules for the current machine.
- *
- * Importantly, James Morris works for RedHat, and Fedora uses initrds for its
- * kernels.  He sent me this (and tells me when I break it).
- */
-static unsigned long load_initrd(const char *name, unsigned long mem)
-{
-	int ifd;
-	struct stat st;
-	unsigned long len;
-
-	ifd = open_or_die(name, O_RDONLY);
-	/* fstat() is needed to get the file size. */
-	if (fstat(ifd, &st) < 0)
-		err(1, "fstat() on initrd '%s'", name);
-
-	/*
-	 * We map the initrd at the top of memory, but mmap wants it to be
-	 * page-aligned, so we round the size up for that.
-	 */
-	len = page_align(st.st_size);
-	map_at(ifd, from_guest_phys(mem - len), 0, st.st_size);
-	/*
-	 * Once a file is mapped, you can close the file descriptor.  It's a
-	 * little odd, but quite useful.
-	 */
-	close(ifd);
-	verbose("mapped initrd %s size=%lu @ %p\n", name, len, (void*)mem-len);
-
-	/* We return the initrd size. */
-	return len;
-}
-/*:*/
-
-/*
- * Simple routine to roll all the commandline arguments together with spaces
- * between them.
- */
-static void concat(char *dst, char *args[])
-{
-	unsigned int i, len = 0;
-
-	for (i = 0; args[i]; i++) {
-		if (i) {
-			strcat(dst+len, " ");
-			len++;
-		}
-		strcpy(dst+len, args[i]);
-		len += strlen(args[i]);
-	}
-	/* In case it's empty. */
-	dst[len] = '\0';
-}
-
-/*L:185
- * This is where we actually tell the kernel to initialize the Guest.  We
- * saw the arguments it expects when we looked at initialize() in lguest_user.c:
- * the base of Guest "physical" memory, the top physical page to allow and the
- * entry point for the Guest.
- */
-static void tell_kernel(unsigned long start)
-{
-	unsigned long args[] = { LHREQ_INITIALIZE,
-				 (unsigned long)guest_base,
-				 guest_limit / getpagesize(), start,
-				 (guest_mmio+getpagesize()-1) / getpagesize() };
-	verbose("Guest: %p - %p (%#lx, MMIO %#lx)\n",
-		guest_base, guest_base + guest_limit,
-		guest_limit, guest_mmio);
-	lguest_fd = open_or_die("/dev/lguest", O_RDWR);
-	if (write(lguest_fd, args, sizeof(args)) < 0)
-		err(1, "Writing to /dev/lguest");
-}
-/*:*/
-
-/*L:200
- * Device Handling.
- *
- * When the Guest gives us a buffer, it sends an array of addresses and sizes.
- * We need to make sure it's not trying to reach into the Launcher itself, so
- * we have a convenient routine which checks it and exits with an error message
- * if something funny is going on:
- */
-static void *_check_pointer(struct device *d,
-			    unsigned long addr, unsigned int size,
-			    unsigned int line)
-{
-	/*
-	 * Check if the requested address and size exceeds the allocated memory,
-	 * or addr + size wraps around.
-	 */
-	if ((addr + size) > guest_limit || (addr + size) < addr)
-		bad_driver(d, "%s:%i: Invalid address %#lx",
-			   __FILE__, line, addr);
-	/*
-	 * We return a pointer for the caller's convenience, now we know it's
-	 * safe to use.
-	 */
-	return from_guest_phys(addr);
-}
-/* A macro which transparently hands the line number to the real function. */
-#define check_pointer(d,addr,size) _check_pointer(d, addr, size, __LINE__)
-
-/*
- * Each buffer in the virtqueues is actually a chain of descriptors.  This
- * function returns the next descriptor in the chain, or vq->vring.num if we're
- * at the end.
- */
-static unsigned next_desc(struct device *d, struct vring_desc *desc,
-			  unsigned int i, unsigned int max)
-{
-	unsigned int next;
-
-	/* If this descriptor says it doesn't chain, we're done. */
-	if (!(desc[i].flags & VRING_DESC_F_NEXT))
-		return max;
-
-	/* Check they're not leading us off end of descriptors. */
-	next = desc[i].next;
-	/* Make sure compiler knows to grab that: we don't want it changing! */
-	wmb();
-
-	if (next >= max)
-		bad_driver(d, "Desc next is %u", next);
-
-	return next;
-}
-
-/*
- * This actually sends the interrupt for this virtqueue, if we've used a
- * buffer.
- */
-static void trigger_irq(struct virtqueue *vq)
-{
-	unsigned long buf[] = { LHREQ_IRQ, vq->dev->config.irq_line };
-
-	/* Don't inform them if nothing used. */
-	if (!vq->pending_used)
-		return;
-	vq->pending_used = 0;
-
-	/*
-	 * 2.4.7.1:
-	 *
-	 *  If the VIRTIO_F_EVENT_IDX feature bit is not negotiated:
-	 *    The driver MUST set flags to 0 or 1. 
-	 */
-	if (vq->vring.avail->flags > 1)
-		bad_driver_vq(vq, "avail->flags = %u\n", vq->vring.avail->flags);
-
-	/*
-	 * 2.4.7.2:
-	 *
-	 *  If the VIRTIO_F_EVENT_IDX feature bit is not negotiated:
-	 *
-	 *     - The device MUST ignore the used_event value.
-	 *     - After the device writes a descriptor index into the used ring:
-	 *         - If flags is 1, the device SHOULD NOT send an interrupt.
-	 *         - If flags is 0, the device MUST send an interrupt.
-	 */
-	if (vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT) {
-		return;
-	}
-
-	/*
-	 * 4.1.4.5.1:
-	 *
-	 *  If MSI-X capability is disabled, the device MUST set the Queue
-	 *  Interrupt bit in ISR status before sending a virtqueue notification
-	 *  to the driver.
-	 */
-	vq->dev->mmio->isr = 0x1;
-
-	/* Send the Guest an interrupt tell them we used something up. */
-	if (write(lguest_fd, buf, sizeof(buf)) != 0)
-		err(1, "Triggering irq %i", vq->dev->config.irq_line);
-}
-
-/*
- * This looks in the virtqueue for the first available buffer, and converts
- * it to an iovec for convenient access.  Since descriptors consist of some
- * number of output then some number of input descriptors, it's actually two
- * iovecs, but we pack them into one and note how many of each there were.
- *
- * This function waits if necessary, and returns the descriptor number found.
- */
-static unsigned wait_for_vq_desc(struct virtqueue *vq,
-				 struct iovec iov[],
-				 unsigned int *out_num, unsigned int *in_num)
-{
-	unsigned int i, head, max;
-	struct vring_desc *desc;
-	u16 last_avail = lg_last_avail(vq);
-
-	/*
-	 * 2.4.7.1:
-	 *
-	 *   The driver MUST handle spurious interrupts from the device.
-	 *
-	 * That's why this is a while loop.
-	 */
-
-	/* There's nothing available? */
-	while (last_avail == vq->vring.avail->idx) {
-		u64 event;
-
-		/*
-		 * Since we're about to sleep, now is a good time to tell the
-		 * Guest about what we've used up to now.
-		 */
-		trigger_irq(vq);
-
-		/* OK, now we need to know about added descriptors. */
-		vq->vring.used->flags &= ~VRING_USED_F_NO_NOTIFY;
-
-		/*
-		 * They could have slipped one in as we were doing that: make
-		 * sure it's written, then check again.
-		 */
-		mb();
-		if (last_avail != vq->vring.avail->idx) {
-			vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY;
-			break;
-		}
-
-		/* Nothing new?  Wait for eventfd to tell us they refilled. */
-		if (read(vq->eventfd, &event, sizeof(event)) != sizeof(event))
-			errx(1, "Event read failed?");
-
-		/* We don't need to be notified again. */
-		vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY;
-	}
-
-	/* Check it isn't doing very strange things with descriptor numbers. */
-	if ((u16)(vq->vring.avail->idx - last_avail) > vq->vring.num)
-		bad_driver_vq(vq, "Guest moved used index from %u to %u",
-			      last_avail, vq->vring.avail->idx);
-
-	/* 
-	 * Make sure we read the descriptor number *after* we read the ring
-	 * update; don't let the cpu or compiler change the order.
-	 */
-	rmb();
-
-	/*
-	 * Grab the next descriptor number they're advertising, and increment
-	 * the index we've seen.
-	 */
-	head = vq->vring.avail->ring[last_avail % vq->vring.num];
-	lg_last_avail(vq)++;
-
-	/* If their number is silly, that's a fatal mistake. */
-	if (head >= vq->vring.num)
-		bad_driver_vq(vq, "Guest says index %u is available", head);
-
-	/* When we start there are none of either input nor output. */
-	*out_num = *in_num = 0;
-
-	max = vq->vring.num;
-	desc = vq->vring.desc;
-	i = head;
-
-	/*
-	 * We have to read the descriptor after we read the descriptor number,
-	 * but there's a data dependency there so the CPU shouldn't reorder
-	 * that: no rmb() required.
-	 */
-
-	do {
-		/*
-		 * If this is an indirect entry, then this buffer contains a
-		 * descriptor table which we handle as if it's any normal
-		 * descriptor chain.
-		 */
-		if (desc[i].flags & VRING_DESC_F_INDIRECT) {
-			/* 2.4.5.3.1:
-			 *
-			 *  The driver MUST NOT set the VIRTQ_DESC_F_INDIRECT
-			 *  flag unless the VIRTIO_F_INDIRECT_DESC feature was
-			 *  negotiated.
-			 */
-			if (!(vq->dev->features_accepted &
-			      (1<<VIRTIO_RING_F_INDIRECT_DESC)))
-				bad_driver_vq(vq, "vq indirect not negotiated");
-
-			/*
-			 * 2.4.5.3.1:
-			 *
-			 *   The driver MUST NOT set the VIRTQ_DESC_F_INDIRECT
-			 *   flag within an indirect descriptor (ie. only one
-			 *   table per descriptor).
-			 */
-			if (desc != vq->vring.desc)
-				bad_driver_vq(vq, "Indirect within indirect");
-
-			/*
-			 * Proposed update VIRTIO-134 spells this out:
-			 *
-			 *   A driver MUST NOT set both VIRTQ_DESC_F_INDIRECT
-			 *   and VIRTQ_DESC_F_NEXT in flags.
-			 */
-			if (desc[i].flags & VRING_DESC_F_NEXT)
-				bad_driver_vq(vq, "indirect and next together");
-
-			if (desc[i].len % sizeof(struct vring_desc))
-				bad_driver_vq(vq,
-					      "Invalid size for indirect table");
-			/*
-			 * 2.4.5.3.2:
-			 *
-			 *  The device MUST ignore the write-only flag
-			 *  (flags&VIRTQ_DESC_F_WRITE) in the descriptor that
-			 *  refers to an indirect table.
-			 *
-			 * We ignore it here: :)
-			 */
-
-			max = desc[i].len / sizeof(struct vring_desc);
-			desc = check_pointer(vq->dev, desc[i].addr, desc[i].len);
-			i = 0;
-
-			/* 2.4.5.3.1:
-			 *
-			 *  A driver MUST NOT create a descriptor chain longer
-			 *  than the Queue Size of the device.
-			 */
-			if (max > vq->pci_config.queue_size)
-				bad_driver_vq(vq,
-					      "indirect has too many entries");
-		}
-
-		/* Grab the first descriptor, and check it's OK. */
-		iov[*out_num + *in_num].iov_len = desc[i].len;
-		iov[*out_num + *in_num].iov_base
-			= check_pointer(vq->dev, desc[i].addr, desc[i].len);
-		/* If this is an input descriptor, increment that count. */
-		if (desc[i].flags & VRING_DESC_F_WRITE)
-			(*in_num)++;
-		else {
-			/*
-			 * If it's an output descriptor, they're all supposed
-			 * to come before any input descriptors.
-			 */
-			if (*in_num)
-				bad_driver_vq(vq,
-					      "Descriptor has out after in");
-			(*out_num)++;
-		}
-
-		/* If we've got too many, that implies a descriptor loop. */
-		if (*out_num + *in_num > max)
-			bad_driver_vq(vq, "Looped descriptor");
-	} while ((i = next_desc(vq->dev, desc, i, max)) != max);
-
-	return head;
-}
-
-/*
- * After we've used one of their buffers, we tell the Guest about it.  Sometime
- * later we'll want to send them an interrupt using trigger_irq(); note that
- * wait_for_vq_desc() does that for us if it has to wait.
- */
-static void add_used(struct virtqueue *vq, unsigned int head, int len)
-{
-	struct vring_used_elem *used;
-
-	/*
-	 * The virtqueue contains a ring of used buffers.  Get a pointer to the
-	 * next entry in that used ring.
-	 */
-	used = &vq->vring.used->ring[vq->vring.used->idx % vq->vring.num];
-	used->id = head;
-	used->len = len;
-	/* Make sure buffer is written before we update index. */
-	wmb();
-	vq->vring.used->idx++;
-	vq->pending_used++;
-}
-
-/* And here's the combo meal deal.  Supersize me! */
-static void add_used_and_trigger(struct virtqueue *vq, unsigned head, int len)
-{
-	add_used(vq, head, len);
-	trigger_irq(vq);
-}
-
-/*
- * The Console
- *
- * We associate some data with the console for our exit hack.
- */
-struct console_abort {
-	/* How many times have they hit ^C? */
-	int count;
-	/* When did they start? */
-	struct timeval start;
-};
-
-/* This is the routine which handles console input (ie. stdin). */
-static void console_input(struct virtqueue *vq)
-{
-	int len;
-	unsigned int head, in_num, out_num;
-	struct console_abort *abort = vq->dev->priv;
-	struct iovec iov[vq->vring.num];
-
-	/* Make sure there's a descriptor available. */
-	head = wait_for_vq_desc(vq, iov, &out_num, &in_num);
-	if (out_num)
-		bad_driver_vq(vq, "Output buffers in console in queue?");
-
-	/* Read into it.  This is where we usually wait. */
-	len = readv(STDIN_FILENO, iov, in_num);
-	if (len <= 0) {
-		/* Ran out of input? */
-		warnx("Failed to get console input, ignoring console.");
-		/*
-		 * For simplicity, dying threads kill the whole Launcher.  So
-		 * just nap here.
-		 */
-		for (;;)
-			pause();
-	}
-
-	/* Tell the Guest we used a buffer. */
-	add_used_and_trigger(vq, head, len);
-
-	/*
-	 * Three ^C within one second?  Exit.
-	 *
-	 * This is such a hack, but works surprisingly well.  Each ^C has to
-	 * be in a buffer by itself, so they can't be too fast.  But we check
-	 * that we get three within about a second, so they can't be too
-	 * slow.
-	 */
-	if (len != 1 || ((char *)iov[0].iov_base)[0] != 3) {
-		abort->count = 0;
-		return;
-	}
-
-	abort->count++;
-	if (abort->count == 1)
-		gettimeofday(&abort->start, NULL);
-	else if (abort->count == 3) {
-		struct timeval now;
-		gettimeofday(&now, NULL);
-		/* Kill all Launcher processes with SIGINT, like normal ^C */
-		if (now.tv_sec <= abort->start.tv_sec+1)
-			kill(0, SIGINT);
-		abort->count = 0;
-	}
-}
-
-/* This is the routine which handles console output (ie. stdout). */
-static void console_output(struct virtqueue *vq)
-{
-	unsigned int head, out, in;
-	struct iovec iov[vq->vring.num];
-
-	/* We usually wait in here, for the Guest to give us something. */
-	head = wait_for_vq_desc(vq, iov, &out, &in);
-	if (in)
-		bad_driver_vq(vq, "Input buffers in console output queue?");
-
-	/* writev can return a partial write, so we loop here. */
-	while (!iov_empty(iov, out)) {
-		int len = writev(STDOUT_FILENO, iov, out);
-		if (len <= 0) {
-			warn("Write to stdout gave %i (%d)", len, errno);
-			break;
-		}
-		iov_consume(vq->dev, iov, out, NULL, len);
-	}
-
-	/*
-	 * We're finished with that buffer: if we're going to sleep,
-	 * wait_for_vq_desc() will prod the Guest with an interrupt.
-	 */
-	add_used(vq, head, 0);
-}
-
-/*
- * The Network
- *
- * Handling output for network is also simple: we get all the output buffers
- * and write them to /dev/net/tun.
- */
-struct net_info {
-	int tunfd;
-};
-
-static void net_output(struct virtqueue *vq)
-{
-	struct net_info *net_info = vq->dev->priv;
-	unsigned int head, out, in;
-	struct iovec iov[vq->vring.num];
-
-	/* We usually wait in here for the Guest to give us a packet. */
-	head = wait_for_vq_desc(vq, iov, &out, &in);
-	if (in)
-		bad_driver_vq(vq, "Input buffers in net output queue?");
-	/*
-	 * Send the whole thing through to /dev/net/tun.  It expects the exact
-	 * same format: what a coincidence!
-	 */
-	if (writev(net_info->tunfd, iov, out) < 0)
-		warnx("Write to tun failed (%d)?", errno);
-
-	/*
-	 * Done with that one; wait_for_vq_desc() will send the interrupt if
-	 * all packets are processed.
-	 */
-	add_used(vq, head, 0);
-}
-
-/*
- * Handling network input is a bit trickier, because I've tried to optimize it.
- *
- * First we have a helper routine which tells is if from this file descriptor
- * (ie. the /dev/net/tun device) will block:
- */
-static bool will_block(int fd)
-{
-	fd_set fdset;
-	struct timeval zero = { 0, 0 };
-	FD_ZERO(&fdset);
-	FD_SET(fd, &fdset);
-	return select(fd+1, &fdset, NULL, NULL, &zero) != 1;
-}
-
-/*
- * This handles packets coming in from the tun device to our Guest.  Like all
- * service routines, it gets called again as soon as it returns, so you don't
- * see a while(1) loop here.
- */
-static void net_input(struct virtqueue *vq)
-{
-	int len;
-	unsigned int head, out, in;
-	struct iovec iov[vq->vring.num];
-	struct net_info *net_info = vq->dev->priv;
-
-	/*
-	 * Get a descriptor to write an incoming packet into.  This will also
-	 * send an interrupt if they're out of descriptors.
-	 */
-	head = wait_for_vq_desc(vq, iov, &out, &in);
-	if (out)
-		bad_driver_vq(vq, "Output buffers in net input queue?");
-
-	/*
-	 * If it looks like we'll block reading from the tun device, send them
-	 * an interrupt.
-	 */
-	if (vq->pending_used && will_block(net_info->tunfd))
-		trigger_irq(vq);
-
-	/*
-	 * Read in the packet.  This is where we normally wait (when there's no
-	 * incoming network traffic).
-	 */
-	len = readv(net_info->tunfd, iov, in);
-	if (len <= 0)
-		warn("Failed to read from tun (%d).", errno);
-
-	/*
-	 * Mark that packet buffer as used, but don't interrupt here.  We want
-	 * to wait until we've done as much work as we can.
-	 */
-	add_used(vq, head, len);
-}
-/*:*/
-
-/* This is the helper to create threads: run the service routine in a loop. */
-static int do_thread(void *_vq)
-{
-	struct virtqueue *vq = _vq;
-
-	for (;;)
-		vq->service(vq);
-	return 0;
-}
-
-/*
- * When a child dies, we kill our entire process group with SIGTERM.  This
- * also has the side effect that the shell restores the console for us!
- */
-static void kill_launcher(int signal)
-{
-	kill(0, SIGTERM);
-}
-
-static void reset_vq_pci_config(struct virtqueue *vq)
-{
-	vq->pci_config.queue_size = VIRTQUEUE_NUM;
-	vq->pci_config.queue_enable = 0;
-}
-
-static void reset_device(struct device *dev)
-{
-	struct virtqueue *vq;
-
-	verbose("Resetting device %s\n", dev->name);
-
-	/* Clear any features they've acked. */
-	dev->features_accepted = 0;
-
-	/* We're going to be explicitly killing threads, so ignore them. */
-	signal(SIGCHLD, SIG_IGN);
-
-	/*
-	 * 4.1.4.3.1:
-	 *
-	 *   The device MUST present a 0 in queue_enable on reset. 
-	 *
-	 * This means we set it here, and reset the saved ones in every vq.
-	 */
-	dev->mmio->cfg.queue_enable = 0;
-
-	/* Get rid of the virtqueue threads */
-	for (vq = dev->vq; vq; vq = vq->next) {
-		vq->last_avail_idx = 0;
-		reset_vq_pci_config(vq);
-		if (vq->thread != (pid_t)-1) {
-			kill(vq->thread, SIGTERM);
-			waitpid(vq->thread, NULL, 0);
-			vq->thread = (pid_t)-1;
-		}
-	}
-	dev->running = false;
-	dev->wrote_features_ok = false;
-
-	/* Now we care if threads die. */
-	signal(SIGCHLD, (void *)kill_launcher);
-}
-
-static void cleanup_devices(void)
-{
-	unsigned int i;
-
-	for (i = 1; i < MAX_PCI_DEVICES; i++) {
-		struct device *d = devices.pci[i];
-		if (!d)
-			continue;
-		reset_device(d);
-	}
-
-	/* If we saved off the original terminal settings, restore them now. */
-	if (orig_term.c_lflag & (ISIG|ICANON|ECHO))
-		tcsetattr(STDIN_FILENO, TCSANOW, &orig_term);
-}
-
-/*L:217
- * We do PCI.  This is mainly done to let us test the kernel virtio PCI
- * code.
- */
-
-/* Linux expects a PCI host bridge: ours is a dummy, and first on the bus. */
-static struct device pci_host_bridge;
-
-static void init_pci_host_bridge(void)
-{
-	pci_host_bridge.name = "PCI Host Bridge";
-	pci_host_bridge.config.class = 0x06; /* bridge */
-	pci_host_bridge.config.subclass = 0; /* host bridge */
-	devices.pci[0] = &pci_host_bridge;
-}
-
-/* The IO ports used to read the PCI config space. */
-#define PCI_CONFIG_ADDR 0xCF8
-#define PCI_CONFIG_DATA 0xCFC
-
-/*
- * Not really portable, but does help readability: this is what the Guest
- * writes to the PCI_CONFIG_ADDR IO port.
- */
-union pci_config_addr {
-	struct {
-		unsigned mbz: 2;
-		unsigned offset: 6;
-		unsigned funcnum: 3;
-		unsigned devnum: 5;
-		unsigned busnum: 8;
-		unsigned reserved: 7;
-		unsigned enabled : 1;
-	} bits;
-	u32 val;
-};
-
-/*
- * We cache what they wrote to the address port, so we know what they're
- * talking about when they access the data port.
- */
-static union pci_config_addr pci_config_addr;
-
-static struct device *find_pci_device(unsigned int index)
-{
-	return devices.pci[index];
-}
-
-/* PCI can do 1, 2 and 4 byte reads; we handle that here. */
-static void ioread(u16 off, u32 v, u32 mask, u32 *val)
-{
-	assert(off < 4);
-	assert(mask == 0xFF || mask == 0xFFFF || mask == 0xFFFFFFFF);
-	*val = (v >> (off * 8)) & mask;
-}
-
-/* PCI can do 1, 2 and 4 byte writes; we handle that here. */
-static void iowrite(u16 off, u32 v, u32 mask, u32 *dst)
-{
-	assert(off < 4);
-	assert(mask == 0xFF || mask == 0xFFFF || mask == 0xFFFFFFFF);
-	*dst &= ~(mask << (off * 8));
-	*dst |= (v & mask) << (off * 8);
-}
-
-/*
- * Where PCI_CONFIG_DATA accesses depends on the previous write to
- * PCI_CONFIG_ADDR.
- */
-static struct device *dev_and_reg(u32 *reg)
-{
-	if (!pci_config_addr.bits.enabled)
-		return NULL;
-
-	if (pci_config_addr.bits.funcnum != 0)
-		return NULL;
-
-	if (pci_config_addr.bits.busnum != 0)
-		return NULL;
-
-	if (pci_config_addr.bits.offset * 4 >= sizeof(struct pci_config))
-		return NULL;
-
-	*reg = pci_config_addr.bits.offset;
-	return find_pci_device(pci_config_addr.bits.devnum);
-}
-
-/*
- * We can get invalid combinations of values while they're writing, so we
- * only fault if they try to write with some invalid bar/offset/length.
- */
-static bool valid_bar_access(struct device *d,
-			     struct virtio_pci_cfg_cap_u32 *cfg_access)
-{
-	/* We only have 1 bar (BAR0) */
-	if (cfg_access->cap.bar != 0)
-		return false;
-
-	/* Check it's within BAR0. */
-	if (cfg_access->cap.offset >= d->mmio_size
-	    || cfg_access->cap.offset + cfg_access->cap.length > d->mmio_size)
-		return false;
-
-	/* Check length is 1, 2 or 4. */
-	if (cfg_access->cap.length != 1
-	    && cfg_access->cap.length != 2
-	    && cfg_access->cap.length != 4)
-		return false;
-
-	/*
-	 * 4.1.4.7.2:
-	 *
-	 *  The driver MUST NOT write a cap.offset which is not a multiple of
-	 *  cap.length (ie. all accesses MUST be aligned).
-	 */
-	if (cfg_access->cap.offset % cfg_access->cap.length != 0)
-		return false;
-
-	/* Return pointer into word in BAR0. */
-	return true;
-}
-
-/* Is this accessing the PCI config address port?. */
-static bool is_pci_addr_port(u16 port)
-{
-	return port >= PCI_CONFIG_ADDR && port < PCI_CONFIG_ADDR + 4;
-}
-
-static bool pci_addr_iowrite(u16 port, u32 mask, u32 val)
-{
-	iowrite(port - PCI_CONFIG_ADDR, val, mask,
-		&pci_config_addr.val);
-	verbose("PCI%s: %#x/%x: bus %u dev %u func %u reg %u\n",
-		pci_config_addr.bits.enabled ? "" : " DISABLED",
-		val, mask,
-		pci_config_addr.bits.busnum,
-		pci_config_addr.bits.devnum,
-		pci_config_addr.bits.funcnum,
-		pci_config_addr.bits.offset);
-	return true;
-}
-
-static void pci_addr_ioread(u16 port, u32 mask, u32 *val)
-{
-	ioread(port - PCI_CONFIG_ADDR, pci_config_addr.val, mask, val);
-}
-
-/* Is this accessing the PCI config data port?. */
-static bool is_pci_data_port(u16 port)
-{
-	return port >= PCI_CONFIG_DATA && port < PCI_CONFIG_DATA + 4;
-}
-
-static void emulate_mmio_write(struct device *d, u32 off, u32 val, u32 mask);
-
-static bool pci_data_iowrite(u16 port, u32 mask, u32 val)
-{
-	u32 reg, portoff;
-	struct device *d = dev_and_reg(&reg);
-
-	/* Complain if they don't belong to a device. */
-	if (!d)
-		return false;
-
-	/* They can do 1 byte writes, etc. */
-	portoff = port - PCI_CONFIG_DATA;
-
-	/*
-	 * PCI uses a weird way to determine the BAR size: the OS
-	 * writes all 1's, and sees which ones stick.
-	 */
-	if (&d->config_words[reg] == &d->config.bar[0]) {
-		int i;
-
-		iowrite(portoff, val, mask, &d->config.bar[0]);
-		for (i = 0; (1 << i) < d->mmio_size; i++)
-			d->config.bar[0] &= ~(1 << i);
-		return true;
-	} else if ((&d->config_words[reg] > &d->config.bar[0]
-		    && &d->config_words[reg] <= &d->config.bar[6])
-		   || &d->config_words[reg] == &d->config.expansion_rom_addr) {
-		/* Allow writing to any other BAR, or expansion ROM */
-		iowrite(portoff, val, mask, &d->config_words[reg]);
-		return true;
-		/* We let them override latency timer and cacheline size */
-	} else if (&d->config_words[reg] == (void *)&d->config.cacheline_size) {
-		/* Only let them change the first two fields. */
-		if (mask == 0xFFFFFFFF)
-			mask = 0xFFFF;
-		iowrite(portoff, val, mask, &d->config_words[reg]);
-		return true;
-	} else if (&d->config_words[reg] == (void *)&d->config.command
-		   && mask == 0xFFFF) {
-		/* Ignore command writes. */
-		return true;
-	} else if (&d->config_words[reg]
-		   == (void *)&d->config.cfg_access.cap.bar
-		   || &d->config_words[reg]
-		   == &d->config.cfg_access.cap.length
-		   || &d->config_words[reg]
-		   == &d->config.cfg_access.cap.offset) {
-
-		/*
-		 * The VIRTIO_PCI_CAP_PCI_CFG capability
-		 * provides a backdoor to access the MMIO
-		 * regions without mapping them.  Weird, but
-		 * useful.
-		 */
-		iowrite(portoff, val, mask, &d->config_words[reg]);
-		return true;
-	} else if (&d->config_words[reg] == &d->config.cfg_access.pci_cfg_data) {
-		u32 write_mask;
-
-		/*
-		 * 4.1.4.7.1:
-		 *
-		 *  Upon detecting driver write access to pci_cfg_data, the
-		 *  device MUST execute a write access at offset cap.offset at
-		 *  BAR selected by cap.bar using the first cap.length bytes
-		 *  from pci_cfg_data.
-		 */
-
-		/* Must be bar 0 */
-		if (!valid_bar_access(d, &d->config.cfg_access))
-			return false;
-
-		iowrite(portoff, val, mask, &d->config.cfg_access.pci_cfg_data);
-
-		/*
-		 * Now emulate a write.  The mask we use is set by
-		 * len, *not* this write!
-		 */
-		write_mask = (1ULL<<(8*d->config.cfg_access.cap.length)) - 1;
-		verbose("Window writing %#x/%#x to bar %u, offset %u len %u\n",
-			d->config.cfg_access.pci_cfg_data, write_mask,
-			d->config.cfg_access.cap.bar,
-			d->config.cfg_access.cap.offset,
-			d->config.cfg_access.cap.length);
-
-		emulate_mmio_write(d, d->config.cfg_access.cap.offset,
-				   d->config.cfg_access.pci_cfg_data,
-				   write_mask);
-		return true;
-	}
-
-	/*
-	 * 4.1.4.1:
-	 *
-	 *  The driver MUST NOT write into any field of the capability
-	 *  structure, with the exception of those with cap_type
-	 *  VIRTIO_PCI_CAP_PCI_CFG...
-	 */
-	return false;
-}
-
-static u32 emulate_mmio_read(struct device *d, u32 off, u32 mask);
-
-static void pci_data_ioread(u16 port, u32 mask, u32 *val)
-{
-	u32 reg;
-	struct device *d = dev_and_reg(&reg);
-
-	if (!d)
-		return;
-
-	/* Read through the PCI MMIO access window is special */
-	if (&d->config_words[reg] == &d->config.cfg_access.pci_cfg_data) {
-		u32 read_mask;
-
-		/*
-		 * 4.1.4.7.1:
-		 *
-		 *  Upon detecting driver read access to pci_cfg_data, the
-		 *  device MUST execute a read access of length cap.length at
-		 *  offset cap.offset at BAR selected by cap.bar and store the
-		 *  first cap.length bytes in pci_cfg_data.
-		 */
-		/* Must be bar 0 */
-		if (!valid_bar_access(d, &d->config.cfg_access))
-			bad_driver(d,
-			     "Invalid cfg_access to bar%u, offset %u len %u",
-			     d->config.cfg_access.cap.bar,
-			     d->config.cfg_access.cap.offset,
-			     d->config.cfg_access.cap.length);
-
-		/*
-		 * Read into the window.  The mask we use is set by
-		 * len, *not* this read!
-		 */
-		read_mask = (1ULL<<(8*d->config.cfg_access.cap.length))-1;
-		d->config.cfg_access.pci_cfg_data
-			= emulate_mmio_read(d,
-					    d->config.cfg_access.cap.offset,
-					    read_mask);
-		verbose("Window read %#x/%#x from bar %u, offset %u len %u\n",
-			d->config.cfg_access.pci_cfg_data, read_mask,
-			d->config.cfg_access.cap.bar,
-			d->config.cfg_access.cap.offset,
-			d->config.cfg_access.cap.length);
-	}
-	ioread(port - PCI_CONFIG_DATA, d->config_words[reg], mask, val);
-}
-
-/*L:216
- * This is where we emulate a handful of Guest instructions.  It's ugly
- * and we used to do it in the kernel but it grew over time.
- */
-
-/*
- * We use the ptrace syscall's pt_regs struct to talk about registers
- * to lguest: these macros convert the names to the offsets.
- */
-#define getreg(name) getreg_off(offsetof(struct user_regs_struct, name))
-#define setreg(name, val) \
-	setreg_off(offsetof(struct user_regs_struct, name), (val))
-
-static u32 getreg_off(size_t offset)
-{
-	u32 r;
-	unsigned long args[] = { LHREQ_GETREG, offset };
-
-	if (pwrite(lguest_fd, args, sizeof(args), cpu_id) < 0)
-		err(1, "Getting register %u", offset);
-	if (pread(lguest_fd, &r, sizeof(r), cpu_id) != sizeof(r))
-		err(1, "Reading register %u", offset);
-
-	return r;
-}
-
-static void setreg_off(size_t offset, u32 val)
-{
-	unsigned long args[] = { LHREQ_SETREG, offset, val };
-
-	if (pwrite(lguest_fd, args, sizeof(args), cpu_id) < 0)
-		err(1, "Setting register %u", offset);
-}
-
-/* Get register by instruction encoding */
-static u32 getreg_num(unsigned regnum, u32 mask)
-{
-	/* 8 bit ops use regnums 4-7 for high parts of word */
-	if (mask == 0xFF && (regnum & 0x4))
-		return getreg_num(regnum & 0x3, 0xFFFF) >> 8;
-
-	switch (regnum) {
-	case 0: return getreg(eax) & mask;
-	case 1: return getreg(ecx) & mask;
-	case 2: return getreg(edx) & mask;
-	case 3: return getreg(ebx) & mask;
-	case 4: return getreg(esp) & mask;
-	case 5: return getreg(ebp) & mask;
-	case 6: return getreg(esi) & mask;
-	case 7: return getreg(edi) & mask;
-	}
-	abort();
-}
-
-/* Set register by instruction encoding */
-static void setreg_num(unsigned regnum, u32 val, u32 mask)
-{
-	/* Don't try to set bits out of range */
-	assert(~(val & ~mask));
-
-	/* 8 bit ops use regnums 4-7 for high parts of word */
-	if (mask == 0xFF && (regnum & 0x4)) {
-		/* Construct the 16 bits we want. */
-		val = (val << 8) | getreg_num(regnum & 0x3, 0xFF);
-		setreg_num(regnum & 0x3, val, 0xFFFF);
-		return;
-	}
-
-	switch (regnum) {
-	case 0: setreg(eax, val | (getreg(eax) & ~mask)); return;
-	case 1: setreg(ecx, val | (getreg(ecx) & ~mask)); return;
-	case 2: setreg(edx, val | (getreg(edx) & ~mask)); return;
-	case 3: setreg(ebx, val | (getreg(ebx) & ~mask)); return;
-	case 4: setreg(esp, val | (getreg(esp) & ~mask)); return;
-	case 5: setreg(ebp, val | (getreg(ebp) & ~mask)); return;
-	case 6: setreg(esi, val | (getreg(esi) & ~mask)); return;
-	case 7: setreg(edi, val | (getreg(edi) & ~mask)); return;
-	}
-	abort();
-}
-
-/* Get bytes of displacement appended to instruction, from r/m encoding */
-static u32 insn_displacement_len(u8 mod_reg_rm)
-{
-	/* Switch on the mod bits */
-	switch (mod_reg_rm >> 6) {
-	case 0:
-		/* If mod == 0, and r/m == 101, 16-bit displacement follows */
-		if ((mod_reg_rm & 0x7) == 0x5)
-			return 2;
-		/* Normally, mod == 0 means no literal displacement */
-		return 0;
-	case 1:
-		/* One byte displacement */
-		return 1;
-	case 2:
-		/* Four byte displacement */
-		return 4;
-	case 3:
-		/* Register mode */
-		return 0;
-	}
-	abort();
-}
-
-static void emulate_insn(const u8 insn[])
-{
-	unsigned long args[] = { LHREQ_TRAP, 13 };
-	unsigned int insnlen = 0, in = 0, small_operand = 0, byte_access;
-	unsigned int eax, port, mask;
-	/*
-	 * Default is to return all-ones on IO port reads, which traditionally
-	 * means "there's nothing there".
-	 */
-	u32 val = 0xFFFFFFFF;
-
-	/*
-	 * This must be the Guest kernel trying to do something, not userspace!
-	 * The bottom two bits of the CS segment register are the privilege
-	 * level.
-	 */
-	if ((getreg(xcs) & 3) != 0x1)
-		goto no_emulate;
-
-	/* Decoding x86 instructions is icky. */
-
-	/*
-	 * Around 2.6.33, the kernel started using an emulation for the
-	 * cmpxchg8b instruction in early boot on many configurations.  This
-	 * code isn't paravirtualized, and it tries to disable interrupts.
-	 * Ignore it, which will Mostly Work.
-	 */
-	if (insn[insnlen] == 0xfa) {
-		/* "cli", or Clear Interrupt Enable instruction.  Skip it. */
-		insnlen = 1;
-		goto skip_insn;
-	}
-
-	/*
-	 * 0x66 is an "operand prefix".  It means a 16, not 32 bit in/out.
-	 */
-	if (insn[insnlen] == 0x66) {
-		small_operand = 1;
-		/* The instruction is 1 byte so far, read the next byte. */
-		insnlen = 1;
-	}
-
-	/* If the lower bit isn't set, it's a single byte access */
-	byte_access = !(insn[insnlen] & 1);
-
-	/*
-	 * Now we can ignore the lower bit and decode the 4 opcodes
-	 * we need to emulate.
-	 */
-	switch (insn[insnlen] & 0xFE) {
-	case 0xE4: /* in     <next byte>,%al */
-		port = insn[insnlen+1];
-		insnlen += 2;
-		in = 1;
-		break;
-	case 0xEC: /* in     (%dx),%al */
-		port = getreg(edx) & 0xFFFF;
-		insnlen += 1;
-		in = 1;
-		break;
-	case 0xE6: /* out    %al,<next byte> */
-		port = insn[insnlen+1];
-		insnlen += 2;
-		break;
-	case 0xEE: /* out    %al,(%dx) */
-		port = getreg(edx) & 0xFFFF;
-		insnlen += 1;
-		break;
-	default:
-		/* OK, we don't know what this is, can't emulate. */
-		goto no_emulate;
-	}
-
-	/* Set a mask of the 1, 2 or 4 bytes, depending on size of IO */
-	if (byte_access)
-		mask = 0xFF;
-	else if (small_operand)
-		mask = 0xFFFF;
-	else
-		mask = 0xFFFFFFFF;
-
-	/*
-	 * If it was an "IN" instruction, they expect the result to be read
-	 * into %eax, so we change %eax.
-	 */
-	eax = getreg(eax);
-
-	if (in) {
-		/* This is the PS/2 keyboard status; 1 means ready for output */
-		if (port == 0x64)
-			val = 1;
-		else if (is_pci_addr_port(port))
-			pci_addr_ioread(port, mask, &val);
-		else if (is_pci_data_port(port))
-			pci_data_ioread(port, mask, &val);
-
-		/* Clear the bits we're about to read */
-		eax &= ~mask;
-		/* Copy bits in from val. */
-		eax |= val & mask;
-		/* Now update the register. */
-		setreg(eax, eax);
-	} else {
-		if (is_pci_addr_port(port)) {
-			if (!pci_addr_iowrite(port, mask, eax))
-				goto bad_io;
-		} else if (is_pci_data_port(port)) {
-			if (!pci_data_iowrite(port, mask, eax))
-				goto bad_io;
-		}
-		/* There are many other ports, eg. CMOS clock, serial
-		 * and parallel ports, so we ignore them all. */
-	}
-
-	verbose("IO %s of %x to %u: %#08x\n",
-		in ? "IN" : "OUT", mask, port, eax);
-skip_insn:
-	/* Finally, we've "done" the instruction, so move past it. */
-	setreg(eip, getreg(eip) + insnlen);
-	return;
-
-bad_io:
-	warnx("Attempt to %s port %u (%#x mask)",
-	      in ? "read from" : "write to", port, mask);
-
-no_emulate:
-	/* Inject trap into Guest. */
-	if (write(lguest_fd, args, sizeof(args)) < 0)
-		err(1, "Reinjecting trap 13 for fault at %#x", getreg(eip));
-}
-
-static struct device *find_mmio_region(unsigned long paddr, u32 *off)
-{
-	unsigned int i;
-
-	for (i = 1; i < MAX_PCI_DEVICES; i++) {
-		struct device *d = devices.pci[i];
-
-		if (!d)
-			continue;
-		if (paddr < d->mmio_addr)
-			continue;
-		if (paddr >= d->mmio_addr + d->mmio_size)
-			continue;
-		*off = paddr - d->mmio_addr;
-		return d;
-	}
-	return NULL;
-}
-
-/* FIXME: Use vq array. */
-static struct virtqueue *vq_by_num(struct device *d, u32 num)
-{
-	struct virtqueue *vq = d->vq;
-
-	while (num-- && vq)
-		vq = vq->next;
-
-	return vq;
-}
-
-static void save_vq_config(const struct virtio_pci_common_cfg *cfg,
-			   struct virtqueue *vq)
-{
-	vq->pci_config = *cfg;
-}
-
-static void restore_vq_config(struct virtio_pci_common_cfg *cfg,
-			      struct virtqueue *vq)
-{
-	/* Only restore the per-vq part */
-	size_t off = offsetof(struct virtio_pci_common_cfg, queue_size);
-
-	memcpy((void *)cfg + off, (void *)&vq->pci_config + off,
-	       sizeof(*cfg) - off);
-}
-
-/*
- * 4.1.4.3.2:
- *
- *  The driver MUST configure the other virtqueue fields before
- *  enabling the virtqueue with queue_enable.
- *
- * When they enable the virtqueue, we check that their setup is valid.
- */
-static void check_virtqueue(struct device *d, struct virtqueue *vq)
-{
-	/* Because lguest is 32 bit, all the descriptor high bits must be 0 */
-	if (vq->pci_config.queue_desc_hi
-	    || vq->pci_config.queue_avail_hi
-	    || vq->pci_config.queue_used_hi)
-		bad_driver_vq(vq, "invalid 64-bit queue address");
-
-	/*
-	 * 2.4.1:
-	 *
-	 *  The driver MUST ensure that the physical address of the first byte
-	 *  of each virtqueue part is a multiple of the specified alignment
-	 *  value in the above table.
-	 */
-	if (vq->pci_config.queue_desc_lo % 16
-	    || vq->pci_config.queue_avail_lo % 2
-	    || vq->pci_config.queue_used_lo % 4)
-		bad_driver_vq(vq, "invalid alignment in queue addresses");
-
-	/* Initialize the virtqueue and check they're all in range. */
-	vq->vring.num = vq->pci_config.queue_size;
-	vq->vring.desc = check_pointer(vq->dev,
-				       vq->pci_config.queue_desc_lo,
-				       sizeof(*vq->vring.desc) * vq->vring.num);
-	vq->vring.avail = check_pointer(vq->dev,
-					vq->pci_config.queue_avail_lo,
-					sizeof(*vq->vring.avail)
-					+ (sizeof(vq->vring.avail->ring[0])
-					   * vq->vring.num));
-	vq->vring.used = check_pointer(vq->dev,
-				       vq->pci_config.queue_used_lo,
-				       sizeof(*vq->vring.used)
-				       + (sizeof(vq->vring.used->ring[0])
-					  * vq->vring.num));
-
-	/*
-	 * 2.4.9.1:
-	 *
-	 *   The driver MUST initialize flags in the used ring to 0
-	 *   when allocating the used ring.
-	 */
-	if (vq->vring.used->flags != 0)
-		bad_driver_vq(vq, "invalid initial used.flags %#x",
-			      vq->vring.used->flags);
-}
-
-static void start_virtqueue(struct virtqueue *vq)
-{
-	/*
-	 * Create stack for thread.  Since the stack grows upwards, we point
-	 * the stack pointer to the end of this region.
-	 */
-	char *stack = malloc(32768);
-
-	/* Create a zero-initialized eventfd. */
-	vq->eventfd = eventfd(0, 0);
-	if (vq->eventfd < 0)
-		err(1, "Creating eventfd");
-
-	/*
-	 * CLONE_VM: because it has to access the Guest memory, and SIGCHLD so
-	 * we get a signal if it dies.
-	 */
-	vq->thread = clone(do_thread, stack + 32768, CLONE_VM | SIGCHLD, vq);
-	if (vq->thread == (pid_t)-1)
-		err(1, "Creating clone");
-}
-
-static void start_virtqueues(struct device *d)
-{
-	struct virtqueue *vq;
-
-	for (vq = d->vq; vq; vq = vq->next) {
-		if (vq->pci_config.queue_enable)
-			start_virtqueue(vq);
-	}
-}
-
-static void emulate_mmio_write(struct device *d, u32 off, u32 val, u32 mask)
-{
-	struct virtqueue *vq;
-
-	switch (off) {
-	case offsetof(struct virtio_pci_mmio, cfg.device_feature_select):
-		/*
-		 * 4.1.4.3.1:
-		 *
-		 * The device MUST present the feature bits it is offering in
-		 * device_feature, starting at bit device_feature_select ∗ 32
-		 * for any device_feature_select written by the driver
-		 */
-		if (val == 0)
-			d->mmio->cfg.device_feature = d->features;
-		else if (val == 1)
-			d->mmio->cfg.device_feature = (d->features >> 32);
-		else
-			d->mmio->cfg.device_feature = 0;
-		goto feature_write_through32;
-	case offsetof(struct virtio_pci_mmio, cfg.guest_feature_select):
-		if (val > 1)
-			bad_driver(d, "Unexpected driver select %u", val);
-		goto feature_write_through32;
-	case offsetof(struct virtio_pci_mmio, cfg.guest_feature):
-		if (d->mmio->cfg.guest_feature_select == 0) {
-			d->features_accepted &= ~((u64)0xFFFFFFFF);
-			d->features_accepted |= val;
-		} else {
-			assert(d->mmio->cfg.guest_feature_select == 1);
-			d->features_accepted &= 0xFFFFFFFF;
-			d->features_accepted |= ((u64)val) << 32;
-		}
-		/*
-		 * 2.2.1:
-		 *
-		 *   The driver MUST NOT accept a feature which the device did
-		 *   not offer
-		 */
-		if (d->features_accepted & ~d->features)
-			bad_driver(d, "over-accepted features %#llx of %#llx",
-				   d->features_accepted, d->features);
-		goto feature_write_through32;
-	case offsetof(struct virtio_pci_mmio, cfg.device_status): {
-		u8 prev;
-
-		verbose("%s: device status -> %#x\n", d->name, val);
-		/*
-		 * 4.1.4.3.1:
-		 * 
-		 *  The device MUST reset when 0 is written to device_status,
-		 *  and present a 0 in device_status once that is done.
-		 */
-		if (val == 0) {
-			reset_device(d);
-			goto write_through8;
-		}
-
-		/* 2.1.1: The driver MUST NOT clear a device status bit. */
-		if (d->mmio->cfg.device_status & ~val)
-			bad_driver(d, "unset of device status bit %#x -> %#x",
-				   d->mmio->cfg.device_status, val);
-
-		/*
-		 * 2.1.2:
-		 *
-		 *  The device MUST NOT consume buffers or notify the driver
-		 *  before DRIVER_OK.
-		 */
-		if (val & VIRTIO_CONFIG_S_DRIVER_OK
-		    && !(d->mmio->cfg.device_status & VIRTIO_CONFIG_S_DRIVER_OK))
-			start_virtqueues(d);
-
-		/*
-		 * 3.1.1:
-		 *
-		 *   The driver MUST follow this sequence to initialize a device:
-		 *   - Reset the device.
-		 *   - Set the ACKNOWLEDGE status bit: the guest OS has
-                 *     notice the device.
-		 *   - Set the DRIVER status bit: the guest OS knows how
-                 *     to drive the device.
-		 *   - Read device feature bits, and write the subset
-		 *     of feature bits understood by the OS and driver
-		 *     to the device. During this step the driver MAY
-		 *     read (but MUST NOT write) the device-specific
-		 *     configuration fields to check that it can
-		 *     support the device before accepting it.
-		 *   - Set the FEATURES_OK status bit.  The driver
-		 *     MUST not accept new feature bits after this
-		 *     step.
-		 *   - Re-read device status to ensure the FEATURES_OK
-		 *     bit is still set: otherwise, the device does
-		 *     not support our subset of features and the
-		 *     device is unusable.
-		 *   - Perform device-specific setup, including
-		 *     discovery of virtqueues for the device,
-		 *     optional per-bus setup, reading and possibly
-		 *     writing the device’s virtio configuration
-		 *     space, and population of virtqueues.
-		 *   - Set the DRIVER_OK status bit. At this point the
-                 *     device is “live”.
-		 */
-		prev = 0;
-		switch (val & ~d->mmio->cfg.device_status) {
-		case VIRTIO_CONFIG_S_DRIVER_OK:
-			prev |= VIRTIO_CONFIG_S_FEATURES_OK; /* fall thru */
-		case VIRTIO_CONFIG_S_FEATURES_OK:
-			prev |= VIRTIO_CONFIG_S_DRIVER; /* fall thru */
-		case VIRTIO_CONFIG_S_DRIVER:
-			prev |= VIRTIO_CONFIG_S_ACKNOWLEDGE; /* fall thru */
-		case VIRTIO_CONFIG_S_ACKNOWLEDGE:
-			break;
-		default:
-			bad_driver(d, "unknown device status bit %#x -> %#x",
-				   d->mmio->cfg.device_status, val);
-		}
-		if (d->mmio->cfg.device_status != prev)
-			bad_driver(d, "unexpected status transition %#x -> %#x",
-				   d->mmio->cfg.device_status, val);
-
-		/* If they just wrote FEATURES_OK, we make sure they read */
-		switch (val & ~d->mmio->cfg.device_status) {
-		case VIRTIO_CONFIG_S_FEATURES_OK:
-			d->wrote_features_ok = true;
-			break;
-		case VIRTIO_CONFIG_S_DRIVER_OK:
-			if (d->wrote_features_ok)
-				bad_driver(d, "did not re-read FEATURES_OK");
-			break;
-		}
-		goto write_through8;
-	}
-	case offsetof(struct virtio_pci_mmio, cfg.queue_select):
-		vq = vq_by_num(d, val);
-		/*
-		 * 4.1.4.3.1:
-		 *
-		 *  The device MUST present a 0 in queue_size if the virtqueue
-		 *  corresponding to the current queue_select is unavailable.
-		 */
-		if (!vq) {
-			d->mmio->cfg.queue_size = 0;
-			goto write_through16;
-		}
-		/* Save registers for old vq, if it was a valid vq */
-		if (d->mmio->cfg.queue_size)
-			save_vq_config(&d->mmio->cfg,
-				       vq_by_num(d, d->mmio->cfg.queue_select));
-		/* Restore the registers for the queue they asked for */
-		restore_vq_config(&d->mmio->cfg, vq);
-		goto write_through16;
-	case offsetof(struct virtio_pci_mmio, cfg.queue_size):
-		/*
-		 * 4.1.4.3.2:
-		 *
-		 *  The driver MUST NOT write a value which is not a power of 2
-		 *  to queue_size.
-		 */
-		if (val & (val-1))
-			bad_driver(d, "invalid queue size %u", val);
-		if (d->mmio->cfg.queue_enable)
-			bad_driver(d, "changing queue size on live device");
-		goto write_through16;
-	case offsetof(struct virtio_pci_mmio, cfg.queue_msix_vector):
-		bad_driver(d, "attempt to set MSIX vector to %u", val);
-	case offsetof(struct virtio_pci_mmio, cfg.queue_enable): {
-		struct virtqueue *vq = vq_by_num(d, d->mmio->cfg.queue_select);
-
-		/*
-		 * 4.1.4.3.2:
-		 *
-		 *  The driver MUST NOT write a 0 to queue_enable.
-		 */
-		if (val != 1)
-			bad_driver(d, "setting queue_enable to %u", val);
-
-		/*
-		 * 3.1.1:
-		 *
-		 *  7. Perform device-specific setup, including discovery of
-		 *     virtqueues for the device, optional per-bus setup,
-		 *     reading and possibly writing the device’s virtio
-		 *     configuration space, and population of virtqueues.
-		 *  8. Set the DRIVER_OK status bit.
-		 *
-		 * All our devices require all virtqueues to be enabled, so
-		 * they should have done that before setting DRIVER_OK.
-		 */
-		if (d->mmio->cfg.device_status & VIRTIO_CONFIG_S_DRIVER_OK)
-			bad_driver(d, "enabling vq after DRIVER_OK");
-
-		d->mmio->cfg.queue_enable = val;
-		save_vq_config(&d->mmio->cfg, vq);
-		check_virtqueue(d, vq);
-		goto write_through16;
-	}
-	case offsetof(struct virtio_pci_mmio, cfg.queue_notify_off):
-		bad_driver(d, "attempt to write to queue_notify_off");
-	case offsetof(struct virtio_pci_mmio, cfg.queue_desc_lo):
-	case offsetof(struct virtio_pci_mmio, cfg.queue_desc_hi):
-	case offsetof(struct virtio_pci_mmio, cfg.queue_avail_lo):
-	case offsetof(struct virtio_pci_mmio, cfg.queue_avail_hi):
-	case offsetof(struct virtio_pci_mmio, cfg.queue_used_lo):
-	case offsetof(struct virtio_pci_mmio, cfg.queue_used_hi):
-		/*
-		 * 4.1.4.3.2:
-		 *
-		 *  The driver MUST configure the other virtqueue fields before
-		 *  enabling the virtqueue with queue_enable.
-		 */
-		if (d->mmio->cfg.queue_enable)
-			bad_driver(d, "changing queue on live device");
-
-		/*
-		 * 3.1.1:
-		 *
-		 *  The driver MUST follow this sequence to initialize a device:
-		 *...
-		 *  5. Set the FEATURES_OK status bit. The driver MUST not
-		 *  accept new feature bits after this step.
-		 */
-		if (!(d->mmio->cfg.device_status & VIRTIO_CONFIG_S_FEATURES_OK))
-			bad_driver(d, "setting up vq before FEATURES_OK");
-
-		/*
-		 *  6. Re-read device status to ensure the FEATURES_OK bit is
-		 *     still set...
-		 */
-		if (d->wrote_features_ok)
-			bad_driver(d, "didn't re-read FEATURES_OK before setup");
-
-		goto write_through32;
-	case offsetof(struct virtio_pci_mmio, notify):
-		vq = vq_by_num(d, val);
-		if (!vq)
-			bad_driver(d, "Invalid vq notification on %u", val);
-		/* Notify the process handling this vq by adding 1 to eventfd */
-		write(vq->eventfd, "\1\0\0\0\0\0\0\0", 8);
-		goto write_through16;
-	case offsetof(struct virtio_pci_mmio, isr):
-		bad_driver(d, "Unexpected write to isr");
-	/* Weird corner case: write to emerg_wr of console */
-	case sizeof(struct virtio_pci_mmio)
-		+ offsetof(struct virtio_console_config, emerg_wr):
-		if (strcmp(d->name, "console") == 0) {
-			char c = val;
-			write(STDOUT_FILENO, &c, 1);
-			goto write_through32;
-		}
-		/* Fall through... */
-	default:
-		/*
-		 * 4.1.4.3.2:
-		 *
-		 *   The driver MUST NOT write to device_feature, num_queues,
-		 *   config_generation or queue_notify_off.
-		 */
-		bad_driver(d, "Unexpected write to offset %u", off);
-	}
-
-feature_write_through32:
-	/*
-	 * 3.1.1:
-	 *
-	 *   The driver MUST follow this sequence to initialize a device:
-	 *...
-	 *   - Set the DRIVER status bit: the guest OS knows how
-	 *     to drive the device.
-	 *   - Read device feature bits, and write the subset
-	 *     of feature bits understood by the OS and driver
-	 *     to the device.
-	 *...
-	 *   - Set the FEATURES_OK status bit. The driver MUST not
-	 *     accept new feature bits after this step.
-	 */
-	if (!(d->mmio->cfg.device_status & VIRTIO_CONFIG_S_DRIVER))
-		bad_driver(d, "feature write before VIRTIO_CONFIG_S_DRIVER");
-	if (d->mmio->cfg.device_status & VIRTIO_CONFIG_S_FEATURES_OK)
-		bad_driver(d, "feature write after VIRTIO_CONFIG_S_FEATURES_OK");
-
-	/*
-	 * 4.1.3.1:
-	 *
-	 *  The driver MUST access each field using the “natural” access
-	 *  method, i.e. 32-bit accesses for 32-bit fields, 16-bit accesses for
-	 *  16-bit fields and 8-bit accesses for 8-bit fields.
-	 */
-write_through32:
-	if (mask != 0xFFFFFFFF) {
-		bad_driver(d, "non-32-bit write to offset %u (%#x)",
-			   off, getreg(eip));
-		return;
-	}
-	memcpy((char *)d->mmio + off, &val, 4);
-	return;
-
-write_through16:
-	if (mask != 0xFFFF)
-		bad_driver(d, "non-16-bit write to offset %u (%#x)",
-			   off, getreg(eip));
-	memcpy((char *)d->mmio + off, &val, 2);
-	return;
-
-write_through8:
-	if (mask != 0xFF)
-		bad_driver(d, "non-8-bit write to offset %u (%#x)",
-			   off, getreg(eip));
-	memcpy((char *)d->mmio + off, &val, 1);
-	return;
-}
-
-static u32 emulate_mmio_read(struct device *d, u32 off, u32 mask)
-{
-	u8 isr;
-	u32 val = 0;
-
-	switch (off) {
-	case offsetof(struct virtio_pci_mmio, cfg.device_feature_select):
-	case offsetof(struct virtio_pci_mmio, cfg.device_feature):
-	case offsetof(struct virtio_pci_mmio, cfg.guest_feature_select):
-	case offsetof(struct virtio_pci_mmio, cfg.guest_feature):
-		/*
-		 * 3.1.1:
-		 *
-		 *   The driver MUST follow this sequence to initialize a device:
-		 *...
-		 *   - Set the DRIVER status bit: the guest OS knows how
-		 *     to drive the device.
-		 *   - Read device feature bits, and write the subset
-		 *     of feature bits understood by the OS and driver
-		 *     to the device.
-		 */
-		if (!(d->mmio->cfg.device_status & VIRTIO_CONFIG_S_DRIVER))
-			bad_driver(d,
-				   "feature read before VIRTIO_CONFIG_S_DRIVER");
-		goto read_through32;
-	case offsetof(struct virtio_pci_mmio, cfg.msix_config):
-		bad_driver(d, "read of msix_config");
-	case offsetof(struct virtio_pci_mmio, cfg.num_queues):
-		goto read_through16;
-	case offsetof(struct virtio_pci_mmio, cfg.device_status):
-		/* As they did read, any write of FEATURES_OK is now fine. */
-		d->wrote_features_ok = false;
-		goto read_through8;
-	case offsetof(struct virtio_pci_mmio, cfg.config_generation):
-		/*
-		 * 4.1.4.3.1:
-		 *
-		 *  The device MUST present a changed config_generation after
-		 *  the driver has read a device-specific configuration value
-		 *  which has changed since any part of the device-specific
-		 *  configuration was last read.
-		 *
-		 * This is simple: none of our devices change config, so this
-		 * is always 0.
-		 */
-		goto read_through8;
-	case offsetof(struct virtio_pci_mmio, notify):
-		/*
-		 * 3.1.1:
-		 *
-		 *   The driver MUST NOT notify the device before setting
-		 *   DRIVER_OK.
-		 */
-		if (!(d->mmio->cfg.device_status & VIRTIO_CONFIG_S_DRIVER_OK))
-			bad_driver(d, "notify before VIRTIO_CONFIG_S_DRIVER_OK");
-		goto read_through16;
-	case offsetof(struct virtio_pci_mmio, isr):
-		if (mask != 0xFF)
-			bad_driver(d, "non-8-bit read from offset %u (%#x)",
-				   off, getreg(eip));
-		isr = d->mmio->isr;
-		/*
-		 * 4.1.4.5.1:
-		 *
-		 *  The device MUST reset ISR status to 0 on driver read. 
-		 */
-		d->mmio->isr = 0;
-		return isr;
-	case offsetof(struct virtio_pci_mmio, padding):
-		bad_driver(d, "read from padding (%#x)", getreg(eip));
-	default:
-		/* Read from device config space, beware unaligned overflow */
-		if (off > d->mmio_size - 4)
-			bad_driver(d, "read past end (%#x)", getreg(eip));
-
-		/*
-		 * 3.1.1:
-		 *  The driver MUST follow this sequence to initialize a device:
-		 *...
-		 *  3. Set the DRIVER status bit: the guest OS knows how to
-		 *  drive the device.
-		 *  4. Read device feature bits, and write the subset of
-		 *  feature bits understood by the OS and driver to the
-		 *  device. During this step the driver MAY read (but MUST NOT
-		 *  write) the device-specific configuration fields to check
-		 *  that it can support the device before accepting it.
-		 */
-		if (!(d->mmio->cfg.device_status & VIRTIO_CONFIG_S_DRIVER))
-			bad_driver(d,
-				   "config read before VIRTIO_CONFIG_S_DRIVER");
-
-		if (mask == 0xFFFFFFFF)
-			goto read_through32;
-		else if (mask == 0xFFFF)
-			goto read_through16;
-		else
-			goto read_through8;
-	}
-
-	/*
-	 * 4.1.3.1:
-	 *
-	 *  The driver MUST access each field using the “natural” access
-	 *  method, i.e. 32-bit accesses for 32-bit fields, 16-bit accesses for
-	 *  16-bit fields and 8-bit accesses for 8-bit fields.
-	 */
-read_through32:
-	if (mask != 0xFFFFFFFF)
-		bad_driver(d, "non-32-bit read to offset %u (%#x)",
-			   off, getreg(eip));
-	memcpy(&val, (char *)d->mmio + off, 4);
-	return val;
-
-read_through16:
-	if (mask != 0xFFFF)
-		bad_driver(d, "non-16-bit read to offset %u (%#x)",
-			   off, getreg(eip));
-	memcpy(&val, (char *)d->mmio + off, 2);
-	return val;
-
-read_through8:
-	if (mask != 0xFF)
-		bad_driver(d, "non-8-bit read to offset %u (%#x)",
-			   off, getreg(eip));
-	memcpy(&val, (char *)d->mmio + off, 1);
-	return val;
-}
-
-static void emulate_mmio(unsigned long paddr, const u8 *insn)
-{
-	u32 val, off, mask = 0xFFFFFFFF, insnlen = 0;
-	struct device *d = find_mmio_region(paddr, &off);
-	unsigned long args[] = { LHREQ_TRAP, 14 };
-
-	if (!d) {
-		warnx("MMIO touching %#08lx (not a device)", paddr);
-		goto reinject;
-	}
-
-	/* Prefix makes it a 16 bit op */
-	if (insn[0] == 0x66) {
-		mask = 0xFFFF;
-		insnlen++;
-	}
-
-	/* iowrite */
-	if (insn[insnlen] == 0x89) {
-		/* Next byte is r/m byte: bits 3-5 are register. */
-		val = getreg_num((insn[insnlen+1] >> 3) & 0x7, mask);
-		emulate_mmio_write(d, off, val, mask);
-		insnlen += 2 + insn_displacement_len(insn[insnlen+1]);
-	} else if (insn[insnlen] == 0x8b) { /* ioread */
-		/* Next byte is r/m byte: bits 3-5 are register. */
-		val = emulate_mmio_read(d, off, mask);
-		setreg_num((insn[insnlen+1] >> 3) & 0x7, val, mask);
-		insnlen += 2 + insn_displacement_len(insn[insnlen+1]);
-	} else if (insn[0] == 0x88) { /* 8-bit iowrite */
-		mask = 0xff;
-		/* Next byte is r/m byte: bits 3-5 are register. */
-		val = getreg_num((insn[1] >> 3) & 0x7, mask);
-		emulate_mmio_write(d, off, val, mask);
-		insnlen = 2 + insn_displacement_len(insn[1]);
-	} else if (insn[0] == 0x8a) { /* 8-bit ioread */
-		mask = 0xff;
-		val = emulate_mmio_read(d, off, mask);
-		setreg_num((insn[1] >> 3) & 0x7, val, mask);
-		insnlen = 2 + insn_displacement_len(insn[1]);
-	} else {
-		warnx("Unknown MMIO instruction touching %#08lx:"
-		     " %02x %02x %02x %02x at %u",
-		     paddr, insn[0], insn[1], insn[2], insn[3], getreg(eip));
-	reinject:
-		/* Inject trap into Guest. */
-		if (write(lguest_fd, args, sizeof(args)) < 0)
-			err(1, "Reinjecting trap 14 for fault at %#x",
-			    getreg(eip));
-		return;
-	}
-
-	/* Finally, we've "done" the instruction, so move past it. */
-	setreg(eip, getreg(eip) + insnlen);
-}
-
-/*L:190
- * Device Setup
- *
- * All devices need a descriptor so the Guest knows it exists, and a "struct
- * device" so the Launcher can keep track of it.  We have common helper
- * routines to allocate and manage them.
- */
-static void add_pci_virtqueue(struct device *dev,
-			      void (*service)(struct virtqueue *),
-			      const char *name)
-{
-	struct virtqueue **i, *vq = malloc(sizeof(*vq));
-
-	/* Initialize the virtqueue */
-	vq->next = NULL;
-	vq->last_avail_idx = 0;
-	vq->dev = dev;
-	vq->name = name;
-
-	/*
-	 * This is the routine the service thread will run, and its Process ID
-	 * once it's running.
-	 */
-	vq->service = service;
-	vq->thread = (pid_t)-1;
-
-	/* Initialize the configuration. */
-	reset_vq_pci_config(vq);
-	vq->pci_config.queue_notify_off = 0;
-
-	/* Add one to the number of queues */
-	vq->dev->mmio->cfg.num_queues++;
-
-	/*
-	 * Add to tail of list, so dev->vq is first vq, dev->vq->next is
-	 * second.
-	 */
-	for (i = &dev->vq; *i; i = &(*i)->next);
-	*i = vq;
-}
-
-/* The Guest accesses the feature bits via the PCI common config MMIO region */
-static void add_pci_feature(struct device *dev, unsigned bit)
-{
-	dev->features |= (1ULL << bit);
-}
-
-/* For devices with no config. */
-static void no_device_config(struct device *dev)
-{
-	dev->mmio_addr = get_mmio_region(dev->mmio_size);
-
-	dev->config.bar[0] = dev->mmio_addr;
-	/* Bottom 4 bits must be zero */
-	assert(~(dev->config.bar[0] & 0xF));
-}
-
-/* This puts the device config into BAR0 */
-static void set_device_config(struct device *dev, const void *conf, size_t len)
-{
-	/* Set up BAR 0 */
-	dev->mmio_size += len;
-	dev->mmio = realloc(dev->mmio, dev->mmio_size);
-	memcpy(dev->mmio + 1, conf, len);
-
-	/*
-	 * 4.1.4.6:
-	 *
-	 *  The device MUST present at least one VIRTIO_PCI_CAP_DEVICE_CFG
-	 *  capability for any device type which has a device-specific
-	 *  configuration.
-	 */
-	/* Hook up device cfg */
-	dev->config.cfg_access.cap.cap_next
-		= offsetof(struct pci_config, device);
-
-	/*
-	 * 4.1.4.6.1:
-	 *
-	 *  The offset for the device-specific configuration MUST be 4-byte
-	 *  aligned.
-	 */
-	assert(dev->config.cfg_access.cap.cap_next % 4 == 0);
-
-	/* Fix up device cfg field length. */
-	dev->config.device.length = len;
-
-	/* The rest is the same as the no-config case */
-	no_device_config(dev);
-}
-
-static void init_cap(struct virtio_pci_cap *cap, size_t caplen, int type,
-		     size_t bar_offset, size_t bar_bytes, u8 next)
-{
-	cap->cap_vndr = PCI_CAP_ID_VNDR;
-	cap->cap_next = next;
-	cap->cap_len = caplen;
-	cap->cfg_type = type;
-	cap->bar = 0;
-	memset(cap->padding, 0, sizeof(cap->padding));
-	cap->offset = bar_offset;
-	cap->length = bar_bytes;
-}
-
-/*
- * This sets up the pci_config structure, as defined in the virtio 1.0
- * standard (and PCI standard).
- */
-static void init_pci_config(struct pci_config *pci, u16 type,
-			    u8 class, u8 subclass)
-{
-	size_t bar_offset, bar_len;
-
-	/*
-	 * 4.1.4.4.1:
-	 *
-	 *  The device MUST either present notify_off_multiplier as an even
-	 *  power of 2, or present notify_off_multiplier as 0.
-	 *
-	 * 2.1.2:
-	 *
-	 *   The device MUST initialize device status to 0 upon reset. 
-	 */
-	memset(pci, 0, sizeof(*pci));
-
-	/* 4.1.2.1: Devices MUST have the PCI Vendor ID 0x1AF4 */
-	pci->vendor_id = 0x1AF4;
-	/* 4.1.2.1: ... PCI Device ID calculated by adding 0x1040 ... */
-	pci->device_id = 0x1040 + type;
-
-	/*
-	 * PCI have specific codes for different types of devices.
-	 * Linux doesn't care, but it's a good clue for people looking
-	 * at the device.
-	 */
-	pci->class = class;
-	pci->subclass = subclass;
-
-	/*
-	 * 4.1.2.1:
-	 *
-	 *  Non-transitional devices SHOULD have a PCI Revision ID of 1 or
-	 *  higher
-	 */
-	pci->revid = 1;
-
-	/*
-	 * 4.1.2.1:
-	 *
-	 *  Non-transitional devices SHOULD have a PCI Subsystem Device ID of
-	 *  0x40 or higher.
-	 */
-	pci->subsystem_device_id = 0x40;
-
-	/* We use our dummy interrupt controller, and irq_line is the irq */
-	pci->irq_line = devices.next_irq++;
-	pci->irq_pin = 0;
-
-	/* Support for extended capabilities. */
-	pci->status = (1 << 4);
-
-	/* Link them in. */
-	/*
-	 * 4.1.4.3.1:
-	 *
-	 *  The device MUST present at least one common configuration
-	 *  capability.
-	 */
-	pci->capabilities = offsetof(struct pci_config, common);
-
-	/* 4.1.4.3.1 ... offset MUST be 4-byte aligned. */
-	assert(pci->capabilities % 4 == 0);
-
-	bar_offset = offsetof(struct virtio_pci_mmio, cfg);
-	bar_len = sizeof(((struct virtio_pci_mmio *)0)->cfg);
-	init_cap(&pci->common, sizeof(pci->common), VIRTIO_PCI_CAP_COMMON_CFG,
-		 bar_offset, bar_len,
-		 offsetof(struct pci_config, notify));
-
-	/*
-	 * 4.1.4.4.1:
-	 *
-	 *  The device MUST present at least one notification capability.
-	 */
-	bar_offset += bar_len;
-	bar_len = sizeof(((struct virtio_pci_mmio *)0)->notify);
-
-	/*
-	 * 4.1.4.4.1:
-	 *
-	 *  The cap.offset MUST be 2-byte aligned.
-	 */
-	assert(pci->common.cap_next % 2 == 0);
-
-	/* FIXME: Use a non-zero notify_off, for per-queue notification? */
-	/*
-	 * 4.1.4.4.1:
-	 *
-	 *  The value cap.length presented by the device MUST be at least 2 and
-	 *  MUST be large enough to support queue notification offsets for all
-	 *  supported queues in all possible configurations.
-	 */
-	assert(bar_len >= 2);
-
-	init_cap(&pci->notify.cap, sizeof(pci->notify),
-		 VIRTIO_PCI_CAP_NOTIFY_CFG,
-		 bar_offset, bar_len,
-		 offsetof(struct pci_config, isr));
-
-	bar_offset += bar_len;
-	bar_len = sizeof(((struct virtio_pci_mmio *)0)->isr);
-	/*
-	 * 4.1.4.5.1:
-	 *
-	 *  The device MUST present at least one VIRTIO_PCI_CAP_ISR_CFG
-	 *  capability.
-	 */
-	init_cap(&pci->isr, sizeof(pci->isr),
-		 VIRTIO_PCI_CAP_ISR_CFG,
-		 bar_offset, bar_len,
-		 offsetof(struct pci_config, cfg_access));
-
-	/*
-	 * 4.1.4.7.1:
-	 *
-	 * The device MUST present at least one VIRTIO_PCI_CAP_PCI_CFG
-	 * capability.
-	 */
-	/* This doesn't have any presence in the BAR */
-	init_cap(&pci->cfg_access.cap, sizeof(pci->cfg_access),
-		 VIRTIO_PCI_CAP_PCI_CFG,
-		 0, 0, 0);
-
-	bar_offset += bar_len + sizeof(((struct virtio_pci_mmio *)0)->padding);
-	assert(bar_offset == sizeof(struct virtio_pci_mmio));
-
-	/*
-	 * This gets sewn in and length set in set_device_config().
-	 * Some devices don't have a device configuration interface, so
-	 * we never expose this if we don't call set_device_config().
-	 */
-	init_cap(&pci->device, sizeof(pci->device), VIRTIO_PCI_CAP_DEVICE_CFG,
-		 bar_offset, 0, 0);
-}
-
-/*
- * This routine does all the creation and setup of a new device, but we don't
- * actually place the MMIO region until we know the size (if any) of the
- * device-specific config.  And we don't actually start the service threads
- * until later.
- *
- * See what I mean about userspace being boring?
- */
-static struct device *new_pci_device(const char *name, u16 type,
-				     u8 class, u8 subclass)
-{
-	struct device *dev = malloc(sizeof(*dev));
-
-	/* Now we populate the fields one at a time. */
-	dev->name = name;
-	dev->vq = NULL;
-	dev->running = false;
-	dev->wrote_features_ok = false;
-	dev->mmio_size = sizeof(struct virtio_pci_mmio);
-	dev->mmio = calloc(1, dev->mmio_size);
-	dev->features = (u64)1 << VIRTIO_F_VERSION_1;
-	dev->features_accepted = 0;
-
-	if (devices.device_num + 1 >= MAX_PCI_DEVICES)
-		errx(1, "Can only handle 31 PCI devices");
-
-	init_pci_config(&dev->config, type, class, subclass);
-	assert(!devices.pci[devices.device_num+1]);
-	devices.pci[++devices.device_num] = dev;
-
-	return dev;
-}
-
-/*
- * Our first setup routine is the console.  It's a fairly simple device, but
- * UNIX tty handling makes it uglier than it could be.
- */
-static void setup_console(void)
-{
-	struct device *dev;
-	struct virtio_console_config conf;
-
-	/* If we can save the initial standard input settings... */
-	if (tcgetattr(STDIN_FILENO, &orig_term) == 0) {
-		struct termios term = orig_term;
-		/*
-		 * Then we turn off echo, line buffering and ^C etc: We want a
-		 * raw input stream to the Guest.
-		 */
-		term.c_lflag &= ~(ISIG|ICANON|ECHO);
-		tcsetattr(STDIN_FILENO, TCSANOW, &term);
-	}
-
-	dev = new_pci_device("console", VIRTIO_ID_CONSOLE, 0x07, 0x00);
-
-	/* We store the console state in dev->priv, and initialize it. */
-	dev->priv = malloc(sizeof(struct console_abort));
-	((struct console_abort *)dev->priv)->count = 0;
-
-	/*
-	 * The console needs two virtqueues: the input then the output.  When
-	 * they put something the input queue, we make sure we're listening to
-	 * stdin.  When they put something in the output queue, we write it to
-	 * stdout.
-	 */
-	add_pci_virtqueue(dev, console_input, "input");
-	add_pci_virtqueue(dev, console_output, "output");
-
-	/* We need a configuration area for the emerg_wr early writes. */
-	add_pci_feature(dev, VIRTIO_CONSOLE_F_EMERG_WRITE);
-	set_device_config(dev, &conf, sizeof(conf));
-
-	verbose("device %u: console\n", devices.device_num);
-}
-/*:*/
-
-/*M:010
- * Inter-guest networking is an interesting area.  Simplest is to have a
- * --sharenet=<name> option which opens or creates a named pipe.  This can be
- * used to send packets to another guest in a 1:1 manner.
- *
- * More sophisticated is to use one of the tools developed for project like UML
- * to do networking.
- *
- * Faster is to do virtio bonding in kernel.  Doing this 1:1 would be
- * completely generic ("here's my vring, attach to your vring") and would work
- * for any traffic.  Of course, namespace and permissions issues need to be
- * dealt with.  A more sophisticated "multi-channel" virtio_net.c could hide
- * multiple inter-guest channels behind one interface, although it would
- * require some manner of hotplugging new virtio channels.
- *
- * Finally, we could use a virtio network switch in the kernel, ie. vhost.
-:*/
-
-static u32 str2ip(const char *ipaddr)
-{
-	unsigned int b[4];
-
-	if (sscanf(ipaddr, "%u.%u.%u.%u", &b[0], &b[1], &b[2], &b[3]) != 4)
-		errx(1, "Failed to parse IP address '%s'", ipaddr);
-	return (b[0] << 24) | (b[1] << 16) | (b[2] << 8) | b[3];
-}
-
-static void str2mac(const char *macaddr, unsigned char mac[6])
-{
-	unsigned int m[6];
-	if (sscanf(macaddr, "%02x:%02x:%02x:%02x:%02x:%02x",
-		   &m[0], &m[1], &m[2], &m[3], &m[4], &m[5]) != 6)
-		errx(1, "Failed to parse mac address '%s'", macaddr);
-	mac[0] = m[0];
-	mac[1] = m[1];
-	mac[2] = m[2];
-	mac[3] = m[3];
-	mac[4] = m[4];
-	mac[5] = m[5];
-}
-
-/*
- * This code is "adapted" from libbridge: it attaches the Host end of the
- * network device to the bridge device specified by the command line.
- *
- * This is yet another James Morris contribution (I'm an IP-level guy, so I
- * dislike bridging), and I just try not to break it.
- */
-static void add_to_bridge(int fd, const char *if_name, const char *br_name)
-{
-	int ifidx;
-	struct ifreq ifr;
-
-	if (!*br_name)
-		errx(1, "must specify bridge name");
-
-	ifidx = if_nametoindex(if_name);
-	if (!ifidx)
-		errx(1, "interface %s does not exist!", if_name);
-
-	strncpy(ifr.ifr_name, br_name, IFNAMSIZ);
-	ifr.ifr_name[IFNAMSIZ-1] = '\0';
-	ifr.ifr_ifindex = ifidx;
-	if (ioctl(fd, SIOCBRADDIF, &ifr) < 0)
-		err(1, "can't add %s to bridge %s", if_name, br_name);
-}
-
-/*
- * This sets up the Host end of the network device with an IP address, brings
- * it up so packets will flow, the copies the MAC address into the hwaddr
- * pointer.
- */
-static void configure_device(int fd, const char *tapif, u32 ipaddr)
-{
-	struct ifreq ifr;
-	struct sockaddr_in sin;
-
-	memset(&ifr, 0, sizeof(ifr));
-	strcpy(ifr.ifr_name, tapif);
-
-	/* Don't read these incantations.  Just cut & paste them like I did! */
-	sin.sin_family = AF_INET;
-	sin.sin_addr.s_addr = htonl(ipaddr);
-	memcpy(&ifr.ifr_addr, &sin, sizeof(sin));
-	if (ioctl(fd, SIOCSIFADDR, &ifr) != 0)
-		err(1, "Setting %s interface address", tapif);
-	ifr.ifr_flags = IFF_UP;
-	if (ioctl(fd, SIOCSIFFLAGS, &ifr) != 0)
-		err(1, "Bringing interface %s up", tapif);
-}
-
-static int get_tun_device(char tapif[IFNAMSIZ])
-{
-	struct ifreq ifr;
-	int vnet_hdr_sz;
-	int netfd;
-
-	/* Start with this zeroed.  Messy but sure. */
-	memset(&ifr, 0, sizeof(ifr));
-
-	/*
-	 * We open the /dev/net/tun device and tell it we want a tap device.  A
-	 * tap device is like a tun device, only somehow different.  To tell
-	 * the truth, I completely blundered my way through this code, but it
-	 * works now!
-	 */
-	netfd = open_or_die("/dev/net/tun", O_RDWR);
-	ifr.ifr_flags = IFF_TAP | IFF_NO_PI | IFF_VNET_HDR;
-	strcpy(ifr.ifr_name, "tap%d");
-	if (ioctl(netfd, TUNSETIFF, &ifr) != 0)
-		err(1, "configuring /dev/net/tun");
-
-	if (ioctl(netfd, TUNSETOFFLOAD,
-		  TUN_F_CSUM|TUN_F_TSO4|TUN_F_TSO6|TUN_F_TSO_ECN) != 0)
-		err(1, "Could not set features for tun device");
-
-	/*
-	 * We don't need checksums calculated for packets coming in this
-	 * device: trust us!
-	 */
-	ioctl(netfd, TUNSETNOCSUM, 1);
-
-	/*
-	 * In virtio before 1.0 (aka legacy virtio), we added a 16-bit
-	 * field at the end of the network header iff
-	 * VIRTIO_NET_F_MRG_RXBUF was negotiated.  For virtio 1.0,
-	 * that became the norm, but we need to tell the tun device
-	 * about our expanded header (which is called
-	 * virtio_net_hdr_mrg_rxbuf in the legacy system).
-	 */
-	vnet_hdr_sz = sizeof(struct virtio_net_hdr_v1);
-	if (ioctl(netfd, TUNSETVNETHDRSZ, &vnet_hdr_sz) != 0)
-		err(1, "Setting tun header size to %u", vnet_hdr_sz);
-
-	memcpy(tapif, ifr.ifr_name, IFNAMSIZ);
-	return netfd;
-}
-
-/*L:195
- * Our network is a Host<->Guest network.  This can either use bridging or
- * routing, but the principle is the same: it uses the "tun" device to inject
- * packets into the Host as if they came in from a normal network card.  We
- * just shunt packets between the Guest and the tun device.
- */
-static void setup_tun_net(char *arg)
-{
-	struct device *dev;
-	struct net_info *net_info = malloc(sizeof(*net_info));
-	int ipfd;
-	u32 ip = INADDR_ANY;
-	bool bridging = false;
-	char tapif[IFNAMSIZ], *p;
-	struct virtio_net_config conf;
-
-	net_info->tunfd = get_tun_device(tapif);
-
-	/* First we create a new network device. */
-	dev = new_pci_device("net", VIRTIO_ID_NET, 0x02, 0x00);
-	dev->priv = net_info;
-
-	/* Network devices need a recv and a send queue, just like console. */
-	add_pci_virtqueue(dev, net_input, "rx");
-	add_pci_virtqueue(dev, net_output, "tx");
-
-	/*
-	 * We need a socket to perform the magic network ioctls to bring up the
-	 * tap interface, connect to the bridge etc.  Any socket will do!
-	 */
-	ipfd = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP);
-	if (ipfd < 0)
-		err(1, "opening IP socket");
-
-	/* If the command line was --tunnet=bridge:<name> do bridging. */
-	if (!strncmp(BRIDGE_PFX, arg, strlen(BRIDGE_PFX))) {
-		arg += strlen(BRIDGE_PFX);
-		bridging = true;
-	}
-
-	/* A mac address may follow the bridge name or IP address */
-	p = strchr(arg, ':');
-	if (p) {
-		str2mac(p+1, conf.mac);
-		add_pci_feature(dev, VIRTIO_NET_F_MAC);
-		*p = '\0';
-	}
-
-	/* arg is now either an IP address or a bridge name */
-	if (bridging)
-		add_to_bridge(ipfd, tapif, arg);
-	else
-		ip = str2ip(arg);
-
-	/* Set up the tun device. */
-	configure_device(ipfd, tapif, ip);
-
-	/* Expect Guest to handle everything except UFO */
-	add_pci_feature(dev, VIRTIO_NET_F_CSUM);
-	add_pci_feature(dev, VIRTIO_NET_F_GUEST_CSUM);
-	add_pci_feature(dev, VIRTIO_NET_F_GUEST_TSO4);
-	add_pci_feature(dev, VIRTIO_NET_F_GUEST_TSO6);
-	add_pci_feature(dev, VIRTIO_NET_F_GUEST_ECN);
-	add_pci_feature(dev, VIRTIO_NET_F_HOST_TSO4);
-	add_pci_feature(dev, VIRTIO_NET_F_HOST_TSO6);
-	add_pci_feature(dev, VIRTIO_NET_F_HOST_ECN);
-	/* We handle indirect ring entries */
-	add_pci_feature(dev, VIRTIO_RING_F_INDIRECT_DESC);
-	set_device_config(dev, &conf, sizeof(conf));
-
-	/* We don't need the socket any more; setup is done. */
-	close(ipfd);
-
-	if (bridging)
-		verbose("device %u: tun %s attached to bridge: %s\n",
-			devices.device_num, tapif, arg);
-	else
-		verbose("device %u: tun %s: %s\n",
-			devices.device_num, tapif, arg);
-}
-/*:*/
-
-/* This hangs off device->priv. */
-struct vblk_info {
-	/* The size of the file. */
-	off64_t len;
-
-	/* The file descriptor for the file. */
-	int fd;
-
-};
-
-/*L:210
- * The Disk
- *
- * The disk only has one virtqueue, so it only has one thread.  It is really
- * simple: the Guest asks for a block number and we read or write that position
- * in the file.
- *
- * Before we serviced each virtqueue in a separate thread, that was unacceptably
- * slow: the Guest waits until the read is finished before running anything
- * else, even if it could have been doing useful work.
- *
- * We could have used async I/O, except it's reputed to suck so hard that
- * characters actually go missing from your code when you try to use it.
- */
-static void blk_request(struct virtqueue *vq)
-{
-	struct vblk_info *vblk = vq->dev->priv;
-	unsigned int head, out_num, in_num, wlen;
-	int ret, i;
-	u8 *in;
-	struct virtio_blk_outhdr out;
-	struct iovec iov[vq->vring.num];
-	off64_t off;
-
-	/*
-	 * Get the next request, where we normally wait.  It triggers the
-	 * interrupt to acknowledge previously serviced requests (if any).
-	 */
-	head = wait_for_vq_desc(vq, iov, &out_num, &in_num);
-
-	/* Copy the output header from the front of the iov (adjusts iov) */
-	iov_consume(vq->dev, iov, out_num, &out, sizeof(out));
-
-	/* Find and trim end of iov input array, for our status byte. */
-	in = NULL;
-	for (i = out_num + in_num - 1; i >= out_num; i--) {
-		if (iov[i].iov_len > 0) {
-			in = iov[i].iov_base + iov[i].iov_len - 1;
-			iov[i].iov_len--;
-			break;
-		}
-	}
-	if (!in)
-		bad_driver_vq(vq, "Bad virtblk cmd with no room for status");
-
-	/*
-	 * For historical reasons, block operations are expressed in 512 byte
-	 * "sectors".
-	 */
-	off = out.sector * 512;
-
-	if (out.type & VIRTIO_BLK_T_OUT) {
-		/*
-		 * Write
-		 *
-		 * Move to the right location in the block file.  This can fail
-		 * if they try to write past end.
-		 */
-		if (lseek64(vblk->fd, off, SEEK_SET) != off)
-			err(1, "Bad seek to sector %llu", out.sector);
-
-		ret = writev(vblk->fd, iov, out_num);
-		verbose("WRITE to sector %llu: %i\n", out.sector, ret);
-
-		/*
-		 * Grr... Now we know how long the descriptor they sent was, we
-		 * make sure they didn't try to write over the end of the block
-		 * file (possibly extending it).
-		 */
-		if (ret > 0 && off + ret > vblk->len) {
-			/* Trim it back to the correct length */
-			ftruncate64(vblk->fd, vblk->len);
-			/* Die, bad Guest, die. */
-			bad_driver_vq(vq, "Write past end %llu+%u", off, ret);
-		}
-
-		wlen = sizeof(*in);
-		*in = (ret >= 0 ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR);
-	} else if (out.type & VIRTIO_BLK_T_FLUSH) {
-		/* Flush */
-		ret = fdatasync(vblk->fd);
-		verbose("FLUSH fdatasync: %i\n", ret);
-		wlen = sizeof(*in);
-		*in = (ret >= 0 ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR);
-	} else {
-		/*
-		 * Read
-		 *
-		 * Move to the right location in the block file.  This can fail
-		 * if they try to read past end.
-		 */
-		if (lseek64(vblk->fd, off, SEEK_SET) != off)
-			err(1, "Bad seek to sector %llu", out.sector);
-
-		ret = readv(vblk->fd, iov + out_num, in_num);
-		if (ret >= 0) {
-			wlen = sizeof(*in) + ret;
-			*in = VIRTIO_BLK_S_OK;
-		} else {
-			wlen = sizeof(*in);
-			*in = VIRTIO_BLK_S_IOERR;
-		}
-	}
-
-	/* Finished that request. */
-	add_used(vq, head, wlen);
-}
-
-/*L:198 This actually sets up a virtual block device. */
-static void setup_block_file(const char *filename)
-{
-	struct device *dev;
-	struct vblk_info *vblk;
-	struct virtio_blk_config conf;
-
-	/* Create the device. */
-	dev = new_pci_device("block", VIRTIO_ID_BLOCK, 0x01, 0x80);
-
-	/* The device has one virtqueue, where the Guest places requests. */
-	add_pci_virtqueue(dev, blk_request, "request");
-
-	/* Allocate the room for our own bookkeeping */
-	vblk = dev->priv = malloc(sizeof(*vblk));
-
-	/* First we open the file and store the length. */
-	vblk->fd = open_or_die(filename, O_RDWR|O_LARGEFILE);
-	vblk->len = lseek64(vblk->fd, 0, SEEK_END);
-
-	/* Tell Guest how many sectors this device has. */
-	conf.capacity = cpu_to_le64(vblk->len / 512);
-
-	/*
-	 * Tell Guest not to put in too many descriptors at once: two are used
-	 * for the in and out elements.
-	 */
-	add_pci_feature(dev, VIRTIO_BLK_F_SEG_MAX);
-	conf.seg_max = cpu_to_le32(VIRTQUEUE_NUM - 2);
-
-	set_device_config(dev, &conf, sizeof(struct virtio_blk_config));
-
-	verbose("device %u: virtblock %llu sectors\n",
-		devices.device_num, le64_to_cpu(conf.capacity));
-}
-
-/*L:211
- * Our random number generator device reads from /dev/urandom into the Guest's
- * input buffers.  The usual case is that the Guest doesn't want random numbers
- * and so has no buffers although /dev/urandom is still readable, whereas
- * console is the reverse.
- *
- * The same logic applies, however.
- */
-struct rng_info {
-	int rfd;
-};
-
-static void rng_input(struct virtqueue *vq)
-{
-	int len;
-	unsigned int head, in_num, out_num, totlen = 0;
-	struct rng_info *rng_info = vq->dev->priv;
-	struct iovec iov[vq->vring.num];
-
-	/* First we need a buffer from the Guests's virtqueue. */
-	head = wait_for_vq_desc(vq, iov, &out_num, &in_num);
-	if (out_num)
-		bad_driver_vq(vq, "Output buffers in rng?");
-
-	/*
-	 * Just like the console write, we loop to cover the whole iovec.
-	 * In this case, short reads actually happen quite a bit.
-	 */
-	while (!iov_empty(iov, in_num)) {
-		len = readv(rng_info->rfd, iov, in_num);
-		if (len <= 0)
-			err(1, "Read from /dev/urandom gave %i", len);
-		iov_consume(vq->dev, iov, in_num, NULL, len);
-		totlen += len;
-	}
-
-	/* Tell the Guest about the new input. */
-	add_used(vq, head, totlen);
-}
-
-/*L:199
- * This creates a "hardware" random number device for the Guest.
- */
-static void setup_rng(void)
-{
-	struct device *dev;
-	struct rng_info *rng_info = malloc(sizeof(*rng_info));
-
-	/* Our device's private info simply contains the /dev/urandom fd. */
-	rng_info->rfd = open_or_die("/dev/urandom", O_RDONLY);
-
-	/* Create the new device. */
-	dev = new_pci_device("rng", VIRTIO_ID_RNG, 0xff, 0);
-	dev->priv = rng_info;
-
-	/* The device has one virtqueue, where the Guest places inbufs. */
-	add_pci_virtqueue(dev, rng_input, "input");
-
-	/* We don't have any configuration space */
-	no_device_config(dev);
-
-	verbose("device %u: rng\n", devices.device_num);
-}
-/* That's the end of device setup. */
-
-/*L:230 Reboot is pretty easy: clean up and exec() the Launcher afresh. */
-static void __attribute__((noreturn)) restart_guest(void)
-{
-	unsigned int i;
-
-	/*
-	 * Since we don't track all open fds, we simply close everything beyond
-	 * stderr.
-	 */
-	for (i = 3; i < FD_SETSIZE; i++)
-		close(i);
-
-	/* Reset all the devices (kills all threads). */
-	cleanup_devices();
-
-	execv(main_args[0], main_args);
-	err(1, "Could not exec %s", main_args[0]);
-}
-
-/*L:220
- * Finally we reach the core of the Launcher which runs the Guest, serves
- * its input and output, and finally, lays it to rest.
- */
-static void __attribute__((noreturn)) run_guest(void)
-{
-	for (;;) {
-		struct lguest_pending notify;
-		int readval;
-
-		/* We read from the /dev/lguest device to run the Guest. */
-		readval = pread(lguest_fd, &notify, sizeof(notify), cpu_id);
-		if (readval == sizeof(notify)) {
-			if (notify.trap == 13) {
-				verbose("Emulating instruction at %#x\n",
-					getreg(eip));
-				emulate_insn(notify.insn);
-			} else if (notify.trap == 14) {
-				verbose("Emulating MMIO at %#x\n",
-					getreg(eip));
-				emulate_mmio(notify.addr, notify.insn);
-			} else
-				errx(1, "Unknown trap %i addr %#08x\n",
-				     notify.trap, notify.addr);
-		/* ENOENT means the Guest died.  Reading tells us why. */
-		} else if (errno == ENOENT) {
-			char reason[1024] = { 0 };
-			pread(lguest_fd, reason, sizeof(reason)-1, cpu_id);
-			errx(1, "%s", reason);
-		/* ERESTART means that we need to reboot the guest */
-		} else if (errno == ERESTART) {
-			restart_guest();
-		/* Anything else means a bug or incompatible change. */
-		} else
-			err(1, "Running guest failed");
-	}
-}
-/*L:240
- * This is the end of the Launcher.  The good news: we are over halfway
- * through!  The bad news: the most fiendish part of the code still lies ahead
- * of us.
- *
- * Are you ready?  Take a deep breath and join me in the core of the Host, in
- * "make Host".
-:*/
-
-static struct option opts[] = {
-	{ "verbose", 0, NULL, 'v' },
-	{ "tunnet", 1, NULL, 't' },
-	{ "block", 1, NULL, 'b' },
-	{ "rng", 0, NULL, 'r' },
-	{ "initrd", 1, NULL, 'i' },
-	{ "username", 1, NULL, 'u' },
-	{ "chroot", 1, NULL, 'c' },
-	{ NULL },
-};
-static void usage(void)
-{
-	errx(1, "Usage: lguest [--verbose] "
-	     "[--tunnet=(<ipaddr>:<macaddr>|bridge:<bridgename>:<macaddr>)\n"
-	     "|--block=<filename>|--initrd=<filename>]...\n"
-	     "<mem-in-mb> vmlinux [args...]");
-}
-
-/*L:105 The main routine is where the real work begins: */
-int main(int argc, char *argv[])
-{
-	/* Memory, code startpoint and size of the (optional) initrd. */
-	unsigned long mem = 0, start, initrd_size = 0;
-	/* Two temporaries. */
-	int i, c;
-	/* The boot information for the Guest. */
-	struct boot_params *boot;
-	/* If they specify an initrd file to load. */
-	const char *initrd_name = NULL;
-
-	/* Password structure for initgroups/setres[gu]id */
-	struct passwd *user_details = NULL;
-
-	/* Directory to chroot to */
-	char *chroot_path = NULL;
-
-	/* Save the args: we "reboot" by execing ourselves again. */
-	main_args = argv;
-
-	/*
-	 * First we initialize the device list.  We remember next interrupt
-	 * number to use for devices (1: remember that 0 is used by the timer).
-	 */
-	devices.next_irq = 1;
-
-	/* We're CPU 0.  In fact, that's the only CPU possible right now. */
-	cpu_id = 0;
-
-	/*
-	 * We need to know how much memory so we can set up the device
-	 * descriptor and memory pages for the devices as we parse the command
-	 * line.  So we quickly look through the arguments to find the amount
-	 * of memory now.
-	 */
-	for (i = 1; i < argc; i++) {
-		if (argv[i][0] != '-') {
-			mem = atoi(argv[i]) * 1024 * 1024;
-			/*
-			 * We start by mapping anonymous pages over all of
-			 * guest-physical memory range.  This fills it with 0,
-			 * and ensures that the Guest won't be killed when it
-			 * tries to access it.
-			 */
-			guest_base = map_zeroed_pages(mem / getpagesize()
-						      + DEVICE_PAGES);
-			guest_limit = mem;
-			guest_max = guest_mmio = mem + DEVICE_PAGES*getpagesize();
-			break;
-		}
-	}
-
-	/* If we exit via err(), this kills all the threads, restores tty. */
-	atexit(cleanup_devices);
-
-	/* We always have a console device, and it's always device 1. */
-	setup_console();
-
-	/* The options are fairly straight-forward */
-	while ((c = getopt_long(argc, argv, "v", opts, NULL)) != EOF) {
-		switch (c) {
-		case 'v':
-			verbose = true;
-			break;
-		case 't':
-			setup_tun_net(optarg);
-			break;
-		case 'b':
-			setup_block_file(optarg);
-			break;
-		case 'r':
-			setup_rng();
-			break;
-		case 'i':
-			initrd_name = optarg;
-			break;
-		case 'u':
-			user_details = getpwnam(optarg);
-			if (!user_details)
-				err(1, "getpwnam failed, incorrect username?");
-			break;
-		case 'c':
-			chroot_path = optarg;
-			break;
-		default:
-			warnx("Unknown argument %s", argv[optind]);
-			usage();
-		}
-	}
-	/*
-	 * After the other arguments we expect memory and kernel image name,
-	 * followed by command line arguments for the kernel.
-	 */
-	if (optind + 2 > argc)
-		usage();
-
-	verbose("Guest base is at %p\n", guest_base);
-
-	/* Initialize the (fake) PCI host bridge device. */
-	init_pci_host_bridge();
-
-	/* Now we load the kernel */
-	start = load_kernel(open_or_die(argv[optind+1], O_RDONLY));
-
-	/* Boot information is stashed at physical address 0 */
-	boot = from_guest_phys(0);
-
-	/* Map the initrd image if requested (at top of physical memory) */
-	if (initrd_name) {
-		initrd_size = load_initrd(initrd_name, mem);
-		/*
-		 * These are the location in the Linux boot header where the
-		 * start and size of the initrd are expected to be found.
-		 */
-		boot->hdr.ramdisk_image = mem - initrd_size;
-		boot->hdr.ramdisk_size = initrd_size;
-		/* The bootloader type 0xFF means "unknown"; that's OK. */
-		boot->hdr.type_of_loader = 0xFF;
-	}
-
-	/*
-	 * The Linux boot header contains an "E820" memory map: ours is a
-	 * simple, single region.
-	 */
-	boot->e820_entries = 1;
-	boot->e820_table[0] = ((struct e820_entry) { 0, mem, E820_TYPE_RAM });
-	/*
-	 * The boot header contains a command line pointer: we put the command
-	 * line after the boot header.
-	 */
-	boot->hdr.cmd_line_ptr = to_guest_phys(boot + 1);
-	/* We use a simple helper to copy the arguments separated by spaces. */
-	concat((char *)(boot + 1), argv+optind+2);
-
-	/* Set kernel alignment to 16M (CONFIG_PHYSICAL_ALIGN) */
-	boot->hdr.kernel_alignment = 0x1000000;
-
-	/* Boot protocol version: 2.07 supports the fields for lguest. */
-	boot->hdr.version = 0x207;
-
-	/* X86_SUBARCH_LGUEST tells the Guest it's an lguest. */
-	boot->hdr.hardware_subarch = X86_SUBARCH_LGUEST;
-
-	/* Tell the entry path not to try to reload segment registers. */
-	boot->hdr.loadflags |= KEEP_SEGMENTS;
-
-	/* We don't support tboot: */
-	boot->tboot_addr = 0;
-
-	/* Ensure this is 0 to prevent APM from loading: */
-	boot->apm_bios_info.version = 0;
-
-	/* We tell the kernel to initialize the Guest. */
-	tell_kernel(start);
-
-	/* Ensure that we terminate if a device-servicing child dies. */
-	signal(SIGCHLD, kill_launcher);
-
-	/* If requested, chroot to a directory */
-	if (chroot_path) {
-		if (chroot(chroot_path) != 0)
-			err(1, "chroot(\"%s\") failed", chroot_path);
-
-		if (chdir("/") != 0)
-			err(1, "chdir(\"/\") failed");
-
-		verbose("chroot done\n");
-	}
-
-	/* If requested, drop privileges */
-	if (user_details) {
-		uid_t u;
-		gid_t g;
-
-		u = user_details->pw_uid;
-		g = user_details->pw_gid;
-
-		if (initgroups(user_details->pw_name, g) != 0)
-			err(1, "initgroups failed");
-
-		if (setresgid(g, g, g) != 0)
-			err(1, "setresgid failed");
-
-		if (setresuid(u, u, u) != 0)
-			err(1, "setresuid failed");
-
-		verbose("Dropping privileges completed\n");
-	}
-
-	/* Finally, run the Guest.  This doesn't return. */
-	run_guest();
-}
-/*:*/
-
-/*M:999
- * Mastery is done: you now know everything I do.
- *
- * But surely you have seen code, features and bugs in your wanderings which
- * you now yearn to attack?  That is the real game, and I look forward to you
- * patching and forking lguest into the Your-Name-Here-visor.
- *
- * Farewell, and good coding!
- * Rusty Russell.
- */
diff --git a/tools/lguest/lguest.txt b/tools/lguest/lguest.txt
deleted file mode 100644
index 06e1f4649511..000000000000
--- a/tools/lguest/lguest.txt
+++ /dev/null
@@ -1,125 +0,0 @@
-      __
- (___()'`;  Rusty's Remarkably Unreliable Guide to Lguest
- /,    /`      - or, A Young Coder's Illustrated Hypervisor
- \\"--\\    http://lguest.ozlabs.org
-
-Lguest is designed to be a minimal 32-bit x86 hypervisor for the Linux kernel,
-for Linux developers and users to experiment with virtualization with the
-minimum of complexity.  Nonetheless, it should have sufficient features to
-make it useful for specific tasks, and, of course, you are encouraged to fork
-and enhance it (see drivers/lguest/README).
-
-Features:
-
-- Kernel module which runs in a normal kernel.
-- Simple I/O model for communication.
-- Simple program to create new guests.
-- Logo contains cute puppies: http://lguest.ozlabs.org
-
-Developer features:
-
-- Fun to hack on.
-- No ABI: being tied to a specific kernel anyway, you can change anything.
-- Many opportunities for improvement or feature implementation.
-
-Running Lguest:
-
-- The easiest way to run lguest is to use same kernel as guest and host.
-  You can configure them differently, but usually it's easiest not to.
-
-  You will need to configure your kernel with the following options:
-
-  "Processor type and features":
-     "Paravirtualized guest support" = Y
-        "Lguest guest support" = Y
-     "High Memory Support" = off/4GB
-     "Alignment value to which kernel should be aligned" = 0x100000
-        (CONFIG_PARAVIRT=y, CONFIG_LGUEST_GUEST=y, CONFIG_HIGHMEM64G=n and
-         CONFIG_PHYSICAL_ALIGN=0x100000)
-
-  "Device Drivers":
-     "Block devices"
-        "Virtio block driver" = M/Y
-     "Network device support"
-        "Universal TUN/TAP device driver support" = M/Y
-        "Virtio network driver" = M/Y
-           (CONFIG_VIRTIO_BLK=m, CONFIG_VIRTIO_NET=m and CONFIG_TUN=m)
-
-  "Virtualization"
-     "Linux hypervisor example code" = M/Y
-        (CONFIG_LGUEST=m)
-
-- A tool called "lguest" is available in this directory: type "make"
-  to build it.  If you didn't build your kernel in-tree, use "make
-  O=<builddir>".
-
-- Create or find a root disk image.  There are several useful ones
-  around, such as the xm-test tiny root image at
-	  http://xm-test.xensource.com/ramdisks/initrd-1.1-i386.img
-
-  For more serious work, I usually use a distribution ISO image and
-  install it under qemu, then make multiple copies:
-
-	  dd if=/dev/zero of=rootfile bs=1M count=2048
-	  qemu -cdrom image.iso -hda rootfile -net user -net nic -boot d
-
-  Make sure that you install a getty on /dev/hvc0 if you want to log in on the
-  console!
-
-- "modprobe lg" if you built it as a module.
-
-- Run an lguest as root:
-
-      tools/lguest/lguest 64 vmlinux --tunnet=192.168.19.1 \
-        --block=rootfile root=/dev/vda
-
-   Explanation:
-    64: the amount of memory to use, in MB.
-
-    vmlinux: the kernel image found in the top of your build directory.  You
-       can also use a standard bzImage.
-
-    --tunnet=192.168.19.1: configures a "tap" device for networking with this
-       IP address.
-
-    --block=rootfile: a file or block device which becomes /dev/vda
-       inside the guest.
-
-    root=/dev/vda: this (and anything else on the command line) are
-       kernel boot parameters.
-
-- Configuring networking.  I usually have the host masquerade, using
-  "iptables -t nat -A POSTROUTING -o eth0 -j MASQUERADE" and "echo 1 >
-  /proc/sys/net/ipv4/ip_forward".  In this example, I would configure
-  eth0 inside the guest at 192.168.19.2.
-
-  Another method is to bridge the tap device to an external interface
-  using --tunnet=bridge:<bridgename>, and perhaps run dhcp on the guest
-  to obtain an IP address.  The bridge needs to be configured first:
-  this option simply adds the tap interface to it.
-
-  A simple example on my system:
-
-    ifconfig eth0 0.0.0.0
-    brctl addbr lg0
-    ifconfig lg0 up
-    brctl addif lg0 eth0
-    dhclient lg0
-
-  Then use --tunnet=bridge:lg0 when launching the guest.
-
-  See:
-  
-    http://www.linuxfoundation.org/collaborate/workgroups/networking/bridge
-    
-  for general information on how to get bridging to work.
-
-- Random number generation. Using the --rng option will provide a
-  /dev/hwrng in the guest that will read from the host's /dev/random.
-  Use this option in conjunction with rng-tools (see ../hw_random.txt)
-  to provide entropy to the guest kernel's /dev/random.
-
-There is a helpful mailing list at http://ozlabs.org/mailman/listinfo/lguest
-
-Good luck!
-Rusty Russell rusty@rustcorp.com.au.
-- 
cgit v1.2.3-71-gd317


From ea5311c7e752dbec9bfbdd79992a8772b37f32fa Mon Sep 17 00:00:00 2001
From: Alex Williamson <alex.williamson@redhat.com>
Date: Thu, 10 Aug 2017 10:54:31 -0600
Subject: PCI: Fix PCIe capability sizes

PCI_CAP_EXP_ENDPOINT_SIZEOF_V1 defines the size of the PCIe capability
structure for v1 devices with link, but we also have a need in the vfio
code for sizing the capability for devices without link, such as Root
Complex Integrated Endpoints.  Create a separate define for this ending the
structure before the link fields.

Additionally, this reveals that PCI_CAP_EXP_ENDPOINT_SIZEOF_V2 is currently
incorrect, ending the capability length before the v2 link fields.  Rename
this to specify an RC Integrated Endpoint (no link) capability length and
move PCI_CAP_EXP_ENDPOINT_SIZEOF_V2 to include the link fields as we have
for the v1 version.

Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
[bhelgaas: add "_" in "PCI_CAP_EXP_RC ENDPOINT_SIZEOF_V2 44"]
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
---
 include/uapi/linux/pci_regs.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h
index c22d3ebaca20..e185d2d39ea6 100644
--- a/include/uapi/linux/pci_regs.h
+++ b/include/uapi/linux/pci_regs.h
@@ -513,6 +513,7 @@
 #define  PCI_EXP_DEVSTA_URD	0x0008	/* Unsupported Request Detected */
 #define  PCI_EXP_DEVSTA_AUXPD	0x0010	/* AUX Power Detected */
 #define  PCI_EXP_DEVSTA_TRPND	0x0020	/* Transactions Pending */
+#define PCI_CAP_EXP_RC_ENDPOINT_SIZEOF_V1	12	/* v1 endpoints without link end here */
 #define PCI_EXP_LNKCAP		12	/* Link Capabilities */
 #define  PCI_EXP_LNKCAP_SLS	0x0000000f /* Supported Link Speeds */
 #define  PCI_EXP_LNKCAP_SLS_2_5GB 0x00000001 /* LNKCAP2 SLS Vector bit 0 */
@@ -556,7 +557,7 @@
 #define  PCI_EXP_LNKSTA_DLLLA	0x2000	/* Data Link Layer Link Active */
 #define  PCI_EXP_LNKSTA_LBMS	0x4000	/* Link Bandwidth Management Status */
 #define  PCI_EXP_LNKSTA_LABS	0x8000	/* Link Autonomous Bandwidth Status */
-#define PCI_CAP_EXP_ENDPOINT_SIZEOF_V1	20	/* v1 endpoints end here */
+#define PCI_CAP_EXP_ENDPOINT_SIZEOF_V1	20	/* v1 endpoints with link end here */
 #define PCI_EXP_SLTCAP		20	/* Slot Capabilities */
 #define  PCI_EXP_SLTCAP_ABP	0x00000001 /* Attention Button Present */
 #define  PCI_EXP_SLTCAP_PCP	0x00000002 /* Power Controller Present */
@@ -639,7 +640,7 @@
 #define  PCI_EXP_DEVCTL2_OBFF_MSGB_EN	0x4000	/* Enable OBFF Message type B */
 #define  PCI_EXP_DEVCTL2_OBFF_WAKE_EN	0x6000	/* OBFF using WAKE# signaling */
 #define PCI_EXP_DEVSTA2		42	/* Device Status 2 */
-#define PCI_CAP_EXP_ENDPOINT_SIZEOF_V2	44	/* v2 endpoints end here */
+#define PCI_CAP_EXP_RC_ENDPOINT_SIZEOF_V2	44	/* v2 endpoints without link end here */
 #define PCI_EXP_LNKCAP2		44	/* Link Capabilities 2 */
 #define  PCI_EXP_LNKCAP2_SLS_2_5GB	0x00000002 /* Supported Speed 2.5GT/s */
 #define  PCI_EXP_LNKCAP2_SLS_5_0GB	0x00000004 /* Supported Speed 5.0GT/s */
@@ -647,6 +648,7 @@
 #define  PCI_EXP_LNKCAP2_CROSSLINK	0x00000100 /* Crosslink supported */
 #define PCI_EXP_LNKCTL2		48	/* Link Control 2 */
 #define PCI_EXP_LNKSTA2		50	/* Link Status 2 */
+#define PCI_CAP_EXP_ENDPOINT_SIZEOF_V2	52	/* v2 endpoints with link end here */
 #define PCI_EXP_SLTCAP2		52	/* Slot Capabilities 2 */
 #define PCI_EXP_SLTCTL2		56	/* Slot Control 2 */
 #define PCI_EXP_SLTSTA2		58	/* Slot Status 2 */
-- 
cgit v1.2.3-71-gd317


From f20c4ea49ec4708de97248927ac6138c2d14eba9 Mon Sep 17 00:00:00 2001
From: Dongdong Liu <liudongdong3@huawei.com>
Date: Sat, 19 Aug 2017 17:07:20 +0800
Subject: PCI/DPC: Add eDPC support

Add eDPC support. Get and print the RP PIO error information when the
trigger condition is RP PIO error.

For more information on eDPC, please see PCI Express Base Specification
Revision 3.1, section 6.2.10.3, or view the PCI-SIG eDPC ECN here:
https://pcisig.com/sites/default/files/specification_documents/ECN_Enhanced_DPC_2012-11-19_final.pdf

Signed-off-by: Dongdong Liu <liudongdong3@huawei.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Keith Busch <keith.busch@intel.com>
---
 drivers/pci/pcie/pcie-dpc.c   | 162 ++++++++++++++++++++++++++++++++++++++++++
 include/uapi/linux/pci_regs.h |  10 +++
 2 files changed, 172 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/drivers/pci/pcie/pcie-dpc.c b/drivers/pci/pcie/pcie-dpc.c
index c39f32e42b4d..8428d7455e49 100644
--- a/drivers/pci/pcie/pcie-dpc.c
+++ b/drivers/pci/pcie/pcie-dpc.c
@@ -16,11 +16,55 @@
 #include <linux/pcieport_if.h>
 #include "../pci.h"
 
+struct rp_pio_header_log_regs {
+	u32 dw0;
+	u32 dw1;
+	u32 dw2;
+	u32 dw3;
+};
+
+struct dpc_rp_pio_regs {
+	u32 status;
+	u32 mask;
+	u32 severity;
+	u32 syserror;
+	u32 exception;
+
+	struct rp_pio_header_log_regs header_log;
+	u32 impspec_log;
+	u32 tlp_prefix_log[4];
+	u32 log_size;
+	u16 first_error;
+};
+
 struct dpc_dev {
 	struct pcie_device	*dev;
 	struct work_struct	work;
 	int			cap_pos;
 	bool			rp;
+	u32			rp_pio_status;
+};
+
+static const char * const rp_pio_error_string[] = {
+	"Configuration Request received UR Completion",	 /* Bit Position 0  */
+	"Configuration Request received CA Completion",	 /* Bit Position 1  */
+	"Configuration Request Completion Timeout",	 /* Bit Position 2  */
+	NULL,
+	NULL,
+	NULL,
+	NULL,
+	NULL,
+	"I/O Request received UR Completion",		 /* Bit Position 8  */
+	"I/O Request received CA Completion",		 /* Bit Position 9  */
+	"I/O Request Completion Timeout",		 /* Bit Position 10 */
+	NULL,
+	NULL,
+	NULL,
+	NULL,
+	NULL,
+	"Memory Request received UR Completion",	 /* Bit Position 16 */
+	"Memory Request received CA Completion",	 /* Bit Position 17 */
+	"Memory Request Completion Timeout",		 /* Bit Position 18 */
 };
 
 static int dpc_wait_rp_inactive(struct dpc_dev *dpc)
@@ -79,10 +123,124 @@ static void interrupt_event_handler(struct work_struct *work)
 	dpc_wait_link_inactive(pdev);
 	if (dpc->rp && dpc_wait_rp_inactive(dpc))
 		return;
+	if (dpc->rp && dpc->rp_pio_status) {
+		pci_write_config_dword(pdev,
+				      dpc->cap_pos + PCI_EXP_DPC_RP_PIO_STATUS,
+				      dpc->rp_pio_status);
+		dpc->rp_pio_status = 0;
+	}
+
 	pci_write_config_word(pdev, dpc->cap_pos + PCI_EXP_DPC_STATUS,
 		PCI_EXP_DPC_STATUS_TRIGGER | PCI_EXP_DPC_STATUS_INTERRUPT);
 }
 
+static void dpc_rp_pio_print_tlp_header(struct device *dev,
+					struct rp_pio_header_log_regs *t)
+{
+	dev_err(dev, "TLP Header: %#010x %#010x %#010x %#010x\n",
+		t->dw0, t->dw1, t->dw2, t->dw3);
+}
+
+static void dpc_rp_pio_print_error(struct dpc_dev *dpc,
+				   struct dpc_rp_pio_regs *rp_pio)
+{
+	struct device *dev = &dpc->dev->device;
+	int i;
+	u32 status;
+
+	dev_err(dev, "rp_pio_status: %#010x, rp_pio_mask: %#010x\n",
+		rp_pio->status, rp_pio->mask);
+
+	dev_err(dev, "RP PIO severity=%#010x, syserror=%#010x, exception=%#010x\n",
+		rp_pio->severity, rp_pio->syserror, rp_pio->exception);
+
+	status = (rp_pio->status & ~rp_pio->mask);
+
+	for (i = 0; i < ARRAY_SIZE(rp_pio_error_string); i++) {
+		if (!(status & (1 << i)))
+			continue;
+
+		dev_err(dev, "[%2d] %s%s\n", i, rp_pio_error_string[i],
+			rp_pio->first_error == i ? " (First)" : "");
+	}
+
+	dpc_rp_pio_print_tlp_header(dev, &rp_pio->header_log);
+	if (rp_pio->log_size == 4)
+		return;
+	dev_err(dev, "RP PIO ImpSpec Log %#010x\n", rp_pio->impspec_log);
+
+	for (i = 0; i < rp_pio->log_size - 5; i++)
+		dev_err(dev, "TLP Prefix Header: dw%d, %#010x\n", i,
+			rp_pio->tlp_prefix_log[i]);
+}
+
+static void dpc_rp_pio_get_info(struct dpc_dev *dpc,
+				struct dpc_rp_pio_regs *rp_pio)
+{
+	struct pci_dev *pdev = dpc->dev->port;
+	struct device *dev = &dpc->dev->device;
+	int i;
+	u16 cap;
+	u16 status;
+
+	pci_read_config_dword(pdev, dpc->cap_pos + PCI_EXP_DPC_RP_PIO_STATUS,
+			      &rp_pio->status);
+	pci_read_config_dword(pdev, dpc->cap_pos + PCI_EXP_DPC_RP_PIO_MASK,
+			      &rp_pio->mask);
+
+	pci_read_config_dword(pdev, dpc->cap_pos + PCI_EXP_DPC_RP_PIO_SEVERITY,
+			      &rp_pio->severity);
+	pci_read_config_dword(pdev, dpc->cap_pos + PCI_EXP_DPC_RP_PIO_SYSERROR,
+			      &rp_pio->syserror);
+	pci_read_config_dword(pdev, dpc->cap_pos + PCI_EXP_DPC_RP_PIO_EXCEPTION,
+			      &rp_pio->exception);
+
+	/* Get First Error Pointer */
+	pci_read_config_word(pdev, dpc->cap_pos + PCI_EXP_DPC_STATUS, &status);
+	rp_pio->first_error = (status & 0x1f00) >> 8;
+
+	pci_read_config_word(pdev, dpc->cap_pos + PCI_EXP_DPC_CAP, &cap);
+	rp_pio->log_size = (cap & PCI_EXP_DPC_RP_PIO_LOG_SIZE) >> 8;
+	if (rp_pio->log_size < 4 || rp_pio->log_size > 9) {
+		dev_err(dev, "RP PIO log size %u is invalid\n",
+			rp_pio->log_size);
+		return;
+	}
+
+	pci_read_config_dword(pdev,
+			      dpc->cap_pos + PCI_EXP_DPC_RP_PIO_HEADER_LOG,
+			      &rp_pio->header_log.dw0);
+	pci_read_config_dword(pdev,
+			      dpc->cap_pos + PCI_EXP_DPC_RP_PIO_HEADER_LOG + 4,
+			      &rp_pio->header_log.dw1);
+	pci_read_config_dword(pdev,
+			      dpc->cap_pos + PCI_EXP_DPC_RP_PIO_HEADER_LOG + 8,
+			      &rp_pio->header_log.dw2);
+	pci_read_config_dword(pdev,
+			      dpc->cap_pos + PCI_EXP_DPC_RP_PIO_HEADER_LOG + 12,
+			      &rp_pio->header_log.dw3);
+	if (rp_pio->log_size == 4)
+		return;
+
+	pci_read_config_dword(pdev,
+			      dpc->cap_pos + PCI_EXP_DPC_RP_PIO_IMPSPEC_LOG,
+			      &rp_pio->impspec_log);
+	for (i = 0; i < rp_pio->log_size - 5; i++)
+		pci_read_config_dword(pdev,
+			dpc->cap_pos + PCI_EXP_DPC_RP_PIO_TLPPREFIX_LOG,
+			&rp_pio->tlp_prefix_log[i]);
+}
+
+static void dpc_process_rp_pio_error(struct dpc_dev *dpc)
+{
+	struct dpc_rp_pio_regs rp_pio_regs;
+
+	dpc_rp_pio_get_info(dpc, &rp_pio_regs);
+	dpc_rp_pio_print_error(dpc, &rp_pio_regs);
+
+	dpc->rp_pio_status = rp_pio_regs.status;
+}
+
 static irqreturn_t dpc_irq(int irq, void *context)
 {
 	struct dpc_dev *dpc = (struct dpc_dev *)context;
@@ -109,6 +267,10 @@ static irqreturn_t dpc_irq(int irq, void *context)
 			 (ext_reason == 0) ? "RP PIO error" :
 			 (ext_reason == 1) ? "software trigger" :
 					     "reserved error");
+		/* show RP PIO error detail information */
+		if (reason == 3 && ext_reason == 0)
+			dpc_process_rp_pio_error(dpc);
+
 		schedule_work(&dpc->work);
 	}
 	return IRQ_HANDLED;
diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h
index c22d3ebaca20..1ce96275531c 100644
--- a/include/uapi/linux/pci_regs.h
+++ b/include/uapi/linux/pci_regs.h
@@ -967,6 +967,7 @@
 #define  PCI_EXP_DPC_CAP_RP_EXT		0x20	/* Root Port Extensions for DPC */
 #define  PCI_EXP_DPC_CAP_POISONED_TLP	0x40	/* Poisoned TLP Egress Blocking Supported */
 #define  PCI_EXP_DPC_CAP_SW_TRIGGER	0x80	/* Software Triggering Supported */
+#define  PCI_EXP_DPC_RP_PIO_LOG_SIZE	0xF00	/* RP PIO log size */
 #define  PCI_EXP_DPC_CAP_DL_ACTIVE	0x1000	/* ERR_COR signal on DL_Active supported */
 
 #define PCI_EXP_DPC_CTL			6	/* DPC control */
@@ -980,6 +981,15 @@
 
 #define PCI_EXP_DPC_SOURCE_ID		10	/* DPC Source Identifier */
 
+#define PCI_EXP_DPC_RP_PIO_STATUS	 0x0C	/* RP PIO Status */
+#define PCI_EXP_DPC_RP_PIO_MASK		 0x10	/* RP PIO MASK */
+#define PCI_EXP_DPC_RP_PIO_SEVERITY	 0x14	/* RP PIO Severity */
+#define PCI_EXP_DPC_RP_PIO_SYSERROR	 0x18	/* RP PIO SysError */
+#define PCI_EXP_DPC_RP_PIO_EXCEPTION	 0x1C	/* RP PIO Exception */
+#define PCI_EXP_DPC_RP_PIO_HEADER_LOG	 0x20	/* RP PIO Header Log */
+#define PCI_EXP_DPC_RP_PIO_IMPSPEC_LOG	 0x30	/* RP PIO ImpSpec Log */
+#define PCI_EXP_DPC_RP_PIO_TLPPREFIX_LOG 0x34	/* RP PIO TLP Prefix Log */
+
 /* Precision Time Measurement */
 #define PCI_PTM_CAP			0x04	    /* PTM Capability */
 #define  PCI_PTM_CAP_REQ		0x00000001  /* Requester capable */
-- 
cgit v1.2.3-71-gd317


From 1177009131bee310421f5c04c43d3777cbacbdc8 Mon Sep 17 00:00:00 2001
From: Arkadi Sharshevsky <arkadis@mellanox.com>
Date: Thu, 24 Aug 2017 08:39:59 +0200
Subject: devlink: Add Ethernet header for dpipe

This will be used by the IPv4 host table which will be introduced in the
following patches. This header is global and can be reused by many
drivers.

Signed-off-by: Arkadi Sharshevsky <arkadis@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/devlink.h        |  1 +
 include/uapi/linux/devlink.h |  8 ++++++++
 net/core/devlink.c           | 17 +++++++++++++++++
 3 files changed, 26 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/net/devlink.h b/include/net/devlink.h
index ed7687bbf5d0..ddb8b5227580 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -328,6 +328,7 @@ int devlink_dpipe_action_put(struct sk_buff *skb,
 			     struct devlink_dpipe_action *action);
 int devlink_dpipe_match_put(struct sk_buff *skb,
 			    struct devlink_dpipe_match *match);
+extern struct devlink_dpipe_header devlink_dpipe_header_ethernet;
 
 #else
 
diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
index b0e807ac53bb..a855f8dcc8ee 100644
--- a/include/uapi/linux/devlink.h
+++ b/include/uapi/linux/devlink.h
@@ -226,4 +226,12 @@ enum devlink_dpipe_action_type {
 	DEVLINK_DPIPE_ACTION_TYPE_FIELD_MODIFY,
 };
 
+enum devlink_dpipe_field_ethernet_id {
+	DEVLINK_DPIPE_FIELD_ETHERNET_DST_MAC,
+};
+
+enum devlink_dpipe_header_id {
+	DEVLINK_DPIPE_HEADER_ETHERNET,
+};
+
 #endif /* _UAPI_LINUX_DEVLINK_H_ */
diff --git a/net/core/devlink.c b/net/core/devlink.c
index a0adfc31a3fe..4f6f3d601785 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -29,6 +29,23 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/devlink.h>
 
+static struct devlink_dpipe_field devlink_dpipe_fields_ethernet[] = {
+	{
+		.name = "destination_mac",
+		.id = DEVLINK_DPIPE_FIELD_ETHERNET_DST_MAC,
+		.bitwidth = 48,
+	},
+};
+
+struct devlink_dpipe_header devlink_dpipe_header_ethernet = {
+	.name = "ethernet",
+	.id = DEVLINK_DPIPE_HEADER_ETHERNET,
+	.fields = devlink_dpipe_fields_ethernet,
+	.fields_count = ARRAY_SIZE(devlink_dpipe_fields_ethernet),
+	.global = true,
+};
+EXPORT_SYMBOL(devlink_dpipe_header_ethernet);
+
 EXPORT_TRACEPOINT_SYMBOL_GPL(devlink_hwmsg);
 
 static LIST_HEAD(devlink_list);
-- 
cgit v1.2.3-71-gd317


From 3fb886ecea93605a8ea14e258ff3158b8966781e Mon Sep 17 00:00:00 2001
From: Arkadi Sharshevsky <arkadis@mellanox.com>
Date: Thu, 24 Aug 2017 08:40:00 +0200
Subject: devlink: Add IPv4 header for dpipe

This will be used by the IPv4 host table which will be introduced in the
following patches. This header is global and can be reused by many
drivers.

Signed-off-by: Arkadi Sharshevsky <arkadis@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/devlink.h        |  1 +
 include/uapi/linux/devlink.h |  5 +++++
 net/core/devlink.c           | 17 +++++++++++++++++
 3 files changed, 23 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/net/devlink.h b/include/net/devlink.h
index ddb8b5227580..8ff8a6f77f29 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -329,6 +329,7 @@ int devlink_dpipe_action_put(struct sk_buff *skb,
 int devlink_dpipe_match_put(struct sk_buff *skb,
 			    struct devlink_dpipe_match *match);
 extern struct devlink_dpipe_header devlink_dpipe_header_ethernet;
+extern struct devlink_dpipe_header devlink_dpipe_header_ipv4;
 
 #else
 
diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
index a855f8dcc8ee..6c172548589d 100644
--- a/include/uapi/linux/devlink.h
+++ b/include/uapi/linux/devlink.h
@@ -230,8 +230,13 @@ enum devlink_dpipe_field_ethernet_id {
 	DEVLINK_DPIPE_FIELD_ETHERNET_DST_MAC,
 };
 
+enum devlink_dpipe_field_ipv4_id {
+	DEVLINK_DPIPE_FIELD_IPV4_DST_IP,
+};
+
 enum devlink_dpipe_header_id {
 	DEVLINK_DPIPE_HEADER_ETHERNET,
+	DEVLINK_DPIPE_HEADER_IPV4,
 };
 
 #endif /* _UAPI_LINUX_DEVLINK_H_ */
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 4f6f3d601785..fcf5dd662882 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -46,6 +46,23 @@ struct devlink_dpipe_header devlink_dpipe_header_ethernet = {
 };
 EXPORT_SYMBOL(devlink_dpipe_header_ethernet);
 
+static struct devlink_dpipe_field devlink_dpipe_fields_ipv4[] = {
+	{
+		.name = "destination ip",
+		.id = DEVLINK_DPIPE_FIELD_IPV4_DST_IP,
+		.bitwidth = 32,
+	},
+};
+
+struct devlink_dpipe_header devlink_dpipe_header_ipv4 = {
+	.name = "ipv4",
+	.id = DEVLINK_DPIPE_HEADER_IPV4,
+	.fields = devlink_dpipe_fields_ipv4,
+	.fields_count = ARRAY_SIZE(devlink_dpipe_fields_ipv4),
+	.global = true,
+};
+EXPORT_SYMBOL(devlink_dpipe_header_ipv4);
+
 EXPORT_TRACEPOINT_SYMBOL_GPL(devlink_hwmsg);
 
 static LIST_HEAD(devlink_list);
-- 
cgit v1.2.3-71-gd317


From 6ae5fa61d27dcb055f4198bcf6c8dbbf1bb33f52 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Wed, 16 Aug 2017 15:21:54 -0700
Subject: perf/x86: Fix data source decoding for Skylake

Skylake changed the encoding of the PEBS data source field.
Some combinations are not available anymore, but some new cases
e.g. for L4 cache hit are added.

Fix up the conversion table for Skylake, similar as had been done
for Nehalem.

On Skylake server the encoding for L4 actually means persistent
memory. Handle this case too.

To properly describe it in the abstracted perf format I had to add
some new fields. Since a hit can have only one level add a new
field that is an enumeration, not a bit field to describe
the level. It can describe any level. Some numbers are also
used to describe PMEM and LFB.

Also add a new generic remote flag that can be combined with
the generic level to signify a remote cache.

And there is an extension field for the snoop indication to handle
the Forward state.

I didn't add a generic flag for hops because it's not needed
for Skylake.

I changed the existing encodings for older CPUs to also fill in the
new level and remote fields.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Madhavan Srinivasan <maddy@linux.vnet.ibm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: acme@kernel.org
Cc: jolsa@kernel.org
Link: http://lkml.kernel.org/r/20170816222156.19953-3-andi@firstfloor.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/events/intel/core.c    |  2 ++
 arch/x86/events/intel/ds.c      | 51 ++++++++++++++++++++++++++---------------
 arch/x86/events/perf_event.h    |  2 ++
 include/uapi/linux/perf_event.h | 30 ++++++++++++++++++++++--
 4 files changed, 64 insertions(+), 21 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index c3439a36dcf9..6f342001ec6a 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -4208,6 +4208,8 @@ __init int intel_pmu_init(void)
 						  skl_format_attr);
 		WARN_ON(!x86_pmu.format_attrs);
 		x86_pmu.cpu_events = hsw_events_attrs;
+		intel_pmu_pebs_data_source_skl(
+			boot_cpu_data.x86_model == INTEL_FAM6_SKYLAKE_X);
 		pr_cont("Skylake events, ");
 		break;
 
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index 3ccdf8cb4495..98e36e0c791c 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -49,34 +49,47 @@ union intel_x86_pebs_dse {
  */
 #define P(a, b) PERF_MEM_S(a, b)
 #define OP_LH (P(OP, LOAD) | P(LVL, HIT))
+#define LEVEL(x) P(LVLNUM, x)
+#define REM P(REMOTE, REMOTE)
 #define SNOOP_NONE_MISS (P(SNOOP, NONE) | P(SNOOP, MISS))
 
 /* Version for Sandy Bridge and later */
 static u64 pebs_data_source[] = {
-	P(OP, LOAD) | P(LVL, MISS) | P(LVL, L3) | P(SNOOP, NA),/* 0x00:ukn L3 */
-	OP_LH | P(LVL, L1)  | P(SNOOP, NONE),	/* 0x01: L1 local */
-	OP_LH | P(LVL, LFB) | P(SNOOP, NONE),	/* 0x02: LFB hit */
-	OP_LH | P(LVL, L2)  | P(SNOOP, NONE),	/* 0x03: L2 hit */
-	OP_LH | P(LVL, L3)  | P(SNOOP, NONE),	/* 0x04: L3 hit */
-	OP_LH | P(LVL, L3)  | P(SNOOP, MISS),	/* 0x05: L3 hit, snoop miss */
-	OP_LH | P(LVL, L3)  | P(SNOOP, HIT),	/* 0x06: L3 hit, snoop hit */
-	OP_LH | P(LVL, L3)  | P(SNOOP, HITM),	/* 0x07: L3 hit, snoop hitm */
-	OP_LH | P(LVL, REM_CCE1) | P(SNOOP, HIT),  /* 0x08: L3 miss snoop hit */
-	OP_LH | P(LVL, REM_CCE1) | P(SNOOP, HITM), /* 0x09: L3 miss snoop hitm*/
-	OP_LH | P(LVL, LOC_RAM)  | P(SNOOP, HIT),  /* 0x0a: L3 miss, shared */
-	OP_LH | P(LVL, REM_RAM1) | P(SNOOP, HIT),  /* 0x0b: L3 miss, shared */
-	OP_LH | P(LVL, LOC_RAM)  | SNOOP_NONE_MISS,/* 0x0c: L3 miss, excl */
-	OP_LH | P(LVL, REM_RAM1) | SNOOP_NONE_MISS,/* 0x0d: L3 miss, excl */
-	OP_LH | P(LVL, IO)  | P(SNOOP, NONE), /* 0x0e: I/O */
-	OP_LH | P(LVL, UNC) | P(SNOOP, NONE), /* 0x0f: uncached */
+	P(OP, LOAD) | P(LVL, MISS) | LEVEL(L3) | P(SNOOP, NA),/* 0x00:ukn L3 */
+	OP_LH | P(LVL, L1)  | LEVEL(L1) | P(SNOOP, NONE),  /* 0x01: L1 local */
+	OP_LH | P(LVL, LFB) | LEVEL(LFB) | P(SNOOP, NONE), /* 0x02: LFB hit */
+	OP_LH | P(LVL, L2)  | LEVEL(L2) | P(SNOOP, NONE),  /* 0x03: L2 hit */
+	OP_LH | P(LVL, L3)  | LEVEL(L3) | P(SNOOP, NONE),  /* 0x04: L3 hit */
+	OP_LH | P(LVL, L3)  | LEVEL(L3) | P(SNOOP, MISS),  /* 0x05: L3 hit, snoop miss */
+	OP_LH | P(LVL, L3)  | LEVEL(L3) | P(SNOOP, HIT),   /* 0x06: L3 hit, snoop hit */
+	OP_LH | P(LVL, L3)  | LEVEL(L3) | P(SNOOP, HITM),  /* 0x07: L3 hit, snoop hitm */
+	OP_LH | P(LVL, REM_CCE1) | REM | LEVEL(L3) | P(SNOOP, HIT),  /* 0x08: L3 miss snoop hit */
+	OP_LH | P(LVL, REM_CCE1) | REM | LEVEL(L3) | P(SNOOP, HITM), /* 0x09: L3 miss snoop hitm*/
+	OP_LH | P(LVL, LOC_RAM)  | LEVEL(RAM) | P(SNOOP, HIT),       /* 0x0a: L3 miss, shared */
+	OP_LH | P(LVL, REM_RAM1) | REM | LEVEL(L3) | P(SNOOP, HIT),  /* 0x0b: L3 miss, shared */
+	OP_LH | P(LVL, LOC_RAM)  | LEVEL(RAM) | SNOOP_NONE_MISS,     /* 0x0c: L3 miss, excl */
+	OP_LH | P(LVL, REM_RAM1) | LEVEL(RAM) | REM | SNOOP_NONE_MISS, /* 0x0d: L3 miss, excl */
+	OP_LH | P(LVL, IO)  | LEVEL(NA) | P(SNOOP, NONE), /* 0x0e: I/O */
+	OP_LH | P(LVL, UNC) | LEVEL(NA) | P(SNOOP, NONE), /* 0x0f: uncached */
 };
 
 /* Patch up minor differences in the bits */
 void __init intel_pmu_pebs_data_source_nhm(void)
 {
-	pebs_data_source[0x05] = OP_LH | P(LVL, L3)  | P(SNOOP, HIT);
-	pebs_data_source[0x06] = OP_LH | P(LVL, L3)  | P(SNOOP, HITM);
-	pebs_data_source[0x07] = OP_LH | P(LVL, L3)  | P(SNOOP, HITM);
+	pebs_data_source[0x05] = OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, HIT);
+	pebs_data_source[0x06] = OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, HITM);
+	pebs_data_source[0x07] = OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, HITM);
+}
+
+void __init intel_pmu_pebs_data_source_skl(bool pmem)
+{
+	u64 pmem_or_l4 = pmem ? LEVEL(PMEM) : LEVEL(L4);
+
+	pebs_data_source[0x08] = OP_LH | pmem_or_l4 | P(SNOOP, HIT);
+	pebs_data_source[0x09] = OP_LH | pmem_or_l4 | REM | P(SNOOP, HIT);
+	pebs_data_source[0x0b] = OP_LH | LEVEL(RAM) | REM | P(SNOOP, NONE);
+	pebs_data_source[0x0c] = OP_LH | LEVEL(ANY_CACHE) | REM | P(SNOOPX, FWD);
+	pebs_data_source[0x0d] = OP_LH | LEVEL(ANY_CACHE) | REM | P(SNOOP, HITM);
 }
 
 static u64 precise_store_data(u64 status)
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 2e9636e4068f..0f7dad8bd358 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -948,6 +948,8 @@ void intel_pmu_lbr_init_knl(void);
 
 void intel_pmu_pebs_data_source_nhm(void);
 
+void intel_pmu_pebs_data_source_skl(bool pmem);
+
 int intel_pmu_setup_lbr_filter(struct perf_event *event);
 
 void intel_pt_interrupt(void);
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 642db5fa3286..2a37ae925d85 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -954,14 +954,20 @@ union perf_mem_data_src {
 			mem_snoop:5,	/* snoop mode */
 			mem_lock:2,	/* lock instr */
 			mem_dtlb:7,	/* tlb access */
-			mem_rsvd:31;
+			mem_lvl_num:4,	/* memory hierarchy level number */
+			mem_remote:1,   /* remote */
+			mem_snoopx:2,	/* snoop mode, ext */
+			mem_rsvd:24;
 	};
 };
 #elif defined(__BIG_ENDIAN_BITFIELD)
 union perf_mem_data_src {
 	__u64 val;
 	struct {
-		__u64	mem_rsvd:31,
+		__u64	mem_rsvd:24,
+			mem_snoopx:2,	/* snoop mode, ext */
+			mem_remote:1,   /* remote */
+			mem_lvl_num:4,	/* memory hierarchy level number */
 			mem_dtlb:7,	/* tlb access */
 			mem_lock:2,	/* lock instr */
 			mem_snoop:5,	/* snoop mode */
@@ -998,6 +1004,22 @@ union perf_mem_data_src {
 #define PERF_MEM_LVL_UNC	0x2000 /* Uncached memory */
 #define PERF_MEM_LVL_SHIFT	5
 
+#define PERF_MEM_REMOTE_REMOTE	0x01  /* Remote */
+#define PERF_MEM_REMOTE_SHIFT	37
+
+#define PERF_MEM_LVLNUM_L1	0x01 /* L1 */
+#define PERF_MEM_LVLNUM_L2	0x02 /* L2 */
+#define PERF_MEM_LVLNUM_L3	0x03 /* L3 */
+#define PERF_MEM_LVLNUM_L4	0x04 /* L4 */
+/* 5-0xa available */
+#define PERF_MEM_LVLNUM_ANY_CACHE 0x0b /* Any cache */
+#define PERF_MEM_LVLNUM_LFB	0x0c /* LFB */
+#define PERF_MEM_LVLNUM_RAM	0x0d /* RAM */
+#define PERF_MEM_LVLNUM_PMEM	0x0e /* PMEM */
+#define PERF_MEM_LVLNUM_NA	0x0f /* N/A */
+
+#define PERF_MEM_LVLNUM_SHIFT	33
+
 /* snoop mode */
 #define PERF_MEM_SNOOP_NA	0x01 /* not available */
 #define PERF_MEM_SNOOP_NONE	0x02 /* no snoop */
@@ -1006,6 +1028,10 @@ union perf_mem_data_src {
 #define PERF_MEM_SNOOP_HITM	0x10 /* snoop hit modified */
 #define PERF_MEM_SNOOP_SHIFT	19
 
+#define PERF_MEM_SNOOPX_FWD	0x01 /* forward */
+/* 1 free */
+#define PERF_MEM_SNOOPX_SHIFT	37
+
 /* locked instruction */
 #define PERF_MEM_LOCK_NA	0x01 /* not available */
 #define PERF_MEM_LOCK_LOCKED	0x02 /* locked transaction */
-- 
cgit v1.2.3-71-gd317


From 38ee7f2d47565689f35662d488d25e7afc43477d Mon Sep 17 00:00:00 2001
From: David Lebrun <david.lebrun@uclouvain.be>
Date: Fri, 25 Aug 2017 09:56:45 +0200
Subject: ipv6: sr: add support for encapsulation of L2 frames

This patch implements the L2 frame encapsulation mechanism, referred to
as T.Encaps.L2 in the SRv6 specifications [1].

A new type of SRv6 tunnel mode is added (SEG6_IPTUN_MODE_L2ENCAP). It only
accepts packets with an existing MAC header (i.e., it will not work for
locally generated packets). The resulting packet looks like IPv6 -> SRH ->
Ethernet -> original L3 payload. The next header field of the SRH is set to
NEXTHDR_NONE.

[1] https://tools.ietf.org/html/draft-filsfils-spring-srv6-network-programming-01

Signed-off-by: David Lebrun <david.lebrun@uclouvain.be>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/seg6_iptunnel.h | 18 ++++++++++++++----
 net/ipv6/seg6_iptunnel.c           | 25 +++++++++++++++++++++++--
 2 files changed, 37 insertions(+), 6 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/seg6_iptunnel.h b/include/uapi/linux/seg6_iptunnel.h
index b6e5a0a1afd7..b23df9f58354 100644
--- a/include/uapi/linux/seg6_iptunnel.h
+++ b/include/uapi/linux/seg6_iptunnel.h
@@ -33,16 +33,26 @@ struct seg6_iptunnel_encap {
 enum {
 	SEG6_IPTUN_MODE_INLINE,
 	SEG6_IPTUN_MODE_ENCAP,
+	SEG6_IPTUN_MODE_L2ENCAP,
 };
 
 #ifdef __KERNEL__
 
 static inline size_t seg6_lwt_headroom(struct seg6_iptunnel_encap *tuninfo)
 {
-	int encap = (tuninfo->mode == SEG6_IPTUN_MODE_ENCAP);
-
-	return ((tuninfo->srh->hdrlen + 1) << 3) +
-	       (encap * sizeof(struct ipv6hdr));
+	int head = 0;
+
+	switch (tuninfo->mode) {
+	case SEG6_IPTUN_MODE_INLINE:
+		break;
+	case SEG6_IPTUN_MODE_ENCAP:
+		head = sizeof(struct ipv6hdr);
+		break;
+	case SEG6_IPTUN_MODE_L2ENCAP:
+		return 0;
+	}
+
+	return ((tuninfo->srh->hdrlen + 1) << 3) + head;
 }
 
 #endif
diff --git a/net/ipv6/seg6_iptunnel.c b/net/ipv6/seg6_iptunnel.c
index 5bec7817a7b9..bd6cc688bd19 100644
--- a/net/ipv6/seg6_iptunnel.c
+++ b/net/ipv6/seg6_iptunnel.c
@@ -238,6 +238,22 @@ static int seg6_do_srh(struct sk_buff *skb)
 		if (err)
 			return err;
 
+		skb->protocol = htons(ETH_P_IPV6);
+		break;
+	case SEG6_IPTUN_MODE_L2ENCAP:
+		if (!skb_mac_header_was_set(skb))
+			return -EINVAL;
+
+		if (pskb_expand_head(skb, skb->mac_len, 0, GFP_ATOMIC) < 0)
+			return -ENOMEM;
+
+		skb_mac_header_rebuild(skb);
+		skb_push(skb, skb->mac_len);
+
+		err = seg6_do_srh_encap(skb, tinfo->srh, NEXTHDR_NONE);
+		if (err)
+			return err;
+
 		skb->protocol = htons(ETH_P_IPV6);
 		break;
 	}
@@ -386,6 +402,8 @@ static int seg6_build_state(struct nlattr *nla,
 		break;
 	case SEG6_IPTUN_MODE_ENCAP:
 		break;
+	case SEG6_IPTUN_MODE_L2ENCAP:
+		break;
 	default:
 		return -EINVAL;
 	}
@@ -409,8 +427,11 @@ static int seg6_build_state(struct nlattr *nla,
 	memcpy(&slwt->tuninfo, tuninfo, tuninfo_len);
 
 	newts->type = LWTUNNEL_ENCAP_SEG6;
-	newts->flags |= LWTUNNEL_STATE_OUTPUT_REDIRECT |
-			LWTUNNEL_STATE_INPUT_REDIRECT;
+	newts->flags |= LWTUNNEL_STATE_INPUT_REDIRECT;
+
+	if (tuninfo->mode != SEG6_IPTUN_MODE_L2ENCAP)
+		newts->flags |= LWTUNNEL_STATE_OUTPUT_REDIRECT;
+
 	newts->headroom = seg6_lwt_headroom(tuninfo);
 
 	*ts = newts;
-- 
cgit v1.2.3-71-gd317


From d1b3437ed780cd97b4b1300db96a4d8faae6fea1 Mon Sep 17 00:00:00 2001
From: Sakari Ailus <sakari.ailus@linux.intel.com>
Date: Tue, 8 Aug 2017 09:29:58 -0400
Subject: media: v4l: Add packed Bayer raw12 pixel formats

These formats are compressed 12-bit raw bayer formats with four different
pixel orders. They are similar to 10-bit variants. The formats added by
this patch are

	V4L2_PIX_FMT_SBGGR12P
	V4L2_PIX_FMT_SGBRG12P
	V4L2_PIX_FMT_SGRBG12P
	V4L2_PIX_FMT_SRGGB12P

Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Signed-off-by: Hans Verkuil <hans.verkuil@cisco.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
---
 Documentation/media/uapi/v4l/pixfmt-rgb.rst      |   1 +
 Documentation/media/uapi/v4l/pixfmt-srggb12p.rst | 103 +++++++++++++++++++++++
 drivers/media/v4l2-core/v4l2-ioctl.c             |  12 ++-
 include/uapi/linux/videodev2.h                   |   5 ++
 4 files changed, 117 insertions(+), 4 deletions(-)
 create mode 100644 Documentation/media/uapi/v4l/pixfmt-srggb12p.rst

(limited to 'include/uapi/linux')

diff --git a/Documentation/media/uapi/v4l/pixfmt-rgb.rst b/Documentation/media/uapi/v4l/pixfmt-rgb.rst
index b0f35136021e..4cc27195dc79 100644
--- a/Documentation/media/uapi/v4l/pixfmt-rgb.rst
+++ b/Documentation/media/uapi/v4l/pixfmt-rgb.rst
@@ -17,4 +17,5 @@ RGB Formats
     pixfmt-srggb10alaw8
     pixfmt-srggb10dpcm8
     pixfmt-srggb12
+    pixfmt-srggb12p
     pixfmt-srggb16
diff --git a/Documentation/media/uapi/v4l/pixfmt-srggb12p.rst b/Documentation/media/uapi/v4l/pixfmt-srggb12p.rst
new file mode 100644
index 000000000000..c0541e5acd01
--- /dev/null
+++ b/Documentation/media/uapi/v4l/pixfmt-srggb12p.rst
@@ -0,0 +1,103 @@
+.. -*- coding: utf-8; mode: rst -*-
+
+.. _V4L2-PIX-FMT-SRGGB12P:
+.. _v4l2-pix-fmt-sbggr12p:
+.. _v4l2-pix-fmt-sgbrg12p:
+.. _v4l2-pix-fmt-sgrbg12p:
+
+*******************************************************************************************************************************
+V4L2_PIX_FMT_SRGGB12P ('pRAA'), V4L2_PIX_FMT_SGRBG12P ('pgAA'), V4L2_PIX_FMT_SGBRG12P ('pGAA'), V4L2_PIX_FMT_SBGGR12P ('pBAA'),
+*******************************************************************************************************************************
+
+
+12-bit packed Bayer formats
+
+
+Description
+===========
+
+These four pixel formats are packed raw sRGB / Bayer formats with 12
+bits per colour. Every two consecutive samples are packed into three
+bytes. Each of the first two bytes contain the 8 high order bits of
+the pixels, and the third byte contains the four least significants
+bits of each pixel, in the same order.
+
+Each n-pixel row contains n/2 green samples and n/2 blue or red
+samples, with alternating green-red and green-blue rows. They are
+conventionally described as GRGR... BGBG..., RGRG... GBGB..., etc.
+Below is an example of a small V4L2_PIX_FMT_SBGGR12P image:
+
+**Byte Order.**
+Each cell is one byte.
+
+
+
+.. flat-table::
+    :header-rows:  0
+    :stub-columns: 0
+    :widths:       2 1 1 1 1 1 1
+
+
+    -  .. row 1
+
+       -  start + 0:
+
+       -  B\ :sub:`00high`
+
+       -  G\ :sub:`01high`
+
+       -  G\ :sub:`01low`\ (bits 7--4) B\ :sub:`00low`\ (bits 3--0)
+
+       -  B\ :sub:`02high`
+
+       -  G\ :sub:`03high`
+
+       -  G\ :sub:`03low`\ (bits 7--4) B\ :sub:`02low`\ (bits 3--0)
+
+    -  .. row 2
+
+       -  start + 6:
+
+       -  G\ :sub:`10high`
+
+       -  R\ :sub:`11high`
+
+       -  R\ :sub:`11low`\ (bits 7--4) G\ :sub:`10low`\ (bits 3--0)
+
+       -  G\ :sub:`12high`
+
+       -  R\ :sub:`13high`
+
+       -  R\ :sub:`13low`\ (bits 3--2) G\ :sub:`12low`\ (bits 3--0)
+
+    -  .. row 3
+
+       -  start + 12:
+
+       -  B\ :sub:`20high`
+
+       -  G\ :sub:`21high`
+
+       -  G\ :sub:`21low`\ (bits 7--4) B\ :sub:`20low`\ (bits 3--0)
+
+       -  B\ :sub:`22high`
+
+       -  G\ :sub:`23high`
+
+       -  G\ :sub:`23low`\ (bits 7--4) B\ :sub:`22low`\ (bits 3--0)
+
+    -  .. row 4
+
+       -  start + 18:
+
+       -  G\ :sub:`30high`
+
+       -  R\ :sub:`31high`
+
+       -  R\ :sub:`31low`\ (bits 7--4) G\ :sub:`30low`\ (bits 3--0)
+
+       -  G\ :sub:`32high`
+
+       -  R\ :sub:`33high`
+
+       -  R\ :sub:`33low`\ (bits 3--2) G\ :sub:`32low`\ (bits 3--0)
diff --git a/drivers/media/v4l2-core/v4l2-ioctl.c b/drivers/media/v4l2-core/v4l2-ioctl.c
index cab63bb49c97..b60a6b0841d1 100644
--- a/drivers/media/v4l2-core/v4l2-ioctl.c
+++ b/drivers/media/v4l2-core/v4l2-ioctl.c
@@ -1195,10 +1195,6 @@ static void v4l_fill_fmtdesc(struct v4l2_fmtdesc *fmt)
 	case V4L2_PIX_FMT_SGBRG10:	descr = "10-bit Bayer GBGB/RGRG"; break;
 	case V4L2_PIX_FMT_SGRBG10:	descr = "10-bit Bayer GRGR/BGBG"; break;
 	case V4L2_PIX_FMT_SRGGB10:	descr = "10-bit Bayer RGRG/GBGB"; break;
-	case V4L2_PIX_FMT_SBGGR12:	descr = "12-bit Bayer BGBG/GRGR"; break;
-	case V4L2_PIX_FMT_SGBRG12:	descr = "12-bit Bayer GBGB/RGRG"; break;
-	case V4L2_PIX_FMT_SGRBG12:	descr = "12-bit Bayer GRGR/BGBG"; break;
-	case V4L2_PIX_FMT_SRGGB12:	descr = "12-bit Bayer RGRG/GBGB"; break;
 	case V4L2_PIX_FMT_SBGGR10P:	descr = "10-bit Bayer BGBG/GRGR Packed"; break;
 	case V4L2_PIX_FMT_SGBRG10P:	descr = "10-bit Bayer GBGB/RGRG Packed"; break;
 	case V4L2_PIX_FMT_SGRBG10P:	descr = "10-bit Bayer GRGR/BGBG Packed"; break;
@@ -1211,6 +1207,14 @@ static void v4l_fill_fmtdesc(struct v4l2_fmtdesc *fmt)
 	case V4L2_PIX_FMT_SGBRG10DPCM8:	descr = "8-bit Bayer GBGB/RGRG (DPCM)"; break;
 	case V4L2_PIX_FMT_SGRBG10DPCM8:	descr = "8-bit Bayer GRGR/BGBG (DPCM)"; break;
 	case V4L2_PIX_FMT_SRGGB10DPCM8:	descr = "8-bit Bayer RGRG/GBGB (DPCM)"; break;
+	case V4L2_PIX_FMT_SBGGR12:	descr = "12-bit Bayer BGBG/GRGR"; break;
+	case V4L2_PIX_FMT_SGBRG12:	descr = "12-bit Bayer GBGB/RGRG"; break;
+	case V4L2_PIX_FMT_SGRBG12:	descr = "12-bit Bayer GRGR/BGBG"; break;
+	case V4L2_PIX_FMT_SRGGB12:	descr = "12-bit Bayer RGRG/GBGB"; break;
+	case V4L2_PIX_FMT_SBGGR12P:	descr = "12-bit Bayer BGBG/GRGR Packed"; break;
+	case V4L2_PIX_FMT_SGBRG12P:	descr = "12-bit Bayer GBGB/RGRG Packed"; break;
+	case V4L2_PIX_FMT_SGRBG12P:	descr = "12-bit Bayer GRGR/BGBG Packed"; break;
+	case V4L2_PIX_FMT_SRGGB12P:	descr = "12-bit Bayer RGRG/GBGB Packed"; break;
 	case V4L2_PIX_FMT_SBGGR16:	descr = "16-bit Bayer BGBG/GRGR"; break;
 	case V4L2_PIX_FMT_SGBRG16:	descr = "16-bit Bayer GBGB/RGRG"; break;
 	case V4L2_PIX_FMT_SGRBG16:	descr = "16-bit Bayer GRGR/BGBG"; break;
diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h
index 45cf7359822c..185d6a0acc06 100644
--- a/include/uapi/linux/videodev2.h
+++ b/include/uapi/linux/videodev2.h
@@ -603,6 +603,11 @@ struct v4l2_pix_format {
 #define V4L2_PIX_FMT_SGBRG12 v4l2_fourcc('G', 'B', '1', '2') /* 12  GBGB.. RGRG.. */
 #define V4L2_PIX_FMT_SGRBG12 v4l2_fourcc('B', 'A', '1', '2') /* 12  GRGR.. BGBG.. */
 #define V4L2_PIX_FMT_SRGGB12 v4l2_fourcc('R', 'G', '1', '2') /* 12  RGRG.. GBGB.. */
+	/* 12bit raw bayer packed, 6 bytes for every 4 pixels */
+#define V4L2_PIX_FMT_SBGGR12P v4l2_fourcc('p', 'B', 'C', 'C')
+#define V4L2_PIX_FMT_SGBRG12P v4l2_fourcc('p', 'G', 'C', 'C')
+#define V4L2_PIX_FMT_SGRBG12P v4l2_fourcc('p', 'g', 'C', 'C')
+#define V4L2_PIX_FMT_SRGGB12P v4l2_fourcc('p', 'R', 'C', 'C')
 #define V4L2_PIX_FMT_SBGGR16 v4l2_fourcc('B', 'Y', 'R', '2') /* 16  BGBG.. GRGR.. */
 #define V4L2_PIX_FMT_SGBRG16 v4l2_fourcc('G', 'B', '1', '6') /* 16  GBGB.. RGRG.. */
 #define V4L2_PIX_FMT_SGRBG16 v4l2_fourcc('G', 'R', '1', '6') /* 16  GRGR.. BGBG.. */
-- 
cgit v1.2.3-71-gd317


From a9e4998073d49a762a154a6b48a332ec6cb8e6b1 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Thu, 20 Jul 2017 18:12:07 -0400
Subject: media: dvb_frontend: ensure that inital front end status initialized

The fe_status variable s is not initialized meaning it can have any
random garbage status.  This could be problematic if fe->ops.tune is
false as s is not updated by the call to fe->ops.tune() and a
subsequent check on the change status will using a garbage value.
Fix this by adding FE_NONE to the enum fe_status and initializing
s to this.

Detected by CoverityScan, CID#112887 ("Uninitialized scalar variable")

Signed-off-by: Colin Ian King <colin.king@canonical.com>
Reviewed-by: Shuah Khan <shuahkh@osg.samsung.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
---
 drivers/media/dvb-core/dvb_frontend.c | 2 +-
 include/uapi/linux/dvb/frontend.h     | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/media/dvb-core/dvb_frontend.c b/drivers/media/dvb-core/dvb_frontend.c
index e3fff8f64d37..18cc3bbc699c 100644
--- a/drivers/media/dvb-core/dvb_frontend.c
+++ b/drivers/media/dvb-core/dvb_frontend.c
@@ -631,7 +631,7 @@ static int dvb_frontend_thread(void *data)
 	struct dvb_frontend *fe = data;
 	struct dtv_frontend_properties *c = &fe->dtv_property_cache;
 	struct dvb_frontend_private *fepriv = fe->frontend_priv;
-	enum fe_status s;
+	enum fe_status s = FE_NONE;
 	enum dvbfe_algo algo;
 	bool re_tune = false;
 	bool semheld = false;
diff --git a/include/uapi/linux/dvb/frontend.h b/include/uapi/linux/dvb/frontend.h
index 00a20cd21ee2..afc3972b0879 100644
--- a/include/uapi/linux/dvb/frontend.h
+++ b/include/uapi/linux/dvb/frontend.h
@@ -127,6 +127,7 @@ enum fe_sec_mini_cmd {
  *			to reset DiSEqC, tone and parameters
  */
 enum fe_status {
+	FE_NONE			= 0x00,
 	FE_HAS_SIGNAL		= 0x01,
 	FE_HAS_CARRIER		= 0x02,
 	FE_HAS_VITERBI		= 0x04,
-- 
cgit v1.2.3-71-gd317


From ddc088238cd6988bb4ac3776f403d7ff9d3c7a63 Mon Sep 17 00:00:00 2001
From: Pawel Baldysiak <pawel.baldysiak@intel.com>
Date: Wed, 16 Aug 2017 17:13:45 +0200
Subject: md: Runtime support for multiple ppls

Increase PPL area to 1MB and use it as circular buffer to store PPL. The
entry with highest generation number is the latest one. If PPL to be
written is larger then space left in a buffer, rewind the buffer to the
start (don't wrap it).

Signed-off-by: Pawel Baldysiak <pawel.baldysiak@intel.com>
Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/md.c                | 16 +++++++++++++---
 drivers/md/md.h                |  1 +
 drivers/md/raid0.c             |  3 ++-
 drivers/md/raid1.c             |  3 ++-
 drivers/md/raid5-ppl.c         | 43 +++++++++++++++++++++++++++++++++++++++---
 drivers/md/raid5.c             |  1 +
 include/uapi/linux/raid/md_p.h |  4 +++-
 7 files changed, 62 insertions(+), 9 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index a74dc9963822..a7876237de10 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1536,7 +1536,8 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_
 	} else if (sb->bblog_offset != 0)
 		rdev->badblocks.shift = 0;
 
-	if (le32_to_cpu(sb->feature_map) & MD_FEATURE_PPL) {
+	if ((le32_to_cpu(sb->feature_map) &
+	    (MD_FEATURE_PPL | MD_FEATURE_MULTIPLE_PPLS))) {
 		rdev->ppl.offset = (__s16)le16_to_cpu(sb->ppl.offset);
 		rdev->ppl.size = le16_to_cpu(sb->ppl.size);
 		rdev->ppl.sector = rdev->sb_start + rdev->ppl.offset;
@@ -1655,10 +1656,15 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
 		if (le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)
 			set_bit(MD_HAS_JOURNAL, &mddev->flags);
 
-		if (le32_to_cpu(sb->feature_map) & MD_FEATURE_PPL) {
+		if (le32_to_cpu(sb->feature_map) &
+		    (MD_FEATURE_PPL | MD_FEATURE_MULTIPLE_PPLS)) {
 			if (le32_to_cpu(sb->feature_map) &
 			    (MD_FEATURE_BITMAP_OFFSET | MD_FEATURE_JOURNAL))
 				return -EINVAL;
+			if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_PPL) &&
+			    (le32_to_cpu(sb->feature_map) &
+					    MD_FEATURE_MULTIPLE_PPLS))
+				return -EINVAL;
 			set_bit(MD_HAS_PPL, &mddev->flags);
 		}
 	} else if (mddev->pers == NULL) {
@@ -1875,7 +1881,11 @@ retry:
 		sb->feature_map |= cpu_to_le32(MD_FEATURE_JOURNAL);
 
 	if (test_bit(MD_HAS_PPL, &mddev->flags)) {
-		sb->feature_map |= cpu_to_le32(MD_FEATURE_PPL);
+		if (test_bit(MD_HAS_MULTIPLE_PPLS, &mddev->flags))
+			sb->feature_map |=
+			    cpu_to_le32(MD_FEATURE_MULTIPLE_PPLS);
+		else
+			sb->feature_map |= cpu_to_le32(MD_FEATURE_PPL);
 		sb->ppl.offset = cpu_to_le16(rdev->ppl.offset);
 		sb->ppl.size = cpu_to_le16(rdev->ppl.size);
 	}
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 09db03455801..d4bdfa5c223b 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -236,6 +236,7 @@ enum mddev_flags {
 				 * never cause the array to become failed.
 				 */
 	MD_HAS_PPL,		/* The raid array has PPL feature set */
+	MD_HAS_MULTIPLE_PPLS,	/* The raid array has multiple PPLs feature set */
 };
 
 enum mddev_sb_flags {
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 6fb81704aff4..fd5e8e5efbef 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -30,7 +30,8 @@
 	((1L << MD_HAS_JOURNAL) |	\
 	 (1L << MD_JOURNAL_CLEAN) |	\
 	 (1L << MD_FAILFAST_SUPPORTED) |\
-	 (1L << MD_HAS_PPL))
+	 (1L << MD_HAS_PPL) |		\
+	 (1L << MD_HAS_MULTIPLE_PPLS))
 
 static int raid0_congested(struct mddev *mddev, int bits)
 {
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 79474f47eeef..1f5bd9475dc1 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -48,7 +48,8 @@
 #define UNSUPPORTED_MDDEV_FLAGS		\
 	((1L << MD_HAS_JOURNAL) |	\
 	 (1L << MD_JOURNAL_CLEAN) |	\
-	 (1L << MD_HAS_PPL))
+	 (1L << MD_HAS_PPL) |		\
+	 (1L << MD_HAS_MULTIPLE_PPLS))
 
 /*
  * Number of guaranteed r1bios in case of extreme VM load:
diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c
index 44ad5baf3206..b313f17a6260 100644
--- a/drivers/md/raid5-ppl.c
+++ b/drivers/md/raid5-ppl.c
@@ -87,6 +87,8 @@
  * The current io_unit accepting new stripes is always at the end of the list.
  */
 
+#define PPL_SPACE_SIZE (128 * 1024)
+
 struct ppl_conf {
 	struct mddev *mddev;
 
@@ -122,6 +124,10 @@ struct ppl_log {
 					 * always at the end of io_list */
 	spinlock_t io_list_lock;
 	struct list_head io_list;	/* all io_units of this log */
+
+	sector_t next_io_sector;
+	unsigned int entry_space;
+	bool use_multippl;
 };
 
 #define PPL_IO_INLINE_BVECS 32
@@ -264,13 +270,12 @@ static int ppl_log_stripe(struct ppl_log *log, struct stripe_head *sh)
 	int i;
 	sector_t data_sector = 0;
 	int data_disks = 0;
-	unsigned int entry_space = (log->rdev->ppl.size << 9) - PPL_HEADER_SIZE;
 	struct r5conf *conf = sh->raid_conf;
 
 	pr_debug("%s: stripe: %llu\n", __func__, (unsigned long long)sh->sector);
 
 	/* check if current io_unit is full */
-	if (io && (io->pp_size == entry_space ||
+	if (io && (io->pp_size == log->entry_space ||
 		   io->entries_count == PPL_HDR_MAX_ENTRIES)) {
 		pr_debug("%s: add io_unit blocked by seq: %llu\n",
 			 __func__, io->seq);
@@ -451,12 +456,25 @@ static void ppl_submit_iounit(struct ppl_io_unit *io)
 	pplhdr->entries_count = cpu_to_le32(io->entries_count);
 	pplhdr->checksum = cpu_to_le32(~crc32c_le(~0, pplhdr, PPL_HEADER_SIZE));
 
+	/* Rewind the buffer if current PPL is larger then remaining space */
+	if (log->use_multippl &&
+	    log->rdev->ppl.sector + log->rdev->ppl.size - log->next_io_sector <
+	    (PPL_HEADER_SIZE + io->pp_size) >> 9)
+		log->next_io_sector = log->rdev->ppl.sector;
+
+
 	bio->bi_end_io = ppl_log_endio;
 	bio->bi_opf = REQ_OP_WRITE | REQ_FUA;
 	bio->bi_bdev = log->rdev->bdev;
-	bio->bi_iter.bi_sector = log->rdev->ppl.sector;
+	bio->bi_iter.bi_sector = log->next_io_sector;
 	bio_add_page(bio, io->header_page, PAGE_SIZE, 0);
 
+	pr_debug("%s: log->current_io_sector: %llu\n", __func__,
+	    (unsigned long long)log->next_io_sector);
+
+	if (log->use_multippl)
+		log->next_io_sector += (PPL_HEADER_SIZE + io->pp_size) >> 9;
+
 	list_for_each_entry(sh, &io->stripe_list, log_list) {
 		/* entries for full stripe writes have no partial parity */
 		if (test_bit(STRIPE_FULL_WRITE, &sh->state))
@@ -1031,6 +1049,7 @@ static int ppl_load(struct ppl_conf *ppl_conf)
 static void __ppl_exit_log(struct ppl_conf *ppl_conf)
 {
 	clear_bit(MD_HAS_PPL, &ppl_conf->mddev->flags);
+	clear_bit(MD_HAS_MULTIPLE_PPLS, &ppl_conf->mddev->flags);
 
 	kfree(ppl_conf->child_logs);
 
@@ -1099,6 +1118,22 @@ static int ppl_validate_rdev(struct md_rdev *rdev)
 	return 0;
 }
 
+static void ppl_init_child_log(struct ppl_log *log, struct md_rdev *rdev)
+{
+	if ((rdev->ppl.size << 9) >= (PPL_SPACE_SIZE +
+				      PPL_HEADER_SIZE) * 2) {
+		log->use_multippl = true;
+		set_bit(MD_HAS_MULTIPLE_PPLS,
+			&log->ppl_conf->mddev->flags);
+		log->entry_space = PPL_SPACE_SIZE;
+	} else {
+		log->use_multippl = false;
+		log->entry_space = (log->rdev->ppl.size << 9) -
+				   PPL_HEADER_SIZE;
+	}
+	log->next_io_sector = rdev->ppl.sector;
+}
+
 int ppl_init_log(struct r5conf *conf)
 {
 	struct ppl_conf *ppl_conf;
@@ -1196,6 +1231,7 @@ int ppl_init_log(struct r5conf *conf)
 			q = bdev_get_queue(rdev->bdev);
 			if (test_bit(QUEUE_FLAG_WC, &q->queue_flags))
 				need_cache_flush = true;
+			ppl_init_child_log(log, rdev);
 		}
 	}
 
@@ -1261,6 +1297,7 @@ int ppl_modify_log(struct r5conf *conf, struct md_rdev *rdev, bool add)
 		if (!ret) {
 			log->rdev = rdev;
 			ret = ppl_write_empty_header(log);
+			ppl_init_child_log(log, rdev);
 		}
 	} else {
 		log->rdev = NULL;
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 6af57c6c0533..049a958d3c1e 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -7236,6 +7236,7 @@ static int raid5_run(struct mddev *mddev)
 		pr_warn("md/raid:%s: using journal device and PPL not allowed - disabling PPL\n",
 			mdname(mddev));
 		clear_bit(MD_HAS_PPL, &mddev->flags);
+		clear_bit(MD_HAS_MULTIPLE_PPLS, &mddev->flags);
 	}
 
 	if (mddev->private == NULL)
diff --git a/include/uapi/linux/raid/md_p.h b/include/uapi/linux/raid/md_p.h
index d500bd224979..b9197976b660 100644
--- a/include/uapi/linux/raid/md_p.h
+++ b/include/uapi/linux/raid/md_p.h
@@ -324,9 +324,10 @@ struct mdp_superblock_1 {
 #define	MD_FEATURE_RECOVERY_BITMAP	128 /* recovery that is happening
 					     * is guided by bitmap.
 					     */
-#define MD_FEATURE_CLUSTERED		256 /* clustered MD */
+#define	MD_FEATURE_CLUSTERED		256 /* clustered MD */
 #define	MD_FEATURE_JOURNAL		512 /* support write cache */
 #define	MD_FEATURE_PPL			1024 /* support PPL */
+#define	MD_FEATURE_MULTIPLE_PPLS	2048 /* support for multiple PPLs */
 #define	MD_FEATURE_ALL			(MD_FEATURE_BITMAP_OFFSET	\
 					|MD_FEATURE_RECOVERY_OFFSET	\
 					|MD_FEATURE_RESHAPE_ACTIVE	\
@@ -338,6 +339,7 @@ struct mdp_superblock_1 {
 					|MD_FEATURE_CLUSTERED		\
 					|MD_FEATURE_JOURNAL		\
 					|MD_FEATURE_PPL			\
+					|MD_FEATURE_MULTIPLE_PPLS	\
 					)
 
 struct r5l_payload_header {
-- 
cgit v1.2.3-71-gd317


From 7a14724f54bf9889fcb1a9f1d4aa4e1d2e969d93 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Mon, 28 Aug 2017 08:33:20 -0700
Subject: libnvdimm: clean up command definitions

Remove the command payloads that do not have an associated libnvdimm
ioctl. I.e. remove the payloads that would only ever be carried in the
ND_CMD_CALL envelope. This prevents userspace from growing unnecessary
dependencies on this kernel header when userspace already has everything
it needs to craft and send these commands.

Cc: Jerry Hoemann <jerry.hoemann@hpe.com>
Reported-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 include/uapi/linux/ndctl.h | 37 -------------------------------------
 1 file changed, 37 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/ndctl.h b/include/uapi/linux/ndctl.h
index 6d3c54264d8e..3f03567631cb 100644
--- a/include/uapi/linux/ndctl.h
+++ b/include/uapi/linux/ndctl.h
@@ -145,43 +145,6 @@ struct nd_cmd_clear_error {
 	__u64 cleared;
 } __packed;
 
-struct nd_cmd_trans_spa {
-	__u64 spa;
-	__u32 status;
-	__u8  flags;
-	__u8  _reserved[3];
-	__u64 trans_length;
-	__u32 num_nvdimms;
-	struct nd_nvdimm_device {
-		__u32 nfit_device_handle;
-		__u32 _reserved;
-		__u64 dpa;
-	} __packed devices[0];
-
-} __packed;
-
-struct nd_cmd_ars_err_inj {
-	__u64 err_inj_spa_range_base;
-	__u64 err_inj_spa_range_length;
-	__u8  err_inj_options;
-	__u32 status;
-} __packed;
-
-struct nd_cmd_ars_err_inj_clr {
-	__u64 err_inj_clr_spa_range_base;
-	__u64 err_inj_clr_spa_range_length;
-	__u32 status;
-} __packed;
-
-struct nd_cmd_ars_err_inj_stat {
-	__u32 status;
-	__u32 inj_err_rec_count;
-	struct nd_error_stat_query_record {
-		__u64 err_inj_stat_spa_range_base;
-		__u64 err_inj_stat_spa_range_length;
-	} __packed record[0];
-} __packed;
-
 enum {
 	ND_CMD_IMPLEMENTED = 0,
 
-- 
cgit v1.2.3-71-gd317


From 464bc0fd6273d518aee79fbd37211dd9bc35d863 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Mon, 28 Aug 2017 07:10:04 -0700
Subject: bpf: convert sockmap field attach_bpf_fd2 to type

In the initial sockmap API we provided strparser and verdict programs
using a single attach command by extending the attach API with a the
attach_bpf_fd2 field.

However, if we add other programs in the future we will be adding a
field for every new possible type, attach_bpf_fd(3,4,..). This
seems a bit clumsy for an API. So lets push the programs using two
new type fields.

   BPF_SK_SKB_STREAM_PARSER
   BPF_SK_SKB_STREAM_VERDICT

This has the advantage of having a readable name and can easily be
extended in the future.

Updates to samples and sockmap included here also generalize tests
slightly to support upcoming patch for multiple map support.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Fixes: 174a79ff9515 ("bpf: sockmap with sk redirect support")
Suggested-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h                                |  10 +-
 include/uapi/linux/bpf.h                           |   9 +-
 kernel/bpf/sockmap.c                               |  25 ++--
 kernel/bpf/syscall.c                               |  38 ++----
 samples/sockmap/sockmap_kern.c                     |   6 +-
 samples/sockmap/sockmap_user.c                     |  12 +-
 tools/include/uapi/linux/bpf.h                     |   9 +-
 tools/lib/bpf/bpf.c                                |  14 +--
 tools/lib/bpf/bpf.h                                |   4 -
 tools/testing/selftests/bpf/bpf_helpers.h          |   3 +-
 tools/testing/selftests/bpf/sockmap_parse_prog.c   |   2 +-
 tools/testing/selftests/bpf/sockmap_verdict_prog.c |   2 +-
 tools/testing/selftests/bpf/test_maps.c            | 133 +++++++++------------
 13 files changed, 116 insertions(+), 151 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 830f472d8df5..c2cb1b5c094e 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -39,8 +39,6 @@ struct bpf_map_ops {
 	void (*map_fd_put_ptr)(void *ptr);
 	u32 (*map_gen_lookup)(struct bpf_map *map, struct bpf_insn *insn_buf);
 	u32 (*map_fd_sys_lookup_elem)(void *ptr);
-	int (*map_attach)(struct bpf_map *map,
-			  struct bpf_prog *p1, struct bpf_prog *p2);
 };
 
 struct bpf_map {
@@ -387,11 +385,19 @@ static inline void __dev_map_flush(struct bpf_map *map)
 
 #if defined(CONFIG_STREAM_PARSER) && defined(CONFIG_BPF_SYSCALL)
 struct sock  *__sock_map_lookup_elem(struct bpf_map *map, u32 key);
+int sock_map_attach_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type);
 #else
 static inline struct sock  *__sock_map_lookup_elem(struct bpf_map *map, u32 key)
 {
 	return NULL;
 }
+
+static inline int sock_map_attach_prog(struct bpf_map *map,
+				       struct bpf_prog *prog,
+				       u32 type)
+{
+	return -EOPNOTSUPP;
+}
 #endif
 
 /* verifier prototypes for helper functions called from eBPF programs */
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 843818dff96d..97227be3690c 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -136,7 +136,8 @@ enum bpf_attach_type {
 	BPF_CGROUP_INET_EGRESS,
 	BPF_CGROUP_INET_SOCK_CREATE,
 	BPF_CGROUP_SOCK_OPS,
-	BPF_CGROUP_SMAP_INGRESS,
+	BPF_SK_SKB_STREAM_PARSER,
+	BPF_SK_SKB_STREAM_VERDICT,
 	__MAX_BPF_ATTACH_TYPE
 };
 
@@ -224,7 +225,6 @@ union bpf_attr {
 		__u32		attach_bpf_fd;	/* eBPF program to attach */
 		__u32		attach_type;
 		__u32		attach_flags;
-		__u32		attach_bpf_fd2;
 	};
 
 	struct { /* anonymous struct used by BPF_PROG_TEST_RUN command */
@@ -580,14 +580,11 @@ union bpf_attr {
  *     @flags: reserved for future use
  *     Return: SK_REDIRECT
  *
- * int bpf_sock_map_update(skops, map, key, flags, map_flags)
+ * int bpf_sock_map_update(skops, map, key, flags)
  *	@skops: pointer to bpf_sock_ops
  *	@map: pointer to sockmap to update
  *	@key: key to insert/update sock in map
  *	@flags: same flags as map update elem
- *	@map_flags: sock map specific flags
- *	   bit 1: Enable strparser
- *	   other bits: reserved
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index 617c239590c2..cf570d108fd5 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -723,20 +723,24 @@ out:
 	return err;
 }
 
-static int sock_map_attach_prog(struct bpf_map *map,
-				struct bpf_prog *parse,
-				struct bpf_prog *verdict)
+int sock_map_attach_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type)
 {
 	struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
-	struct bpf_prog *_parse, *_verdict;
+	struct bpf_prog *orig;
 
-	_parse = xchg(&stab->bpf_parse, parse);
-	_verdict = xchg(&stab->bpf_verdict, verdict);
+	switch (type) {
+	case BPF_SK_SKB_STREAM_PARSER:
+		orig = xchg(&stab->bpf_parse, prog);
+		break;
+	case BPF_SK_SKB_STREAM_VERDICT:
+		orig = xchg(&stab->bpf_verdict, prog);
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
 
-	if (_parse)
-		bpf_prog_put(_parse);
-	if (_verdict)
-		bpf_prog_put(_verdict);
+	if (orig)
+		bpf_prog_put(orig);
 
 	return 0;
 }
@@ -777,7 +781,6 @@ const struct bpf_map_ops sock_map_ops = {
 	.map_get_next_key = sock_map_get_next_key,
 	.map_update_elem = sock_map_update_elem,
 	.map_delete_elem = sock_map_delete_elem,
-	.map_attach = sock_map_attach_prog,
 };
 
 BPF_CALL_5(bpf_sock_map_update, struct bpf_sock_ops_kern *, bpf_sock,
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 9378f3ba2cbf..021a05d9d800 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1093,12 +1093,12 @@ static int bpf_obj_get(const union bpf_attr *attr)
 
 #ifdef CONFIG_CGROUP_BPF
 
-#define BPF_PROG_ATTACH_LAST_FIELD attach_bpf_fd2
+#define BPF_PROG_ATTACH_LAST_FIELD attach_flags
 
-static int sockmap_get_from_fd(const union bpf_attr *attr, int ptype)
+static int sockmap_get_from_fd(const union bpf_attr *attr)
 {
-	struct bpf_prog *prog1, *prog2;
 	int ufd = attr->target_fd;
+	struct bpf_prog *prog;
 	struct bpf_map *map;
 	struct fd f;
 	int err;
@@ -1108,29 +1108,16 @@ static int sockmap_get_from_fd(const union bpf_attr *attr, int ptype)
 	if (IS_ERR(map))
 		return PTR_ERR(map);
 
-	if (!map->ops->map_attach) {
-		fdput(f);
-		return -EOPNOTSUPP;
-	}
-
-	prog1 = bpf_prog_get_type(attr->attach_bpf_fd, ptype);
-	if (IS_ERR(prog1)) {
+	prog = bpf_prog_get_type(attr->attach_bpf_fd, BPF_PROG_TYPE_SK_SKB);
+	if (IS_ERR(prog)) {
 		fdput(f);
-		return PTR_ERR(prog1);
-	}
-
-	prog2 = bpf_prog_get_type(attr->attach_bpf_fd2, ptype);
-	if (IS_ERR(prog2)) {
-		fdput(f);
-		bpf_prog_put(prog1);
-		return PTR_ERR(prog2);
+		return PTR_ERR(prog);
 	}
 
-	err = map->ops->map_attach(map, prog1, prog2);
+	err = sock_map_attach_prog(map, prog, attr->attach_type);
 	if (err) {
 		fdput(f);
-		bpf_prog_put(prog1);
-		bpf_prog_put(prog2);
+		bpf_prog_put(prog);
 		return err;
 	}
 
@@ -1165,16 +1152,13 @@ static int bpf_prog_attach(const union bpf_attr *attr)
 	case BPF_CGROUP_SOCK_OPS:
 		ptype = BPF_PROG_TYPE_SOCK_OPS;
 		break;
-	case BPF_CGROUP_SMAP_INGRESS:
-		ptype = BPF_PROG_TYPE_SK_SKB;
-		break;
+	case BPF_SK_SKB_STREAM_PARSER:
+	case BPF_SK_SKB_STREAM_VERDICT:
+		return sockmap_get_from_fd(attr);
 	default:
 		return -EINVAL;
 	}
 
-	if (attr->attach_type == BPF_CGROUP_SMAP_INGRESS)
-		return sockmap_get_from_fd(attr, ptype);
-
 	prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype);
 	if (IS_ERR(prog))
 		return PTR_ERR(prog);
diff --git a/samples/sockmap/sockmap_kern.c b/samples/sockmap/sockmap_kern.c
index 6ff986f7059b..f9b38ef82dc2 100644
--- a/samples/sockmap/sockmap_kern.c
+++ b/samples/sockmap/sockmap_kern.c
@@ -82,8 +82,7 @@ int bpf_sockmap(struct bpf_sock_ops *skops)
 		if (lport == 10000) {
 			ret = 1;
 			err = bpf_sock_map_update(skops, &sock_map, &ret,
-						  BPF_NOEXIST,
-						  BPF_SOCKMAP_STRPARSER);
+						  BPF_NOEXIST);
 			bpf_printk("passive(%i -> %i) map ctx update err: %d\n",
 				   lport, bpf_ntohl(rport), err);
 		}
@@ -95,8 +94,7 @@ int bpf_sockmap(struct bpf_sock_ops *skops)
 		if (bpf_ntohl(rport) == 10001) {
 			ret = 10;
 			err = bpf_sock_map_update(skops, &sock_map, &ret,
-						  BPF_NOEXIST,
-						  BPF_SOCKMAP_STRPARSER);
+						  BPF_NOEXIST);
 			bpf_printk("active(%i -> %i) map ctx update err: %d\n",
 				   lport, bpf_ntohl(rport), err);
 		}
diff --git a/samples/sockmap/sockmap_user.c b/samples/sockmap/sockmap_user.c
index fb78f5abefb4..7cc9d228216f 100644
--- a/samples/sockmap/sockmap_user.c
+++ b/samples/sockmap/sockmap_user.c
@@ -256,8 +256,16 @@ int main(int argc, char **argv)
 	}
 
 	/* Attach programs to sockmap */
-	err = __bpf_prog_attach(prog_fd[0], prog_fd[1], map_fd[0],
-				BPF_CGROUP_SMAP_INGRESS, 0);
+	err = bpf_prog_attach(prog_fd[0], map_fd[0],
+				BPF_SK_SKB_STREAM_PARSER, 0);
+	if (err) {
+		fprintf(stderr, "ERROR: bpf_prog_attach (sockmap): %d (%s)\n",
+			err, strerror(errno));
+		return err;
+	}
+
+	err = bpf_prog_attach(prog_fd[1], map_fd[0],
+				BPF_SK_SKB_STREAM_VERDICT, 0);
 	if (err) {
 		fprintf(stderr, "ERROR: bpf_prog_attach (sockmap): %d (%s)\n",
 			err, strerror(errno));
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index f8f6377fd541..09ac590eefb1 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -136,7 +136,8 @@ enum bpf_attach_type {
 	BPF_CGROUP_INET_EGRESS,
 	BPF_CGROUP_INET_SOCK_CREATE,
 	BPF_CGROUP_SOCK_OPS,
-	BPF_CGROUP_SMAP_INGRESS,
+	BPF_SK_SKB_STREAM_PARSER,
+	BPF_SK_SKB_STREAM_VERDICT,
 	__MAX_BPF_ATTACH_TYPE
 };
 
@@ -227,7 +228,6 @@ union bpf_attr {
 		__u32		attach_bpf_fd;	/* eBPF program to attach */
 		__u32		attach_type;
 		__u32		attach_flags;
-		__u32		attach_bpf_fd2;
 	};
 
 	struct { /* anonymous struct used by BPF_PROG_TEST_RUN command */
@@ -572,14 +572,11 @@ union bpf_attr {
  *     @flags: reserved for future use
  *     Return: SK_REDIRECT
  *
- * int bpf_sock_map_update(skops, map, key, flags, map_flags)
+ * int bpf_sock_map_update(skops, map, key, flags)
  *	@skops: pointer to bpf_sock_ops
  *	@map: pointer to sockmap to update
  *	@key: key to insert/update sock in map
  *	@flags: same flags as map update elem
- *	@map_flags: sock map specific flags
- *	   bit 1: Enable strparser
- *	   other bits: reserved
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c
index a0717610b116..1d6907d379c9 100644
--- a/tools/lib/bpf/bpf.c
+++ b/tools/lib/bpf/bpf.c
@@ -235,28 +235,20 @@ int bpf_obj_get(const char *pathname)
 	return sys_bpf(BPF_OBJ_GET, &attr, sizeof(attr));
 }
 
-int __bpf_prog_attach(int prog_fd1, int prog_fd2, int target_fd,
-		      enum bpf_attach_type type,
-		      unsigned int flags)
+int bpf_prog_attach(int prog_fd, int target_fd, enum bpf_attach_type type,
+		    unsigned int flags)
 {
 	union bpf_attr attr;
 
 	bzero(&attr, sizeof(attr));
 	attr.target_fd	   = target_fd;
-	attr.attach_bpf_fd = prog_fd1;
-	attr.attach_bpf_fd2 = prog_fd2;
+	attr.attach_bpf_fd = prog_fd;
 	attr.attach_type   = type;
 	attr.attach_flags  = flags;
 
 	return sys_bpf(BPF_PROG_ATTACH, &attr, sizeof(attr));
 }
 
-int bpf_prog_attach(int prog_fd, int target_fd, enum bpf_attach_type type,
-		    unsigned int flags)
-{
-	return __bpf_prog_attach(prog_fd, 0, target_fd, type, flags);
-}
-
 int bpf_prog_detach(int target_fd, enum bpf_attach_type type)
 {
 	union bpf_attr attr;
diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h
index 90e9d4e85d08..b8ea5843c39e 100644
--- a/tools/lib/bpf/bpf.h
+++ b/tools/lib/bpf/bpf.h
@@ -56,10 +56,6 @@ int bpf_obj_pin(int fd, const char *pathname);
 int bpf_obj_get(const char *pathname);
 int bpf_prog_attach(int prog_fd, int attachable_fd, enum bpf_attach_type type,
 		    unsigned int flags);
-int __bpf_prog_attach(int prog1, int prog2,
-		      int attachable_fd,
-		      enum bpf_attach_type type,
-		      unsigned int flags);
 int bpf_prog_detach(int attachable_fd, enum bpf_attach_type type);
 int bpf_prog_test_run(int prog_fd, int repeat, void *data, __u32 size,
 		      void *data_out, __u32 *size_out, __u32 *retval,
diff --git a/tools/testing/selftests/bpf/bpf_helpers.h b/tools/testing/selftests/bpf/bpf_helpers.h
index 98f3be26d390..36fb9161b34a 100644
--- a/tools/testing/selftests/bpf/bpf_helpers.h
+++ b/tools/testing/selftests/bpf/bpf_helpers.h
@@ -68,8 +68,7 @@ static int (*bpf_setsockopt)(void *ctx, int level, int optname, void *optval,
 static int (*bpf_sk_redirect_map)(void *map, int key, int flags) =
 	(void *) BPF_FUNC_sk_redirect_map;
 static int (*bpf_sock_map_update)(void *map, void *key, void *value,
-				  unsigned long long flags,
-				  unsigned long long map_lags) =
+				  unsigned long long flags) =
 	(void *) BPF_FUNC_sock_map_update;
 
 
diff --git a/tools/testing/selftests/bpf/sockmap_parse_prog.c b/tools/testing/selftests/bpf/sockmap_parse_prog.c
index 8b5453158399..710f43f42dc4 100644
--- a/tools/testing/selftests/bpf/sockmap_parse_prog.c
+++ b/tools/testing/selftests/bpf/sockmap_parse_prog.c
@@ -30,7 +30,7 @@ int bpf_prog1(struct __sk_buff *skb)
 	 */
 	d[0] = 1;
 
-	bpf_printk("data[0] = (%u): local_port %i remote %i\n",
+	bpf_printk("parse: data[0] = (%u): local_port %i remote %i\n",
 		   d[0], lport, bpf_ntohl(rport));
 	return skb->len;
 }
diff --git a/tools/testing/selftests/bpf/sockmap_verdict_prog.c b/tools/testing/selftests/bpf/sockmap_verdict_prog.c
index d5f9447b3808..0573c1db2519 100644
--- a/tools/testing/selftests/bpf/sockmap_verdict_prog.c
+++ b/tools/testing/selftests/bpf/sockmap_verdict_prog.c
@@ -40,7 +40,7 @@ int bpf_prog2(struct __sk_buff *skb)
 	d[6] = 0xe;
 	d[7] = 0xf;
 
-	bpf_printk("data[0] = (%u): local_port %i remote %i\n",
+	bpf_printk("verdict: data[0] = (%u): local_port %i remote %i redirect 5\n",
 		   d[0], lport, bpf_ntohl(rport));
 	return bpf_sk_redirect_map(&sock_map, 5, 0);
 }
diff --git a/tools/testing/selftests/bpf/test_maps.c b/tools/testing/selftests/bpf/test_maps.c
index 40b2d1faf02b..6df6e6257424 100644
--- a/tools/testing/selftests/bpf/test_maps.c
+++ b/tools/testing/selftests/bpf/test_maps.c
@@ -547,20 +547,26 @@ static void test_sockmap(int task, void *data)
 		goto out_sockmap;
 	}
 
-	/* Nothing attached so these should fail */
+	/* Test update without programs */
 	for (i = 0; i < 6; i++) {
 		err = bpf_map_update_elem(fd, &i, &sfd[i], BPF_ANY);
-		if (!err) {
-			printf("Failed invalid update sockmap '%i:%i'\n",
+		if (err) {
+			printf("Failed noprog update sockmap '%i:%i'\n",
 			       i, sfd[i]);
 			goto out_sockmap;
 		}
 	}
 
 	/* Test attaching bad fds */
-	err = __bpf_prog_attach(-1, -2, fd, BPF_CGROUP_SMAP_INGRESS, 0);
+	err = bpf_prog_attach(-1, fd, BPF_SK_SKB_STREAM_PARSER, 0);
 	if (!err) {
-		printf("Failed invalid prog attach\n");
+		printf("Failed invalid parser prog attach\n");
+		goto out_sockmap;
+	}
+
+	err = bpf_prog_attach(-1, fd, BPF_SK_SKB_STREAM_VERDICT, 0);
+	if (!err) {
+		printf("Failed invalid verdict prog attach\n");
 		goto out_sockmap;
 	}
 
@@ -591,14 +597,21 @@ static void test_sockmap(int task, void *data)
 		goto out_sockmap;
 	}
 
-	err = __bpf_prog_attach(parse_prog, verdict_prog, map_fd,
-				BPF_CGROUP_SMAP_INGRESS, 0);
+	err = bpf_prog_attach(parse_prog, map_fd,
+		      BPF_SK_SKB_STREAM_PARSER, 0);
+	if (err) {
+		printf("Failed bpf prog attach\n");
+		goto out_sockmap;
+	}
+
+	err = bpf_prog_attach(verdict_prog, map_fd,
+			      BPF_SK_SKB_STREAM_VERDICT, 0);
 	if (err) {
 		printf("Failed bpf prog attach\n");
 		goto out_sockmap;
 	}
 
-	/* Test map update elem */
+	/* Test map update elem afterwards fd lives in fd and map_fd */
 	for (i = 0; i < 6; i++) {
 		err = bpf_map_update_elem(map_fd, &i, &sfd[i], BPF_ANY);
 		if (err) {
@@ -649,96 +662,68 @@ static void test_sockmap(int task, void *data)
 		goto out_sockmap;
 	}
 
-	/* Delete the reset of the elems include some NULL elems */
-	for (i = 0; i < 6; i++) {
-		err = bpf_map_delete_elem(map_fd, &i);
-		if (err && (i == 0 || i == 1 || i >= 4)) {
-			printf("Failed delete  sockmap %i '%i:%i'\n",
-			       err, i, sfd[i]);
-			goto out_sockmap;
-		} else if (!err && (i == 2 || i == 3)) {
-			printf("Failed null delete sockmap %i '%i:%i'\n",
-			       err, i, sfd[i]);
-			goto out_sockmap;
-		}
-	}
-
-	/* Test having multiple SMAPs open and active on same fds */
-	err = __bpf_prog_attach(parse_prog, verdict_prog, fd,
-				BPF_CGROUP_SMAP_INGRESS, 0);
-	if (err) {
-		printf("Failed fd bpf prog attach\n");
-		goto out_sockmap;
-	}
-
-	for (i = 0; i < 6; i++) {
-		err = bpf_map_update_elem(fd, &i, &sfd[i], BPF_ANY);
-		if (err) {
-			printf("Failed fd update sockmap %i '%i:%i'\n",
-			       err, i, sfd[i]);
-			goto out_sockmap;
-		}
-	}
-
-	/* Test duplicate socket add of NOEXIST, ANY and EXIST */
-	i = 0;
+	/* Push fd into same slot */
+	i = 2;
 	err = bpf_map_update_elem(fd, &i, &sfd[i], BPF_NOEXIST);
 	if (!err) {
-		printf("Failed BPF_NOEXIST create\n");
+		printf("Failed allowed sockmap dup slot BPF_NOEXIST\n");
 		goto out_sockmap;
 	}
 
 	err = bpf_map_update_elem(fd, &i, &sfd[i], BPF_ANY);
 	if (err) {
-		printf("Failed sockmap update BPF_ANY\n");
+		printf("Failed sockmap update new slot BPF_ANY\n");
 		goto out_sockmap;
 	}
 
 	err = bpf_map_update_elem(fd, &i, &sfd[i], BPF_EXIST);
 	if (err) {
-		printf("Failed sockmap update BPF_EXIST\n");
+		printf("Failed sockmap update new slot BPF_EXIST\n");
 		goto out_sockmap;
 	}
 
-	/* The above were pushing fd into same slot try different slot now */
-	i = 2;
-	err = bpf_map_update_elem(fd, &i, &sfd[i], BPF_NOEXIST);
-	if (!err) {
-		printf("Failed BPF_NOEXIST create\n");
-		goto out_sockmap;
+	/* Delete the elems without programs */
+	for (i = 0; i < 6; i++) {
+		err = bpf_map_delete_elem(fd, &i);
+		if (err) {
+			printf("Failed delete sockmap %i '%i:%i'\n",
+			       err, i, sfd[i]);
+		}
 	}
 
-	err = bpf_map_update_elem(fd, &i, &sfd[i], BPF_ANY);
+	/* Test having multiple maps open and set with programs on same fds */
+	err = bpf_prog_attach(parse_prog, fd,
+			      BPF_SK_SKB_STREAM_PARSER, 0);
 	if (err) {
-		printf("Failed sockmap update BPF_ANY\n");
+		printf("Failed fd bpf parse prog attach\n");
 		goto out_sockmap;
 	}
-
-	err = bpf_map_update_elem(fd, &i, &sfd[i], BPF_EXIST);
+	err = bpf_prog_attach(verdict_prog, fd,
+			      BPF_SK_SKB_STREAM_VERDICT, 0);
 	if (err) {
-		printf("Failed sockmap update BPF_EXIST\n");
+		printf("Failed fd bpf verdict prog attach\n");
 		goto out_sockmap;
 	}
 
-	/* Try pushing fd into different map, this is not allowed at the
-	 * moment. Which programs would we use?
-	 */
-	err = bpf_map_update_elem(map_fd, &i, &sfd[i], BPF_NOEXIST);
-	if (!err) {
-		printf("Failed BPF_NOEXIST create\n");
-		goto out_sockmap;
-	}
-
-	err = bpf_map_update_elem(map_fd, &i, &sfd[i], BPF_ANY);
-	if (!err) {
-		printf("Failed sockmap update BPF_ANY\n");
-		goto out_sockmap;
-	}
-
-	err = bpf_map_update_elem(map_fd, &i, &sfd[i], BPF_EXIST);
-	if (!err) {
-		printf("Failed sockmap update BPF_EXIST\n");
-		goto out_sockmap;
+	for (i = 4; i < 6; i++) {
+		err = bpf_map_update_elem(fd, &i, &sfd[i], BPF_ANY);
+		if (!err) {
+			printf("Failed allowed duplicate programs in update ANY sockmap %i '%i:%i'\n",
+			       err, i, sfd[i]);
+			goto out_sockmap;
+		}
+		err = bpf_map_update_elem(fd, &i, &sfd[i], BPF_NOEXIST);
+		if (!err) {
+			printf("Failed allowed duplicate program in update NOEXIST sockmap  %i '%i:%i'\n",
+			       err, i, sfd[i]);
+			goto out_sockmap;
+		}
+		err = bpf_map_update_elem(fd, &i, &sfd[i], BPF_EXIST);
+		if (!err) {
+			printf("Failed allowed duplicate program in update EXIST sockmap  %i '%i:%i'\n",
+			       err, i, sfd[i]);
+			goto out_sockmap;
+		}
 	}
 
 	/* Test map close sockets */
-- 
cgit v1.2.3-71-gd317


From 2f857d04601a1bb56958b95a9f180bce0e91e5e6 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Mon, 28 Aug 2017 07:10:25 -0700
Subject: bpf: sockmap, remove STRPARSER map_flags and add multi-map support

The addition of map_flags BPF_SOCKMAP_STRPARSER flags was to handle a
specific use case where we want to have BPF parse program disabled on
an entry in a sockmap.

However, Alexei found the API a bit cumbersome and I agreed. Lets
remove the STRPARSER flag and support the use case by allowing socks
to be in multiple maps. This allows users to create two maps one with
programs attached and one without. When socks are added to maps they
now inherit any programs attached to the map. This is a nice
generalization and IMO improves the API.

The API rules are less ambiguous and do not need a flag:

  - When a sock is added to a sockmap we have two cases,

     i. The sock map does not have any attached programs so
        we can add sock to map without inheriting bpf programs.
        The sock may exist in 0 or more other maps.

    ii. The sock map has an attached BPF program. To avoid duplicate
        bpf programs we only add the sock entry if it does not have
        an existing strparser/verdict attached, returning -EBUSY if
        a program is already attached. Otherwise attach the program
        and inherit strparser/verdict programs from the sock map.

This allows for socks to be in a multiple maps for redirects and
inherit a BPF program from a single map.

Also this patch simplifies the logic around BPF_{EXIST|NOEXIST|ANY}
flags. In the original patch I tried to be extra clever and only
update map entries when necessary. Now I've decided the complexity
is not worth it. If users constantly update an entry with the same
sock for no reason (i.e. update an entry without actually changing
any parameters on map or sock) we still do an alloc/release. Using
this and allowing multiple entries of a sock to exist in a map the
logic becomes much simpler.

Note: Now that multiple maps are supported the "maps" pointer called
when a socket is closed becomes a list of maps to remove the sock from.
To keep the map up to date when a sock is added to the sockmap we must
add the map/elem in the list. Likewise when it is removed we must
remove it from the list. This results in searching the per psock list
on delete operation. On TCP_CLOSE events we walk the list and remove
the psock from all map/entry locations. I don't see any perf
implications in this because at most I have a psock in two maps. If
a psock were to be in many maps its possibly this might be noticeable
on delete but I can't think of a reason to dup a psock in many maps.
The sk_callback_lock is used to protect read/writes to the list. This
was convenient because in all locations we were taking the lock
anyways just after working on the list. Also the lock is per sock so
in normal cases we shouldn't see any contention.

Suggested-by: Alexei Starovoitov <ast@kernel.org>
Fixes: 174a79ff9515 ("bpf: sockmap with sk redirect support")
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/bpf.h |   3 -
 kernel/bpf/sockmap.c     | 269 +++++++++++++++++++++++++++++------------------
 2 files changed, 165 insertions(+), 107 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 97227be3690c..08c206a863e1 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -143,9 +143,6 @@ enum bpf_attach_type {
 
 #define MAX_BPF_ATTACH_TYPE __MAX_BPF_ATTACH_TYPE
 
-/* If BPF_SOCKMAP_STRPARSER is used sockmap will use strparser on receive */
-#define BPF_SOCKMAP_STRPARSER	(1U << 0)
-
 /* If BPF_F_ALLOW_OVERRIDE flag is used in BPF_PROG_ATTACH command
  * to the given target_fd cgroup the descendent cgroup will be able to
  * override effective bpf program that was inherited from this cgroup
diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index cf570d108fd5..a6882e54930b 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -13,15 +13,16 @@
 /* A BPF sock_map is used to store sock objects. This is primarly used
  * for doing socket redirect with BPF helper routines.
  *
- * A sock map may have two BPF programs attached to it, a program used
- * to parse packets and a program to provide a verdict and redirect
- * decision on the packet. If no BPF parse program is provided it is
- * assumed that every skb is a "message" (skb->len). Otherwise the
- * parse program is attached to strparser and used to build messages
- * that may span multiple skbs. The verdict program will either select
- * a socket to send/receive the skb on or provide the drop code indicating
- * the skb should be dropped. More actions may be added later as needed.
- * The default program will drop packets.
+ * A sock map may have BPF programs attached to it, currently a program
+ * used to parse packets and a program to provide a verdict and redirect
+ * decision on the packet are supported. Any programs attached to a sock
+ * map are inherited by sock objects when they are added to the map. If
+ * no BPF programs are attached the sock object may only be used for sock
+ * redirect.
+ *
+ * A sock object may be in multiple maps, but can only inherit a single
+ * parse or verdict program. If adding a sock object to a map would result
+ * in having multiple parsing programs the update will return an EBUSY error.
  *
  * For reference this program is similar to devmap used in XDP context
  * reviewing these together may be useful. For an example please review
@@ -44,15 +45,21 @@ struct bpf_stab {
 	struct sock **sock_map;
 	struct bpf_prog *bpf_parse;
 	struct bpf_prog *bpf_verdict;
-	refcount_t refcnt;
 };
 
 enum smap_psock_state {
 	SMAP_TX_RUNNING,
 };
 
+struct smap_psock_map_entry {
+	struct list_head list;
+	struct sock **entry;
+};
+
 struct smap_psock {
 	struct rcu_head	rcu;
+	/* refcnt is used inside sk_callback_lock */
+	u32 refcnt;
 
 	/* datapath variables */
 	struct sk_buff_head rxqueue;
@@ -66,10 +73,9 @@ struct smap_psock {
 	struct strparser strp;
 	struct bpf_prog *bpf_parse;
 	struct bpf_prog *bpf_verdict;
-	struct bpf_stab *stab;
+	struct list_head maps;
 
 	/* Back reference used when sock callback trigger sockmap operations */
-	int key;
 	struct sock *sock;
 	unsigned long state;
 
@@ -83,7 +89,7 @@ struct smap_psock {
 
 static inline struct smap_psock *smap_psock_sk(const struct sock *sk)
 {
-	return (struct smap_psock *)rcu_dereference_sk_user_data(sk);
+	return rcu_dereference_sk_user_data(sk);
 }
 
 static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb)
@@ -149,11 +155,12 @@ static void smap_report_sk_error(struct smap_psock *psock, int err)
 	sk->sk_error_report(sk);
 }
 
-static void smap_release_sock(struct sock *sock);
+static void smap_release_sock(struct smap_psock *psock, struct sock *sock);
 
 /* Called with lock_sock(sk) held */
 static void smap_state_change(struct sock *sk)
 {
+	struct smap_psock_map_entry *e, *tmp;
 	struct smap_psock *psock;
 	struct sock *osk;
 
@@ -184,9 +191,15 @@ static void smap_state_change(struct sock *sk)
 		psock = smap_psock_sk(sk);
 		if (unlikely(!psock))
 			break;
-		osk = cmpxchg(&psock->stab->sock_map[psock->key], sk, NULL);
-		if (osk == sk)
-			smap_release_sock(sk);
+		write_lock_bh(&sk->sk_callback_lock);
+		list_for_each_entry_safe(e, tmp, &psock->maps, list) {
+			osk = cmpxchg(e->entry, sk, NULL);
+			if (osk == sk) {
+				list_del(&e->list);
+				smap_release_sock(psock, sk);
+			}
+		}
+		write_unlock_bh(&sk->sk_callback_lock);
 		break;
 	default:
 		psock = smap_psock_sk(sk);
@@ -289,9 +302,8 @@ static void smap_write_space(struct sock *sk)
 
 static void smap_stop_sock(struct smap_psock *psock, struct sock *sk)
 {
-	write_lock_bh(&sk->sk_callback_lock);
 	if (!psock->strp_enabled)
-		goto out;
+		return;
 	sk->sk_data_ready = psock->save_data_ready;
 	sk->sk_write_space = psock->save_write_space;
 	sk->sk_state_change = psock->save_state_change;
@@ -300,8 +312,6 @@ static void smap_stop_sock(struct smap_psock *psock, struct sock *sk)
 	psock->save_state_change = NULL;
 	strp_stop(&psock->strp);
 	psock->strp_enabled = false;
-out:
-	write_unlock_bh(&sk->sk_callback_lock);
 }
 
 static void smap_destroy_psock(struct rcu_head *rcu)
@@ -318,9 +328,11 @@ static void smap_destroy_psock(struct rcu_head *rcu)
 	schedule_work(&psock->gc_work);
 }
 
-static void smap_release_sock(struct sock *sock)
+static void smap_release_sock(struct smap_psock *psock, struct sock *sock)
 {
-	struct smap_psock *psock = smap_psock_sk(sock);
+	psock->refcnt--;
+	if (psock->refcnt)
+		return;
 
 	smap_stop_sock(psock, sock);
 	clear_bit(SMAP_TX_RUNNING, &psock->state);
@@ -414,6 +426,7 @@ static void sock_map_remove_complete(struct bpf_stab *stab)
 
 static void smap_gc_work(struct work_struct *w)
 {
+	struct smap_psock_map_entry *e, *tmp;
 	struct smap_psock *psock;
 
 	psock = container_of(w, struct smap_psock, gc_work);
@@ -431,8 +444,10 @@ static void smap_gc_work(struct work_struct *w)
 	if (psock->bpf_verdict)
 		bpf_prog_put(psock->bpf_verdict);
 
-	if (refcount_dec_and_test(&psock->stab->refcnt))
-		sock_map_remove_complete(psock->stab);
+	list_for_each_entry_safe(e, tmp, &psock->maps, list) {
+		list_del(&e->list);
+		kfree(e);
+	}
 
 	sock_put(psock->sock);
 	kfree(psock);
@@ -453,6 +468,8 @@ static struct smap_psock *smap_init_psock(struct sock *sock,
 	skb_queue_head_init(&psock->rxqueue);
 	INIT_WORK(&psock->tx_work, smap_tx_work);
 	INIT_WORK(&psock->gc_work, smap_gc_work);
+	INIT_LIST_HEAD(&psock->maps);
+	psock->refcnt = 1;
 
 	rcu_assign_sk_user_data(sock, psock);
 	sock_hold(sock);
@@ -503,13 +520,24 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
 	if (!stab->sock_map)
 		goto free_stab;
 
-	refcount_set(&stab->refcnt, 1);
 	return &stab->map;
 free_stab:
 	kfree(stab);
 	return ERR_PTR(err);
 }
 
+static void smap_list_remove(struct smap_psock *psock, struct sock **entry)
+{
+	struct smap_psock_map_entry *e, *tmp;
+
+	list_for_each_entry_safe(e, tmp, &psock->maps, list) {
+		if (e->entry == entry) {
+			list_del(&e->list);
+			break;
+		}
+	}
+}
+
 static void sock_map_free(struct bpf_map *map)
 {
 	struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
@@ -526,13 +554,18 @@ static void sock_map_free(struct bpf_map *map)
 	 */
 	rcu_read_lock();
 	for (i = 0; i < stab->map.max_entries; i++) {
+		struct smap_psock *psock;
 		struct sock *sock;
 
 		sock = xchg(&stab->sock_map[i], NULL);
 		if (!sock)
 			continue;
 
-		smap_release_sock(sock);
+		write_lock_bh(&sock->sk_callback_lock);
+		psock = smap_psock_sk(sock);
+		smap_list_remove(psock, &stab->sock_map[i]);
+		smap_release_sock(psock, sock);
+		write_unlock_bh(&sock->sk_callback_lock);
 	}
 	rcu_read_unlock();
 
@@ -541,8 +574,7 @@ static void sock_map_free(struct bpf_map *map)
 	if (stab->bpf_parse)
 		bpf_prog_put(stab->bpf_parse);
 
-	if (refcount_dec_and_test(&stab->refcnt))
-		sock_map_remove_complete(stab);
+	sock_map_remove_complete(stab);
 }
 
 static int sock_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
@@ -576,6 +608,7 @@ struct sock  *__sock_map_lookup_elem(struct bpf_map *map, u32 key)
 static int sock_map_delete_elem(struct bpf_map *map, void *key)
 {
 	struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
+	struct smap_psock *psock;
 	int k = *(u32 *)key;
 	struct sock *sock;
 
@@ -586,7 +619,17 @@ static int sock_map_delete_elem(struct bpf_map *map, void *key)
 	if (!sock)
 		return -EINVAL;
 
-	smap_release_sock(sock);
+	write_lock_bh(&sock->sk_callback_lock);
+	psock = smap_psock_sk(sock);
+	if (!psock)
+		goto out;
+
+	if (psock->bpf_parse)
+		smap_stop_sock(psock, sock);
+	smap_list_remove(psock, &stab->sock_map[k]);
+	smap_release_sock(psock, sock);
+out:
+	write_unlock_bh(&sock->sk_callback_lock);
 	return 0;
 }
 
@@ -601,29 +644,34 @@ static int sock_map_delete_elem(struct bpf_map *map, void *key)
  * and syncd so we are certain all references from the update/lookup/delete
  * operations as well as references in the data path are no longer in use.
  *
- * A psock object holds a refcnt on the sockmap it is attached to and this is
- * not decremented until after a RCU grace period and garbage collection occurs.
- * This ensures the map is not free'd until psocks linked to it are removed. The
- * map link is used when the independent sock events trigger map deletion.
+ * Psocks may exist in multiple maps, but only a single set of parse/verdict
+ * programs may be inherited from the maps it belongs to. A reference count
+ * is kept with the total number of references to the psock from all maps. The
+ * psock will not be released until this reaches zero. The psock and sock
+ * user data data use the sk_callback_lock to protect critical data structures
+ * from concurrent access. This allows us to avoid two updates from modifying
+ * the user data in sock and the lock is required anyways for modifying
+ * callbacks, we simply increase its scope slightly.
  *
- * Psocks may only participate in one sockmap at a time. Users that try to
- * join a single sock to multiple maps will get an error.
- *
- * Last, but not least, it is possible the socket is closed while running
- * an update on an existing psock. This will release the psock, but again
- * not until the update has completed due to rcu grace period rules.
+ * Rules to follow,
+ *  - psock must always be read inside RCU critical section
+ *  - sk_user_data must only be modified inside sk_callback_lock and read
+ *    inside RCU critical section.
+ *  - psock->maps list must only be read & modified inside sk_callback_lock
+ *  - sock_map must use READ_ONCE and (cmp)xchg operations
+ *  - BPF verdict/parse programs must use READ_ONCE and xchg operations
  */
 static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
 				    struct bpf_map *map,
-				    void *key, u64 flags, u64 map_flags)
+				    void *key, u64 flags)
 {
 	struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
+	struct smap_psock_map_entry *e = NULL;
 	struct bpf_prog *verdict, *parse;
-	struct smap_psock *psock = NULL;
-	struct sock *old_sock, *sock;
+	struct sock *osock, *sock;
+	struct smap_psock *psock;
 	u32 i = *(u32 *)key;
-	bool update = false;
-	int err = 0;
+	int err;
 
 	if (unlikely(flags > BPF_EXIST))
 		return -EINVAL;
@@ -631,35 +679,22 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
 	if (unlikely(i >= stab->map.max_entries))
 		return -E2BIG;
 
-	if (unlikely(map_flags > BPF_SOCKMAP_STRPARSER))
-		return -EINVAL;
-
-	verdict = parse = NULL;
 	sock = READ_ONCE(stab->sock_map[i]);
-
-	if (flags == BPF_EXIST || flags == BPF_ANY) {
-		if (!sock && flags == BPF_EXIST) {
-			return -ENOENT;
-		} else if (sock && sock != skops->sk) {
-			return -EINVAL;
-		} else if (sock) {
-			psock = smap_psock_sk(sock);
-			if (unlikely(!psock))
-				return -EBUSY;
-			update = true;
-		}
-	} else if (sock && BPF_NOEXIST) {
+	if (flags == BPF_EXIST && !sock)
+		return -ENOENT;
+	else if (flags == BPF_NOEXIST && sock)
 		return -EEXIST;
-	}
 
-	/* reserve BPF programs early so can abort easily on failures */
-	if (map_flags & BPF_SOCKMAP_STRPARSER) {
-		verdict = READ_ONCE(stab->bpf_verdict);
-		parse = READ_ONCE(stab->bpf_parse);
+	sock = skops->sk;
 
-		if (!verdict || !parse)
-			return -ENOENT;
+	/* 1. If sock map has BPF programs those will be inherited by the
+	 * sock being added. If the sock is already attached to BPF programs
+	 * this results in an error.
+	 */
+	verdict = READ_ONCE(stab->bpf_verdict);
+	parse = READ_ONCE(stab->bpf_parse);
 
+	if (parse && verdict) {
 		/* bpf prog refcnt may be zero if a concurrent attach operation
 		 * removes the program after the above READ_ONCE() but before
 		 * we increment the refcnt. If this is the case abort with an
@@ -676,50 +711,78 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
 		}
 	}
 
-	if (!psock) {
-		sock = skops->sk;
-		if (rcu_dereference_sk_user_data(sock))
-			return -EEXIST;
+	write_lock_bh(&sock->sk_callback_lock);
+	psock = smap_psock_sk(sock);
+
+	/* 2. Do not allow inheriting programs if psock exists and has
+	 * already inherited programs. This would create confusion on
+	 * which parser/verdict program is running. If no psock exists
+	 * create one. Inside sk_callback_lock to ensure concurrent create
+	 * doesn't update user data.
+	 */
+	if (psock) {
+		if (READ_ONCE(psock->bpf_parse) && parse) {
+			err = -EBUSY;
+			goto out_progs;
+		}
+		psock->refcnt++;
+	} else {
 		psock = smap_init_psock(sock, stab);
 		if (IS_ERR(psock)) {
-			if (verdict)
-				bpf_prog_put(verdict);
-			if (parse)
-				bpf_prog_put(parse);
-			return PTR_ERR(psock);
+			err = PTR_ERR(psock);
+			goto out_progs;
 		}
-		psock->key = i;
-		psock->stab = stab;
-		refcount_inc(&stab->refcnt);
+
 		set_bit(SMAP_TX_RUNNING, &psock->state);
 	}
 
-	if (map_flags & BPF_SOCKMAP_STRPARSER) {
-		write_lock_bh(&sock->sk_callback_lock);
-		if (psock->strp_enabled)
-			goto start_done;
+	e = kzalloc(sizeof(*e), GFP_ATOMIC | __GFP_NOWARN);
+	if (!e) {
+		err = -ENOMEM;
+		goto out_progs;
+	}
+	e->entry = &stab->sock_map[i];
+
+	/* 3. At this point we have a reference to a valid psock that is
+	 * running. Attach any BPF programs needed.
+	 */
+	if (parse && verdict && !psock->strp_enabled) {
 		err = smap_init_sock(psock, sock);
 		if (err)
-			goto out;
+			goto out_free;
 		smap_init_progs(psock, stab, verdict, parse);
 		smap_start_sock(psock, sock);
-start_done:
-		write_unlock_bh(&sock->sk_callback_lock);
-	} else if (update) {
-		smap_stop_sock(psock, sock);
 	}
 
-	if (!update) {
-		old_sock = xchg(&stab->sock_map[i], skops->sk);
-		if (old_sock)
-			smap_release_sock(old_sock);
-	}
+	/* 4. Place psock in sockmap for use and stop any programs on
+	 * the old sock assuming its not the same sock we are replacing
+	 * it with. Because we can only have a single set of programs if
+	 * old_sock has a strp we can stop it.
+	 */
+	list_add_tail(&e->list, &psock->maps);
+	write_unlock_bh(&sock->sk_callback_lock);
 
+	osock = xchg(&stab->sock_map[i], sock);
+	if (osock) {
+		struct smap_psock *opsock = smap_psock_sk(osock);
+
+		write_lock_bh(&osock->sk_callback_lock);
+		if (osock != sock && parse)
+			smap_stop_sock(opsock, osock);
+		smap_list_remove(opsock, &stab->sock_map[i]);
+		smap_release_sock(opsock, osock);
+		write_unlock_bh(&osock->sk_callback_lock);
+	}
 	return 0;
-out:
+out_free:
+	smap_release_sock(psock, sock);
+out_progs:
+	if (verdict)
+		bpf_prog_put(verdict);
+	if (parse)
+		bpf_prog_put(parse);
 	write_unlock_bh(&sock->sk_callback_lock);
-	if (!update)
-		smap_release_sock(sock);
+	kfree(e);
 	return err;
 }
 
@@ -768,8 +831,7 @@ static int sock_map_update_elem(struct bpf_map *map,
 		return -EINVAL;
 	}
 
-	err = sock_map_ctx_update_elem(&skops, map, key,
-				       flags, BPF_SOCKMAP_STRPARSER);
+	err = sock_map_ctx_update_elem(&skops, map, key, flags);
 	fput(socket->file);
 	return err;
 }
@@ -783,11 +845,11 @@ const struct bpf_map_ops sock_map_ops = {
 	.map_delete_elem = sock_map_delete_elem,
 };
 
-BPF_CALL_5(bpf_sock_map_update, struct bpf_sock_ops_kern *, bpf_sock,
-	   struct bpf_map *, map, void *, key, u64, flags, u64, map_flags)
+BPF_CALL_4(bpf_sock_map_update, struct bpf_sock_ops_kern *, bpf_sock,
+	   struct bpf_map *, map, void *, key, u64, flags)
 {
 	WARN_ON_ONCE(!rcu_read_lock_held());
-	return sock_map_ctx_update_elem(bpf_sock, map, key, flags, map_flags);
+	return sock_map_ctx_update_elem(bpf_sock, map, key, flags);
 }
 
 const struct bpf_func_proto bpf_sock_map_update_proto = {
@@ -799,5 +861,4 @@ const struct bpf_func_proto bpf_sock_map_update_proto = {
 	.arg2_type	= ARG_CONST_MAP_PTR,
 	.arg3_type	= ARG_PTR_TO_MAP_KEY,
 	.arg4_type	= ARG_ANYTHING,
-	.arg5_type	= ARG_ANYTHING,
 };
-- 
cgit v1.2.3-71-gd317


From 63e8d4394a2d226803f47abd7287dbb6d21bf8e4 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Tue, 22 Aug 2017 16:58:20 +0300
Subject: serial: pch_uart: Make port type explicit

It used to be a gap in port definitions after PORT_MAX_8250. Since the
new drivers are coming the gap become shorter and shorter until the
commit a2d6a987bfe4 ("serial: 8250: Add new port type for TI DA8xx/66AK2x")
completely removed it.

So, while type here is just a formality, make things a little bit more
explicit for this driver and move port types to UAPI header. Note,
it uses two types for now.

Fixes: fddceb8b5399 ("tty: 8250: Add 64byte UART support for FSL platforms")
Cc: Priyanka Jain <Priyanka.Jain@freescale.com>
Cc: Poonam Aggrwal <poonam.aggrwal@freescale.com>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/serial/pch_uart.c    | 35 +++++++++++++++--------------------
 include/uapi/linux/serial_core.h |  5 ++++-
 2 files changed, 19 insertions(+), 21 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/tty/serial/pch_uart.c b/drivers/tty/serial/pch_uart.c
index 3788198f105b..ae8cfc81ffc5 100644
--- a/drivers/tty/serial/pch_uart.c
+++ b/drivers/tty/serial/pch_uart.c
@@ -46,11 +46,6 @@ enum {
 	PCH_UART_HANDLED_LS_INT_SHIFT,
 };
 
-enum {
-	PCH_UART_8LINE,
-	PCH_UART_2LINE,
-};
-
 #define PCH_UART_DRIVER_DEVICE "ttyPCH"
 
 /* Set the max number of UART port
@@ -267,7 +262,7 @@ struct eg20t_port {
 
 /**
  * struct pch_uart_driver_data - private data structure for UART-DMA
- * @port_type:			The number of DMA channel
+ * @port_type:			The type of UART port
  * @line_no:			UART port line number (0, 1, 2...)
  */
 struct pch_uart_driver_data {
@@ -290,17 +285,17 @@ enum pch_uart_num_t {
 };
 
 static struct pch_uart_driver_data drv_dat[] = {
-	[pch_et20t_uart0] = {PCH_UART_8LINE, 0},
-	[pch_et20t_uart1] = {PCH_UART_2LINE, 1},
-	[pch_et20t_uart2] = {PCH_UART_2LINE, 2},
-	[pch_et20t_uart3] = {PCH_UART_2LINE, 3},
-	[pch_ml7213_uart0] = {PCH_UART_8LINE, 0},
-	[pch_ml7213_uart1] = {PCH_UART_2LINE, 1},
-	[pch_ml7213_uart2] = {PCH_UART_2LINE, 2},
-	[pch_ml7223_uart0] = {PCH_UART_8LINE, 0},
-	[pch_ml7223_uart1] = {PCH_UART_2LINE, 1},
-	[pch_ml7831_uart0] = {PCH_UART_8LINE, 0},
-	[pch_ml7831_uart1] = {PCH_UART_2LINE, 1},
+	[pch_et20t_uart0] = {PORT_PCH_8LINE, 0},
+	[pch_et20t_uart1] = {PORT_PCH_2LINE, 1},
+	[pch_et20t_uart2] = {PORT_PCH_2LINE, 2},
+	[pch_et20t_uart3] = {PORT_PCH_2LINE, 3},
+	[pch_ml7213_uart0] = {PORT_PCH_8LINE, 0},
+	[pch_ml7213_uart1] = {PORT_PCH_2LINE, 1},
+	[pch_ml7213_uart2] = {PORT_PCH_2LINE, 2},
+	[pch_ml7223_uart0] = {PORT_PCH_8LINE, 0},
+	[pch_ml7223_uart1] = {PORT_PCH_2LINE, 1},
+	[pch_ml7831_uart0] = {PORT_PCH_8LINE, 0},
+	[pch_ml7831_uart1] = {PORT_PCH_2LINE, 1},
 };
 
 #ifdef CONFIG_SERIAL_PCH_UART_CONSOLE
@@ -1777,10 +1772,10 @@ static struct eg20t_port *pch_uart_init_port(struct pci_dev *pdev,
 		goto init_port_free_txbuf;
 
 	switch (port_type) {
-	case PORT_UNKNOWN:
+	case PORT_PCH_8LINE:
 		fifosize = 256; /* EG20T/ML7213: UART0 */
 		break;
-	case PORT_8250:
+	case PORT_PCH_2LINE:
 		fifosize = 64; /* EG20T:UART1~3  ML7213: UART1~2*/
 		break;
 	default:
@@ -1804,7 +1799,7 @@ static struct eg20t_port *pch_uart_init_port(struct pci_dev *pdev,
 
 	priv->fifo_size = fifosize;
 	priv->uartclk = pch_uart_get_uartclk();
-	priv->port_type = PORT_MAX_8250 + port_type + 1;
+	priv->port_type = port_type;
 	priv->port.dev = &pdev->dev;
 	priv->port.iobase = iobase;
 	priv->port.membase = NULL;
diff --git a/include/uapi/linux/serial_core.h b/include/uapi/linux/serial_core.h
index 38bea3217ead..502aa23c7e15 100644
--- a/include/uapi/linux/serial_core.h
+++ b/include/uapi/linux/serial_core.h
@@ -57,7 +57,6 @@
 #define PORT_RT2880	29	/* Ralink RT2880 internal UART */
 #define PORT_16550A_FSL64 30	/* Freescale 16550 UART with 64 FIFOs */
 #define PORT_DA830	31	/* TI DA8xx/66AK2x */
-#define PORT_MAX_8250	31	/* max port ID */
 
 /*
  * ARM specific type numbers.  These are not currently guaranteed
@@ -77,6 +76,10 @@
 #define PORT_SUNZILOG	38
 #define PORT_SUNSAB	39
 
+/* Intel EG20 */
+#define PORT_PCH_8LINE	44
+#define PORT_PCH_2LINE	45
+
 /* DEC */
 #define PORT_DZ		46
 #define PORT_ZS		47
-- 
cgit v1.2.3-71-gd317


From 3f3dac7e4d815cb7f929c0ed98c3a45a86852e53 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Tue, 22 Aug 2017 16:58:21 +0300
Subject: serial: Remove unused port type

PORT_MFD is not in use since commit

	1bd187de5364 ("x86, intel-mid: remove Intel MID specific serial support")

Remove leftover.

Fixes: 1bd187de5364 ("x86, intel-mid: remove Intel MID specific serial support")
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/uapi/linux/serial_core.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/serial_core.h b/include/uapi/linux/serial_core.h
index 502aa23c7e15..00d335634271 100644
--- a/include/uapi/linux/serial_core.h
+++ b/include/uapi/linux/serial_core.h
@@ -209,9 +209,6 @@
 /* MAX310X */
 #define PORT_MAX310X	94
 
-/* High Speed UART for Medfield */
-#define PORT_MFD	95
-
 /* TI OMAP-UART */
 #define PORT_OMAP	96
 
-- 
cgit v1.2.3-71-gd317


From ee1c90cc2cea80638f559c552371ee6893ca9d9e Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Tue, 22 Aug 2017 16:58:22 +0300
Subject: serial: Fix port type numbering for TI DA8xx

The UAPI has a global list of unique numbers for different port types.
The commit
	a2d6a987bfe4 ("serial: 8250: Add new port type for TI DA8xx/66AK2x")
introduced a new port type and brought the collision with two other port
types.

Reuse 95 for it instead.

Fixes: a2d6a987bfe4 ("serial: 8250: Add new port type for TI DA8xx/66AK2x")
Cc: David Lechner <david@lechnology.com>
Cc: Sekhar Nori <nsekhar@ti.com>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/uapi/linux/serial_core.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/serial_core.h b/include/uapi/linux/serial_core.h
index 00d335634271..dc2d7cb766ab 100644
--- a/include/uapi/linux/serial_core.h
+++ b/include/uapi/linux/serial_core.h
@@ -56,7 +56,6 @@
 #define PORT_ALTR_16550_F128 28 /* Altera 16550 UART with 128 FIFOs */
 #define PORT_RT2880	29	/* Ralink RT2880 internal UART */
 #define PORT_16550A_FSL64 30	/* Freescale 16550 UART with 64 FIFOs */
-#define PORT_DA830	31	/* TI DA8xx/66AK2x */
 
 /*
  * ARM specific type numbers.  These are not currently guaranteed
@@ -209,6 +208,9 @@
 /* MAX310X */
 #define PORT_MAX310X	94
 
+/* TI DA8xx/66AK2x */
+#define PORT_DA830	95
+
 /* TI OMAP-UART */
 #define PORT_OMAP	96
 
-- 
cgit v1.2.3-71-gd317


From 1c16ae65e2502da05310b2ec56b3a1fd3efe6f4d Mon Sep 17 00:00:00 2001
From: Sean Wang <sean.wang@mediatek.com>
Date: Mon, 21 Aug 2017 01:17:56 +0800
Subject: serial: 8250: of: Add new port type for MediaTek BTIF controller on
 MT7622/23 SoC

MediaTek BTIF controller is the serial interface similar to UART but it
works only as the digital device which is mainly used to communicate with
the connectivity module called CONNSYS inside the SoC which could be mostly
found on those MediaTek SoCs with Bluetooth feature such as MT7622 and
MT7623 SoCs.

And the controller is made as being compatible with the 8250 register
layout with extra registers such as DMA enablement so it tends to be
integrated with reusing 8250 OF driver. However, DMA mode is not being
supported yet in the current driver.

Signed-off-by: Sean Wang <sean.wang@mediatek.com>
Suggested-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/serial/8250/8250_of.c   | 2 ++
 drivers/tty/serial/8250/8250_port.c | 8 ++++++++
 include/uapi/linux/serial_core.h    | 3 +++
 3 files changed, 13 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/drivers/tty/serial/8250/8250_of.c b/drivers/tty/serial/8250/8250_of.c
index 30eacbffaa51..1222c005fb98 100644
--- a/drivers/tty/serial/8250/8250_of.c
+++ b/drivers/tty/serial/8250/8250_of.c
@@ -313,6 +313,8 @@ static const struct of_device_id of_platform_serial_table[] = {
 		.data = (void *)PORT_ALTR_16550_F64, },
 	{ .compatible = "altr,16550-FIFO128",
 		.data = (void *)PORT_ALTR_16550_F128, },
+	{ .compatible = "mediatek,mtk-btif",
+		.data = (void *)PORT_MTK_BTIF, },
 	{ .compatible = "mrvl,mmp-uart",
 		.data = (void *)PORT_XSCALE, },
 	{ .compatible = "ti,da830-uart", .data = (void *)PORT_DA830, },
diff --git a/drivers/tty/serial/8250/8250_port.c b/drivers/tty/serial/8250/8250_port.c
index 6b745e440b81..4726aa276968 100644
--- a/drivers/tty/serial/8250/8250_port.c
+++ b/drivers/tty/serial/8250/8250_port.c
@@ -289,6 +289,14 @@ static const struct serial8250_config uart_config[] = {
 		.rxtrig_bytes	= {1, 4, 8, 14},
 		.flags		= UART_CAP_FIFO | UART_CAP_AFE,
 	},
+	[PORT_MTK_BTIF] = {
+		.name		= "MediaTek BTIF",
+		.fifo_size	= 16,
+		.tx_loadsz	= 16,
+		.fcr		= UART_FCR_ENABLE_FIFO |
+				  UART_FCR_CLEAR_RCVR | UART_FCR_CLEAR_XMIT,
+		.flags		= UART_CAP_FIFO,
+	},
 };
 
 /* Uart divisor latch read */
diff --git a/include/uapi/linux/serial_core.h b/include/uapi/linux/serial_core.h
index dc2d7cb766ab..50d71c436323 100644
--- a/include/uapi/linux/serial_core.h
+++ b/include/uapi/linux/serial_core.h
@@ -274,4 +274,7 @@
 /* MPS2 UART */
 #define PORT_MPS2UART	116
 
+/* MediaTek BTIF */
+#define PORT_MTK_BTIF	117
+
 #endif /* _UAPILINUX_SERIAL_CORE_H */
-- 
cgit v1.2.3-71-gd317


From fc7ce9c74c3ad232b084d80148654f926d01ece7 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@intel.com>
Date: Mon, 28 Aug 2017 20:52:49 -0400
Subject: perf/core, x86: Add PERF_SAMPLE_PHYS_ADDR

For understanding how the workload maps to memory channels and hardware
behavior, it's very important to collect address maps with physical
addresses. For example, 3D XPoint access can only be found by filtering
the physical address.

Add a new sample type for physical address.

perf already has a facility to collect data virtual address. This patch
introduces a function to convert the virtual address to physical address.
The function is quite generic and can be extended to any architecture as
long as a virtual address is provided.

 - For kernel direct mapping addresses, virt_to_phys is used to convert
   the virtual addresses to physical address.

 - For user virtual addresses, __get_user_pages_fast is used to walk the
   pages tables for user physical address.

 - This does not work for vmalloc addresses right now. These are not
   resolved, but code to do that could be added.

The new sample type requires collecting the virtual address. The
virtual address will not be output unless SAMPLE_ADDR is applied.

For security, the physical address can only be exposed to root or
privileged user.

Tested-by: Madhavan Srinivasan <maddy@linux.vnet.ibm.com>
Signed-off-by: Kan Liang <kan.liang@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: acme@kernel.org
Cc: mpe@ellerman.id.au
Link: http://lkml.kernel.org/r/1503967969-48278-1-git-send-email-kan.liang@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/powerpc/perf/core-book3s.c |  3 ++-
 arch/x86/events/intel/ds.c      |  2 +-
 arch/x86/events/perf_event.h    |  2 +-
 include/linux/perf_event.h      |  2 ++
 include/uapi/linux/perf_event.h |  4 +++-
 kernel/events/core.c            | 46 +++++++++++++++++++++++++++++++++++++++++
 6 files changed, 55 insertions(+), 4 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c
index 6c2d4168daec..2e3eb7431571 100644
--- a/arch/powerpc/perf/core-book3s.c
+++ b/arch/powerpc/perf/core-book3s.c
@@ -2039,7 +2039,8 @@ static void record_and_restart(struct perf_event *event, unsigned long val,
 
 		perf_sample_data_init(&data, ~0ULL, event->hw.last_period);
 
-		if (event->attr.sample_type & PERF_SAMPLE_ADDR)
+		if (event->attr.sample_type &
+		    (PERF_SAMPLE_ADDR | PERF_SAMPLE_PHYS_ADDR))
 			perf_get_data_addr(regs, &data.addr);
 
 		if (event->attr.sample_type & PERF_SAMPLE_BRANCH_STACK) {
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index 98e36e0c791c..e1965e5ff570 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -1185,7 +1185,7 @@ static void setup_pebs_sample_data(struct perf_event *event,
 	else
 		regs->flags &= ~PERF_EFLAGS_EXACT;
 
-	if ((sample_type & PERF_SAMPLE_ADDR) &&
+	if ((sample_type & (PERF_SAMPLE_ADDR | PERF_SAMPLE_PHYS_ADDR)) &&
 	    x86_pmu.intel_cap.pebs_format >= 1)
 		data->addr = pebs->dla;
 
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 9337589014cc..4196f81ec0e1 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -91,7 +91,7 @@ struct amd_nb {
 	(PERF_SAMPLE_IP | PERF_SAMPLE_TID | PERF_SAMPLE_ADDR | \
 	PERF_SAMPLE_ID | PERF_SAMPLE_CPU | PERF_SAMPLE_STREAM_ID | \
 	PERF_SAMPLE_DATA_SRC | PERF_SAMPLE_IDENTIFIER | \
-	PERF_SAMPLE_TRANSACTION)
+	PERF_SAMPLE_TRANSACTION | PERF_SAMPLE_PHYS_ADDR)
 
 /*
  * A debug store configuration.
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index adda0aaae6c8..718ba163c1b9 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -943,6 +943,8 @@ struct perf_sample_data {
 
 	struct perf_regs		regs_intr;
 	u64				stack_user_size;
+
+	u64				phys_addr;
 } ____cacheline_aligned;
 
 /* default value for data source */
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 2a37ae925d85..140ae638cfd6 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -139,8 +139,9 @@ enum perf_event_sample_format {
 	PERF_SAMPLE_IDENTIFIER			= 1U << 16,
 	PERF_SAMPLE_TRANSACTION			= 1U << 17,
 	PERF_SAMPLE_REGS_INTR			= 1U << 18,
+	PERF_SAMPLE_PHYS_ADDR			= 1U << 19,
 
-	PERF_SAMPLE_MAX = 1U << 19,		/* non-ABI */
+	PERF_SAMPLE_MAX = 1U << 20,		/* non-ABI */
 };
 
 /*
@@ -814,6 +815,7 @@ enum perf_event_type {
 	 *	{ u64			transaction; } && PERF_SAMPLE_TRANSACTION
 	 *	{ u64			abi; # enum perf_sample_regs_abi
 	 *	  u64			regs[weight(mask)]; } && PERF_SAMPLE_REGS_INTR
+	 *	{ u64			phys_addr;} && PERF_SAMPLE_PHYS_ADDR
 	 * };
 	 */
 	PERF_RECORD_SAMPLE			= 9,
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 77fd6b11ef22..ce64f3fed5c6 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1575,6 +1575,9 @@ static void __perf_event_header_size(struct perf_event *event, u64 sample_type)
 	if (sample_type & PERF_SAMPLE_TRANSACTION)
 		size += sizeof(data->txn);
 
+	if (sample_type & PERF_SAMPLE_PHYS_ADDR)
+		size += sizeof(data->phys_addr);
+
 	event->header_size = size;
 }
 
@@ -6017,6 +6020,9 @@ void perf_output_sample(struct perf_output_handle *handle,
 		}
 	}
 
+	if (sample_type & PERF_SAMPLE_PHYS_ADDR)
+		perf_output_put(handle, data->phys_addr);
+
 	if (!event->attr.watermark) {
 		int wakeup_events = event->attr.wakeup_events;
 
@@ -6032,6 +6038,38 @@ void perf_output_sample(struct perf_output_handle *handle,
 	}
 }
 
+static u64 perf_virt_to_phys(u64 virt)
+{
+	u64 phys_addr = 0;
+	struct page *p = NULL;
+
+	if (!virt)
+		return 0;
+
+	if (virt >= TASK_SIZE) {
+		/* If it's vmalloc()d memory, leave phys_addr as 0 */
+		if (virt_addr_valid((void *)(uintptr_t)virt) &&
+		    !(virt >= VMALLOC_START && virt < VMALLOC_END))
+			phys_addr = (u64)virt_to_phys((void *)(uintptr_t)virt);
+	} else {
+		/*
+		 * Walking the pages tables for user address.
+		 * Interrupts are disabled, so it prevents any tear down
+		 * of the page tables.
+		 * Try IRQ-safe __get_user_pages_fast first.
+		 * If failed, leave phys_addr as 0.
+		 */
+		if ((current->mm != NULL) &&
+		    (__get_user_pages_fast(virt, 1, 0, &p) == 1))
+			phys_addr = page_to_phys(p) + virt % PAGE_SIZE;
+
+		if (p)
+			put_page(p);
+	}
+
+	return phys_addr;
+}
+
 void perf_prepare_sample(struct perf_event_header *header,
 			 struct perf_sample_data *data,
 			 struct perf_event *event,
@@ -6150,6 +6188,9 @@ void perf_prepare_sample(struct perf_event_header *header,
 
 		header->size += size;
 	}
+
+	if (sample_type & PERF_SAMPLE_PHYS_ADDR)
+		data->phys_addr = perf_virt_to_phys(data->addr);
 }
 
 static void __always_inline
@@ -9909,6 +9950,11 @@ SYSCALL_DEFINE5(perf_event_open,
 			return -EINVAL;
 	}
 
+	/* Only privileged users can get physical addresses */
+	if ((attr.sample_type & PERF_SAMPLE_PHYS_ADDR) &&
+	    perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
+		return -EACCES;
+
 	if (!attr.sample_max_stack)
 		attr.sample_max_stack = sysctl_perf_event_max_stack;
 
-- 
cgit v1.2.3-71-gd317


From 2804fd3af6ba5ae5737705b27146455eabe2e2f8 Mon Sep 17 00:00:00 2001
From: Alexander Aring <aring@mojatatu.com>
Date: Mon, 28 Aug 2017 15:03:13 -0400
Subject: if_ether: add forces ife lfb type

This patch adds the forces IFE lfb type according to IEEE registered
ethertypes. See http://standards-oui.ieee.org/ethertype/eth.txt for more
information. Since there exists the IFE subsystem it can be used there.

This patch also use the correct word "ForCES" instead of "FoRCES" which
is a spelling error inside the IEEE ethertype specification.

Signed-off-by: Alexander Aring <aring@mojatatu.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_ether.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/if_ether.h b/include/uapi/linux/if_ether.h
index efeb1190c2ca..f68f6bf4a253 100644
--- a/include/uapi/linux/if_ether.h
+++ b/include/uapi/linux/if_ether.h
@@ -104,6 +104,7 @@
 #define ETH_P_QINQ2	0x9200		/* deprecated QinQ VLAN [ NOT AN OFFICIALLY REGISTERED ID ] */
 #define ETH_P_QINQ3	0x9300		/* deprecated QinQ VLAN [ NOT AN OFFICIALLY REGISTERED ID ] */
 #define ETH_P_EDSA	0xDADA		/* Ethertype DSA [ NOT AN OFFICIALLY REGISTERED ID ] */
+#define ETH_P_IFE	0xED3E		/* ForCES inter-FE LFB type */
 #define ETH_P_AF_IUCV   0xFBFB		/* IBM af_iucv [ NOT AN OFFICIALLY REGISTERED ID ] */
 
 #define ETH_P_802_3_MIN	0x0600		/* If the value in the ethernet type is less than this value
-- 
cgit v1.2.3-71-gd317


From 155e6f649757c902901e599c268f8b575ddac1f8 Mon Sep 17 00:00:00 2001
From: Jiri Benc <jbenc@redhat.com>
Date: Mon, 28 Aug 2017 21:43:21 +0200
Subject: ether: add NSH ethertype

The NSH draft says:

   An IEEE EtherType, 0x894F, has been allocated for NSH.

Signed-off-by: Jiri Benc <jbenc@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_ether.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/if_ether.h b/include/uapi/linux/if_ether.h
index f68f6bf4a253..61f7ccce5b69 100644
--- a/include/uapi/linux/if_ether.h
+++ b/include/uapi/linux/if_ether.h
@@ -99,6 +99,7 @@
 #define ETH_P_FIP	0x8914		/* FCoE Initialization Protocol */
 #define ETH_P_80221	0x8917		/* IEEE 802.21 Media Independent Handover Protocol */
 #define ETH_P_HSR	0x892F		/* IEC 62439-3 HSRv1	*/
+#define ETH_P_NSH	0x894F		/* Network Service Header */
 #define ETH_P_LOOPBACK	0x9000		/* Ethernet loopback packet, per IEEE 802.3 */
 #define ETH_P_QINQ1	0x9100		/* deprecated QinQ VLAN [ NOT AN OFFICIALLY REGISTERED ID ] */
 #define ETH_P_QINQ2	0x9200		/* deprecated QinQ VLAN [ NOT AN OFFICIALLY REGISTERED ID ] */
-- 
cgit v1.2.3-71-gd317


From 31770e34e43d6c8dee129bfee77e56c34e61f0e5 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Wed, 30 Aug 2017 19:24:58 +0200
Subject: tcp: Revert "tcp: remove header prediction"

This reverts commit 45f119bf936b1f9f546a0b139c5b56f9bb2bdc78.

Eric Dumazet says:
  We found at Google a significant regression caused by
  45f119bf936b1f9f546a0b139c5b56f9bb2bdc78 tcp: remove header prediction

  In typical RPC  (TCP_RR), when a TCP socket receives data, we now call
  tcp_ack() while we used to not call it.

  This touches enough cache lines to cause a slowdown.

so problem does not seem to be HP removal itself but the tcp_ack()
call.  Therefore, it might be possible to remove HP after all, provided
one finds a way to elide tcp_ack for most cases.

Reported-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h       |   6 ++
 include/net/tcp.h         |  23 ++++++
 include/uapi/linux/snmp.h |   2 +
 net/ipv4/proc.c           |   2 +
 net/ipv4/tcp.c            |   4 +-
 net/ipv4/tcp_input.c      | 188 ++++++++++++++++++++++++++++++++++++++++++++--
 net/ipv4/tcp_minisocks.c  |   2 +
 net/ipv4/tcp_output.c     |   2 +
 8 files changed, 223 insertions(+), 6 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 267164a1d559..4aa40ef02d32 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -147,6 +147,12 @@ struct tcp_sock {
 	u16	tcp_header_len;	/* Bytes of tcp header to send		*/
 	u16	gso_segs;	/* Max number of segs per GSO packet	*/
 
+/*
+ *	Header prediction flags
+ *	0x5?10 << 16 + snd_wnd in net byte order
+ */
+	__be32	pred_flags;
+
 /*
  *	RFC793 variables by their proper names. This means you can
  *	read the code and the spec side by side (and laugh ...)
diff --git a/include/net/tcp.h b/include/net/tcp.h
index c546d13ffbca..9c3db054e47f 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -634,6 +634,29 @@ static inline u32 __tcp_set_rto(const struct tcp_sock *tp)
 	return usecs_to_jiffies((tp->srtt_us >> 3) + tp->rttvar_us);
 }
 
+static inline void __tcp_fast_path_on(struct tcp_sock *tp, u32 snd_wnd)
+{
+	tp->pred_flags = htonl((tp->tcp_header_len << 26) |
+			       ntohl(TCP_FLAG_ACK) |
+			       snd_wnd);
+}
+
+static inline void tcp_fast_path_on(struct tcp_sock *tp)
+{
+	__tcp_fast_path_on(tp, tp->snd_wnd >> tp->rx_opt.snd_wscale);
+}
+
+static inline void tcp_fast_path_check(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	if (RB_EMPTY_ROOT(&tp->out_of_order_queue) &&
+	    tp->rcv_wnd &&
+	    atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf &&
+	    !tp->urg_data)
+		tcp_fast_path_on(tp);
+}
+
 /* Compute the actual rto_min value */
 static inline u32 tcp_rto_min(struct sock *sk)
 {
diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h
index b3f346fb9fe3..758f12b58541 100644
--- a/include/uapi/linux/snmp.h
+++ b/include/uapi/linux/snmp.h
@@ -184,7 +184,9 @@ enum
 	LINUX_MIB_DELAYEDACKLOST,		/* DelayedACKLost */
 	LINUX_MIB_LISTENOVERFLOWS,		/* ListenOverflows */
 	LINUX_MIB_LISTENDROPS,			/* ListenDrops */
+	LINUX_MIB_TCPHPHITS,			/* TCPHPHits */
 	LINUX_MIB_TCPPUREACKS,			/* TCPPureAcks */
+	LINUX_MIB_TCPHPACKS,			/* TCPHPAcks */
 	LINUX_MIB_TCPRENORECOVERY,		/* TCPRenoRecovery */
 	LINUX_MIB_TCPSACKRECOVERY,		/* TCPSackRecovery */
 	LINUX_MIB_TCPSACKRENEGING,		/* TCPSACKReneging */
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index b6d3fe03feb3..127153f1ed8a 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -206,7 +206,9 @@ static const struct snmp_mib snmp4_net_list[] = {
 	SNMP_MIB_ITEM("DelayedACKLost", LINUX_MIB_DELAYEDACKLOST),
 	SNMP_MIB_ITEM("ListenOverflows", LINUX_MIB_LISTENOVERFLOWS),
 	SNMP_MIB_ITEM("ListenDrops", LINUX_MIB_LISTENDROPS),
+	SNMP_MIB_ITEM("TCPHPHits", LINUX_MIB_TCPHPHITS),
 	SNMP_MIB_ITEM("TCPPureAcks", LINUX_MIB_TCPPUREACKS),
+	SNMP_MIB_ITEM("TCPHPAcks", LINUX_MIB_TCPHPACKS),
 	SNMP_MIB_ITEM("TCPRenoRecovery", LINUX_MIB_TCPRENORECOVERY),
 	SNMP_MIB_ITEM("TCPSackRecovery", LINUX_MIB_TCPSACKRECOVERY),
 	SNMP_MIB_ITEM("TCPSACKReneging", LINUX_MIB_TCPSACKRENEGING),
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 566083ee2654..21ca2df274c5 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1963,8 +1963,10 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
 		tcp_rcv_space_adjust(sk);
 
 skip_copy:
-		if (tp->urg_data && after(tp->copied_seq, tp->urg_seq))
+		if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {
 			tp->urg_data = 0;
+			tcp_fast_path_check(sk);
+		}
 		if (used + offset < skb->len)
 			continue;
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index a0e436366d31..c5d7656beeee 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -103,6 +103,7 @@ int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2;
 #define FLAG_DATA_SACKED	0x20 /* New SACK.				*/
 #define FLAG_ECE		0x40 /* ECE in this ACK				*/
 #define FLAG_LOST_RETRANS	0x80 /* This ACK marks some retransmission lost */
+#define FLAG_SLOWPATH		0x100 /* Do not skip RFC checks for window update.*/
 #define FLAG_ORIG_SACK_ACKED	0x200 /* Never retransmitted data are (s)acked	*/
 #define FLAG_SND_UNA_ADVANCED	0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */
 #define FLAG_DSACKING_ACK	0x800 /* SACK blocks contained D-SACK info */
@@ -3371,6 +3372,12 @@ static int tcp_ack_update_window(struct sock *sk, const struct sk_buff *skb, u32
 		if (tp->snd_wnd != nwin) {
 			tp->snd_wnd = nwin;
 
+			/* Note, it is the only place, where
+			 * fast path is recovered for sending TCP.
+			 */
+			tp->pred_flags = 0;
+			tcp_fast_path_check(sk);
+
 			if (tcp_send_head(sk))
 				tcp_slow_start_after_idle_check(sk);
 
@@ -3592,7 +3599,19 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 	if (flag & FLAG_UPDATE_TS_RECENT)
 		tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq);
 
-	{
+	if (!(flag & FLAG_SLOWPATH) && after(ack, prior_snd_una)) {
+		/* Window is constant, pure forward advance.
+		 * No more checks are required.
+		 * Note, we use the fact that SND.UNA>=SND.WL2.
+		 */
+		tcp_update_wl(tp, ack_seq);
+		tcp_snd_una_update(tp, ack);
+		flag |= FLAG_WIN_UPDATE;
+
+		tcp_in_ack_event(sk, CA_ACK_WIN_UPDATE);
+
+		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPACKS);
+	} else {
 		u32 ack_ev_flags = CA_ACK_SLOWPATH;
 
 		if (ack_seq != TCP_SKB_CB(skb)->end_seq)
@@ -4407,6 +4426,8 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
 	if (TCP_SKB_CB(skb)->has_rxtstamp)
 		TCP_SKB_CB(skb)->swtstamp = skb->tstamp;
 
+	/* Disable header prediction. */
+	tp->pred_flags = 0;
 	inet_csk_schedule_ack(sk);
 
 	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOQUEUE);
@@ -4647,6 +4668,8 @@ queue_and_out:
 		if (tp->rx_opt.num_sacks)
 			tcp_sack_remove(tp);
 
+		tcp_fast_path_check(sk);
+
 		if (eaten > 0)
 			kfree_skb_partial(skb, fragstolen);
 		if (!sock_flag(sk, SOCK_DEAD))
@@ -4972,6 +4995,7 @@ static int tcp_prune_queue(struct sock *sk)
 	NET_INC_STATS(sock_net(sk), LINUX_MIB_RCVPRUNED);
 
 	/* Massive buffer overcommit. */
+	tp->pred_flags = 0;
 	return -1;
 }
 
@@ -5143,6 +5167,9 @@ static void tcp_check_urg(struct sock *sk, const struct tcphdr *th)
 
 	tp->urg_data = TCP_URG_NOTYET;
 	tp->urg_seq = ptr;
+
+	/* Disable header prediction. */
+	tp->pred_flags = 0;
 }
 
 /* This is the 'fast' part of urgent handling. */
@@ -5301,6 +5328,26 @@ discard:
 
 /*
  *	TCP receive function for the ESTABLISHED state.
+ *
+ *	It is split into a fast path and a slow path. The fast path is
+ * 	disabled when:
+ *	- A zero window was announced from us - zero window probing
+ *        is only handled properly in the slow path.
+ *	- Out of order segments arrived.
+ *	- Urgent data is expected.
+ *	- There is no buffer space left
+ *	- Unexpected TCP flags/window values/header lengths are received
+ *	  (detected by checking the TCP header against pred_flags)
+ *	- Data is sent in both directions. Fast path only supports pure senders
+ *	  or pure receivers (this means either the sequence number or the ack
+ *	  value must stay constant)
+ *	- Unexpected TCP option.
+ *
+ *	When these conditions are not satisfied it drops into a standard
+ *	receive procedure patterned after RFC793 to handle all cases.
+ *	The first three cases are guaranteed by proper pred_flags setting,
+ *	the rest is checked inline. Fast processing is turned on in
+ *	tcp_data_queue when everything is OK.
  */
 void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
 			 const struct tcphdr *th)
@@ -5311,19 +5358,144 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
 	tcp_mstamp_refresh(tp);
 	if (unlikely(!sk->sk_rx_dst))
 		inet_csk(sk)->icsk_af_ops->sk_rx_dst_set(sk, skb);
+	/*
+	 *	Header prediction.
+	 *	The code loosely follows the one in the famous
+	 *	"30 instruction TCP receive" Van Jacobson mail.
+	 *
+	 *	Van's trick is to deposit buffers into socket queue
+	 *	on a device interrupt, to call tcp_recv function
+	 *	on the receive process context and checksum and copy
+	 *	the buffer to user space. smart...
+	 *
+	 *	Our current scheme is not silly either but we take the
+	 *	extra cost of the net_bh soft interrupt processing...
+	 *	We do checksum and copy also but from device to kernel.
+	 */
 
 	tp->rx_opt.saw_tstamp = 0;
 
+	/*	pred_flags is 0xS?10 << 16 + snd_wnd
+	 *	if header_prediction is to be made
+	 *	'S' will always be tp->tcp_header_len >> 2
+	 *	'?' will be 0 for the fast path, otherwise pred_flags is 0 to
+	 *  turn it off	(when there are holes in the receive
+	 *	 space for instance)
+	 *	PSH flag is ignored.
+	 */
+
+	if ((tcp_flag_word(th) & TCP_HP_BITS) == tp->pred_flags &&
+	    TCP_SKB_CB(skb)->seq == tp->rcv_nxt &&
+	    !after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt)) {
+		int tcp_header_len = tp->tcp_header_len;
+
+		/* Timestamp header prediction: tcp_header_len
+		 * is automatically equal to th->doff*4 due to pred_flags
+		 * match.
+		 */
+
+		/* Check timestamp */
+		if (tcp_header_len == sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) {
+			/* No? Slow path! */
+			if (!tcp_parse_aligned_timestamp(tp, th))
+				goto slow_path;
+
+			/* If PAWS failed, check it more carefully in slow path */
+			if ((s32)(tp->rx_opt.rcv_tsval - tp->rx_opt.ts_recent) < 0)
+				goto slow_path;
+
+			/* DO NOT update ts_recent here, if checksum fails
+			 * and timestamp was corrupted part, it will result
+			 * in a hung connection since we will drop all
+			 * future packets due to the PAWS test.
+			 */
+		}
+
+		if (len <= tcp_header_len) {
+			/* Bulk data transfer: sender */
+			if (len == tcp_header_len) {
+				/* Predicted packet is in window by definition.
+				 * seq == rcv_nxt and rcv_wup <= rcv_nxt.
+				 * Hence, check seq<=rcv_wup reduces to:
+				 */
+				if (tcp_header_len ==
+				    (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) &&
+				    tp->rcv_nxt == tp->rcv_wup)
+					tcp_store_ts_recent(tp);
+
+				/* We know that such packets are checksummed
+				 * on entry.
+				 */
+				tcp_ack(sk, skb, 0);
+				__kfree_skb(skb);
+				tcp_data_snd_check(sk);
+				return;
+			} else { /* Header too small */
+				TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
+				goto discard;
+			}
+		} else {
+			int eaten = 0;
+			bool fragstolen = false;
+
+			if (tcp_checksum_complete(skb))
+				goto csum_error;
+
+			if ((int)skb->truesize > sk->sk_forward_alloc)
+				goto step5;
+
+			/* Predicted packet is in window by definition.
+			 * seq == rcv_nxt and rcv_wup <= rcv_nxt.
+			 * Hence, check seq<=rcv_wup reduces to:
+			 */
+			if (tcp_header_len ==
+			    (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) &&
+			    tp->rcv_nxt == tp->rcv_wup)
+				tcp_store_ts_recent(tp);
+
+			tcp_rcv_rtt_measure_ts(sk, skb);
+
+			NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPHITS);
+
+			/* Bulk data transfer: receiver */
+			eaten = tcp_queue_rcv(sk, skb, tcp_header_len,
+					      &fragstolen);
+
+			tcp_event_data_recv(sk, skb);
+
+			if (TCP_SKB_CB(skb)->ack_seq != tp->snd_una) {
+				/* Well, only one small jumplet in fast path... */
+				tcp_ack(sk, skb, FLAG_DATA);
+				tcp_data_snd_check(sk);
+				if (!inet_csk_ack_scheduled(sk))
+					goto no_ack;
+			}
+
+			__tcp_ack_snd_check(sk, 0);
+no_ack:
+			if (eaten)
+				kfree_skb_partial(skb, fragstolen);
+			sk->sk_data_ready(sk);
+			return;
+		}
+	}
+
+slow_path:
 	if (len < (th->doff << 2) || tcp_checksum_complete(skb))
 		goto csum_error;
 
 	if (!th->ack && !th->rst && !th->syn)
 		goto discard;
 
+	/*
+	 *	Standard slow path.
+	 */
+
 	if (!tcp_validate_incoming(sk, skb, th, 1))
 		return;
 
-	if (tcp_ack(sk, skb, FLAG_UPDATE_TS_RECENT) < 0)
+step5:
+	if (tcp_ack(sk, skb, FLAG_SLOWPATH | FLAG_UPDATE_TS_RECENT) < 0)
 		goto discard;
 
 	tcp_rcv_rtt_measure_ts(sk, skb);
@@ -5376,6 +5548,11 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb)
 
 	if (sock_flag(sk, SOCK_KEEPOPEN))
 		inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp));
+
+	if (!tp->rx_opt.snd_wscale)
+		__tcp_fast_path_on(tp, tp->snd_wnd);
+	else
+		tp->pred_flags = 0;
 }
 
 static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
@@ -5504,7 +5681,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
 		tcp_ecn_rcv_synack(tp, th);
 
 		tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
-		tcp_ack(sk, skb, 0);
+		tcp_ack(sk, skb, FLAG_SLOWPATH);
 
 		/* Ok.. it's good. Set up sequence numbers and
 		 * move to established.
@@ -5740,8 +5917,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
 		return 0;
 
 	/* step 5: check the ACK field */
-
-	acceptable = tcp_ack(sk, skb, FLAG_UPDATE_TS_RECENT |
+	acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH |
+				      FLAG_UPDATE_TS_RECENT |
 				      FLAG_NO_CHALLENGE_ACK) > 0;
 
 	if (!acceptable) {
@@ -5809,6 +5986,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
 		tp->lsndtime = tcp_jiffies32;
 
 		tcp_initialize_rcv_mss(sk);
+		tcp_fast_path_on(tp);
 		break;
 
 	case TCP_FIN_WAIT1: {
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 1537b87c657f..188a6f31356d 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -436,6 +436,8 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
 		struct tcp_sock *newtp = tcp_sk(newsk);
 
 		/* Now setup tcp_sock */
+		newtp->pred_flags = 0;
+
 		newtp->rcv_wup = newtp->copied_seq =
 		newtp->rcv_nxt = treq->rcv_isn + 1;
 		newtp->segs_in = 1;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 3e0d19631534..5b6690d05abb 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -295,7 +295,9 @@ static u16 tcp_select_window(struct sock *sk)
 	/* RFC1323 scaling applied */
 	new_win >>= tp->rx_opt.rcv_wscale;
 
+	/* If we advertise zero window, disable fast path. */
 	if (new_win == 0) {
+		tp->pred_flags = 0;
 		if (old_win)
 			NET_INC_STATS(sock_net(sk),
 				      LINUX_MIB_TCPTOZEROWINDOWADV);
-- 
cgit v1.2.3-71-gd317


From 7373ae7e8f0bf2c0718422481da986db5058b005 Mon Sep 17 00:00:00 2001
From: Subash Abhinov Kasiviswanathan <subashab@codeaurora.org>
Date: Tue, 29 Aug 2017 22:44:16 -0600
Subject: net: ether: Add support for multiplexing and aggregation type

Define the Qualcomm multiplexing and aggregation (MAP) ether type 0x00F9.
This is needed for receiving data in the MAP protocol like RMNET. This is
not an officially registered ID.

Signed-off-by: Subash Abhinov Kasiviswanathan <subashab@codeaurora.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_ether.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/if_ether.h b/include/uapi/linux/if_ether.h
index 61f7ccce5b69..9037065e23d0 100644
--- a/include/uapi/linux/if_ether.h
+++ b/include/uapi/linux/if_ether.h
@@ -140,6 +140,9 @@
 #define ETH_P_IEEE802154 0x00F6		/* IEEE802.15.4 frame		*/
 #define ETH_P_CAIF	0x00F7		/* ST-Ericsson CAIF protocol	*/
 #define ETH_P_XDSA	0x00F8		/* Multiplexed DSA protocol	*/
+#define ETH_P_MAP	0x00F9		/* Qualcomm multiplexing and
+					 * aggregation protocol
+					 */
 
 /*
  *	This is an Ethernet frame header.
-- 
cgit v1.2.3-71-gd317


From cdf4969c42a6c1a376dd03a9e846cf638d3cd4b1 Mon Sep 17 00:00:00 2001
From: Subash Abhinov Kasiviswanathan <subashab@codeaurora.org>
Date: Tue, 29 Aug 2017 22:44:17 -0600
Subject: net: arp: Add support for raw IP device

Define the raw IP type. This is needed for raw IP net devices
like rmnet.

Signed-off-by: Subash Abhinov Kasiviswanathan <subashab@codeaurora.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_arp.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/if_arp.h b/include/uapi/linux/if_arp.h
index cf73510b9238..a2a635620600 100644
--- a/include/uapi/linux/if_arp.h
+++ b/include/uapi/linux/if_arp.h
@@ -59,6 +59,7 @@
 #define ARPHRD_LAPB	516		/* LAPB				*/
 #define ARPHRD_DDCMP    517		/* Digital's DDCMP protocol     */
 #define ARPHRD_RAWHDLC	518		/* Raw HDLC			*/
+#define ARPHRD_RAWIP    519		/* Raw IP                       */
 
 #define ARPHRD_TUNNEL	768		/* IPIP tunnel			*/
 #define ARPHRD_TUNNEL6	769		/* IP6IP6 tunnel       		*/
-- 
cgit v1.2.3-71-gd317


From e3bfed1df379c18f20feb06427d952b766e2c00f Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Fri, 25 Aug 2017 19:53:39 +1000
Subject: KVM: PPC: Book3S HV: Report storage key support to userspace

This adds information about storage keys to the struct returned by
the KVM_PPC_GET_SMMU_INFO ioctl.  The new fields replace a pad field,
which was zeroed by previous kernel versions.  Thus userspace that
knows about the new fields will see zeroes when running on an older
kernel, indicating that storage keys are not supported.  The size of
the structure has not changed.

The number of keys is hard-coded for the CPUs supported by HV KVM,
which is just POWER7, POWER8 and POWER9.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/kvm/book3s_hv.c | 8 ++++++++
 include/uapi/linux/kvm.h     | 3 ++-
 2 files changed, 10 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 1182cfd79857..f62ad2e9085f 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -3331,6 +3331,14 @@ static int kvm_vm_ioctl_get_smmu_info_hv(struct kvm *kvm,
 	if (radix_enabled())
 		return -EINVAL;
 
+	/*
+	 * POWER7, POWER8 and POWER9 all support 32 storage keys for data.
+	 * POWER7 doesn't support keys for instruction accesses,
+	 * POWER8 and POWER9 do.
+	 */
+	info->data_keys = 32;
+	info->instr_keys = cpu_has_feature(CPU_FTR_ARCH_207S) ? 32 : 0;
+
 	info->flags = KVM_PPC_PAGE_SIZES_REAL;
 	if (mmu_has_feature(MMU_FTR_1T_SEGMENT))
 		info->flags |= KVM_PPC_1T_SEGMENTS;
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 6cd63c18708a..838887587411 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -711,7 +711,8 @@ struct kvm_ppc_one_seg_page_size {
 struct kvm_ppc_smmu_info {
 	__u64 flags;
 	__u32 slb_size;
-	__u32 pad;
+	__u16 data_keys;	/* # storage keys supported for data */
+	__u16 instr_keys;	/* # storage keys supported for instructions */
 	struct kvm_ppc_one_seg_page_size sps[KVM_PPC_PAGE_SIZES_MAX_SZ];
 };
 
-- 
cgit v1.2.3-71-gd317


From 8fc614c0ae5cb5df11d6aa9559e63baacf20a840 Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bhelgaas@google.com>
Date: Thu, 31 Aug 2017 14:12:39 -0500
Subject: PCI/AER: Reformat AER register definitions

Reformat so comments fit on same line as definition.  No functional change
intended.

Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 include/uapi/linux/pci_regs.h | 26 ++++++++++----------------
 1 file changed, 10 insertions(+), 16 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h
index c22d3ebaca20..46632aaee1c0 100644
--- a/include/uapi/linux/pci_regs.h
+++ b/include/uapi/linux/pci_regs.h
@@ -733,23 +733,17 @@
 #define  PCI_ERR_CAP_ECRC_CHKE	0x00000100	/* ECRC Check Enable */
 #define PCI_ERR_HEADER_LOG	28	/* Header Log Register (16 bytes) */
 #define PCI_ERR_ROOT_COMMAND	44	/* Root Error Command */
-/* Correctable Err Reporting Enable */
-#define PCI_ERR_ROOT_CMD_COR_EN		0x00000001
-/* Non-fatal Err Reporting Enable */
-#define PCI_ERR_ROOT_CMD_NONFATAL_EN	0x00000002
-/* Fatal Err Reporting Enable */
-#define PCI_ERR_ROOT_CMD_FATAL_EN	0x00000004
+#define PCI_ERR_ROOT_CMD_COR_EN		0x00000001 /* Correctable Err Reporting Enable */
+#define PCI_ERR_ROOT_CMD_NONFATAL_EN	0x00000002 /* Non-Fatal Err Reporting Enable */
+#define PCI_ERR_ROOT_CMD_FATAL_EN	0x00000004 /* Fatal Err Reporting Enable */
 #define PCI_ERR_ROOT_STATUS	48
-#define PCI_ERR_ROOT_COR_RCV		0x00000001	/* ERR_COR Received */
-/* Multi ERR_COR Received */
-#define PCI_ERR_ROOT_MULTI_COR_RCV	0x00000002
-/* ERR_FATAL/NONFATAL Received */
-#define PCI_ERR_ROOT_UNCOR_RCV		0x00000004
-/* Multi ERR_FATAL/NONFATAL Received */
-#define PCI_ERR_ROOT_MULTI_UNCOR_RCV	0x00000008
-#define PCI_ERR_ROOT_FIRST_FATAL	0x00000010	/* First Fatal */
-#define PCI_ERR_ROOT_NONFATAL_RCV	0x00000020	/* Non-Fatal Received */
-#define PCI_ERR_ROOT_FATAL_RCV		0x00000040	/* Fatal Received */
+#define PCI_ERR_ROOT_COR_RCV		0x00000001 /* ERR_COR Received */
+#define PCI_ERR_ROOT_MULTI_COR_RCV	0x00000002 /* Multiple ERR_COR */
+#define PCI_ERR_ROOT_UNCOR_RCV		0x00000004 /* ERR_FATAL/NONFATAL */
+#define PCI_ERR_ROOT_MULTI_UNCOR_RCV	0x00000008 /* Multiple FATAL/NONFATAL */
+#define PCI_ERR_ROOT_FIRST_FATAL	0x00000010 /* First UNC is Fatal */
+#define PCI_ERR_ROOT_NONFATAL_RCV	0x00000020 /* Non-Fatal Received */
+#define PCI_ERR_ROOT_FATAL_RCV		0x00000040 /* Fatal Received */
 #define PCI_ERR_ROOT_ERR_SRC	52	/* Error Source Identification */
 
 /* Virtual Channel */
-- 
cgit v1.2.3-71-gd317


From 89e4fdecb51cf5535867026274bc97de9480ade5 Mon Sep 17 00:00:00 2001
From: Omar Sandoval <osandov@fb.com>
Date: Thu, 24 Aug 2017 00:03:43 -0700
Subject: loop: add ioctl for changing logical block size

This is a different approach from the first attempt in f2c6df7dbf9a
("loop: support 4k physical blocksize"). Rather than extending
LOOP_{GET,SET}_STATUS, add a separate ioctl just for setting the block
size.

Reviewed-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/loop.c      | 24 ++++++++++++++++++++++++
 include/uapi/linux/loop.h |  1 +
 2 files changed, 25 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index e3f190016d4f..ac106b287d75 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1047,6 +1047,7 @@ static int loop_clr_fd(struct loop_device *lo)
 	memset(lo->lo_encrypt_key, 0, LO_KEY_SIZE);
 	memset(lo->lo_crypt_name, 0, LO_NAME_SIZE);
 	memset(lo->lo_file_name, 0, LO_NAME_SIZE);
+	blk_queue_logical_block_size(lo->lo_queue, 512);
 	if (bdev) {
 		bdput(bdev);
 		invalidate_bdev(bdev);
@@ -1330,6 +1331,24 @@ static int loop_set_dio(struct loop_device *lo, unsigned long arg)
 	return error;
 }
 
+static int loop_set_block_size(struct loop_device *lo, unsigned long arg)
+{
+	if (lo->lo_state != Lo_bound)
+		return -ENXIO;
+
+	if (arg < 512 || arg > PAGE_SIZE || !is_power_of_2(arg))
+		return -EINVAL;
+
+	blk_mq_freeze_queue(lo->lo_queue);
+
+	blk_queue_logical_block_size(lo->lo_queue, arg);
+	loop_update_dio(lo);
+
+	blk_mq_unfreeze_queue(lo->lo_queue);
+
+	return 0;
+}
+
 static int lo_ioctl(struct block_device *bdev, fmode_t mode,
 	unsigned int cmd, unsigned long arg)
 {
@@ -1378,6 +1397,11 @@ static int lo_ioctl(struct block_device *bdev, fmode_t mode,
 		if ((mode & FMODE_WRITE) || capable(CAP_SYS_ADMIN))
 			err = loop_set_dio(lo, arg);
 		break;
+	case LOOP_SET_BLOCK_SIZE:
+		err = -EPERM;
+		if ((mode & FMODE_WRITE) || capable(CAP_SYS_ADMIN))
+			err = loop_set_block_size(lo, arg);
+		break;
 	default:
 		err = lo->ioctl ? lo->ioctl(lo, cmd, arg) : -EINVAL;
 	}
diff --git a/include/uapi/linux/loop.h b/include/uapi/linux/loop.h
index c8125ec1f4f2..23158dbe2424 100644
--- a/include/uapi/linux/loop.h
+++ b/include/uapi/linux/loop.h
@@ -88,6 +88,7 @@ struct loop_info64 {
 #define LOOP_CHANGE_FD		0x4C06
 #define LOOP_SET_CAPACITY	0x4C07
 #define LOOP_SET_DIRECT_IO	0x4C08
+#define LOOP_SET_BLOCK_SIZE	0x4C09
 
 /* /dev/loop-control interface */
 #define LOOP_CTL_ADD		0x4C80
-- 
cgit v1.2.3-71-gd317


From ddef7ed2b5cbafae692d1d580bb5a07808926a9c Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 6 Jul 2017 18:58:37 +0200
Subject: annotate RWF_... flags

[AV: added missing annotations in syscalls.h/compat.h]

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/nfsd/vfs.c                |  2 +-
 fs/read_write.c              | 50 ++++++++++++++++++++++----------------------
 include/linux/compat.h       | 16 ++++++++++++--
 include/linux/fs.h           | 12 ++++++-----
 include/linux/syscalls.h     |  4 ++--
 include/uapi/linux/aio_abi.h | 21 ++++++++++---------
 include/uapi/linux/fs.h      | 28 ++++++++++++++++++-------
 7 files changed, 80 insertions(+), 53 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 38d0383dc7f9..bc69d40c4e8b 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -969,7 +969,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
 	int			use_wgather;
 	loff_t			pos = offset;
 	unsigned int		pflags = current->flags;
-	int			flags = 0;
+	rwf_t			flags = 0;
 
 	if (test_bit(RQ_LOCAL, &rqstp->rq_flags))
 		/*
diff --git a/fs/read_write.c b/fs/read_write.c
index 0cc7033aa413..61b58c7b6531 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -33,7 +33,7 @@ const struct file_operations generic_ro_fops = {
 
 EXPORT_SYMBOL(generic_ro_fops);
 
-static inline int unsigned_offsets(struct file *file)
+static inline bool unsigned_offsets(struct file *file)
 {
 	return file->f_mode & FMODE_UNSIGNED_OFFSET;
 }
@@ -633,7 +633,7 @@ unsigned long iov_shorten(struct iovec *iov, unsigned long nr_segs, size_t to)
 EXPORT_SYMBOL(iov_shorten);
 
 static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter,
-		loff_t *ppos, int type, int flags)
+		loff_t *ppos, int type, rwf_t flags)
 {
 	struct kiocb kiocb;
 	ssize_t ret;
@@ -655,7 +655,7 @@ static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter,
 
 /* Do it by hand, with file-ops */
 static ssize_t do_loop_readv_writev(struct file *filp, struct iov_iter *iter,
-		loff_t *ppos, int type, int flags)
+		loff_t *ppos, int type, rwf_t flags)
 {
 	ssize_t ret = 0;
 
@@ -871,7 +871,7 @@ out:
 #endif
 
 static ssize_t do_iter_read(struct file *file, struct iov_iter *iter,
-		loff_t *pos, int flags)
+		loff_t *pos, rwf_t flags)
 {
 	size_t tot_len;
 	ssize_t ret = 0;
@@ -899,7 +899,7 @@ out:
 }
 
 ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos,
-		int flags)
+		rwf_t flags)
 {
 	if (!file->f_op->read_iter)
 		return -EINVAL;
@@ -908,7 +908,7 @@ ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos,
 EXPORT_SYMBOL(vfs_iter_read);
 
 static ssize_t do_iter_write(struct file *file, struct iov_iter *iter,
-		loff_t *pos, int flags)
+		loff_t *pos, rwf_t flags)
 {
 	size_t tot_len;
 	ssize_t ret = 0;
@@ -935,7 +935,7 @@ static ssize_t do_iter_write(struct file *file, struct iov_iter *iter,
 }
 
 ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos,
-		int flags)
+		rwf_t flags)
 {
 	if (!file->f_op->write_iter)
 		return -EINVAL;
@@ -944,7 +944,7 @@ ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos,
 EXPORT_SYMBOL(vfs_iter_write);
 
 ssize_t vfs_readv(struct file *file, const struct iovec __user *vec,
-		  unsigned long vlen, loff_t *pos, int flags)
+		  unsigned long vlen, loff_t *pos, rwf_t flags)
 {
 	struct iovec iovstack[UIO_FASTIOV];
 	struct iovec *iov = iovstack;
@@ -962,7 +962,7 @@ ssize_t vfs_readv(struct file *file, const struct iovec __user *vec,
 EXPORT_SYMBOL(vfs_readv);
 
 ssize_t vfs_writev(struct file *file, const struct iovec __user *vec,
-		   unsigned long vlen, loff_t *pos, int flags)
+		   unsigned long vlen, loff_t *pos, rwf_t flags)
 {
 	struct iovec iovstack[UIO_FASTIOV];
 	struct iovec *iov = iovstack;
@@ -981,7 +981,7 @@ ssize_t vfs_writev(struct file *file, const struct iovec __user *vec,
 EXPORT_SYMBOL(vfs_writev);
 
 static ssize_t do_readv(unsigned long fd, const struct iovec __user *vec,
-			unsigned long vlen, int flags)
+			unsigned long vlen, rwf_t flags)
 {
 	struct fd f = fdget_pos(fd);
 	ssize_t ret = -EBADF;
@@ -1001,7 +1001,7 @@ static ssize_t do_readv(unsigned long fd, const struct iovec __user *vec,
 }
 
 static ssize_t do_writev(unsigned long fd, const struct iovec __user *vec,
-			 unsigned long vlen, int flags)
+			 unsigned long vlen, rwf_t flags)
 {
 	struct fd f = fdget_pos(fd);
 	ssize_t ret = -EBADF;
@@ -1027,7 +1027,7 @@ static inline loff_t pos_from_hilo(unsigned long high, unsigned long low)
 }
 
 static ssize_t do_preadv(unsigned long fd, const struct iovec __user *vec,
-			 unsigned long vlen, loff_t pos, int flags)
+			 unsigned long vlen, loff_t pos, rwf_t flags)
 {
 	struct fd f;
 	ssize_t ret = -EBADF;
@@ -1050,7 +1050,7 @@ static ssize_t do_preadv(unsigned long fd, const struct iovec __user *vec,
 }
 
 static ssize_t do_pwritev(unsigned long fd, const struct iovec __user *vec,
-			  unsigned long vlen, loff_t pos, int flags)
+			  unsigned long vlen, loff_t pos, rwf_t flags)
 {
 	struct fd f;
 	ssize_t ret = -EBADF;
@@ -1094,7 +1094,7 @@ SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec,
 
 SYSCALL_DEFINE6(preadv2, unsigned long, fd, const struct iovec __user *, vec,
 		unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h,
-		int, flags)
+		rwf_t, flags)
 {
 	loff_t pos = pos_from_hilo(pos_h, pos_l);
 
@@ -1114,7 +1114,7 @@ SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec,
 
 SYSCALL_DEFINE6(pwritev2, unsigned long, fd, const struct iovec __user *, vec,
 		unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h,
-		int, flags)
+		rwf_t, flags)
 {
 	loff_t pos = pos_from_hilo(pos_h, pos_l);
 
@@ -1127,7 +1127,7 @@ SYSCALL_DEFINE6(pwritev2, unsigned long, fd, const struct iovec __user *, vec,
 #ifdef CONFIG_COMPAT
 static size_t compat_readv(struct file *file,
 			   const struct compat_iovec __user *vec,
-			   unsigned long vlen, loff_t *pos, int flags)
+			   unsigned long vlen, loff_t *pos, rwf_t flags)
 {
 	struct iovec iovstack[UIO_FASTIOV];
 	struct iovec *iov = iovstack;
@@ -1147,7 +1147,7 @@ static size_t compat_readv(struct file *file,
 
 static size_t do_compat_readv(compat_ulong_t fd,
 				 const struct compat_iovec __user *vec,
-				 compat_ulong_t vlen, int flags)
+				 compat_ulong_t vlen, rwf_t flags)
 {
 	struct fd f = fdget_pos(fd);
 	ssize_t ret;
@@ -1173,7 +1173,7 @@ COMPAT_SYSCALL_DEFINE3(readv, compat_ulong_t, fd,
 
 static long do_compat_preadv64(unsigned long fd,
 				  const struct compat_iovec __user *vec,
-				  unsigned long vlen, loff_t pos, int flags)
+				  unsigned long vlen, loff_t pos, rwf_t flags)
 {
 	struct fd f;
 	ssize_t ret;
@@ -1211,7 +1211,7 @@ COMPAT_SYSCALL_DEFINE5(preadv, compat_ulong_t, fd,
 #ifdef __ARCH_WANT_COMPAT_SYS_PREADV64V2
 COMPAT_SYSCALL_DEFINE5(preadv64v2, unsigned long, fd,
 		const struct compat_iovec __user *,vec,
-		unsigned long, vlen, loff_t, pos, int, flags)
+		unsigned long, vlen, loff_t, pos, rwf_t, flags)
 {
 	return do_compat_preadv64(fd, vec, vlen, pos, flags);
 }
@@ -1220,7 +1220,7 @@ COMPAT_SYSCALL_DEFINE5(preadv64v2, unsigned long, fd,
 COMPAT_SYSCALL_DEFINE6(preadv2, compat_ulong_t, fd,
 		const struct compat_iovec __user *,vec,
 		compat_ulong_t, vlen, u32, pos_low, u32, pos_high,
-		int, flags)
+		rwf_t, flags)
 {
 	loff_t pos = ((loff_t)pos_high << 32) | pos_low;
 
@@ -1232,7 +1232,7 @@ COMPAT_SYSCALL_DEFINE6(preadv2, compat_ulong_t, fd,
 
 static size_t compat_writev(struct file *file,
 			    const struct compat_iovec __user *vec,
-			    unsigned long vlen, loff_t *pos, int flags)
+			    unsigned long vlen, loff_t *pos, rwf_t flags)
 {
 	struct iovec iovstack[UIO_FASTIOV];
 	struct iovec *iov = iovstack;
@@ -1254,7 +1254,7 @@ static size_t compat_writev(struct file *file,
 
 static size_t do_compat_writev(compat_ulong_t fd,
 				  const struct compat_iovec __user* vec,
-				  compat_ulong_t vlen, int flags)
+				  compat_ulong_t vlen, rwf_t flags)
 {
 	struct fd f = fdget_pos(fd);
 	ssize_t ret;
@@ -1279,7 +1279,7 @@ COMPAT_SYSCALL_DEFINE3(writev, compat_ulong_t, fd,
 
 static long do_compat_pwritev64(unsigned long fd,
 				   const struct compat_iovec __user *vec,
-				   unsigned long vlen, loff_t pos, int flags)
+				   unsigned long vlen, loff_t pos, rwf_t flags)
 {
 	struct fd f;
 	ssize_t ret;
@@ -1317,7 +1317,7 @@ COMPAT_SYSCALL_DEFINE5(pwritev, compat_ulong_t, fd,
 #ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64V2
 COMPAT_SYSCALL_DEFINE5(pwritev64v2, unsigned long, fd,
 		const struct compat_iovec __user *,vec,
-		unsigned long, vlen, loff_t, pos, int, flags)
+		unsigned long, vlen, loff_t, pos, rwf_t, flags)
 {
 	return do_compat_pwritev64(fd, vec, vlen, pos, flags);
 }
@@ -1325,7 +1325,7 @@ COMPAT_SYSCALL_DEFINE5(pwritev64v2, unsigned long, fd,
 
 COMPAT_SYSCALL_DEFINE6(pwritev2, compat_ulong_t, fd,
 		const struct compat_iovec __user *,vec,
-		compat_ulong_t, vlen, u32, pos_low, u32, pos_high, int, flags)
+		compat_ulong_t, vlen, u32, pos_low, u32, pos_high, rwf_t, flags)
 {
 	loff_t pos = ((loff_t)pos_high << 32) | pos_low;
 
diff --git a/include/linux/compat.h b/include/linux/compat.h
index e5d3fbe24f7d..3fc433303d7a 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -365,10 +365,10 @@ asmlinkage ssize_t compat_sys_pwritev(compat_ulong_t fd,
 		compat_ulong_t vlen, u32 pos_low, u32 pos_high);
 asmlinkage ssize_t compat_sys_preadv2(compat_ulong_t fd,
 		const struct compat_iovec __user *vec,
-		compat_ulong_t vlen, u32 pos_low, u32 pos_high, int flags);
+		compat_ulong_t vlen, u32 pos_low, u32 pos_high, rwf_t flags);
 asmlinkage ssize_t compat_sys_pwritev2(compat_ulong_t fd,
 		const struct compat_iovec __user *vec,
-		compat_ulong_t vlen, u32 pos_low, u32 pos_high, int flags);
+		compat_ulong_t vlen, u32 pos_low, u32 pos_high, rwf_t flags);
 
 #ifdef __ARCH_WANT_COMPAT_SYS_PREADV64
 asmlinkage long compat_sys_preadv64(unsigned long fd,
@@ -382,6 +382,18 @@ asmlinkage long compat_sys_pwritev64(unsigned long fd,
 		unsigned long vlen, loff_t pos);
 #endif
 
+#ifdef __ARCH_WANT_COMPAT_SYS_PREADV64V2
+asmlinkage long  compat_sys_readv64v2(unsigned long fd,
+		const struct compat_iovec __user *vec,
+		unsigned long vlen, loff_t pos, rwf_t flags);
+#endif
+
+#ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64V2
+asmlinkage long compat_sys_pwritev64v2(unsigned long fd,
+		const struct compat_iovec __user *vec,
+		unsigned long vlen, loff_t pos, rwf_t flags);
+#endif
+
 asmlinkage long compat_sys_lseek(unsigned int, compat_off_t, unsigned int);
 
 asmlinkage long compat_sys_execve(const char __user *filename, const compat_uptr_t __user *argv,
diff --git a/include/linux/fs.h b/include/linux/fs.h
index cbfe127bccf8..2625fc47c7e5 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -72,6 +72,8 @@ extern int leases_enable, lease_break_time;
 extern int sysctl_protected_symlinks;
 extern int sysctl_protected_hardlinks;
 
+typedef __kernel_rwf_t rwf_t;
+
 struct buffer_head;
 typedef int (get_block_t)(struct inode *inode, sector_t iblock,
 			struct buffer_head *bh_result, int create);
@@ -1758,9 +1760,9 @@ extern ssize_t __vfs_write(struct file *, const char __user *, size_t, loff_t *)
 extern ssize_t vfs_read(struct file *, char __user *, size_t, loff_t *);
 extern ssize_t vfs_write(struct file *, const char __user *, size_t, loff_t *);
 extern ssize_t vfs_readv(struct file *, const struct iovec __user *,
-		unsigned long, loff_t *, int);
+		unsigned long, loff_t *, rwf_t);
 extern ssize_t vfs_writev(struct file *, const struct iovec __user *,
-		unsigned long, loff_t *, int);
+		unsigned long, loff_t *, rwf_t);
 extern ssize_t vfs_copy_file_range(struct file *, loff_t , struct file *,
 				   loff_t, size_t, unsigned int);
 extern int vfs_clone_file_prep_inodes(struct inode *inode_in, loff_t pos_in,
@@ -2874,9 +2876,9 @@ extern ssize_t generic_file_direct_write(struct kiocb *, struct iov_iter *);
 extern ssize_t generic_perform_write(struct file *, struct iov_iter *, loff_t);
 
 ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos,
-		int flags);
+		rwf_t flags);
 ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos,
-		int flags);
+		rwf_t flags);
 
 /* fs/block_dev.c */
 extern ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to);
@@ -3143,7 +3145,7 @@ static inline int iocb_flags(struct file *file)
 	return res;
 }
 
-static inline int kiocb_set_rw_flags(struct kiocb *ki, int flags)
+static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags)
 {
 	if (unlikely(flags & ~RWF_SUPPORTED))
 		return -EOPNOTSUPP;
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 0bc1d2e8cc17..138c94535864 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -579,12 +579,12 @@ asmlinkage long sys_preadv(unsigned long fd, const struct iovec __user *vec,
 			   unsigned long vlen, unsigned long pos_l, unsigned long pos_h);
 asmlinkage long sys_preadv2(unsigned long fd, const struct iovec __user *vec,
 			    unsigned long vlen, unsigned long pos_l, unsigned long pos_h,
-			    int flags);
+			    rwf_t flags);
 asmlinkage long sys_pwritev(unsigned long fd, const struct iovec __user *vec,
 			    unsigned long vlen, unsigned long pos_l, unsigned long pos_h);
 asmlinkage long sys_pwritev2(unsigned long fd, const struct iovec __user *vec,
 			    unsigned long vlen, unsigned long pos_l, unsigned long pos_h,
-			    int flags);
+			    rwf_t flags);
 asmlinkage long sys_getcwd(char __user *buf, unsigned long size);
 asmlinkage long sys_mkdir(const char __user *pathname, umode_t mode);
 asmlinkage long sys_chdir(const char __user *filename);
diff --git a/include/uapi/linux/aio_abi.h b/include/uapi/linux/aio_abi.h
index a2d4a8ac94ca..a04adbc70ddf 100644
--- a/include/uapi/linux/aio_abi.h
+++ b/include/uapi/linux/aio_abi.h
@@ -28,6 +28,7 @@
 #define __LINUX__AIO_ABI_H
 
 #include <linux/types.h>
+#include <linux/fs.h>
 #include <asm/byteorder.h>
 
 typedef __kernel_ulong_t aio_context_t;
@@ -62,14 +63,6 @@ struct io_event {
 	__s64		res2;		/* secondary result */
 };
 
-#if defined(__BYTE_ORDER) ? __BYTE_ORDER == __LITTLE_ENDIAN : defined(__LITTLE_ENDIAN)
-#define PADDED(x,y)	x, y
-#elif defined(__BYTE_ORDER) ? __BYTE_ORDER == __BIG_ENDIAN : defined(__BIG_ENDIAN)
-#define PADDED(x,y)	y, x
-#else
-#error edit for your odd byteorder.
-#endif
-
 /*
  * we always use a 64bit off_t when communicating
  * with userland.  its up to libraries to do the
@@ -79,8 +72,16 @@ struct io_event {
 struct iocb {
 	/* these are internal to the kernel/libc. */
 	__u64	aio_data;	/* data to be returned in event's data */
-	__u32	PADDED(aio_key, aio_rw_flags);
-				/* the kernel sets aio_key to the req # */
+
+#if defined(__BYTE_ORDER) ? __BYTE_ORDER == __LITTLE_ENDIAN : defined(__LITTLE_ENDIAN)
+	__u32	aio_key;	/* the kernel sets aio_key to the req # */
+	__kernel_rwf_t aio_rw_flags;	/* RWF_* flags */
+#elif defined(__BYTE_ORDER) ? __BYTE_ORDER == __BIG_ENDIAN : defined(__BIG_ENDIAN)
+	__kernel_rwf_t aio_rw_flags;	/* RWF_* flags */
+	__u32	aio_key;	/* the kernel sets aio_key to the req # */
+#else
+#error edit for your odd byteorder.
+#endif
 
 	/* common fields */
 	__u16	aio_lio_opcode;	/* see IOCB_CMD_ above */
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index b7495d05e8de..56235dddea7d 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -358,13 +358,25 @@ struct fscrypt_key {
 #define SYNC_FILE_RANGE_WRITE		2
 #define SYNC_FILE_RANGE_WAIT_AFTER	4
 
-/* flags for preadv2/pwritev2: */
-#define RWF_HIPRI			0x00000001 /* high priority request, poll if possible */
-#define RWF_DSYNC			0x00000002 /* per-IO O_DSYNC */
-#define RWF_SYNC			0x00000004 /* per-IO O_SYNC */
-#define RWF_NOWAIT			0x00000008 /* per-IO, return -EAGAIN if operation would block */
-
-#define RWF_SUPPORTED			(RWF_HIPRI | RWF_DSYNC | RWF_SYNC |\
-					 RWF_NOWAIT)
+/*
+ * Flags for preadv2/pwritev2:
+ */
+
+typedef int __bitwise __kernel_rwf_t;
+
+/* high priority request, poll if possible */
+#define RWF_HIPRI	((__force __kernel_rwf_t)0x00000001)
+
+/* per-IO O_DSYNC */
+#define RWF_DSYNC	((__force __kernel_rwf_t)0x00000002)
+
+/* per-IO O_SYNC */
+#define RWF_SYNC	((__force __kernel_rwf_t)0x00000004)
+
+/* per-IO, return -EAGAIN if operation would block */
+#define RWF_NOWAIT	((__force __kernel_rwf_t)0x00000008)
+
+/* mask of flags supported by the kernel */
+#define RWF_SUPPORTED	(RWF_HIPRI | RWF_DSYNC | RWF_SYNC | RWF_NOWAIT)
 
 #endif /* _UAPI_LINUX_FS_H */
-- 
cgit v1.2.3-71-gd317


From 1797f5b3cf0b3a73c42b89f7a8fd897417373730 Mon Sep 17 00:00:00 2001
From: Arkadi Sharshevsky <arkadis@mellanox.com>
Date: Thu, 31 Aug 2017 17:59:12 +0200
Subject: devlink: Add IPv6 header for dpipe

This will be used by the IPv6 host table which will be introduced in the
following patches. The fields in the header are added per-use. This header
is global and can be reused by many drivers.

Signed-off-by: Arkadi Sharshevsky <arkadis@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/devlink.h        |  1 +
 include/uapi/linux/devlink.h |  5 +++++
 net/core/devlink.c           | 17 +++++++++++++++++
 3 files changed, 23 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/net/devlink.h b/include/net/devlink.h
index aaf7178127a2..b9654e133599 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -330,6 +330,7 @@ int devlink_dpipe_match_put(struct sk_buff *skb,
 			    struct devlink_dpipe_match *match);
 extern struct devlink_dpipe_header devlink_dpipe_header_ethernet;
 extern struct devlink_dpipe_header devlink_dpipe_header_ipv4;
+extern struct devlink_dpipe_header devlink_dpipe_header_ipv6;
 
 #else
 
diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
index 6c172548589d..0cbca96c66b9 100644
--- a/include/uapi/linux/devlink.h
+++ b/include/uapi/linux/devlink.h
@@ -234,9 +234,14 @@ enum devlink_dpipe_field_ipv4_id {
 	DEVLINK_DPIPE_FIELD_IPV4_DST_IP,
 };
 
+enum devlink_dpipe_field_ipv6_id {
+	DEVLINK_DPIPE_FIELD_IPV6_DST_IP,
+};
+
 enum devlink_dpipe_header_id {
 	DEVLINK_DPIPE_HEADER_ETHERNET,
 	DEVLINK_DPIPE_HEADER_IPV4,
+	DEVLINK_DPIPE_HEADER_IPV6,
 };
 
 #endif /* _UAPI_LINUX_DEVLINK_H_ */
diff --git a/net/core/devlink.c b/net/core/devlink.c
index cbc4b0461b0f..7d430c1d9c3e 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -63,6 +63,23 @@ struct devlink_dpipe_header devlink_dpipe_header_ipv4 = {
 };
 EXPORT_SYMBOL(devlink_dpipe_header_ipv4);
 
+static struct devlink_dpipe_field devlink_dpipe_fields_ipv6[] = {
+	{
+		.name = "destination ip",
+		.id = DEVLINK_DPIPE_FIELD_IPV6_DST_IP,
+		.bitwidth = 128,
+	},
+};
+
+struct devlink_dpipe_header devlink_dpipe_header_ipv6 = {
+	.name = "ipv6",
+	.id = DEVLINK_DPIPE_HEADER_IPV6,
+	.fields = devlink_dpipe_fields_ipv6,
+	.fields_count = ARRAY_SIZE(devlink_dpipe_fields_ipv6),
+	.global = true,
+};
+EXPORT_SYMBOL(devlink_dpipe_header_ipv6);
+
 EXPORT_TRACEPOINT_SYMBOL_GPL(devlink_hwmsg);
 
 static LIST_HEAD(devlink_list);
-- 
cgit v1.2.3-71-gd317


From 482dca939fb7ee35ba20b944b4c2476133dbf0df Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Thu, 31 Aug 2017 15:05:44 -0700
Subject: bpf: Add mark and priority to sock options that can be set

Add socket mark and priority to fields that can be set by
ebpf program when a socket is created.

Signed-off-by: David Ahern <dsahern@gmail.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/bpf.h |  2 ++
 net/core/filter.c        | 26 ++++++++++++++++++++++++++
 2 files changed, 28 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 08c206a863e1..ba848b761cfb 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -758,6 +758,8 @@ struct bpf_sock {
 	__u32 family;
 	__u32 type;
 	__u32 protocol;
+	__u32 mark;
+	__u32 priority;
 };
 
 #define XDP_PACKET_HEADROOM 256
diff --git a/net/core/filter.c b/net/core/filter.c
index c6a37fe0285b..f51b9690adf3 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3455,6 +3455,10 @@ static bool sock_filter_is_valid_access(int off, int size,
 		switch (off) {
 		case offsetof(struct bpf_sock, bound_dev_if):
 			break;
+		case offsetof(struct bpf_sock, mark):
+			break;
+		case offsetof(struct bpf_sock, priority):
+			break;
 		default:
 			return false;
 		}
@@ -3958,6 +3962,28 @@ static u32 sock_filter_convert_ctx_access(enum bpf_access_type type,
 				      offsetof(struct sock, sk_bound_dev_if));
 		break;
 
+	case offsetof(struct bpf_sock, mark):
+		BUILD_BUG_ON(FIELD_SIZEOF(struct sock, sk_mark) != 4);
+
+		if (type == BPF_WRITE)
+			*insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg,
+					offsetof(struct sock, sk_mark));
+		else
+			*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
+				      offsetof(struct sock, sk_mark));
+		break;
+
+	case offsetof(struct bpf_sock, priority):
+		BUILD_BUG_ON(FIELD_SIZEOF(struct sock, sk_priority) != 4);
+
+		if (type == BPF_WRITE)
+			*insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg,
+					offsetof(struct sock, sk_priority));
+		else
+			*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
+				      offsetof(struct sock, sk_priority));
+		break;
+
 	case offsetof(struct bpf_sock, family):
 		BUILD_BUG_ON(FIELD_SIZEOF(struct sock, sk_family) != 2);
 
-- 
cgit v1.2.3-71-gd317


From abcc61537e3566cae7f1fd225f2dcb82b3595fe3 Mon Sep 17 00:00:00 2001
From: Colin Cross <ccross@android.com>
Date: Thu, 31 Aug 2017 10:04:24 +0200
Subject: ANDROID: binder: Add BINDER_GET_NODE_DEBUG_INFO ioctl

The BINDER_GET_NODE_DEBUG_INFO ioctl will return debug info on
a node.  Each successive call reusing the previous return value
will return the next node.  The data will be used by
libmemunreachable to mark the pointers with kernel references
as reachable.

Signed-off-by: Colin Cross <ccross@android.com>
Signed-off-by: Martijn Coenen <maco@android.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/android/binder.c            | 43 +++++++++++++++++++++++++++++++++++++
 include/uapi/linux/android/binder.h | 14 ++++++++++++
 2 files changed, 57 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index 8f2031c52ea4..eb8bd8d7c4c9 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -4389,6 +4389,31 @@ out:
 	return ret;
 }
 
+static int binder_ioctl_get_node_debug_info(struct binder_proc *proc,
+				struct binder_node_debug_info *info)
+{
+	struct rb_node *n;
+	binder_uintptr_t ptr = info->ptr;
+
+	memset(info, 0, sizeof(*info));
+
+	binder_inner_proc_lock(proc);
+	for (n = rb_first(&proc->nodes); n != NULL; n = rb_next(n)) {
+		struct binder_node *node = rb_entry(n, struct binder_node,
+						    rb_node);
+		if (node->ptr > ptr) {
+			info->ptr = node->ptr;
+			info->cookie = node->cookie;
+			info->has_strong_ref = node->has_strong_ref;
+			info->has_weak_ref = node->has_weak_ref;
+			break;
+		}
+	}
+	binder_inner_proc_unlock(proc);
+
+	return 0;
+}
+
 static long binder_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
 	int ret;
@@ -4458,6 +4483,24 @@ static long binder_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 		}
 		break;
 	}
+	case BINDER_GET_NODE_DEBUG_INFO: {
+		struct binder_node_debug_info info;
+
+		if (copy_from_user(&info, ubuf, sizeof(info))) {
+			ret = -EFAULT;
+			goto err;
+		}
+
+		ret = binder_ioctl_get_node_debug_info(proc, &info);
+		if (ret < 0)
+			goto err;
+
+		if (copy_to_user(ubuf, &info, sizeof(info))) {
+			ret = -EFAULT;
+			goto err;
+		}
+		break;
+	}
 	default:
 		ret = -EINVAL;
 		goto err;
diff --git a/include/uapi/linux/android/binder.h b/include/uapi/linux/android/binder.h
index 7668b5791c91..84a9a0944e13 100644
--- a/include/uapi/linux/android/binder.h
+++ b/include/uapi/linux/android/binder.h
@@ -186,6 +186,19 @@ struct binder_version {
 #define BINDER_CURRENT_PROTOCOL_VERSION 8
 #endif
 
+/*
+ * Use with BINDER_GET_NODE_DEBUG_INFO, driver reads ptr, writes to all fields.
+ * Set ptr to NULL for the first call to get the info for the first node, and
+ * then repeat the call passing the previously returned value to get the next
+ * nodes.  ptr will be 0 when there are no more nodes.
+ */
+struct binder_node_debug_info {
+	binder_uintptr_t ptr;
+	binder_uintptr_t cookie;
+	__u32            has_strong_ref;
+	__u32            has_weak_ref;
+};
+
 #define BINDER_WRITE_READ		_IOWR('b', 1, struct binder_write_read)
 #define BINDER_SET_IDLE_TIMEOUT		_IOW('b', 3, __s64)
 #define BINDER_SET_MAX_THREADS		_IOW('b', 5, __u32)
@@ -193,6 +206,7 @@ struct binder_version {
 #define BINDER_SET_CONTEXT_MGR		_IOW('b', 7, __s32)
 #define BINDER_THREAD_EXIT		_IOW('b', 8, __s32)
 #define BINDER_VERSION			_IOWR('b', 9, struct binder_version)
+#define BINDER_GET_NODE_DEBUG_INFO	_IOWR('b', 11, struct binder_node_debug_info)
 
 /*
  * NOTE: Two special error codes you should check for when calling
-- 
cgit v1.2.3-71-gd317


From 8db6c34f1dbc8e06aa016a9b829b06902c3e1340 Mon Sep 17 00:00:00 2001
From: "Serge E. Hallyn" <serge@hallyn.com>
Date: Mon, 8 May 2017 13:11:56 -0500
Subject: Introduce v3 namespaced file capabilities

Root in a non-initial user ns cannot be trusted to write a traditional
security.capability xattr.  If it were allowed to do so, then any
unprivileged user on the host could map his own uid to root in a private
namespace, write the xattr, and execute the file with privilege on the
host.

However supporting file capabilities in a user namespace is very
desirable.  Not doing so means that any programs designed to run with
limited privilege must continue to support other methods of gaining and
dropping privilege.  For instance a program installer must detect
whether file capabilities can be assigned, and assign them if so but set
setuid-root otherwise.  The program in turn must know how to drop
partial capabilities, and do so only if setuid-root.

This patch introduces v3 of the security.capability xattr.  It builds a
vfs_ns_cap_data struct by appending a uid_t rootid to struct
vfs_cap_data.  This is the absolute uid_t (that is, the uid_t in user
namespace which mounted the filesystem, usually init_user_ns) of the
root id in whose namespaces the file capabilities may take effect.

When a task asks to write a v2 security.capability xattr, if it is
privileged with respect to the userns which mounted the filesystem, then
nothing should change.  Otherwise, the kernel will transparently rewrite
the xattr as a v3 with the appropriate rootid.  This is done during the
execution of setxattr() to catch user-space-initiated capability writes.
Subsequently, any task executing the file which has the noted kuid as
its root uid, or which is in a descendent user_ns of such a user_ns,
will run the file with capabilities.

Similarly when asking to read file capabilities, a v3 capability will
be presented as v2 if it applies to the caller's namespace.

If a task writes a v3 security.capability, then it can provide a uid for
the xattr so long as the uid is valid in its own user namespace, and it
is privileged with CAP_SETFCAP over its namespace.  The kernel will
translate that rootid to an absolute uid, and write that to disk.  After
this, a task in the writer's namespace will not be able to use those
capabilities (unless rootid was 0), but a task in a namespace where the
given uid is root will.

Only a single security.capability xattr may exist at a time for a given
file.  A task may overwrite an existing xattr so long as it is
privileged over the inode.  Note this is a departure from previous
semantics, which required privilege to remove a security.capability
xattr.  This check can be re-added if deemed useful.

This allows a simple setxattr to work, allows tar/untar to work, and
allows us to tar in one namespace and untar in another while preserving
the capability, without risking leaking privilege into a parent
namespace.

Example using tar:

 $ cp /bin/sleep sleepx
 $ mkdir b1 b2
 $ lxc-usernsexec -m b:0:100000:1 -m b:1:$(id -u):1 -- chown 0:0 b1
 $ lxc-usernsexec -m b:0:100001:1 -m b:1:$(id -u):1 -- chown 0:0 b2
 $ lxc-usernsexec -m b:0:100000:1000 -- tar --xattrs-include=security.capability --xattrs -cf b1/sleepx.tar sleepx
 $ lxc-usernsexec -m b:0:100001:1000 -- tar --xattrs-include=security.capability --xattrs -C b2 -xf b1/sleepx.tar
 $ lxc-usernsexec -m b:0:100001:1000 -- getcap b2/sleepx
   b2/sleepx = cap_sys_admin+ep
 # /opt/ltp/testcases/bin/getv3xattr b2/sleepx
   v3 xattr, rootid is 100001

A patch to linux-test-project adding a new set of tests for this
functionality is in the nsfscaps branch at github.com/hallyn/ltp

Changelog:
   Nov 02 2016: fix invalid check at refuse_fcap_overwrite()
   Nov 07 2016: convert rootid from and to fs user_ns
   (From ebiederm: mar 28 2017)
     commoncap.c: fix typos - s/v4/v3
     get_vfs_caps_from_disk: clarify the fs_ns root access check
     nsfscaps: change the code split for cap_inode_setxattr()
   Apr 09 2017:
       don't return v3 cap for caps owned by current root.
      return a v2 cap for a true v2 cap in non-init ns
   Apr 18 2017:
      . Change the flow of fscap writing to support s_user_ns writing.
      . Remove refuse_fcap_overwrite().  The value of the previous
        xattr doesn't matter.
   Apr 24 2017:
      . incorporate Eric's incremental diff
      . move cap_convert_nscap to setxattr and simplify its usage
   May 8, 2017:
      . fix leaking dentry refcount in cap_inode_getsecurity

Signed-off-by: Serge Hallyn <serge@hallyn.com>
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 fs/xattr.c                      |   6 +
 include/linux/capability.h      |   2 +
 include/linux/security.h        |   2 +
 include/uapi/linux/capability.h |  22 +++-
 security/commoncap.c            | 270 +++++++++++++++++++++++++++++++++++++---
 5 files changed, 280 insertions(+), 22 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/fs/xattr.c b/fs/xattr.c
index 464c94bf65f9..7b03df6b8be2 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -441,6 +441,12 @@ setxattr(struct dentry *d, const char __user *name, const void __user *value,
 		if ((strcmp(kname, XATTR_NAME_POSIX_ACL_ACCESS) == 0) ||
 		    (strcmp(kname, XATTR_NAME_POSIX_ACL_DEFAULT) == 0))
 			posix_acl_fix_xattr_from_user(kvalue, size);
+		else if (strcmp(kname, XATTR_NAME_CAPS) == 0) {
+			error = cap_convert_nscap(d, &kvalue, size);
+			if (error < 0)
+				goto out;
+			size = error;
+		}
 	}
 
 	error = vfs_setxattr(d, kname, kvalue, size, flags);
diff --git a/include/linux/capability.h b/include/linux/capability.h
index 6ffb67e10c06..b52e278e4744 100644
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -248,4 +248,6 @@ extern bool ptracer_capable(struct task_struct *tsk, struct user_namespace *ns);
 /* audit system wants to get cap info from files as well */
 extern int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data *cpu_caps);
 
+extern int cap_convert_nscap(struct dentry *dentry, void **ivalue, size_t size);
+
 #endif /* !_LINUX_CAPABILITY_H */
diff --git a/include/linux/security.h b/include/linux/security.h
index b6ea1dc9cc9d..6fff8c924718 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -91,6 +91,8 @@ extern int cap_inode_setxattr(struct dentry *dentry, const char *name,
 extern int cap_inode_removexattr(struct dentry *dentry, const char *name);
 extern int cap_inode_need_killpriv(struct dentry *dentry);
 extern int cap_inode_killpriv(struct dentry *dentry);
+extern int cap_inode_getsecurity(struct inode *inode, const char *name,
+				 void **buffer, bool alloc);
 extern int cap_mmap_addr(unsigned long addr);
 extern int cap_mmap_file(struct file *file, unsigned long reqprot,
 			 unsigned long prot, unsigned long flags);
diff --git a/include/uapi/linux/capability.h b/include/uapi/linux/capability.h
index 6fe14d001f68..230e05d35191 100644
--- a/include/uapi/linux/capability.h
+++ b/include/uapi/linux/capability.h
@@ -60,9 +60,13 @@ typedef struct __user_cap_data_struct {
 #define VFS_CAP_U32_2           2
 #define XATTR_CAPS_SZ_2         (sizeof(__le32)*(1 + 2*VFS_CAP_U32_2))
 
-#define XATTR_CAPS_SZ           XATTR_CAPS_SZ_2
-#define VFS_CAP_U32             VFS_CAP_U32_2
-#define VFS_CAP_REVISION	VFS_CAP_REVISION_2
+#define VFS_CAP_REVISION_3	0x03000000
+#define VFS_CAP_U32_3           2
+#define XATTR_CAPS_SZ_3         (sizeof(__le32)*(2 + 2*VFS_CAP_U32_3))
+
+#define XATTR_CAPS_SZ           XATTR_CAPS_SZ_3
+#define VFS_CAP_U32             VFS_CAP_U32_3
+#define VFS_CAP_REVISION	VFS_CAP_REVISION_3
 
 struct vfs_cap_data {
 	__le32 magic_etc;            /* Little endian */
@@ -72,6 +76,18 @@ struct vfs_cap_data {
 	} data[VFS_CAP_U32];
 };
 
+/*
+ * same as vfs_cap_data but with a rootid at the end
+ */
+struct vfs_ns_cap_data {
+	__le32 magic_etc;
+	struct {
+		__le32 permitted;    /* Little endian */
+		__le32 inheritable;  /* Little endian */
+	} data[VFS_CAP_U32];
+	__le32 rootid;
+};
+
 #ifndef __KERNEL__
 
 /*
diff --git a/security/commoncap.c b/security/commoncap.c
index d59320282294..c37d27dd1e2c 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -335,6 +335,209 @@ int cap_inode_killpriv(struct dentry *dentry)
 	return error;
 }
 
+static bool rootid_owns_currentns(kuid_t kroot)
+{
+	struct user_namespace *ns;
+
+	if (!uid_valid(kroot))
+		return false;
+
+	for (ns = current_user_ns(); ; ns = ns->parent) {
+		if (from_kuid(ns, kroot) == 0)
+			return true;
+		if (ns == &init_user_ns)
+			break;
+	}
+
+	return false;
+}
+
+static __u32 sansflags(__u32 m)
+{
+	return m & ~VFS_CAP_FLAGS_EFFECTIVE;
+}
+
+static bool is_v2header(size_t size, __le32 magic)
+{
+	__u32 m = le32_to_cpu(magic);
+	if (size != XATTR_CAPS_SZ_2)
+		return false;
+	return sansflags(m) == VFS_CAP_REVISION_2;
+}
+
+static bool is_v3header(size_t size, __le32 magic)
+{
+	__u32 m = le32_to_cpu(magic);
+
+	if (size != XATTR_CAPS_SZ_3)
+		return false;
+	return sansflags(m) == VFS_CAP_REVISION_3;
+}
+
+/*
+ * getsecurity: We are called for security.* before any attempt to read the
+ * xattr from the inode itself.
+ *
+ * This gives us a chance to read the on-disk value and convert it.  If we
+ * return -EOPNOTSUPP, then vfs_getxattr() will call the i_op handler.
+ *
+ * Note we are not called by vfs_getxattr_alloc(), but that is only called
+ * by the integrity subsystem, which really wants the unconverted values -
+ * so that's good.
+ */
+int cap_inode_getsecurity(struct inode *inode, const char *name, void **buffer,
+			  bool alloc)
+{
+	int size, ret;
+	kuid_t kroot;
+	uid_t root, mappedroot;
+	char *tmpbuf = NULL;
+	struct vfs_cap_data *cap;
+	struct vfs_ns_cap_data *nscap;
+	struct dentry *dentry;
+	struct user_namespace *fs_ns;
+
+	if (strcmp(name, "capability") != 0)
+		return -EOPNOTSUPP;
+
+	dentry = d_find_alias(inode);
+	if (!dentry)
+		return -EINVAL;
+
+	size = sizeof(struct vfs_ns_cap_data);
+	ret = (int) vfs_getxattr_alloc(dentry, XATTR_NAME_CAPS,
+				 &tmpbuf, size, GFP_NOFS);
+	dput(dentry);
+
+	if (ret < 0)
+		return ret;
+
+	fs_ns = inode->i_sb->s_user_ns;
+	cap = (struct vfs_cap_data *) tmpbuf;
+	if (is_v2header((size_t) ret, cap->magic_etc)) {
+		/* If this is sizeof(vfs_cap_data) then we're ok with the
+		 * on-disk value, so return that.  */
+		if (alloc)
+			*buffer = tmpbuf;
+		else
+			kfree(tmpbuf);
+		return ret;
+	} else if (!is_v3header((size_t) ret, cap->magic_etc)) {
+		kfree(tmpbuf);
+		return -EINVAL;
+	}
+
+	nscap = (struct vfs_ns_cap_data *) tmpbuf;
+	root = le32_to_cpu(nscap->rootid);
+	kroot = make_kuid(fs_ns, root);
+
+	/* If the root kuid maps to a valid uid in current ns, then return
+	 * this as a nscap. */
+	mappedroot = from_kuid(current_user_ns(), kroot);
+	if (mappedroot != (uid_t)-1 && mappedroot != (uid_t)0) {
+		if (alloc) {
+			*buffer = tmpbuf;
+			nscap->rootid = cpu_to_le32(mappedroot);
+		} else
+			kfree(tmpbuf);
+		return size;
+	}
+
+	if (!rootid_owns_currentns(kroot)) {
+		kfree(tmpbuf);
+		return -EOPNOTSUPP;
+	}
+
+	/* This comes from a parent namespace.  Return as a v2 capability */
+	size = sizeof(struct vfs_cap_data);
+	if (alloc) {
+		*buffer = kmalloc(size, GFP_ATOMIC);
+		if (*buffer) {
+			struct vfs_cap_data *cap = *buffer;
+			__le32 nsmagic, magic;
+			magic = VFS_CAP_REVISION_2;
+			nsmagic = le32_to_cpu(nscap->magic_etc);
+			if (nsmagic & VFS_CAP_FLAGS_EFFECTIVE)
+				magic |= VFS_CAP_FLAGS_EFFECTIVE;
+			memcpy(&cap->data, &nscap->data, sizeof(__le32) * 2 * VFS_CAP_U32);
+			cap->magic_etc = cpu_to_le32(magic);
+		}
+	}
+	kfree(tmpbuf);
+	return size;
+}
+
+static kuid_t rootid_from_xattr(const void *value, size_t size,
+				struct user_namespace *task_ns)
+{
+	const struct vfs_ns_cap_data *nscap = value;
+	uid_t rootid = 0;
+
+	if (size == XATTR_CAPS_SZ_3)
+		rootid = le32_to_cpu(nscap->rootid);
+
+	return make_kuid(task_ns, rootid);
+}
+
+static bool validheader(size_t size, __le32 magic)
+{
+	return is_v2header(size, magic) || is_v3header(size, magic);
+}
+
+/*
+ * User requested a write of security.capability.  If needed, update the
+ * xattr to change from v2 to v3, or to fixup the v3 rootid.
+ *
+ * If all is ok, we return the new size, on error return < 0.
+ */
+int cap_convert_nscap(struct dentry *dentry, void **ivalue, size_t size)
+{
+	struct vfs_ns_cap_data *nscap;
+	uid_t nsrootid;
+	const struct vfs_cap_data *cap = *ivalue;
+	__u32 magic, nsmagic;
+	struct inode *inode = d_backing_inode(dentry);
+	struct user_namespace *task_ns = current_user_ns(),
+		*fs_ns = inode->i_sb->s_user_ns;
+	kuid_t rootid;
+	size_t newsize;
+
+	if (!*ivalue)
+		return -EINVAL;
+	if (!validheader(size, cap->magic_etc))
+		return -EINVAL;
+	if (!capable_wrt_inode_uidgid(inode, CAP_SETFCAP))
+		return -EPERM;
+	if (size == XATTR_CAPS_SZ_2)
+		if (ns_capable(inode->i_sb->s_user_ns, CAP_SETFCAP))
+			/* user is privileged, just write the v2 */
+			return size;
+
+	rootid = rootid_from_xattr(*ivalue, size, task_ns);
+	if (!uid_valid(rootid))
+		return -EINVAL;
+
+	nsrootid = from_kuid(fs_ns, rootid);
+	if (nsrootid == -1)
+		return -EINVAL;
+
+	newsize = sizeof(struct vfs_ns_cap_data);
+	nscap = kmalloc(newsize, GFP_ATOMIC);
+	if (!nscap)
+		return -ENOMEM;
+	nscap->rootid = cpu_to_le32(nsrootid);
+	nsmagic = VFS_CAP_REVISION_3;
+	magic = le32_to_cpu(cap->magic_etc);
+	if (magic & VFS_CAP_FLAGS_EFFECTIVE)
+		nsmagic |= VFS_CAP_FLAGS_EFFECTIVE;
+	nscap->magic_etc = cpu_to_le32(nsmagic);
+	memcpy(&nscap->data, &cap->data, sizeof(__le32) * 2 * VFS_CAP_U32);
+
+	kvfree(*ivalue);
+	*ivalue = nscap;
+	return newsize;
+}
+
 /*
  * Calculate the new process capability sets from the capability sets attached
  * to a file.
@@ -388,7 +591,10 @@ int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data
 	__u32 magic_etc;
 	unsigned tocopy, i;
 	int size;
-	struct vfs_cap_data caps;
+	struct vfs_ns_cap_data data, *nscaps = &data;
+	struct vfs_cap_data *caps = (struct vfs_cap_data *) &data;
+	kuid_t rootkuid;
+	struct user_namespace *fs_ns = inode->i_sb->s_user_ns;
 
 	memset(cpu_caps, 0, sizeof(struct cpu_vfs_cap_data));
 
@@ -396,18 +602,20 @@ int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data
 		return -ENODATA;
 
 	size = __vfs_getxattr((struct dentry *)dentry, inode,
-			      XATTR_NAME_CAPS, &caps, XATTR_CAPS_SZ);
+			      XATTR_NAME_CAPS, &data, XATTR_CAPS_SZ);
 	if (size == -ENODATA || size == -EOPNOTSUPP)
 		/* no data, that's ok */
 		return -ENODATA;
+
 	if (size < 0)
 		return size;
 
 	if (size < sizeof(magic_etc))
 		return -EINVAL;
 
-	cpu_caps->magic_etc = magic_etc = le32_to_cpu(caps.magic_etc);
+	cpu_caps->magic_etc = magic_etc = le32_to_cpu(caps->magic_etc);
 
+	rootkuid = make_kuid(fs_ns, 0);
 	switch (magic_etc & VFS_CAP_REVISION_MASK) {
 	case VFS_CAP_REVISION_1:
 		if (size != XATTR_CAPS_SZ_1)
@@ -419,15 +627,27 @@ int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data
 			return -EINVAL;
 		tocopy = VFS_CAP_U32_2;
 		break;
+	case VFS_CAP_REVISION_3:
+		if (size != XATTR_CAPS_SZ_3)
+			return -EINVAL;
+		tocopy = VFS_CAP_U32_3;
+		rootkuid = make_kuid(fs_ns, le32_to_cpu(nscaps->rootid));
+		break;
+
 	default:
 		return -EINVAL;
 	}
+	/* Limit the caps to the mounter of the filesystem
+	 * or the more limited uid specified in the xattr.
+	 */
+	if (!rootid_owns_currentns(rootkuid))
+		return -ENODATA;
 
 	CAP_FOR_EACH_U32(i) {
 		if (i >= tocopy)
 			break;
-		cpu_caps->permitted.cap[i] = le32_to_cpu(caps.data[i].permitted);
-		cpu_caps->inheritable.cap[i] = le32_to_cpu(caps.data[i].inheritable);
+		cpu_caps->permitted.cap[i] = le32_to_cpu(caps->data[i].permitted);
+		cpu_caps->inheritable.cap[i] = le32_to_cpu(caps->data[i].inheritable);
 	}
 
 	cpu_caps->permitted.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK;
@@ -465,8 +685,8 @@ static int get_file_caps(struct linux_binprm *bprm, bool *effective, bool *has_c
 	rc = get_vfs_caps_from_disk(bprm->file->f_path.dentry, &vcaps);
 	if (rc < 0) {
 		if (rc == -EINVAL)
-			printk(KERN_NOTICE "%s: get_vfs_caps_from_disk returned %d for %s\n",
-				__func__, rc, bprm->filename);
+			printk(KERN_NOTICE "Invalid argument reading file caps for %s\n",
+					bprm->filename);
 		else if (rc == -ENODATA)
 			rc = 0;
 		goto out;
@@ -663,15 +883,19 @@ int cap_bprm_secureexec(struct linux_binprm *bprm)
 int cap_inode_setxattr(struct dentry *dentry, const char *name,
 		       const void *value, size_t size, int flags)
 {
-	if (!strcmp(name, XATTR_NAME_CAPS)) {
-		if (!capable(CAP_SETFCAP))
-			return -EPERM;
+	/* Ignore non-security xattrs */
+	if (strncmp(name, XATTR_SECURITY_PREFIX,
+			sizeof(XATTR_SECURITY_PREFIX) - 1) != 0)
+		return 0;
+
+	/*
+	 * For XATTR_NAME_CAPS the check will be done in
+	 * cap_convert_nscap(), called by setxattr()
+	 */
+	if (strcmp(name, XATTR_NAME_CAPS) == 0)
 		return 0;
-	}
 
-	if (!strncmp(name, XATTR_SECURITY_PREFIX,
-		     sizeof(XATTR_SECURITY_PREFIX) - 1) &&
-	    !capable(CAP_SYS_ADMIN))
+	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 	return 0;
 }
@@ -689,15 +913,22 @@ int cap_inode_setxattr(struct dentry *dentry, const char *name,
  */
 int cap_inode_removexattr(struct dentry *dentry, const char *name)
 {
-	if (!strcmp(name, XATTR_NAME_CAPS)) {
-		if (!capable(CAP_SETFCAP))
+	/* Ignore non-security xattrs */
+	if (strncmp(name, XATTR_SECURITY_PREFIX,
+			sizeof(XATTR_SECURITY_PREFIX) - 1) != 0)
+		return 0;
+
+	if (strcmp(name, XATTR_NAME_CAPS) == 0) {
+		/* security.capability gets namespaced */
+		struct inode *inode = d_backing_inode(dentry);
+		if (!inode)
+			return -EINVAL;
+		if (!capable_wrt_inode_uidgid(inode, CAP_SETFCAP))
 			return -EPERM;
 		return 0;
 	}
 
-	if (!strncmp(name, XATTR_SECURITY_PREFIX,
-		     sizeof(XATTR_SECURITY_PREFIX) - 1) &&
-	    !capable(CAP_SYS_ADMIN))
+	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 	return 0;
 }
@@ -1085,6 +1316,7 @@ struct security_hook_list capability_hooks[] __lsm_ro_after_init = {
 	LSM_HOOK_INIT(bprm_secureexec, cap_bprm_secureexec),
 	LSM_HOOK_INIT(inode_need_killpriv, cap_inode_need_killpriv),
 	LSM_HOOK_INIT(inode_killpriv, cap_inode_killpriv),
+	LSM_HOOK_INIT(inode_getsecurity, cap_inode_getsecurity),
 	LSM_HOOK_INIT(mmap_addr, cap_mmap_addr),
 	LSM_HOOK_INIT(mmap_file, cap_mmap_file),
 	LSM_HOOK_INIT(task_fix_setuid, cap_task_fix_setuid),
-- 
cgit v1.2.3-71-gd317


From d897246df9fc0a5df97a784bf7b072be4a6ae479 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Thu, 31 Aug 2017 13:47:35 -0700
Subject: fsmap: fix documentation of FMR_OF_LAST

The FMR_OF_LAST flag is set on the last fsmap record being returned for
the dataset requested, contrary to what the header file says.  Fix the
docs to reflect the behavior of all fsmap implementations.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
---
 include/uapi/linux/fsmap.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/fsmap.h b/include/uapi/linux/fsmap.h
index 7e8e5f0bd6d2..e5213c3e38b2 100644
--- a/include/uapi/linux/fsmap.h
+++ b/include/uapi/linux/fsmap.h
@@ -96,7 +96,7 @@ fsmap_advance(
 #define FMR_OF_EXTENT_MAP	0x4	/* segment = extent map */
 #define FMR_OF_SHARED		0x8	/* segment = shared with another file */
 #define FMR_OF_SPECIAL_OWNER	0x10	/* owner is a special value */
-#define FMR_OF_LAST		0x20	/* segment is the last in the FS */
+#define FMR_OF_LAST		0x20	/* segment is the last in the dataset */
 
 /* Each FS gets to define its own special owner codes. */
 #define FMR_OWNER(type, code)	(((__u64)type << 32) | \
-- 
cgit v1.2.3-71-gd317


From c03fa9bcacd9ac04595cc13f34f3445f0a5ecf13 Mon Sep 17 00:00:00 2001
From: Ivan Delalande <colona@arista.com>
Date: Thu, 31 Aug 2017 09:59:39 -0700
Subject: tcp_diag: report TCP MD5 signing keys and addresses

Report TCP MD5 (RFC2385) signing keys, addresses and address prefixes to
processes with CAP_NET_ADMIN requesting INET_DIAG_INFO. Currently it is
not possible to retrieve these from the kernel once they have been
configured on sockets.

Signed-off-by: Ivan Delalande <colona@arista.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/inet_diag.h |   1 +
 include/uapi/linux/tcp.h       |   9 ++++
 net/ipv4/tcp_diag.c            | 109 ++++++++++++++++++++++++++++++++++++++---
 3 files changed, 113 insertions(+), 6 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h
index 678496897a68..f52ff62bfabe 100644
--- a/include/uapi/linux/inet_diag.h
+++ b/include/uapi/linux/inet_diag.h
@@ -143,6 +143,7 @@ enum {
 	INET_DIAG_MARK,
 	INET_DIAG_BBRINFO,
 	INET_DIAG_CLASS_ID,
+	INET_DIAG_MD5SIG,
 	__INET_DIAG_MAX,
 };
 
diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index 030e594bab45..15c25eccab2b 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -256,4 +256,13 @@ struct tcp_md5sig {
 	__u8	tcpm_key[TCP_MD5SIG_MAXKEYLEN];		/* key (binary) */
 };
 
+/* INET_DIAG_MD5SIG */
+struct tcp_diag_md5sig {
+	__u8	tcpm_family;
+	__u8	tcpm_prefixlen;
+	__u16	tcpm_keylen;
+	__be32	tcpm_addr[4];
+	__u8	tcpm_key[TCP_MD5SIG_MAXKEYLEN];
+};
+
 #endif /* _UAPI_LINUX_TCP_H */
diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c
index a748c74aa8b7..abbf0edcf6c2 100644
--- a/net/ipv4/tcp_diag.c
+++ b/net/ipv4/tcp_diag.c
@@ -16,6 +16,7 @@
 
 #include <linux/tcp.h>
 
+#include <net/netlink.h>
 #include <net/tcp.h>
 
 static void tcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
@@ -36,6 +37,100 @@ static void tcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
 		tcp_get_info(sk, info);
 }
 
+#ifdef CONFIG_TCP_MD5SIG
+static void tcp_diag_md5sig_fill(struct tcp_diag_md5sig *info,
+				 const struct tcp_md5sig_key *key)
+{
+	info->tcpm_family = key->family;
+	info->tcpm_prefixlen = key->prefixlen;
+	info->tcpm_keylen = key->keylen;
+	memcpy(info->tcpm_key, key->key, key->keylen);
+
+	if (key->family == AF_INET)
+		info->tcpm_addr[0] = key->addr.a4.s_addr;
+	#if IS_ENABLED(CONFIG_IPV6)
+	else if (key->family == AF_INET6)
+		memcpy(&info->tcpm_addr, &key->addr.a6,
+		       sizeof(info->tcpm_addr));
+	#endif
+}
+
+static int tcp_diag_put_md5sig(struct sk_buff *skb,
+			       const struct tcp_md5sig_info *md5sig)
+{
+	const struct tcp_md5sig_key *key;
+	struct tcp_diag_md5sig *info;
+	struct nlattr *attr;
+	int md5sig_count = 0;
+
+	hlist_for_each_entry_rcu(key, &md5sig->head, node)
+		md5sig_count++;
+	if (md5sig_count == 0)
+		return 0;
+
+	attr = nla_reserve(skb, INET_DIAG_MD5SIG,
+			   md5sig_count * sizeof(struct tcp_diag_md5sig));
+	if (!attr)
+		return -EMSGSIZE;
+
+	info = nla_data(attr);
+	memset(info, 0, md5sig_count * sizeof(struct tcp_diag_md5sig));
+	hlist_for_each_entry_rcu(key, &md5sig->head, node) {
+		tcp_diag_md5sig_fill(info++, key);
+		if (--md5sig_count == 0)
+			break;
+	}
+
+	return 0;
+}
+#endif
+
+static int tcp_diag_get_aux(struct sock *sk, bool net_admin,
+			    struct sk_buff *skb)
+{
+#ifdef CONFIG_TCP_MD5SIG
+	if (net_admin) {
+		struct tcp_md5sig_info *md5sig;
+		int err = 0;
+
+		rcu_read_lock();
+		md5sig = rcu_dereference(tcp_sk(sk)->md5sig_info);
+		if (md5sig)
+			err = tcp_diag_put_md5sig(skb, md5sig);
+		rcu_read_unlock();
+		if (err < 0)
+			return err;
+	}
+#endif
+
+	return 0;
+}
+
+static size_t tcp_diag_get_aux_size(struct sock *sk, bool net_admin)
+{
+	size_t size = 0;
+
+#ifdef CONFIG_TCP_MD5SIG
+	if (net_admin && sk_fullsock(sk)) {
+		const struct tcp_md5sig_info *md5sig;
+		const struct tcp_md5sig_key *key;
+		size_t md5sig_count = 0;
+
+		rcu_read_lock();
+		md5sig = rcu_dereference(tcp_sk(sk)->md5sig_info);
+		if (md5sig) {
+			hlist_for_each_entry_rcu(key, &md5sig->head, node)
+				md5sig_count++;
+		}
+		rcu_read_unlock();
+		size += nla_total_size(md5sig_count *
+				       sizeof(struct tcp_diag_md5sig));
+	}
+#endif
+
+	return size;
+}
+
 static void tcp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
 			  const struct inet_diag_req_v2 *r, struct nlattr *bc)
 {
@@ -68,13 +163,15 @@ static int tcp_diag_destroy(struct sk_buff *in_skb,
 #endif
 
 static const struct inet_diag_handler tcp_diag_handler = {
-	.dump		 = tcp_diag_dump,
-	.dump_one	 = tcp_diag_dump_one,
-	.idiag_get_info	 = tcp_diag_get_info,
-	.idiag_type	 = IPPROTO_TCP,
-	.idiag_info_size = sizeof(struct tcp_info),
+	.dump			= tcp_diag_dump,
+	.dump_one		= tcp_diag_dump_one,
+	.idiag_get_info		= tcp_diag_get_info,
+	.idiag_get_aux		= tcp_diag_get_aux,
+	.idiag_get_aux_size	= tcp_diag_get_aux_size,
+	.idiag_type		= IPPROTO_TCP,
+	.idiag_info_size	= sizeof(struct tcp_info),
 #ifdef CONFIG_INET_DIAG_DESTROY
-	.destroy	 = tcp_diag_destroy,
+	.destroy		= tcp_diag_destroy,
 #endif
 };
 
-- 
cgit v1.2.3-71-gd317


From bea74641e3786d51dcf1175527cc1781420961c9 Mon Sep 17 00:00:00 2001
From: Vishwanath Pai <vpai@akamai.com>
Date: Fri, 18 Aug 2017 20:58:59 +0200
Subject: netfilter: xt_hashlimit: add rate match mode

This patch adds a new feature to hashlimit that allows matching on the
current packet/byte rate without rate limiting. This can be enabled
with a new flag --hashlimit-rate-match. The match returns true if the
current rate of packets is above/below the user specified value.

The main difference between the existing algorithm and the new one is
that the existing algorithm rate-limits the flow whereas the new
algorithm does not. Instead it *classifies* the flow based on whether
it is above or below a certain rate. I will demonstrate this with an
example below. Let us assume this rule:

iptables -A INPUT -m hashlimit --hashlimit-above 10/s -j new_chain

If the packet rate is 15/s, the existing algorithm would ACCEPT 10
packets every second and send 5 packets to "new_chain".

But with the new algorithm, as long as the rate of 15/s is sustained,
all packets will continue to match and every packet is sent to new_chain.

This new functionality will let us classify different flows based on
their current rate, so that further decisions can be made on them based on
what the current rate is.

This is how the new algorithm works:
We divide time into intervals of 1 (sec/min/hour) as specified by
the user. We keep track of the number of packets/bytes processed in the
current interval. After each interval we reset the counter to 0.

When we receive a packet for match, we look at the packet rate
during the current interval and the previous interval to make a
decision:

if [ prev_rate < user and cur_rate < user ]
        return Below
else
        return Above

Where cur_rate is the number of packets/bytes seen in the current
interval, prev is the number of packets/bytes seen in the previous
interval and 'user' is the rate specified by the user.

We also provide flexibility to the user for choosing the time
interval using the option --hashilmit-interval. For example the user can
keep a low rate like x/hour but still keep the interval as small as 1
second.

To preserve backwards compatibility we have to add this feature in a new
revision, so I've created revision 3 for hashlimit. The two new options
we add are:

--hashlimit-rate-match
--hashlimit-rate-interval

I have updated the help text to add these new options. Also added a few
tests for the new options.

Suggested-by: Igor Lubashev <ilubashe@akamai.com>
Reviewed-by: Josh Hunt <johunt@akamai.com>
Signed-off-by: Vishwanath Pai <vpai@akamai.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/xt_hashlimit.h      |   3 +-
 include/uapi/linux/netfilter/xt_hashlimit.h |  36 +++-
 net/netfilter/xt_hashlimit.c                | 277 +++++++++++++++++++++++++---
 3 files changed, 285 insertions(+), 31 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/linux/netfilter/xt_hashlimit.h b/include/linux/netfilter/xt_hashlimit.h
index 074790c0cf74..0fc458bde80b 100644
--- a/include/linux/netfilter/xt_hashlimit.h
+++ b/include/linux/netfilter/xt_hashlimit.h
@@ -5,5 +5,6 @@
 
 #define XT_HASHLIMIT_ALL (XT_HASHLIMIT_HASH_DIP | XT_HASHLIMIT_HASH_DPT | \
 			  XT_HASHLIMIT_HASH_SIP | XT_HASHLIMIT_HASH_SPT | \
-			  XT_HASHLIMIT_INVERT | XT_HASHLIMIT_BYTES)
+			  XT_HASHLIMIT_INVERT | XT_HASHLIMIT_BYTES |\
+			  XT_HASHLIMIT_RATE_MATCH)
 #endif /*_XT_HASHLIMIT_H*/
diff --git a/include/uapi/linux/netfilter/xt_hashlimit.h b/include/uapi/linux/netfilter/xt_hashlimit.h
index 79da349f1060..aa98573248b1 100644
--- a/include/uapi/linux/netfilter/xt_hashlimit.h
+++ b/include/uapi/linux/netfilter/xt_hashlimit.h
@@ -19,12 +19,13 @@
 struct xt_hashlimit_htable;
 
 enum {
-	XT_HASHLIMIT_HASH_DIP = 1 << 0,
-	XT_HASHLIMIT_HASH_DPT = 1 << 1,
-	XT_HASHLIMIT_HASH_SIP = 1 << 2,
-	XT_HASHLIMIT_HASH_SPT = 1 << 3,
-	XT_HASHLIMIT_INVERT   = 1 << 4,
-	XT_HASHLIMIT_BYTES    = 1 << 5,
+	XT_HASHLIMIT_HASH_DIP		= 1 << 0,
+	XT_HASHLIMIT_HASH_DPT		= 1 << 1,
+	XT_HASHLIMIT_HASH_SIP		= 1 << 2,
+	XT_HASHLIMIT_HASH_SPT		= 1 << 3,
+	XT_HASHLIMIT_INVERT		= 1 << 4,
+	XT_HASHLIMIT_BYTES		= 1 << 5,
+	XT_HASHLIMIT_RATE_MATCH		= 1 << 6,
 };
 
 struct hashlimit_cfg {
@@ -79,6 +80,21 @@ struct hashlimit_cfg2 {
 	__u8 srcmask, dstmask;
 };
 
+struct hashlimit_cfg3 {
+	__u64 avg;		/* Average secs between packets * scale */
+	__u64 burst;		/* Period multiplier for upper limit. */
+	__u32 mode;		/* bitmask of XT_HASHLIMIT_HASH_* */
+
+	/* user specified */
+	__u32 size;		/* how many buckets */
+	__u32 max;		/* max number of entries */
+	__u32 gc_interval;	/* gc interval */
+	__u32 expire;		/* when do entries expire? */
+
+	__u32 interval;
+	__u8 srcmask, dstmask;
+};
+
 struct xt_hashlimit_mtinfo1 {
 	char name[IFNAMSIZ];
 	struct hashlimit_cfg1 cfg;
@@ -95,4 +111,12 @@ struct xt_hashlimit_mtinfo2 {
 	struct xt_hashlimit_htable *hinfo __attribute__((aligned(8)));
 };
 
+struct xt_hashlimit_mtinfo3 {
+	char name[NAME_MAX];
+	struct hashlimit_cfg3 cfg;
+
+	/* Used internally by the kernel */
+	struct xt_hashlimit_htable *hinfo __attribute__((aligned(8)));
+};
+
 #endif /* _UAPI_XT_HASHLIMIT_H */
diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c
index ffdb611e54a2..10d48234f5f4 100644
--- a/net/netfilter/xt_hashlimit.c
+++ b/net/netfilter/xt_hashlimit.c
@@ -56,6 +56,7 @@ static inline struct hashlimit_net *hashlimit_pernet(struct net *net)
 }
 
 /* need to declare this at the top */
+static const struct file_operations dl_file_ops_v2;
 static const struct file_operations dl_file_ops_v1;
 static const struct file_operations dl_file_ops;
 
@@ -87,8 +88,19 @@ struct dsthash_ent {
 	unsigned long expires;		/* precalculated expiry time */
 	struct {
 		unsigned long prev;	/* last modification */
-		u_int64_t credit;
-		u_int64_t credit_cap, cost;
+		union {
+			struct {
+				u_int64_t credit;
+				u_int64_t credit_cap;
+				u_int64_t cost;
+			};
+			struct {
+				u_int32_t interval, prev_window;
+				u_int64_t current_rate;
+				u_int64_t rate;
+				int64_t burst;
+			};
+		};
 	} rateinfo;
 	struct rcu_head rcu;
 };
@@ -99,7 +111,7 @@ struct xt_hashlimit_htable {
 	u_int8_t family;
 	bool rnd_initialized;
 
-	struct hashlimit_cfg2 cfg;	/* config */
+	struct hashlimit_cfg3 cfg;	/* config */
 
 	/* used internally */
 	spinlock_t lock;		/* lock for list_head */
@@ -116,10 +128,10 @@ struct xt_hashlimit_htable {
 };
 
 static int
-cfg_copy(struct hashlimit_cfg2 *to, void *from, int revision)
+cfg_copy(struct hashlimit_cfg3 *to, const void *from, int revision)
 {
 	if (revision == 1) {
-		struct hashlimit_cfg1 *cfg = from;
+		struct hashlimit_cfg1 *cfg = (struct hashlimit_cfg1 *)from;
 
 		to->mode = cfg->mode;
 		to->avg = cfg->avg;
@@ -131,7 +143,19 @@ cfg_copy(struct hashlimit_cfg2 *to, void *from, int revision)
 		to->srcmask = cfg->srcmask;
 		to->dstmask = cfg->dstmask;
 	} else if (revision == 2) {
-		memcpy(to, from, sizeof(struct hashlimit_cfg2));
+		struct hashlimit_cfg2 *cfg = (struct hashlimit_cfg2 *)from;
+
+		to->mode = cfg->mode;
+		to->avg = cfg->avg;
+		to->burst = cfg->burst;
+		to->size = cfg->size;
+		to->max = cfg->max;
+		to->gc_interval = cfg->gc_interval;
+		to->expire = cfg->expire;
+		to->srcmask = cfg->srcmask;
+		to->dstmask = cfg->dstmask;
+	} else if (revision == 3) {
+		memcpy(to, from, sizeof(struct hashlimit_cfg3));
 	} else {
 		return -EINVAL;
 	}
@@ -240,13 +264,14 @@ dsthash_free(struct xt_hashlimit_htable *ht, struct dsthash_ent *ent)
 }
 static void htable_gc(struct work_struct *work);
 
-static int htable_create(struct net *net, struct hashlimit_cfg2 *cfg,
+static int htable_create(struct net *net, struct hashlimit_cfg3 *cfg,
 			 const char *name, u_int8_t family,
 			 struct xt_hashlimit_htable **out_hinfo,
 			 int revision)
 {
 	struct hashlimit_net *hashlimit_net = hashlimit_pernet(net);
 	struct xt_hashlimit_htable *hinfo;
+	const struct file_operations *fops;
 	unsigned int size, i;
 	int ret;
 
@@ -268,7 +293,7 @@ static int htable_create(struct net *net, struct hashlimit_cfg2 *cfg,
 	*out_hinfo = hinfo;
 
 	/* copy match config into hashtable config */
-	ret = cfg_copy(&hinfo->cfg, (void *)cfg, 2);
+	ret = cfg_copy(&hinfo->cfg, (void *)cfg, 3);
 
 	if (ret)
 		return ret;
@@ -293,11 +318,21 @@ static int htable_create(struct net *net, struct hashlimit_cfg2 *cfg,
 	}
 	spin_lock_init(&hinfo->lock);
 
+	switch (revision) {
+	case 1:
+		fops = &dl_file_ops_v1;
+		break;
+	case 2:
+		fops = &dl_file_ops_v2;
+		break;
+	default:
+		fops = &dl_file_ops;
+	}
+
 	hinfo->pde = proc_create_data(name, 0,
 		(family == NFPROTO_IPV4) ?
 		hashlimit_net->ipt_hashlimit : hashlimit_net->ip6t_hashlimit,
-		(revision == 1) ? &dl_file_ops_v1 : &dl_file_ops,
-		hinfo);
+		fops, hinfo);
 	if (hinfo->pde == NULL) {
 		kfree(hinfo->name);
 		vfree(hinfo);
@@ -482,6 +517,25 @@ static u32 user2credits_byte(u32 user)
 	return (u32) (us >> 32);
 }
 
+static u64 user2rate(u64 user)
+{
+	if (user != 0) {
+		return div64_u64(XT_HASHLIMIT_SCALE_v2, user);
+	} else {
+		pr_warn("invalid rate from userspace: %llu\n", user);
+		return 0;
+	}
+}
+
+static u64 user2rate_bytes(u64 user)
+{
+	u64 r;
+
+	r = user ? 0xFFFFFFFFULL / user : 0xFFFFFFFFULL;
+	r = (r - 1) << 4;
+	return r;
+}
+
 static void rateinfo_recalc(struct dsthash_ent *dh, unsigned long now,
 			    u32 mode, int revision)
 {
@@ -491,6 +545,21 @@ static void rateinfo_recalc(struct dsthash_ent *dh, unsigned long now,
 	if (delta == 0)
 		return;
 
+	if (revision >= 3 && mode & XT_HASHLIMIT_RATE_MATCH) {
+		u64 interval = dh->rateinfo.interval * HZ;
+
+		if (delta < interval)
+			return;
+
+		dh->rateinfo.prev = now;
+		dh->rateinfo.prev_window =
+			((dh->rateinfo.current_rate * interval) >
+			 (delta * dh->rateinfo.rate));
+		dh->rateinfo.current_rate = 0;
+
+		return;
+	}
+
 	dh->rateinfo.prev = now;
 
 	if (mode & XT_HASHLIMIT_BYTES) {
@@ -515,7 +584,23 @@ static void rateinfo_init(struct dsthash_ent *dh,
 			  struct xt_hashlimit_htable *hinfo, int revision)
 {
 	dh->rateinfo.prev = jiffies;
-	if (hinfo->cfg.mode & XT_HASHLIMIT_BYTES) {
+	if (revision >= 3 && hinfo->cfg.mode & XT_HASHLIMIT_RATE_MATCH) {
+		dh->rateinfo.prev_window = 0;
+		dh->rateinfo.current_rate = 0;
+		if (hinfo->cfg.mode & XT_HASHLIMIT_BYTES) {
+			dh->rateinfo.rate = user2rate_bytes(hinfo->cfg.avg);
+			if (hinfo->cfg.burst)
+				dh->rateinfo.burst =
+					hinfo->cfg.burst * dh->rateinfo.rate;
+			else
+				dh->rateinfo.burst = dh->rateinfo.rate;
+		} else {
+			dh->rateinfo.rate = user2rate(hinfo->cfg.avg);
+			dh->rateinfo.burst =
+				hinfo->cfg.burst + dh->rateinfo.rate;
+		}
+		dh->rateinfo.interval = hinfo->cfg.interval;
+	} else if (hinfo->cfg.mode & XT_HASHLIMIT_BYTES) {
 		dh->rateinfo.credit = CREDITS_PER_JIFFY_BYTES * HZ;
 		dh->rateinfo.cost = user2credits_byte(hinfo->cfg.avg);
 		dh->rateinfo.credit_cap = hinfo->cfg.burst;
@@ -648,7 +733,7 @@ static u32 hashlimit_byte_cost(unsigned int len, struct dsthash_ent *dh)
 static bool
 hashlimit_mt_common(const struct sk_buff *skb, struct xt_action_param *par,
 		    struct xt_hashlimit_htable *hinfo,
-		    const struct hashlimit_cfg2 *cfg, int revision)
+		    const struct hashlimit_cfg3 *cfg, int revision)
 {
 	unsigned long now = jiffies;
 	struct dsthash_ent *dh;
@@ -680,6 +765,20 @@ hashlimit_mt_common(const struct sk_buff *skb, struct xt_action_param *par,
 		rateinfo_recalc(dh, now, hinfo->cfg.mode, revision);
 	}
 
+	if (cfg->mode & XT_HASHLIMIT_RATE_MATCH) {
+		cost = (cfg->mode & XT_HASHLIMIT_BYTES) ? skb->len : 1;
+		dh->rateinfo.current_rate += cost;
+
+		if (!dh->rateinfo.prev_window &&
+		    (dh->rateinfo.current_rate <= dh->rateinfo.burst)) {
+			spin_unlock(&dh->lock);
+			rcu_read_unlock_bh();
+			return !(cfg->mode & XT_HASHLIMIT_INVERT);
+		} else {
+			goto overlimit;
+		}
+	}
+
 	if (cfg->mode & XT_HASHLIMIT_BYTES)
 		cost = hashlimit_byte_cost(skb->len, dh);
 	else
@@ -693,6 +792,7 @@ hashlimit_mt_common(const struct sk_buff *skb, struct xt_action_param *par,
 		return !(cfg->mode & XT_HASHLIMIT_INVERT);
 	}
 
+overlimit:
 	spin_unlock(&dh->lock);
 	local_bh_enable();
 	/* default match is underlimit - so over the limit, we need to invert */
@@ -708,7 +808,7 @@ hashlimit_mt_v1(const struct sk_buff *skb, struct xt_action_param *par)
 {
 	const struct xt_hashlimit_mtinfo1 *info = par->matchinfo;
 	struct xt_hashlimit_htable *hinfo = info->hinfo;
-	struct hashlimit_cfg2 cfg = {};
+	struct hashlimit_cfg3 cfg = {};
 	int ret;
 
 	ret = cfg_copy(&cfg, (void *)&info->cfg, 1);
@@ -720,17 +820,33 @@ hashlimit_mt_v1(const struct sk_buff *skb, struct xt_action_param *par)
 }
 
 static bool
-hashlimit_mt(const struct sk_buff *skb, struct xt_action_param *par)
+hashlimit_mt_v2(const struct sk_buff *skb, struct xt_action_param *par)
 {
 	const struct xt_hashlimit_mtinfo2 *info = par->matchinfo;
 	struct xt_hashlimit_htable *hinfo = info->hinfo;
+	struct hashlimit_cfg3 cfg = {};
+	int ret;
+
+	ret = cfg_copy(&cfg, (void *)&info->cfg, 2);
+
+	if (ret)
+		return ret;
+
+	return hashlimit_mt_common(skb, par, hinfo, &cfg, 2);
+}
+
+static bool
+hashlimit_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	const struct xt_hashlimit_mtinfo3 *info = par->matchinfo;
+	struct xt_hashlimit_htable *hinfo = info->hinfo;
 
-	return hashlimit_mt_common(skb, par, hinfo, &info->cfg, 2);
+	return hashlimit_mt_common(skb, par, hinfo, &info->cfg, 3);
 }
 
 static int hashlimit_mt_check_common(const struct xt_mtchk_param *par,
 				     struct xt_hashlimit_htable **hinfo,
-				     struct hashlimit_cfg2 *cfg,
+				     struct hashlimit_cfg3 *cfg,
 				     const char *name, int revision)
 {
 	struct net *net = par->net;
@@ -753,7 +869,17 @@ static int hashlimit_mt_check_common(const struct xt_mtchk_param *par,
 	}
 
 	/* Check for overflow. */
-	if (cfg->mode & XT_HASHLIMIT_BYTES) {
+	if (revision >= 3 && cfg->mode & XT_HASHLIMIT_RATE_MATCH) {
+		if (cfg->avg == 0) {
+			pr_info("hashlimit invalid rate\n");
+			return -ERANGE;
+		}
+
+		if (cfg->interval == 0) {
+			pr_info("hashlimit invalid interval\n");
+			return -EINVAL;
+		}
+	} else if (cfg->mode & XT_HASHLIMIT_BYTES) {
 		if (user2credits_byte(cfg->avg) == 0) {
 			pr_info("overflow, rate too high: %llu\n", cfg->avg);
 			return -EINVAL;
@@ -784,7 +910,7 @@ static int hashlimit_mt_check_common(const struct xt_mtchk_param *par,
 static int hashlimit_mt_check_v1(const struct xt_mtchk_param *par)
 {
 	struct xt_hashlimit_mtinfo1 *info = par->matchinfo;
-	struct hashlimit_cfg2 cfg = {};
+	struct hashlimit_cfg3 cfg = {};
 	int ret;
 
 	if (info->name[sizeof(info->name) - 1] != '\0')
@@ -799,15 +925,40 @@ static int hashlimit_mt_check_v1(const struct xt_mtchk_param *par)
 					 &cfg, info->name, 1);
 }
 
-static int hashlimit_mt_check(const struct xt_mtchk_param *par)
+static int hashlimit_mt_check_v2(const struct xt_mtchk_param *par)
 {
 	struct xt_hashlimit_mtinfo2 *info = par->matchinfo;
+	struct hashlimit_cfg3 cfg = {};
+	int ret;
+
+	if (info->name[sizeof(info->name) - 1] != '\0')
+		return -EINVAL;
+
+	ret = cfg_copy(&cfg, (void *)&info->cfg, 2);
+
+	if (ret)
+		return ret;
+
+	return hashlimit_mt_check_common(par, &info->hinfo,
+					 &cfg, info->name, 2);
+}
+
+static int hashlimit_mt_check(const struct xt_mtchk_param *par)
+{
+	struct xt_hashlimit_mtinfo3 *info = par->matchinfo;
 
 	if (info->name[sizeof(info->name) - 1] != '\0')
 		return -EINVAL;
 
 	return hashlimit_mt_check_common(par, &info->hinfo, &info->cfg,
-					 info->name, 2);
+					 info->name, 3);
+}
+
+static void hashlimit_mt_destroy_v2(const struct xt_mtdtor_param *par)
+{
+	const struct xt_hashlimit_mtinfo2 *info = par->matchinfo;
+
+	htable_put(info->hinfo);
 }
 
 static void hashlimit_mt_destroy_v1(const struct xt_mtdtor_param *par)
@@ -819,7 +970,7 @@ static void hashlimit_mt_destroy_v1(const struct xt_mtdtor_param *par)
 
 static void hashlimit_mt_destroy(const struct xt_mtdtor_param *par)
 {
-	const struct xt_hashlimit_mtinfo2 *info = par->matchinfo;
+	const struct xt_hashlimit_mtinfo3 *info = par->matchinfo;
 
 	htable_put(info->hinfo);
 }
@@ -840,9 +991,20 @@ static struct xt_match hashlimit_mt_reg[] __read_mostly = {
 		.name           = "hashlimit",
 		.revision       = 2,
 		.family         = NFPROTO_IPV4,
-		.match          = hashlimit_mt,
+		.match          = hashlimit_mt_v2,
 		.matchsize      = sizeof(struct xt_hashlimit_mtinfo2),
 		.usersize	= offsetof(struct xt_hashlimit_mtinfo2, hinfo),
+		.checkentry     = hashlimit_mt_check_v2,
+		.destroy        = hashlimit_mt_destroy_v2,
+		.me             = THIS_MODULE,
+	},
+	{
+		.name           = "hashlimit",
+		.revision       = 3,
+		.family         = NFPROTO_IPV4,
+		.match          = hashlimit_mt,
+		.matchsize      = sizeof(struct xt_hashlimit_mtinfo3),
+		.usersize	= offsetof(struct xt_hashlimit_mtinfo3, hinfo),
 		.checkentry     = hashlimit_mt_check,
 		.destroy        = hashlimit_mt_destroy,
 		.me             = THIS_MODULE,
@@ -863,9 +1025,20 @@ static struct xt_match hashlimit_mt_reg[] __read_mostly = {
 		.name           = "hashlimit",
 		.revision       = 2,
 		.family         = NFPROTO_IPV6,
-		.match          = hashlimit_mt,
+		.match          = hashlimit_mt_v2,
 		.matchsize      = sizeof(struct xt_hashlimit_mtinfo2),
 		.usersize	= offsetof(struct xt_hashlimit_mtinfo2, hinfo),
+		.checkentry     = hashlimit_mt_check_v2,
+		.destroy        = hashlimit_mt_destroy_v2,
+		.me             = THIS_MODULE,
+	},
+	{
+		.name           = "hashlimit",
+		.revision       = 3,
+		.family         = NFPROTO_IPV6,
+		.match          = hashlimit_mt,
+		.matchsize      = sizeof(struct xt_hashlimit_mtinfo3),
+		.usersize	= offsetof(struct xt_hashlimit_mtinfo3, hinfo),
 		.checkentry     = hashlimit_mt_check,
 		.destroy        = hashlimit_mt_destroy,
 		.me             = THIS_MODULE,
@@ -947,6 +1120,21 @@ static void dl_seq_print(struct dsthash_ent *ent, u_int8_t family,
 	}
 }
 
+static int dl_seq_real_show_v2(struct dsthash_ent *ent, u_int8_t family,
+			       struct seq_file *s)
+{
+	const struct xt_hashlimit_htable *ht = s->private;
+
+	spin_lock(&ent->lock);
+	/* recalculate to show accurate numbers */
+	rateinfo_recalc(ent, jiffies, ht->cfg.mode, 2);
+
+	dl_seq_print(ent, family, s);
+
+	spin_unlock(&ent->lock);
+	return seq_has_overflowed(s);
+}
+
 static int dl_seq_real_show_v1(struct dsthash_ent *ent, u_int8_t family,
 			       struct seq_file *s)
 {
@@ -969,7 +1157,7 @@ static int dl_seq_real_show(struct dsthash_ent *ent, u_int8_t family,
 
 	spin_lock(&ent->lock);
 	/* recalculate to show accurate numbers */
-	rateinfo_recalc(ent, jiffies, ht->cfg.mode, 2);
+	rateinfo_recalc(ent, jiffies, ht->cfg.mode, 3);
 
 	dl_seq_print(ent, family, s);
 
@@ -977,6 +1165,20 @@ static int dl_seq_real_show(struct dsthash_ent *ent, u_int8_t family,
 	return seq_has_overflowed(s);
 }
 
+static int dl_seq_show_v2(struct seq_file *s, void *v)
+{
+	struct xt_hashlimit_htable *htable = s->private;
+	unsigned int *bucket = (unsigned int *)v;
+	struct dsthash_ent *ent;
+
+	if (!hlist_empty(&htable->hash[*bucket])) {
+		hlist_for_each_entry(ent, &htable->hash[*bucket], node)
+			if (dl_seq_real_show_v2(ent, htable->family, s))
+				return -1;
+	}
+	return 0;
+}
+
 static int dl_seq_show_v1(struct seq_file *s, void *v)
 {
 	struct xt_hashlimit_htable *htable = s->private;
@@ -1012,6 +1214,13 @@ static const struct seq_operations dl_seq_ops_v1 = {
 	.show  = dl_seq_show_v1
 };
 
+static const struct seq_operations dl_seq_ops_v2 = {
+	.start = dl_seq_start,
+	.next  = dl_seq_next,
+	.stop  = dl_seq_stop,
+	.show  = dl_seq_show_v2
+};
+
 static const struct seq_operations dl_seq_ops = {
 	.start = dl_seq_start,
 	.next  = dl_seq_next,
@@ -1019,6 +1228,18 @@ static const struct seq_operations dl_seq_ops = {
 	.show  = dl_seq_show
 };
 
+static int dl_proc_open_v2(struct inode *inode, struct file *file)
+{
+	int ret = seq_open(file, &dl_seq_ops_v2);
+
+	if (!ret) {
+		struct seq_file *sf = file->private_data;
+
+		sf->private = PDE_DATA(inode);
+	}
+	return ret;
+}
+
 static int dl_proc_open_v1(struct inode *inode, struct file *file)
 {
 	int ret = seq_open(file, &dl_seq_ops_v1);
@@ -1042,6 +1263,14 @@ static int dl_proc_open(struct inode *inode, struct file *file)
 	return ret;
 }
 
+static const struct file_operations dl_file_ops_v2 = {
+	.owner   = THIS_MODULE,
+	.open    = dl_proc_open_v2,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release
+};
+
 static const struct file_operations dl_file_ops_v1 = {
 	.owner   = THIS_MODULE,
 	.open    = dl_proc_open_v1,
-- 
cgit v1.2.3-71-gd317


From a691205571723cb0544110ca91653ac4b0eb5b17 Mon Sep 17 00:00:00 2001
From: "Pablo M. Bermudo Garay" <pablombg@gmail.com>
Date: Wed, 23 Aug 2017 22:41:25 +0200
Subject: netfilter: nft_limit: add stateful object type

Register a new limit stateful object type into the stateful object
infrastructure.

Signed-off-by: Pablo M. Bermudo Garay <pablombg@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_tables.h |   3 +-
 net/netfilter/nft_limit.c                | 122 ++++++++++++++++++++++++++++++-
 2 files changed, 123 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index b49da72efa68..871afa4871bf 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -1282,7 +1282,8 @@ enum nft_ct_helper_attributes {
 #define NFT_OBJECT_COUNTER	1
 #define NFT_OBJECT_QUOTA	2
 #define NFT_OBJECT_CT_HELPER	3
-#define __NFT_OBJECT_MAX	4
+#define NFT_OBJECT_LIMIT	4
+#define __NFT_OBJECT_MAX	5
 #define NFT_OBJECT_MAX		(__NFT_OBJECT_MAX - 1)
 
 /**
diff --git a/net/netfilter/nft_limit.c b/net/netfilter/nft_limit.c
index aae2d1ec27f3..a9fc298ef4c3 100644
--- a/net/netfilter/nft_limit.c
+++ b/net/netfilter/nft_limit.c
@@ -229,14 +229,133 @@ static struct nft_expr_type nft_limit_type __read_mostly = {
 	.owner		= THIS_MODULE,
 };
 
+static void nft_limit_obj_pkts_eval(struct nft_object *obj,
+				    struct nft_regs *regs,
+				    const struct nft_pktinfo *pkt)
+{
+	struct nft_limit_pkts *priv = nft_obj_data(obj);
+
+	if (nft_limit_eval(&priv->limit, priv->cost))
+		regs->verdict.code = NFT_BREAK;
+}
+
+static int nft_limit_obj_pkts_init(const struct nft_ctx *ctx,
+				   const struct nlattr * const tb[],
+				   struct nft_object *obj)
+{
+	struct nft_limit_pkts *priv = nft_obj_data(obj);
+	int err;
+
+	err = nft_limit_init(&priv->limit, tb);
+	if (err < 0)
+		return err;
+
+	priv->cost = div64_u64(priv->limit.nsecs, priv->limit.rate);
+	return 0;
+}
+
+static int nft_limit_obj_pkts_dump(struct sk_buff *skb,
+				   struct nft_object *obj,
+				   bool reset)
+{
+	const struct nft_limit_pkts *priv = nft_obj_data(obj);
+
+	return nft_limit_dump(skb, &priv->limit, NFT_LIMIT_PKTS);
+}
+
+static struct nft_object_type nft_limit_obj_type;
+static const struct nft_object_ops nft_limit_obj_pkts_ops = {
+	.type		= &nft_limit_obj_type,
+	.size		= NFT_EXPR_SIZE(sizeof(struct nft_limit_pkts)),
+	.init		= nft_limit_obj_pkts_init,
+	.eval		= nft_limit_obj_pkts_eval,
+	.dump		= nft_limit_obj_pkts_dump,
+};
+
+static void nft_limit_obj_bytes_eval(struct nft_object *obj,
+				     struct nft_regs *regs,
+				     const struct nft_pktinfo *pkt)
+{
+	struct nft_limit *priv = nft_obj_data(obj);
+	u64 cost = div64_u64(priv->nsecs * pkt->skb->len, priv->rate);
+
+	if (nft_limit_eval(priv, cost))
+		regs->verdict.code = NFT_BREAK;
+}
+
+static int nft_limit_obj_bytes_init(const struct nft_ctx *ctx,
+				    const struct nlattr * const tb[],
+				    struct nft_object *obj)
+{
+	struct nft_limit *priv = nft_obj_data(obj);
+
+	return nft_limit_init(priv, tb);
+}
+
+static int nft_limit_obj_bytes_dump(struct sk_buff *skb,
+				    struct nft_object *obj,
+				    bool reset)
+{
+	const struct nft_limit *priv = nft_obj_data(obj);
+
+	return nft_limit_dump(skb, priv, NFT_LIMIT_PKT_BYTES);
+}
+
+static struct nft_object_type nft_limit_obj_type;
+static const struct nft_object_ops nft_limit_obj_bytes_ops = {
+	.type		= &nft_limit_obj_type,
+	.size		= sizeof(struct nft_limit),
+	.init		= nft_limit_obj_bytes_init,
+	.eval		= nft_limit_obj_bytes_eval,
+	.dump		= nft_limit_obj_bytes_dump,
+};
+
+static const struct nft_object_ops *
+nft_limit_obj_select_ops(const struct nft_ctx *ctx,
+			 const struct nlattr * const tb[])
+{
+	if (!tb[NFTA_LIMIT_TYPE])
+		return &nft_limit_obj_pkts_ops;
+
+	switch (ntohl(nla_get_be32(tb[NFTA_LIMIT_TYPE]))) {
+	case NFT_LIMIT_PKTS:
+		return &nft_limit_obj_pkts_ops;
+	case NFT_LIMIT_PKT_BYTES:
+		return &nft_limit_obj_bytes_ops;
+	}
+	return ERR_PTR(-EOPNOTSUPP);
+}
+
+static struct nft_object_type nft_limit_obj_type __read_mostly = {
+	.select_ops	= nft_limit_obj_select_ops,
+	.type		= NFT_OBJECT_LIMIT,
+	.maxattr	= NFTA_LIMIT_MAX,
+	.policy		= nft_limit_policy,
+	.owner		= THIS_MODULE,
+};
+
 static int __init nft_limit_module_init(void)
 {
-	return nft_register_expr(&nft_limit_type);
+	int err;
+
+	err = nft_register_obj(&nft_limit_obj_type);
+	if (err < 0)
+		return err;
+
+	err = nft_register_expr(&nft_limit_type);
+	if (err < 0)
+		goto err1;
+
+	return 0;
+err1:
+	nft_unregister_obj(&nft_limit_obj_type);
+	return err;
 }
 
 static void __exit nft_limit_module_exit(void)
 {
 	nft_unregister_expr(&nft_limit_type);
+	nft_unregister_obj(&nft_limit_obj_type);
 }
 
 module_init(nft_limit_module_init);
@@ -245,3 +364,4 @@ module_exit(nft_limit_module_exit);
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
 MODULE_ALIAS_NFT_EXPR("limit");
+MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_LIMIT);
-- 
cgit v1.2.3-71-gd317


From 2335ba704f32b855651d0cd15dd9b271ec565fb6 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Sun, 3 Sep 2017 23:55:59 +0200
Subject: netlink: add NLM_F_NONREC flag for deletion requests

In the last NFWS in Faro, Portugal, we discussed that netlink is lacking
the semantics to request non recursive deletions, ie. do not delete an
object iff it has child objects that hang from this parent object that
the user requests to be deleted.

We need this new flag to solve a problem for the iptables-compat
backward compatibility utility, that runs iptables commands using the
existing nf_tables netlink interface. Specifically, custom chains in
iptables cannot be deleted if there are rules in it, however, nf_tables
allows to remove any chain that is populated with content. To sort out
this asymmetry, iptables-compat userspace sets this new NLM_F_NONREC
flag to obtain the same semantics that iptables provides.

This new flag should only be used for deletion requests. Note this new
flag value overlaps with the existing:

* NLM_F_ROOT for get requests.
* NLM_F_REPLACE for new requests.

However, those flags should not ever be used in deletion requests.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netlink.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/netlink.h b/include/uapi/linux/netlink.h
index f4fc9c9e123d..e8af60a7c56d 100644
--- a/include/uapi/linux/netlink.h
+++ b/include/uapi/linux/netlink.h
@@ -69,6 +69,9 @@ struct nlmsghdr {
 #define NLM_F_CREATE	0x400	/* Create, if it does not exist	*/
 #define NLM_F_APPEND	0x800	/* Add to end of list		*/
 
+/* Modifiers to DELETE request */
+#define NLM_F_NONREC	0x100	/* Do not delete recursively	*/
+
 /* Flags for ACK message */
 #define NLM_F_CAPPED	0x100	/* request was capped */
 #define NLM_F_ACK_TLVS	0x200	/* extended ACK TVLs were included */
-- 
cgit v1.2.3-71-gd317


From c93022a72f01f8e53d6e1bc2a8d2c2824c2f36bc Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab@s-opensource.com>
Date: Fri, 1 Sep 2017 05:43:39 -0400
Subject: media: ca.h: split typedefs from structs

Using typedefs inside the Kernel is against CodingStyle, and
there's no good usage here.

Just like we did at frontend.h, at commit 0df289a209e0 ("[media] dvb:
Get rid of typedev usage for enums"), let's keep those typedefs only
to provide userspace backward compatibility.

No functional changes.

Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
---
 drivers/media/pci/ttpci/av7110.h    |  2 +-
 drivers/media/pci/ttpci/av7110_ca.c | 12 ++++-----
 include/uapi/linux/dvb/ca.h         | 51 +++++++++++++++++++++++--------------
 3 files changed, 39 insertions(+), 26 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/media/pci/ttpci/av7110.h b/drivers/media/pci/ttpci/av7110.h
index 824c1e262fbb..347827925c14 100644
--- a/drivers/media/pci/ttpci/av7110.h
+++ b/drivers/media/pci/ttpci/av7110.h
@@ -177,7 +177,7 @@ struct av7110 {
 
 	/* CA */
 
-	ca_slot_info_t		ci_slot[2];
+	struct ca_slot_info	ci_slot[2];
 
 	enum av7110_video_mode	vidmode;
 	struct dmxdev		dmxdev;
diff --git a/drivers/media/pci/ttpci/av7110_ca.c b/drivers/media/pci/ttpci/av7110_ca.c
index f64723aea56b..1fe49171d823 100644
--- a/drivers/media/pci/ttpci/av7110_ca.c
+++ b/drivers/media/pci/ttpci/av7110_ca.c
@@ -119,7 +119,7 @@ static void ci_ll_release(struct dvb_ringbuffer *cirbuf, struct dvb_ringbuffer *
 }
 
 static int ci_ll_reset(struct dvb_ringbuffer *cibuf, struct file *file,
-		       int slots, ca_slot_info_t *slot)
+		       int slots, struct ca_slot_info *slot)
 {
 	int i;
 	int len = 0;
@@ -264,7 +264,7 @@ static int dvb_ca_ioctl(struct file *file, unsigned int cmd, void *parg)
 		break;
 	case CA_GET_CAP:
 	{
-		ca_caps_t cap;
+		struct ca_caps cap;
 
 		cap.slot_num = 2;
 		cap.slot_type = (FW_CI_LL_SUPPORT(av7110->arm_app) ?
@@ -277,7 +277,7 @@ static int dvb_ca_ioctl(struct file *file, unsigned int cmd, void *parg)
 
 	case CA_GET_SLOT_INFO:
 	{
-		ca_slot_info_t *info=(ca_slot_info_t *)parg;
+		struct ca_slot_info *info=(struct ca_slot_info *)parg;
 
 		if (info->num < 0 || info->num > 1) {
 			mutex_unlock(&av7110->ioctl_mutex);
@@ -286,7 +286,7 @@ static int dvb_ca_ioctl(struct file *file, unsigned int cmd, void *parg)
 		av7110->ci_slot[info->num].num = info->num;
 		av7110->ci_slot[info->num].type = FW_CI_LL_SUPPORT(av7110->arm_app) ?
 							CA_CI_LINK : CA_CI;
-		memcpy(info, &av7110->ci_slot[info->num], sizeof(ca_slot_info_t));
+		memcpy(info, &av7110->ci_slot[info->num], sizeof(struct ca_slot_info));
 		break;
 	}
 
@@ -298,7 +298,7 @@ static int dvb_ca_ioctl(struct file *file, unsigned int cmd, void *parg)
 
 	case CA_GET_DESCR_INFO:
 	{
-		ca_descr_info_t info;
+		struct ca_descr_info info;
 
 		info.num = 16;
 		info.type = CA_ECD;
@@ -308,7 +308,7 @@ static int dvb_ca_ioctl(struct file *file, unsigned int cmd, void *parg)
 
 	case CA_SET_DESCR:
 	{
-		ca_descr_t *descr = (ca_descr_t*) parg;
+		struct ca_descr *descr = (struct ca_descr*) parg;
 
 		if (descr->index >= 16 || descr->parity > 1) {
 			mutex_unlock(&av7110->ioctl_mutex);
diff --git a/include/uapi/linux/dvb/ca.h b/include/uapi/linux/dvb/ca.h
index c18537f3e449..00cf24587bea 100644
--- a/include/uapi/linux/dvb/ca.h
+++ b/include/uapi/linux/dvb/ca.h
@@ -26,7 +26,7 @@
 
 /* slot interface types and info */
 
-typedef struct ca_slot_info {
+struct ca_slot_info {
 	int num;               /* slot number */
 
 	int type;              /* CA interface this slot supports */
@@ -39,52 +39,65 @@ typedef struct ca_slot_info {
 	unsigned int flags;
 #define CA_CI_MODULE_PRESENT 1 /* module (or card) inserted */
 #define CA_CI_MODULE_READY   2
-} ca_slot_info_t;
+};
 
 
 /* descrambler types and info */
 
-typedef struct ca_descr_info {
+struct ca_descr_info {
 	unsigned int num;          /* number of available descramblers (keys) */
 	unsigned int type;         /* type of supported scrambling system */
 #define CA_ECD           1
 #define CA_NDS           2
 #define CA_DSS           4
-} ca_descr_info_t;
+};
 
-typedef struct ca_caps {
+struct ca_caps {
 	unsigned int slot_num;     /* total number of CA card and module slots */
 	unsigned int slot_type;    /* OR of all supported types */
 	unsigned int descr_num;    /* total number of descrambler slots (keys) */
 	unsigned int descr_type;   /* OR of all supported types */
-} ca_caps_t;
+};
 
 /* a message to/from a CI-CAM */
-typedef struct ca_msg {
+struct ca_msg {
 	unsigned int index;
 	unsigned int type;
 	unsigned int length;
 	unsigned char msg[256];
-} ca_msg_t;
+};
 
-typedef struct ca_descr {
+struct ca_descr {
 	unsigned int index;
 	unsigned int parity;	/* 0 == even, 1 == odd */
 	unsigned char cw[8];
-} ca_descr_t;
+};
 
-typedef struct ca_pid {
+struct ca_pid {
 	unsigned int pid;
 	int index;		/* -1 == disable*/
-} ca_pid_t;
+};
 
 #define CA_RESET          _IO('o', 128)
-#define CA_GET_CAP        _IOR('o', 129, ca_caps_t)
-#define CA_GET_SLOT_INFO  _IOR('o', 130, ca_slot_info_t)
-#define CA_GET_DESCR_INFO _IOR('o', 131, ca_descr_info_t)
-#define CA_GET_MSG        _IOR('o', 132, ca_msg_t)
-#define CA_SEND_MSG       _IOW('o', 133, ca_msg_t)
-#define CA_SET_DESCR      _IOW('o', 134, ca_descr_t)
-#define CA_SET_PID        _IOW('o', 135, ca_pid_t)
+#define CA_GET_CAP        _IOR('o', 129, struct ca_caps)
+#define CA_GET_SLOT_INFO  _IOR('o', 130, struct ca_slot_info)
+#define CA_GET_DESCR_INFO _IOR('o', 131, struct ca_descr_info)
+#define CA_GET_MSG        _IOR('o', 132, struct ca_msg)
+#define CA_SEND_MSG       _IOW('o', 133, struct ca_msg)
+#define CA_SET_DESCR      _IOW('o', 134, struct ca_descr)
+#define CA_SET_PID        _IOW('o', 135, struct ca_pid)
+
+#if !defined (__KERNEL__)
+
+/* This is needed for legacy userspace support */
+typedef struct ca_slot_info ca_slot_info_t;
+typedef struct ca_descr_info  ca_descr_info_t;
+typedef struct ca_caps  ca_caps_t;
+typedef struct ca_msg ca_msg_t;
+typedef struct ca_descr ca_descr_t;
+typedef struct ca_pid ca_pid_t;
+
+#endif
+
 
 #endif
-- 
cgit v1.2.3-71-gd317


From 3256b36ea36525945d8575c0100752819a309aaa Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab@s-opensource.com>
Date: Fri, 1 Sep 2017 06:09:14 -0400
Subject: media: dmx.h: split typedefs from structs

Using typedefs inside the Kernel is against CodingStyle, and
there's no good usage here.

Just like we did at frontend.h, at commit 0df289a209e0
("[media] dvb: Get rid of typedev usage for enums"), let's keep
those typedefs only to provide userspace backward compatibility.

No functional changes.

Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
---
 drivers/media/dvb-core/dmxdev.c |  4 +--
 include/uapi/linux/dvb/dmx.h    | 56 ++++++++++++++++++++++++-----------------
 2 files changed, 35 insertions(+), 25 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/media/dvb-core/dmxdev.c b/drivers/media/dvb-core/dmxdev.c
index 45e91add73ba..16b0b74c3114 100644
--- a/drivers/media/dvb-core/dmxdev.c
+++ b/drivers/media/dvb-core/dmxdev.c
@@ -562,7 +562,7 @@ static int dvb_dmxdev_start_feed(struct dmxdev *dmxdev,
 {
 	ktime_t timeout = 0;
 	struct dmx_pes_filter_params *para = &filter->params.pes;
-	dmx_output_t otype;
+	enum dmx_output otype;
 	int ret;
 	int ts_type;
 	enum dmx_ts_pes ts_pes;
@@ -787,7 +787,7 @@ static int dvb_dmxdev_filter_free(struct dmxdev *dmxdev,
 	return 0;
 }
 
-static inline void invert_mode(dmx_filter_t *filter)
+static inline void invert_mode(struct dmx_filter *filter)
 {
 	int i;
 
diff --git a/include/uapi/linux/dvb/dmx.h b/include/uapi/linux/dvb/dmx.h
index 427e4899ed69..1bc4d6fb0f01 100644
--- a/include/uapi/linux/dvb/dmx.h
+++ b/include/uapi/linux/dvb/dmx.h
@@ -43,16 +43,14 @@ enum dmx_output
 	DMX_OUT_TSDEMUX_TAP /* Like TS_TAP but retrieved from the DMX device */
 };
 
-typedef enum dmx_output dmx_output_t;
-
-typedef enum dmx_input
+enum dmx_input
 {
 	DMX_IN_FRONTEND, /* Input from a front-end device.  */
 	DMX_IN_DVR       /* Input from the logical DVR device.  */
-} dmx_input_t;
+};
 
 
-typedef enum dmx_ts_pes
+enum dmx_ts_pes
 {
 	DMX_PES_AUDIO0,
 	DMX_PES_VIDEO0,
@@ -79,7 +77,7 @@ typedef enum dmx_ts_pes
 	DMX_PES_PCR3,
 
 	DMX_PES_OTHER
-} dmx_pes_type_t;
+};
 
 #define DMX_PES_AUDIO    DMX_PES_AUDIO0
 #define DMX_PES_VIDEO    DMX_PES_VIDEO0
@@ -88,20 +86,20 @@ typedef enum dmx_ts_pes
 #define DMX_PES_PCR      DMX_PES_PCR0
 
 
-typedef struct dmx_filter
+struct dmx_filter
 {
 	__u8  filter[DMX_FILTER_SIZE];
 	__u8  mask[DMX_FILTER_SIZE];
 	__u8  mode[DMX_FILTER_SIZE];
-} dmx_filter_t;
+};
 
 
 struct dmx_sct_filter_params
 {
-	__u16          pid;
-	dmx_filter_t   filter;
-	__u32          timeout;
-	__u32          flags;
+	__u16             pid;
+	struct dmx_filter filter;
+	__u32             timeout;
+	__u32             flags;
 #define DMX_CHECK_CRC       1
 #define DMX_ONESHOT         2
 #define DMX_IMMEDIATE_START 4
@@ -111,19 +109,19 @@ struct dmx_sct_filter_params
 
 struct dmx_pes_filter_params
 {
-	__u16          pid;
-	dmx_input_t    input;
-	dmx_output_t   output;
-	dmx_pes_type_t pes_type;
-	__u32          flags;
+	__u16           pid;
+	enum dmx_input  input;
+	enum dmx_output output;
+	enum dmx_ts_pes pes_type;
+	__u32           flags;
 };
 
-typedef struct dmx_caps {
+struct dmx_caps {
 	__u32 caps;
 	int num_decoders;
-} dmx_caps_t;
+};
 
-typedef enum dmx_source {
+enum dmx_source {
 	DMX_SOURCE_FRONT0 = 0,
 	DMX_SOURCE_FRONT1,
 	DMX_SOURCE_FRONT2,
@@ -132,7 +130,7 @@ typedef enum dmx_source {
 	DMX_SOURCE_DVR1,
 	DMX_SOURCE_DVR2,
 	DMX_SOURCE_DVR3
-} dmx_source_t;
+};
 
 struct dmx_stc {
 	unsigned int num;	/* input : which STC? 0..N */
@@ -146,10 +144,22 @@ struct dmx_stc {
 #define DMX_SET_PES_FILTER       _IOW('o', 44, struct dmx_pes_filter_params)
 #define DMX_SET_BUFFER_SIZE      _IO('o', 45)
 #define DMX_GET_PES_PIDS         _IOR('o', 47, __u16[5])
-#define DMX_GET_CAPS             _IOR('o', 48, dmx_caps_t)
-#define DMX_SET_SOURCE           _IOW('o', 49, dmx_source_t)
+#define DMX_GET_CAPS             _IOR('o', 48, struct dmx_caps)
+#define DMX_SET_SOURCE           _IOW('o', 49, enum dmx_source)
 #define DMX_GET_STC              _IOWR('o', 50, struct dmx_stc)
 #define DMX_ADD_PID              _IOW('o', 51, __u16)
 #define DMX_REMOVE_PID           _IOW('o', 52, __u16)
 
+#if !defined (__KERNEL__)
+
+/* This is needed for legacy userspace support */
+typedef enum dmx_output dmx_output_t;
+typedef enum dmx_input dmx_input_t;
+typedef enum dmx_ts_pes dmx_pes_type_t;
+typedef struct dmx_filter dmx_filter_t;
+typedef struct dmx_caps dmx_caps_t;
+typedef enum dmx_source  dmx_source_t;
+
+#endif
+
 #endif /* _UAPI_DVBDMX_H_ */
-- 
cgit v1.2.3-71-gd317


From f35afa4f60c868d7c7811ba747133acbf39410ac Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab@s-opensource.com>
Date: Wed, 30 Aug 2017 07:55:47 -0400
Subject: media: dvb/frontend.h: move out a private internal structure

struct dtv_cmds_h is just an ancillary struct used by the
dvb_frontend.c to internally store frontend commands.

It doesn't belong to the userspace header, nor it is used anywhere,
except inside the DVB core. So, remove it from the header.

Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
---
 drivers/media/dvb-core/dvb_frontend.c | 11 +++++++++++
 include/uapi/linux/dvb/frontend.h     | 11 -----------
 2 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/media/dvb-core/dvb_frontend.c b/drivers/media/dvb-core/dvb_frontend.c
index 114994ca0929..2fcba1616168 100644
--- a/drivers/media/dvb-core/dvb_frontend.c
+++ b/drivers/media/dvb-core/dvb_frontend.c
@@ -1000,6 +1000,17 @@ static int dvb_frontend_clear_cache(struct dvb_frontend *fe)
 	.buffer = b \
 }
 
+struct dtv_cmds_h {
+	char	*name;		/* A display name for debugging purposes */
+
+	__u32	cmd;		/* A unique ID */
+
+	/* Flags */
+	__u32	set:1;		/* Either a set or get property */
+	__u32	buffer:1;	/* Does this property use the buffer? */
+	__u32	reserved:30;	/* Align */
+};
+
 static struct dtv_cmds_h dtv_cmds[DTV_MAX_COMMAND + 1] = {
 	_DTV_CMD(DTV_TUNE, 1, 0),
 	_DTV_CMD(DTV_CLEAR, 1, 0),
diff --git a/include/uapi/linux/dvb/frontend.h b/include/uapi/linux/dvb/frontend.h
index afc3972b0879..3a80f3d1da1c 100644
--- a/include/uapi/linux/dvb/frontend.h
+++ b/include/uapi/linux/dvb/frontend.h
@@ -384,17 +384,6 @@ enum atscmh_rs_code_mode {
 #define NO_STREAM_ID_FILTER	(~0U)
 #define LNA_AUTO                (~0U)
 
-struct dtv_cmds_h {
-	char	*name;		/* A display name for debugging purposes */
-
-	__u32	cmd;		/* A unique ID */
-
-	/* Flags */
-	__u32	set:1;		/* Either a set or get property */
-	__u32	buffer:1;	/* Does this property use the buffer? */
-	__u32	reserved:30;	/* Align */
-};
-
 /**
  * Scale types for the quality parameters.
  * @FE_SCALE_NOT_AVAILABLE: That QoS measure is not available. That
-- 
cgit v1.2.3-71-gd317


From 8220ead805b6bab4ade2839857a198e9708b07de Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab@s-opensource.com>
Date: Wed, 30 Aug 2017 08:12:38 -0400
Subject: media: dvb/frontend.h: document the uAPI file

Most of the stuff at the Digital TV frontend header file
are documented only at the Documentation. However, a few
kernel-doc markups are there, several of them with parsing
issues.

Add the missing documentation, copying definitions from the
Documentation when it applies, fixing some bugs.

Please notice that DVBv3 stuff that were deprecated weren't
commented by purpose. Instead, they were clearly tagged as
such.

This patch prepares to move part of the documentation from
Documentation/ to kernel-doc comments.

Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
---
 include/uapi/linux/dvb/frontend.h | 580 ++++++++++++++++++++++++++++++++------
 1 file changed, 498 insertions(+), 82 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/dvb/frontend.h b/include/uapi/linux/dvb/frontend.h
index 3a80f3d1da1c..16a318fc469a 100644
--- a/include/uapi/linux/dvb/frontend.h
+++ b/include/uapi/linux/dvb/frontend.h
@@ -28,13 +28,46 @@
 
 #include <linux/types.h>
 
-enum fe_type {
-	FE_QPSK,
-	FE_QAM,
-	FE_OFDM,
-	FE_ATSC
-};
-
+/**
+ * enum fe_caps - Frontend capabilities
+ *
+ * @FE_IS_STUPID:			There's something wrong at the
+ *					frontend, and it can't report its
+ *					capabilities.
+ * @FE_CAN_INVERSION_AUTO:		Can auto-detect frequency spectral
+ *					band inversion
+ * @FE_CAN_FEC_1_2:			Supports FEC 1/2
+ * @FE_CAN_FEC_2_3:			Supports FEC 2/3
+ * @FE_CAN_FEC_3_4:			Supports FEC 3/4
+ * @FE_CAN_FEC_4_5:			Supports FEC 4/5
+ * @FE_CAN_FEC_5_6:			Supports FEC 5/6
+ * @FE_CAN_FEC_6_7:			Supports FEC 6/7
+ * @FE_CAN_FEC_7_8:			Supports FEC 7/8
+ * @FE_CAN_FEC_8_9:			Supports FEC 8/9
+ * @FE_CAN_FEC_AUTO:			Can auto-detect FEC
+ * @FE_CAN_QPSK:			Supports QPSK modulation
+ * @FE_CAN_QAM_16:			Supports 16-QAM modulation
+ * @FE_CAN_QAM_32:			Supports 32-QAM modulation
+ * @FE_CAN_QAM_64:			Supports 64-QAM modulation
+ * @FE_CAN_QAM_128:			Supports 128-QAM modulation
+ * @FE_CAN_QAM_256:			Supports 256-QAM modulation
+ * @FE_CAN_QAM_AUTO:			Can auto-detect QAM modulation
+ * @FE_CAN_TRANSMISSION_MODE_AUTO:	Can auto-detect transmission mode
+ * @FE_CAN_BANDWIDTH_AUTO:		Can auto-detect bandwidth
+ * @FE_CAN_GUARD_INTERVAL_AUTO:		Can auto-detect guard interval
+ * @FE_CAN_HIERARCHY_AUTO:		Can auto-detect hierarchy
+ * @FE_CAN_8VSB:			Supports 8-VSB modulation
+ * @FE_CAN_16VSB:			Supporta 16-VSB modulation
+ * @FE_HAS_EXTENDED_CAPS:		Unused
+ * @FE_CAN_MULTISTREAM:			Supports multistream filtering
+ * @FE_CAN_TURBO_FEC:			Supports "turbo FEC" modulation
+ * @FE_CAN_2G_MODULATION:		Supports "2nd generation" modulation,
+ *					e. g. DVB-S2, DVB-T2, DVB-C2
+ * @FE_NEEDS_BENDING:			Unused
+ * @FE_CAN_RECOVER:			Can recover from a cable unplug
+ *					automatically
+ * @FE_CAN_MUTE_TS:			Can stop spurious TS data output
+ */
 enum fe_caps {
 	FE_IS_STUPID			= 0,
 	FE_CAN_INVERSION_AUTO		= 0x1,
@@ -60,15 +93,55 @@ enum fe_caps {
 	FE_CAN_HIERARCHY_AUTO		= 0x100000,
 	FE_CAN_8VSB			= 0x200000,
 	FE_CAN_16VSB			= 0x400000,
-	FE_HAS_EXTENDED_CAPS		= 0x800000,   /* We need more bitspace for newer APIs, indicate this. */
-	FE_CAN_MULTISTREAM		= 0x4000000,  /* frontend supports multistream filtering */
-	FE_CAN_TURBO_FEC		= 0x8000000,  /* frontend supports "turbo fec modulation" */
-	FE_CAN_2G_MODULATION		= 0x10000000, /* frontend supports "2nd generation modulation" (DVB-S2) */
-	FE_NEEDS_BENDING		= 0x20000000, /* not supported anymore, don't use (frontend requires frequency bending) */
-	FE_CAN_RECOVER			= 0x40000000, /* frontend can recover from a cable unplug automatically */
-	FE_CAN_MUTE_TS			= 0x80000000  /* frontend can stop spurious TS data output */
+	FE_HAS_EXTENDED_CAPS		= 0x800000,
+	FE_CAN_MULTISTREAM		= 0x4000000,
+	FE_CAN_TURBO_FEC		= 0x8000000,
+	FE_CAN_2G_MODULATION		= 0x10000000,
+	FE_NEEDS_BENDING		= 0x20000000,
+	FE_CAN_RECOVER			= 0x40000000,
+	FE_CAN_MUTE_TS			= 0x80000000
+};
+
+/*
+ * DEPRECATED: Should be kept just due to backward compatibility.
+ */
+enum fe_type {
+	FE_QPSK,
+	FE_QAM,
+	FE_OFDM,
+	FE_ATSC
 };
 
+/**
+ * struct dvb_frontend_info - Frontend properties and capabilities
+ *
+ * @name:			Name of the frontend
+ * @type:			**DEPRECATED**.
+ *				Should not be used on modern programs,
+ *				as a frontend may have more than one type.
+ *				In order to get the support types of a given
+ *				frontend, use :c:type:`DTV_ENUM_DELSYS`
+ *				instead.
+ * @frequency_min:		Minimal frequency supported by the frontend.
+ * @frequency_max:		Minimal frequency supported by the frontend.
+ * @frequency_stepsize:		All frequencies are multiple of this value.
+ * @frequency_tolerance:	Frequency tolerance.
+ * @symbol_rate_min:		Minimal symbol rate, in bauds
+ *				(for Cable/Satellite systems).
+ * @symbol_rate_max:		Maximal symbol rate, in bauds
+ *				(for Cable/Satellite systems).
+ * @symbol_rate_tolerance:	Maximal symbol rate tolerance, in ppm
+ *				(for Cable/Satellite systems).
+ * @notifier_delay:		**DEPRECATED**. Not used by any driver.
+ * @caps:			Capabilities supported by the frontend,
+ *				as specified in &enum fe_caps.
+ *
+ * .. note:
+ *
+ *    #. The frequencies are specified in Hz for Terrestrial and Cable
+ *       systems.
+ *    #. The frequencies are specified in kHz for Satellite systems.
+ */
 struct dvb_frontend_info {
 	char       name[128];
 	enum fe_type type;	/* DEPRECATED. Use DTV_ENUM_DELSYS instead */
@@ -78,53 +151,102 @@ struct dvb_frontend_info {
 	__u32      frequency_tolerance;
 	__u32      symbol_rate_min;
 	__u32      symbol_rate_max;
-	__u32      symbol_rate_tolerance;	/* ppm */
+	__u32      symbol_rate_tolerance;
 	__u32      notifier_delay;		/* DEPRECATED */
 	enum fe_caps caps;
 };
 
-
 /**
- *  Check out the DiSEqC bus spec available on http://www.eutelsat.org/ for
- *  the meaning of this struct...
+ * struct dvb_diseqc_master_cmd - DiSEqC master command
+ *
+ * @msg:
+ *	DiSEqC message to be sent. It contains a 3 bytes header with:
+ *	framing + address + command, and an optional argument
+ *	of up to 3 bytes of data.
+ * @msg_len:
+ *	Length of the DiSEqC message. Valid values are 3 to 6.
+ *
+ * Check out the DiSEqC bus spec available on http://www.eutelsat.org/ for
+ * the possible messages that can be used.
  */
 struct dvb_diseqc_master_cmd {
-	__u8 msg [6];	/*  { framing, address, command, data [3] } */
-	__u8 msg_len;	/*  valid values are 3...6  */
+	__u8 msg[6];
+	__u8 msg_len;
 };
 
+/**
+ * struct dvb_diseqc_slave_reply - DiSEqC received data
+ *
+ * @msg:
+ *	DiSEqC message buffer to store a message received via DiSEqC.
+ *	It contains one byte header with: framing and
+ *	an optional argument of up to 3 bytes of data.
+ * @msg_len:
+ *	Length of the DiSEqC message. Valid values are 0 to 4,
+ *	where 0 means no message.
+ * @timeout:
+ *	Return from ioctl after timeout ms with errorcode when
+ *	no message was received.
+ *
+ * Check out the DiSEqC bus spec available on http://www.eutelsat.org/ for
+ * the possible messages that can be used.
+ */
 struct dvb_diseqc_slave_reply {
-	__u8 msg [4];	/*  { framing, data [3] } */
-	__u8 msg_len;	/*  valid values are 0...4, 0 means no msg  */
-	int  timeout;	/*  return from ioctl after timeout ms with */
-};			/*  errorcode when no message was received  */
+	__u8 msg[4];
+	__u8 msg_len;
+	int  timeout;
+};
 
+/**
+ * enum fe_sec_voltage - DC Voltage used to feed the LNBf
+ *
+ * @SEC_VOLTAGE_13:	Output 13V to the LNBf
+ * @SEC_VOLTAGE_18:	Output 18V to the LNBf
+ * @SEC_VOLTAGE_OFF:	Don't feed the LNBf with a DC voltage
+ */
 enum fe_sec_voltage {
 	SEC_VOLTAGE_13,
 	SEC_VOLTAGE_18,
 	SEC_VOLTAGE_OFF
 };
 
+/**
+ * enum fe_sec_tone_mode - Type of tone to be send to the LNBf.
+ * @SEC_TONE_ON:	Sends a 22kHz tone burst to the antenna.
+ * @SEC_TONE_OFF:	Don't send a 22kHz tone to the antenna (except
+ *			if the ``FE_DISEQC_*`` ioctls are called).
+ */
 enum fe_sec_tone_mode {
 	SEC_TONE_ON,
 	SEC_TONE_OFF
 };
 
+/**
+ * enum fe_sec_mini_cmd - Type of mini burst to be sent
+ *
+ * @SEC_MINI_A:		Sends a mini-DiSEqC 22kHz '0' Tone Burst to select
+ *			satellite-A
+ * @SEC_MINI_B:		Sends a mini-DiSEqC 22kHz '1' Data Burst to select
+ *			satellite-B
+ */
 enum fe_sec_mini_cmd {
 	SEC_MINI_A,
 	SEC_MINI_B
 };
 
 /**
- * enum fe_status - enumerates the possible frontend status
- * @FE_HAS_SIGNAL:	found something above the noise level
- * @FE_HAS_CARRIER:	found a DVB signal
- * @FE_HAS_VITERBI:	FEC is stable
- * @FE_HAS_SYNC:	found sync bytes
- * @FE_HAS_LOCK:	everything's working
- * @FE_TIMEDOUT:	no lock within the last ~2 seconds
- * @FE_REINIT:		frontend was reinitialized, application is recommended
- *			to reset DiSEqC, tone and parameters
+ * enum fe_status - Enumerates the possible frontend status.
+ * @FE_NONE:		The frontend doesn't have any kind of lock.
+ *			That's the initial frontend status
+ * @FE_HAS_SIGNAL:	Has found something above the noise level.
+ * @FE_HAS_CARRIER:	Has found a DVB signal.
+ * @FE_HAS_VITERBI:	FEC inner coding (Viterbi, LDPC or other inner code).
+ *			is stable.
+ * @FE_HAS_SYNC:	Synchronization bytes was found.
+ * @FE_HAS_LOCK:	DVB were locked and everything is working.
+ * @FE_TIMEDOUT:	Fo lock within the last about 2 seconds.
+ * @FE_REINIT:		Frontend was reinitialized, application is recommended
+ *			to reset DiSEqC, tone and parameters.
  */
 enum fe_status {
 	FE_NONE			= 0x00,
@@ -137,12 +259,45 @@ enum fe_status {
 	FE_REINIT		= 0x40,
 };
 
+/**
+ * enum fe_spectral_inversion - Type of inversion band
+ *
+ * @INVERSION_OFF:	Don't do spectral band inversion.
+ * @INVERSION_ON:	Do spectral band inversion.
+ * @INVERSION_AUTO:	Autodetect spectral band inversion.
+ *
+ * This parameter indicates if spectral inversion should be presumed or
+ * not. In the automatic setting (``INVERSION_AUTO``) the hardware will try
+ * to figure out the correct setting by itself. If the hardware doesn't
+ * support, the DVB core will try to lock at the carrier first with
+ * inversion off. If it fails, it will try to enable inversion.
+ */
 enum fe_spectral_inversion {
 	INVERSION_OFF,
 	INVERSION_ON,
 	INVERSION_AUTO
 };
 
+/**
+ * enum fe_code_rate - Type of Forward Error Correction (FEC)
+ *
+ *
+ * @FEC_NONE: No Forward Error Correction Code
+ * @FEC_1_2:  Forward Error Correction Code 1/2
+ * @FEC_2_3:  Forward Error Correction Code 2/3
+ * @FEC_3_4:  Forward Error Correction Code 3/4
+ * @FEC_4_5:  Forward Error Correction Code 4/5
+ * @FEC_5_6:  Forward Error Correction Code 5/6
+ * @FEC_6_7:  Forward Error Correction Code 6/7
+ * @FEC_7_8:  Forward Error Correction Code 7/8
+ * @FEC_8_9:  Forward Error Correction Code 8/9
+ * @FEC_AUTO: Autodetect Error Correction Code
+ * @FEC_3_5:  Forward Error Correction Code 3/5
+ * @FEC_9_10: Forward Error Correction Code 9/10
+ * @FEC_2_5:  Forward Error Correction Code 2/5
+ *
+ * Please note that not all FEC types are supported by a given standard.
+ */
 enum fe_code_rate {
 	FEC_NONE = 0,
 	FEC_1_2,
@@ -159,6 +314,26 @@ enum fe_code_rate {
 	FEC_2_5,
 };
 
+/**
+ * enum fe_modulation - Type of modulation/constellation
+ * @QPSK:	QPSK modulation
+ * @QAM_16:	16-QAM modulation
+ * @QAM_32:	32-QAM modulation
+ * @QAM_64:	64-QAM modulation
+ * @QAM_128:	128-QAM modulation
+ * @QAM_256:	256-QAM modulation
+ * @QAM_AUTO:	Autodetect QAM modulation
+ * @VSB_8:	8-VSB modulation
+ * @VSB_16:	16-VSB modulation
+ * @PSK_8:	8-PSK modulation
+ * @APSK_16:	16-APSK modulation
+ * @APSK_32:	32-APSK modulation
+ * @DQPSK:	DQPSK modulation
+ * @QAM_4_NR:	4-QAM-NR modulation
+ *
+ * Please note that not all modulations are supported by a given standard.
+ *
+ */
 enum fe_modulation {
 	QPSK,
 	QAM_16,
@@ -176,6 +351,32 @@ enum fe_modulation {
 	QAM_4_NR,
 };
 
+/**
+ * enum fe_transmit_mode - Transmission mode
+ *
+ * @TRANSMISSION_MODE_AUTO:
+ *	Autodetect transmission mode. The hardware will try to find the
+ *	correct FFT-size (if capable) to fill in the missing parameters.
+ * @TRANSMISSION_MODE_1K:
+ *	Transmission mode 1K
+ * @TRANSMISSION_MODE_2K:
+ *	Transmission mode 2K
+ * @TRANSMISSION_MODE_8K:
+ *	Transmission mode 8K
+ * @TRANSMISSION_MODE_4K:
+ *	Transmission mode 4K
+ * @TRANSMISSION_MODE_16K:
+ *	Transmission mode 16K
+ * @TRANSMISSION_MODE_32K:
+ *	Transmission mode 32K
+ * @TRANSMISSION_MODE_C1:
+ *	Single Carrier (C=1) transmission mode (DTMB only)
+ * @TRANSMISSION_MODE_C3780:
+ *	Multi Carrier (C=3780) transmission mode (DTMB only)
+ *
+ * Please note that not all transmission modes are supported by a given
+ * standard.
+ */
 enum fe_transmit_mode {
 	TRANSMISSION_MODE_2K,
 	TRANSMISSION_MODE_8K,
@@ -188,6 +389,23 @@ enum fe_transmit_mode {
 	TRANSMISSION_MODE_C3780,
 };
 
+/**
+ * enum fe_guard_interval - Guard interval
+ *
+ * @GUARD_INTERVAL_AUTO:	Autodetect the guard interval
+ * @GUARD_INTERVAL_1_128:	Guard interval 1/128
+ * @GUARD_INTERVAL_1_32:	Guard interval 1/32
+ * @GUARD_INTERVAL_1_16:	Guard interval 1/16
+ * @GUARD_INTERVAL_1_8:		Guard interval 1/8
+ * @GUARD_INTERVAL_1_4:		Guard interval 1/4
+ * @GUARD_INTERVAL_19_128:	Guard interval 19/128
+ * @GUARD_INTERVAL_19_256:	Guard interval 19/256
+ * @GUARD_INTERVAL_PN420:	PN length 420 (1/4)
+ * @GUARD_INTERVAL_PN595:	PN length 595 (1/6)
+ * @GUARD_INTERVAL_PN945:	PN length 945 (1/9)
+ *
+ * Please note that not all guard intervals are supported by a given standard.
+ */
 enum fe_guard_interval {
 	GUARD_INTERVAL_1_32,
 	GUARD_INTERVAL_1_16,
@@ -202,6 +420,16 @@ enum fe_guard_interval {
 	GUARD_INTERVAL_PN945,
 };
 
+/**
+ * enum fe_hierarchy - Hierarchy
+ * @HIERARCHY_NONE:	No hierarchy
+ * @HIERARCHY_AUTO:	Autodetect hierarchy (if supported)
+ * @HIERARCHY_1:	Hierarchy 1
+ * @HIERARCHY_2:	Hierarchy 2
+ * @HIERARCHY_4:	Hierarchy 4
+ *
+ * Please note that not all hierarchy types are supported by a given standard.
+ */
 enum fe_hierarchy {
 	HIERARCHY_NONE,
 	HIERARCHY_1,
@@ -210,6 +438,15 @@ enum fe_hierarchy {
 	HIERARCHY_AUTO
 };
 
+/**
+ * enum fe_interleaving - Interleaving
+ * @INTERLEAVING_NONE:	No interleaving.
+ * @INTERLEAVING_AUTO:	Auto-detect interleaving.
+ * @INTERLEAVING_240:	Interleaving of 240 symbols.
+ * @INTERLEAVING_720:	Interleaving of 720 symbols.
+ *
+ * Please note that, currently, only DTMB uses it.
+ */
 enum fe_interleaving {
 	INTERLEAVING_NONE,
 	INTERLEAVING_AUTO,
@@ -217,7 +454,8 @@ enum fe_interleaving {
 	INTERLEAVING_720,
 };
 
-/* S2API Commands */
+/* DVBv5 property Commands */
+
 #define DTV_UNDEFINED		0
 #define DTV_TUNE		1
 #define DTV_CLEAR		2
@@ -310,19 +548,79 @@ enum fe_interleaving {
 
 #define DTV_MAX_COMMAND		DTV_STAT_TOTAL_BLOCK_COUNT
 
+/**
+ * enum fe_pilot - Type of pilot tone
+ *
+ * @PILOT_ON:	Pilot tones enabled
+ * @PILOT_OFF:	Pilot tones disabled
+ * @PILOT_AUTO:	Autodetect pilot tones
+ */
 enum fe_pilot {
 	PILOT_ON,
 	PILOT_OFF,
 	PILOT_AUTO,
 };
 
+/**
+ * enum fe_rolloff - Rolloff factor (also known as alpha)
+ * @ROLLOFF_35:		Roloff factor: 35%
+ * @ROLLOFF_20:		Roloff factor: 20%
+ * @ROLLOFF_25:		Roloff factor: 25%
+ * @ROLLOFF_AUTO:	Auto-detect the roloff factor.
+ *
+ * .. note:
+ *
+ *    Roloff factor of 35% is implied on DVB-S. On DVB-S2, it is default.
+ */
 enum fe_rolloff {
-	ROLLOFF_35, /* Implied value in DVB-S, default for DVB-S2 */
+	ROLLOFF_35,
 	ROLLOFF_20,
 	ROLLOFF_25,
 	ROLLOFF_AUTO,
 };
 
+/**
+ * enum fe_delivery_system - Type of the delivery system
+ *
+ * @SYS_UNDEFINED:
+ *	Undefined standard. Generally, indicates an error
+ * @SYS_DVBC_ANNEX_A:
+ *	Cable TV: DVB-C following ITU-T J.83 Annex A spec
+ * @SYS_DVBC_ANNEX_B:
+ *	Cable TV: DVB-C following ITU-T J.83 Annex B spec (ClearQAM)
+ * @SYS_DVBC_ANNEX_C:
+ *	Cable TV: DVB-C following ITU-T J.83 Annex C spec
+ * @SYS_ISDBC:
+ *	Cable TV: ISDB-C (no drivers yet)
+ * @SYS_DVBT:
+ *	Terrestrial TV: DVB-T
+ * @SYS_DVBT2:
+ *	Terrestrial TV: DVB-T2
+ * @SYS_ISDBT:
+ *	Terrestrial TV: ISDB-T
+ * @SYS_ATSC:
+ *	Terrestrial TV: ATSC
+ * @SYS_ATSCMH:
+ *	Terrestrial TV (mobile): ATSC-M/H
+ * @SYS_DTMB:
+ *	Terrestrial TV: DTMB
+ * @SYS_DVBS:
+ *	Satellite TV: DVB-S
+ * @SYS_DVBS2:
+ *	Satellite TV: DVB-S2
+ * @SYS_TURBO:
+ *	Satellite TV: DVB-S Turbo
+ * @SYS_ISDBS:
+ *	Satellite TV: ISDB-S
+ * @SYS_DAB:
+ *	Digital audio: DAB (not fully supported)
+ * @SYS_DSS:
+ *	Satellite TV: DSS (not fully supported)
+ * @SYS_CMMB:
+ *	Terrestrial TV (mobile): CMMB (not fully supported)
+ * @SYS_DVBH:
+ *	Terrestrial TV (mobile): DVB-H (standard deprecated)
+ */
 enum fe_delivery_system {
 	SYS_UNDEFINED,
 	SYS_DVBC_ANNEX_A,
@@ -345,35 +643,85 @@ enum fe_delivery_system {
 	SYS_DVBC_ANNEX_C,
 };
 
-/* backward compatibility */
+/* backward compatibility definitions for delivery systems */
 #define SYS_DVBC_ANNEX_AC	SYS_DVBC_ANNEX_A
-#define SYS_DMBTH SYS_DTMB /* DMB-TH is legacy name, use DTMB instead */
+#define SYS_DMBTH		SYS_DTMB /* DMB-TH is legacy name, use DTMB */
 
-/* ATSC-MH */
+/* ATSC-MH specific parameters */
 
+/**
+ * enum atscmh_sccc_block_mode - Type of Series Concatenated Convolutional
+ *				 Code Block Mode.
+ *
+ * @ATSCMH_SCCC_BLK_SEP:
+ *	Separate SCCC: the SCCC outer code mode shall be set independently
+ *	for each Group Region (A, B, C, D)
+ * @ATSCMH_SCCC_BLK_COMB:
+ *	Combined SCCC: all four Regions shall have the same SCCC outer
+ *	code mode.
+ * @ATSCMH_SCCC_BLK_RES:
+ *	Reserved. Shouldn't be used.
+ */
 enum atscmh_sccc_block_mode {
 	ATSCMH_SCCC_BLK_SEP      = 0,
 	ATSCMH_SCCC_BLK_COMB     = 1,
 	ATSCMH_SCCC_BLK_RES      = 2,
 };
 
+/**
+ * enum atscmh_sccc_code_mode - Type of Series Concatenated Convolutional
+ *				Code Rate.
+ *
+ * @ATSCMH_SCCC_CODE_HLF:
+ *	The outer code rate of a SCCC Block is 1/2 rate.
+ * @ATSCMH_SCCC_CODE_QTR:
+ *	The outer code rate of a SCCC Block is 1/4 rate.
+ * @ATSCMH_SCCC_CODE_RES:
+ *	Reserved. Should not be used.
+ */
 enum atscmh_sccc_code_mode {
 	ATSCMH_SCCC_CODE_HLF     = 0,
 	ATSCMH_SCCC_CODE_QTR     = 1,
 	ATSCMH_SCCC_CODE_RES     = 2,
 };
 
+/**
+ * enum atscmh_rs_frame_ensemble - Reed Solomon(RS) frame ensemble.
+ *
+ * @ATSCMH_RSFRAME_ENS_PRI:	Primary Ensemble.
+ * @ATSCMH_RSFRAME_ENS_SEC:	Secondary Ensemble.
+ */
 enum atscmh_rs_frame_ensemble {
 	ATSCMH_RSFRAME_ENS_PRI   = 0,
 	ATSCMH_RSFRAME_ENS_SEC   = 1,
 };
 
+/**
+ * enum atscmh_rs_frame_mode - Reed Solomon (RS) frame mode.
+ *
+ * @ATSCMH_RSFRAME_PRI_ONLY:
+ *	Single Frame: There is only a primary RS Frame for all Group
+ *	Regions.
+ * @ATSCMH_RSFRAME_PRI_SEC:
+ *	Dual Frame: There are two separate RS Frames: Primary RS Frame for
+ *	Group Region A and B and Secondary RS Frame for Group Region C and
+ *	D.
+ * @ATSCMH_RSFRAME_RES:
+ *	Reserved. Shouldn't be used.
+ */
 enum atscmh_rs_frame_mode {
 	ATSCMH_RSFRAME_PRI_ONLY  = 0,
 	ATSCMH_RSFRAME_PRI_SEC   = 1,
 	ATSCMH_RSFRAME_RES       = 2,
 };
 
+/**
+ * enum atscmh_rs_code_mode
+ * @ATSCMH_RSCODE_211_187:	Reed Solomon code (211,187).
+ * @ATSCMH_RSCODE_223_187:	Reed Solomon code (223,187).
+ * @ATSCMH_RSCODE_235_187:	Reed Solomon code (235,187).
+ * @ATSCMH_RSCODE_RES:		Reserved. Shouldn't be used.
+ */
 enum atscmh_rs_code_mode {
 	ATSCMH_RSCODE_211_187    = 0,
 	ATSCMH_RSCODE_223_187    = 1,
@@ -385,16 +733,17 @@ enum atscmh_rs_code_mode {
 #define LNA_AUTO                (~0U)
 
 /**
- * Scale types for the quality parameters.
+ * enum fecap_scale_params - scale types for the quality parameters.
+ *
  * @FE_SCALE_NOT_AVAILABLE: That QoS measure is not available. That
  *			    could indicate a temporary or a permanent
  *			    condition.
  * @FE_SCALE_DECIBEL: The scale is measured in 0.001 dB steps, typically
- *		  used on signal measures.
+ *		      used on signal measures.
  * @FE_SCALE_RELATIVE: The scale is a relative percentual measure,
- *			ranging from 0 (0%) to 0xffff (100%).
+ *		       ranging from 0 (0%) to 0xffff (100%).
  * @FE_SCALE_COUNTER: The scale counts the occurrence of an event, like
- *			bit error, block error, lapsed time.
+ *		      bit error, block error, lapsed time.
  */
 enum fecap_scale_params {
 	FE_SCALE_NOT_AVAILABLE = 0,
@@ -406,24 +755,38 @@ enum fecap_scale_params {
 /**
  * struct dtv_stats - Used for reading a DTV status property
  *
- * @value:	value of the measure. Should range from 0 to 0xffff;
  * @scale:	Filled with enum fecap_scale_params - the scale
  *		in usage for that parameter
  *
+ * The ``{unnamed_union}`` may have either one of the values below:
+ *
+ * %svalue
+ *	integer value of the measure, for %FE_SCALE_DECIBEL,
+ *	used for dB measures. The unit is 0.001 dB.
+ *
+ * %uvalue
+ *	unsigned integer value of the measure, used when @scale is
+ *	either %FE_SCALE_RELATIVE or %FE_SCALE_COUNTER.
+ *
  * For most delivery systems, this will return a single value for each
  * parameter.
+ *
  * It should be noticed, however, that new OFDM delivery systems like
  * ISDB can use different modulation types for each group of carriers.
  * On such standards, up to 8 groups of statistics can be provided, one
  * for each carrier group (called "layer" on ISDB).
+ *
  * In order to be consistent with other delivery systems, the first
  * value refers to the entire set of carriers ("global").
- * dtv_status:scale should use the value FE_SCALE_NOT_AVAILABLE when
+ *
+ * @scale should use the value %FE_SCALE_NOT_AVAILABLE when
  * the value for the entire group of carriers or from one specific layer
  * is not provided by the hardware.
- * st.len should be filled with the latest filled status + 1.
  *
- * In other words, for ISDB, those values should be filled like:
+ * @len should be filled with the latest filled status + 1.
+ *
+ * In other words, for ISDB, those values should be filled like::
+ *
  *	u.st.stat.svalue[0] = global statistics;
  *	u.st.stat.scale[0] = FE_SCALE_DECIBEL;
  *	u.st.stat.value[1] = layer A statistics;
@@ -445,11 +808,39 @@ struct dtv_stats {
 
 #define MAX_DTV_STATS   4
 
+/**
+ * struct dtv_fe_stats - store Digital TV frontend statistics
+ *
+ * @len:	length of the statistics - if zero, stats is disabled.
+ * @stat:	array with digital TV statistics.
+ *
+ * On most standards, @len can either be 0 or 1. However, for ISDB, each
+ * layer is modulated in separate. So, each layer may have its own set
+ * of statistics. If so, stat[0] carries on a global value for the property.
+ * Indexes 1 to 3 means layer A to B.
+ */
 struct dtv_fe_stats {
 	__u8 len;
 	struct dtv_stats stat[MAX_DTV_STATS];
 } __attribute__ ((packed));
 
+/**
+ * struct dtv_property - store one of frontend command and its value
+ *
+ * @cmd:	Digital TV command.
+ * @reserved:	Not used.
+ * @u:		Union with the values for the command.
+ * @result:	Result of the command set (currently unused).
+ *
+ * The @u union may have either one of the values below:
+ *
+ * %data
+ *	an unsigned 32-bits number.
+ * %st
+ *	a &struct dtv_fe_stats array of statistics.
+ * %buffer
+ *	a buffer of up to 32 characters (currently unused).
+ */
 struct dtv_property {
 	__u32 cmd;
 	__u32 reserved[3];
@@ -469,17 +860,70 @@ struct dtv_property {
 /* num of properties cannot exceed DTV_IOCTL_MAX_MSGS per ioctl */
 #define DTV_IOCTL_MAX_MSGS 64
 
+/**
+ * struct dtv_properties - a set of command/value pairs.
+ *
+ * @num:	amount of commands stored at the struct.
+ * @props:	a pointer to &struct dtv_property.
+ */
 struct dtv_properties {
 	__u32 num;
 	struct dtv_property *props;
 };
 
+/*
+ * When set, this flag will disable any zigzagging or other "normal" tuning
+ * behavior. Additionally, there will be no automatic monitoring of the lock
+ * status, and hence no frontend events will be generated. If a frontend device
+ * is closed, this flag will be automatically turned off when the device is
+ * reopened read-write.
+ */
+#define FE_TUNE_MODE_ONESHOT 0x01
+
+/* Digital TV Frontend API calls */
+
+#define FE_GET_INFO		   _IOR('o', 61, struct dvb_frontend_info)
+
+#define FE_DISEQC_RESET_OVERLOAD   _IO('o', 62)
+#define FE_DISEQC_SEND_MASTER_CMD  _IOW('o', 63, struct dvb_diseqc_master_cmd)
+#define FE_DISEQC_RECV_SLAVE_REPLY _IOR('o', 64, struct dvb_diseqc_slave_reply)
+#define FE_DISEQC_SEND_BURST       _IO('o', 65)  /* fe_sec_mini_cmd_t */
+
+#define FE_SET_TONE		   _IO('o', 66)  /* fe_sec_tone_mode_t */
+#define FE_SET_VOLTAGE		   _IO('o', 67)  /* fe_sec_voltage_t */
+#define FE_ENABLE_HIGH_LNB_VOLTAGE _IO('o', 68)  /* int */
+
+#define FE_READ_STATUS		   _IOR('o', 69, fe_status_t)
+#define FE_READ_BER		   _IOR('o', 70, __u32)
+#define FE_READ_SIGNAL_STRENGTH    _IOR('o', 71, __u16)
+#define FE_READ_SNR		   _IOR('o', 72, __u16)
+#define FE_READ_UNCORRECTED_BLOCKS _IOR('o', 73, __u32)
+
+#define FE_SET_FRONTEND_TUNE_MODE  _IO('o', 81) /* unsigned int */
+#define FE_GET_EVENT		   _IOR('o', 78, struct dvb_frontend_event)
+
+#define FE_DISHNETWORK_SEND_LEGACY_CMD _IO('o', 80) /* unsigned int */
+
+#define FE_SET_PROPERTY		   _IOW('o', 82, struct dtv_properties)
+#define FE_GET_PROPERTY		   _IOR('o', 83, struct dtv_properties)
+
 #if defined(__DVB_CORE__) || !defined (__KERNEL__)
 
 /*
- * DEPRECATED: The DVBv3 ioctls, structs and enums should not be used on
- * newer programs, as it doesn't support the second generation of digital
- * TV standards, nor supports newer delivery systems.
+ * DEPRECATED: Everything below is deprecated in favor of DVBv5 API
+ *
+ * The DVBv3 only ioctls, structs and enums should not be used on
+ * newer programs, as it doesn't support the second generation of
+ * digital TV standards, nor supports newer delivery systems.
+ * They also don't support modern frontends with usually support multiple
+ * delivery systems.
+ *
+ * Drivers shouldn't use them.
+ *
+ * New applications should use DVBv5 delivery system instead
+ */
+
+/*
  */
 
 enum fe_bandwidth {
@@ -492,7 +936,7 @@ enum fe_bandwidth {
 	BANDWIDTH_1_712_MHZ,
 };
 
-/* This is needed for legacy userspace support */
+/* This is kept for legacy userspace support */
 typedef enum fe_sec_voltage fe_sec_voltage_t;
 typedef enum fe_caps fe_caps_t;
 typedef enum fe_type fe_type_t;
@@ -510,6 +954,8 @@ typedef enum fe_pilot fe_pilot_t;
 typedef enum fe_rolloff fe_rolloff_t;
 typedef enum fe_delivery_system fe_delivery_system_t;
 
+/* DVBv3 structs */
+
 struct dvb_qpsk_parameters {
 	__u32		symbol_rate;  /* symbol rate in Symbols per second */
 	fe_code_rate_t	fec_inner;    /* forward error correction (see above) */
@@ -551,42 +997,12 @@ struct dvb_frontend_event {
 	fe_status_t status;
 	struct dvb_frontend_parameters parameters;
 };
-#endif
-
-#define FE_SET_PROPERTY		   _IOW('o', 82, struct dtv_properties)
-#define FE_GET_PROPERTY		   _IOR('o', 83, struct dtv_properties)
-
-/**
- * When set, this flag will disable any zigzagging or other "normal" tuning
- * behaviour. Additionally, there will be no automatic monitoring of the lock
- * status, and hence no frontend events will be generated. If a frontend device
- * is closed, this flag will be automatically turned off when the device is
- * reopened read-write.
- */
-#define FE_TUNE_MODE_ONESHOT 0x01
 
-#define FE_GET_INFO		   _IOR('o', 61, struct dvb_frontend_info)
-
-#define FE_DISEQC_RESET_OVERLOAD   _IO('o', 62)
-#define FE_DISEQC_SEND_MASTER_CMD  _IOW('o', 63, struct dvb_diseqc_master_cmd)
-#define FE_DISEQC_RECV_SLAVE_REPLY _IOR('o', 64, struct dvb_diseqc_slave_reply)
-#define FE_DISEQC_SEND_BURST       _IO('o', 65)  /* fe_sec_mini_cmd_t */
-
-#define FE_SET_TONE		   _IO('o', 66)  /* fe_sec_tone_mode_t */
-#define FE_SET_VOLTAGE		   _IO('o', 67)  /* fe_sec_voltage_t */
-#define FE_ENABLE_HIGH_LNB_VOLTAGE _IO('o', 68)  /* int */
-
-#define FE_READ_STATUS		   _IOR('o', 69, fe_status_t)
-#define FE_READ_BER		   _IOR('o', 70, __u32)
-#define FE_READ_SIGNAL_STRENGTH    _IOR('o', 71, __u16)
-#define FE_READ_SNR		   _IOR('o', 72, __u16)
-#define FE_READ_UNCORRECTED_BLOCKS _IOR('o', 73, __u32)
+/* DVBv3 API calls */
 
 #define FE_SET_FRONTEND		   _IOW('o', 76, struct dvb_frontend_parameters)
 #define FE_GET_FRONTEND		   _IOR('o', 77, struct dvb_frontend_parameters)
-#define FE_SET_FRONTEND_TUNE_MODE  _IO('o', 81) /* unsigned int */
-#define FE_GET_EVENT		   _IOR('o', 78, struct dvb_frontend_event)
 
-#define FE_DISHNETWORK_SEND_LEGACY_CMD _IO('o', 80) /* unsigned int */
+#endif
 
 #endif /*_DVBFRONTEND_H_*/
-- 
cgit v1.2.3-71-gd317


From 9d5e27cbc117671959a9f625e51c754f5a0666e3 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab@s-opensource.com>
Date: Wed, 30 Aug 2017 13:45:20 -0400
Subject: media: dvb frontend docs: use kernel-doc documentation

Now that frontend.h contains most documentation for the frontend,
remove the duplicated information from Documentation/ and use the
kernel-doc auto-generated one instead.

That should simplify maintainership of DVB frontend uAPI, as most
of the documentation will stick with the header file.

Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
---
 Documentation/media/frontend.h.rst.exceptions      |  185 +-
 Documentation/media/uapi/dvb/dtv-fe-stats.rst      |   17 -
 Documentation/media/uapi/dvb/dtv-properties.rst    |   15 -
 Documentation/media/uapi/dvb/dtv-property.rst      |   31 -
 Documentation/media/uapi/dvb/dtv-stats.rst         |   18 -
 Documentation/media/uapi/dvb/dvbproperty-006.rst   |   12 -
 Documentation/media/uapi/dvb/dvbproperty.rst       |   28 +-
 .../media/uapi/dvb/fe-diseqc-recv-slave-reply.rst  |   40 +-
 .../media/uapi/dvb/fe-diseqc-send-burst.rst        |   31 +-
 .../media/uapi/dvb/fe-diseqc-send-master-cmd.rst   |   29 +-
 Documentation/media/uapi/dvb/fe-get-info.rst       |  370 +---
 Documentation/media/uapi/dvb/fe-get-property.rst   |    2 +-
 Documentation/media/uapi/dvb/fe-read-status.rst    |   83 -
 Documentation/media/uapi/dvb/fe-set-tone.rst       |   30 -
 .../media/uapi/dvb/fe_property_parameters.rst      | 1847 +++++---------------
 Documentation/media/uapi/dvb/frontend-header.rst   |    4 +
 include/uapi/linux/dvb/frontend.h                  |    8 +-
 17 files changed, 602 insertions(+), 2148 deletions(-)
 delete mode 100644 Documentation/media/uapi/dvb/dtv-fe-stats.rst
 delete mode 100644 Documentation/media/uapi/dvb/dtv-properties.rst
 delete mode 100644 Documentation/media/uapi/dvb/dtv-property.rst
 delete mode 100644 Documentation/media/uapi/dvb/dtv-stats.rst
 delete mode 100644 Documentation/media/uapi/dvb/dvbproperty-006.rst
 create mode 100644 Documentation/media/uapi/dvb/frontend-header.rst

(limited to 'include/uapi/linux')

diff --git a/Documentation/media/frontend.h.rst.exceptions b/Documentation/media/frontend.h.rst.exceptions
index 7656770f1936..f7c4df620a52 100644
--- a/Documentation/media/frontend.h.rst.exceptions
+++ b/Documentation/media/frontend.h.rst.exceptions
@@ -25,19 +25,9 @@ ignore define DTV_MAX_COMMAND
 ignore define MAX_DTV_STATS
 ignore define DTV_IOCTL_MAX_MSGS
 
-# Stats enum is documented altogether
-replace enum fecap_scale_params :ref:`frontend-stat-properties`
-replace symbol FE_SCALE_COUNTER frontend-stat-properties
-replace symbol FE_SCALE_DECIBEL frontend-stat-properties
-replace symbol FE_SCALE_NOT_AVAILABLE frontend-stat-properties
-replace symbol FE_SCALE_RELATIVE frontend-stat-properties
-
 # the same reference is used for both get and set ioctls
 replace ioctl FE_SET_PROPERTY :c:type:`FE_GET_PROPERTY`
 
-# Ignore struct used only internally at Kernel
-ignore struct dtv_cmds_h
-
 # Typedefs that use the enum reference
 replace typedef fe_sec_voltage_t :c:type:`fe_sec_voltage`
 
@@ -45,3 +35,178 @@ replace typedef fe_sec_voltage_t :c:type:`fe_sec_voltage`
 replace define FE_TUNE_MODE_ONESHOT :c:func:`FE_SET_FRONTEND_TUNE_MODE`
 replace define LNA_AUTO dtv-lna
 replace define NO_STREAM_ID_FILTER dtv-stream-id
+
+# Those enums are defined at the frontend.h header, and not externally
+
+ignore symbol FE_IS_STUPID
+ignore symbol FE_CAN_INVERSION_AUTO
+ignore symbol FE_CAN_FEC_1_2
+ignore symbol FE_CAN_FEC_2_3
+ignore symbol FE_CAN_FEC_3_4
+ignore symbol FE_CAN_FEC_4_5
+ignore symbol FE_CAN_FEC_5_6
+ignore symbol FE_CAN_FEC_6_7
+ignore symbol FE_CAN_FEC_7_8
+ignore symbol FE_CAN_FEC_8_9
+ignore symbol FE_CAN_FEC_AUTO
+ignore symbol FE_CAN_QPSK
+ignore symbol FE_CAN_QAM_16
+ignore symbol FE_CAN_QAM_32
+ignore symbol FE_CAN_QAM_64
+ignore symbol FE_CAN_QAM_128
+ignore symbol FE_CAN_QAM_256
+ignore symbol FE_CAN_QAM_AUTO
+ignore symbol FE_CAN_TRANSMISSION_MODE_AUTO
+ignore symbol FE_CAN_BANDWIDTH_AUTO
+ignore symbol FE_CAN_GUARD_INTERVAL_AUTO
+ignore symbol FE_CAN_HIERARCHY_AUTO
+ignore symbol FE_CAN_8VSB
+ignore symbol FE_CAN_16VSB
+ignore symbol FE_HAS_EXTENDED_CAPS
+ignore symbol FE_CAN_MULTISTREAM
+ignore symbol FE_CAN_TURBO_FEC
+ignore symbol FE_CAN_2G_MODULATION
+ignore symbol FE_NEEDS_BENDING
+ignore symbol FE_CAN_RECOVER
+ignore symbol FE_CAN_MUTE_TS
+
+ignore symbol QPSK
+ignore symbol QAM_16
+ignore symbol QAM_32
+ignore symbol QAM_64
+ignore symbol QAM_128
+ignore symbol QAM_256
+ignore symbol QAM_AUTO
+ignore symbol VSB_8
+ignore symbol VSB_16
+ignore symbol PSK_8
+ignore symbol APSK_16
+ignore symbol APSK_32
+ignore symbol DQPSK
+ignore symbol QAM_4_NR
+
+ignore symbol SEC_VOLTAGE_13
+ignore symbol SEC_VOLTAGE_18
+ignore symbol SEC_VOLTAGE_OFF
+
+ignore symbol SEC_TONE_ON
+ignore symbol SEC_TONE_OFF
+
+ignore symbol SEC_MINI_A
+ignore symbol SEC_MINI_B
+
+ignore symbol FE_NONE
+ignore symbol FE_HAS_SIGNAL
+ignore symbol FE_HAS_CARRIER
+ignore symbol FE_HAS_VITERBI
+ignore symbol FE_HAS_SYNC
+ignore symbol FE_HAS_LOCK
+ignore symbol FE_REINIT
+ignore symbol FE_TIMEDOUT
+
+ignore symbol FEC_NONE
+ignore symbol FEC_1_2
+ignore symbol FEC_2_3
+ignore symbol FEC_3_4
+ignore symbol FEC_4_5
+ignore symbol FEC_5_6
+ignore symbol FEC_6_7
+ignore symbol FEC_7_8
+ignore symbol FEC_8_9
+ignore symbol FEC_AUTO
+ignore symbol FEC_3_5
+ignore symbol FEC_9_10
+ignore symbol FEC_2_5
+
+ignore symbol TRANSMISSION_MODE_AUTO
+ignore symbol TRANSMISSION_MODE_1K
+ignore symbol TRANSMISSION_MODE_2K
+ignore symbol TRANSMISSION_MODE_8K
+ignore symbol TRANSMISSION_MODE_4K
+ignore symbol TRANSMISSION_MODE_16K
+ignore symbol TRANSMISSION_MODE_32K
+ignore symbol TRANSMISSION_MODE_C1
+ignore symbol TRANSMISSION_MODE_C3780
+ignore symbol TRANSMISSION_MODE_2K
+ignore symbol TRANSMISSION_MODE_8K
+
+ignore symbol GUARD_INTERVAL_AUTO
+ignore symbol GUARD_INTERVAL_1_128
+ignore symbol GUARD_INTERVAL_1_32
+ignore symbol GUARD_INTERVAL_1_16
+ignore symbol GUARD_INTERVAL_1_8
+ignore symbol GUARD_INTERVAL_1_4
+ignore symbol GUARD_INTERVAL_19_128
+ignore symbol GUARD_INTERVAL_19_256
+ignore symbol GUARD_INTERVAL_PN420
+ignore symbol GUARD_INTERVAL_PN595
+ignore symbol GUARD_INTERVAL_PN945
+
+ignore symbol HIERARCHY_NONE
+ignore symbol HIERARCHY_AUTO
+ignore symbol HIERARCHY_1
+ignore symbol HIERARCHY_2
+ignore symbol HIERARCHY_4
+
+ignore symbol INTERLEAVING_NONE
+ignore symbol INTERLEAVING_AUTO
+ignore symbol INTERLEAVING_240
+ignore symbol INTERLEAVING_720
+
+ignore symbol PILOT_ON
+ignore symbol PILOT_OFF
+ignore symbol PILOT_AUTO
+
+ignore symbol ROLLOFF_35
+ignore symbol ROLLOFF_20
+ignore symbol ROLLOFF_25
+ignore symbol ROLLOFF_AUTO
+
+ignore symbol INVERSION_ON
+ignore symbol INVERSION_OFF
+ignore symbol INVERSION_AUTO
+
+ignore symbol SYS_UNDEFINED
+ignore symbol SYS_DVBC_ANNEX_A
+ignore symbol SYS_DVBC_ANNEX_B
+ignore symbol SYS_DVBC_ANNEX_C
+ignore symbol SYS_ISDBC
+ignore symbol SYS_DVBT
+ignore symbol SYS_DVBT2
+ignore symbol SYS_ISDBT
+ignore symbol SYS_ATSC
+ignore symbol SYS_ATSCMH
+ignore symbol SYS_DTMB
+ignore symbol SYS_DVBS
+ignore symbol SYS_DVBS2
+ignore symbol SYS_TURBO
+ignore symbol SYS_ISDBS
+ignore symbol SYS_DAB
+ignore symbol SYS_DSS
+ignore symbol SYS_CMMB
+ignore symbol SYS_DVBH
+
+ignore symbol ATSCMH_SCCC_BLK_SEP
+ignore symbol ATSCMH_SCCC_BLK_COMB
+ignore symbol ATSCMH_SCCC_BLK_RES
+
+ignore symbol ATSCMH_SCCC_CODE_HLF
+ignore symbol ATSCMH_SCCC_CODE_QTR
+ignore symbol ATSCMH_SCCC_CODE_RES
+
+ignore symbol ATSCMH_RSFRAME_ENS_PRI
+ignore symbol ATSCMH_RSFRAME_ENS_SEC
+
+ignore symbol ATSCMH_RSFRAME_PRI_ONLY
+ignore symbol ATSCMH_RSFRAME_PRI_SEC
+ignore symbol ATSCMH_RSFRAME_RES
+
+ignore symbol ATSCMH_RSCODE_211_187
+ignore symbol ATSCMH_RSCODE_223_187
+ignore symbol ATSCMH_RSCODE_235_187
+ignore symbol ATSCMH_RSCODE_RES
+
+ignore symbol FE_SCALE_NOT_AVAILABLE
+ignore symbol FE_SCALE_DECIBEL
+ignore symbol FE_SCALE_RELATIVE
+ignore symbol FE_SCALE_COUNTER
diff --git a/Documentation/media/uapi/dvb/dtv-fe-stats.rst b/Documentation/media/uapi/dvb/dtv-fe-stats.rst
deleted file mode 100644
index e8a02a1f138d..000000000000
--- a/Documentation/media/uapi/dvb/dtv-fe-stats.rst
+++ /dev/null
@@ -1,17 +0,0 @@
-.. -*- coding: utf-8; mode: rst -*-
-
-.. c:type:: dtv_fe_stats
-
-*******************
-struct dtv_fe_stats
-*******************
-
-
-.. code-block:: c
-
-    #define MAX_DTV_STATS   4
-
-    struct dtv_fe_stats {
-	__u8 len;
-	struct dtv_stats stat[MAX_DTV_STATS];
-    } __packed;
diff --git a/Documentation/media/uapi/dvb/dtv-properties.rst b/Documentation/media/uapi/dvb/dtv-properties.rst
deleted file mode 100644
index 48c4e834ad11..000000000000
--- a/Documentation/media/uapi/dvb/dtv-properties.rst
+++ /dev/null
@@ -1,15 +0,0 @@
-.. -*- coding: utf-8; mode: rst -*-
-
-.. c:type:: dtv_properties
-
-*********************
-struct dtv_properties
-*********************
-
-
-.. code-block:: c
-
-    struct dtv_properties {
-	__u32 num;
-	struct dtv_property *props;
-    };
diff --git a/Documentation/media/uapi/dvb/dtv-property.rst b/Documentation/media/uapi/dvb/dtv-property.rst
deleted file mode 100644
index 3ddc3474b00e..000000000000
--- a/Documentation/media/uapi/dvb/dtv-property.rst
+++ /dev/null
@@ -1,31 +0,0 @@
-.. -*- coding: utf-8; mode: rst -*-
-
-.. c:type:: dtv_property
-
-*******************
-struct dtv_property
-*******************
-
-
-.. code-block:: c
-
-    /* Reserved fields should be set to 0 */
-
-    struct dtv_property {
-	__u32 cmd;
-	__u32 reserved[3];
-	union {
-	    __u32 data;
-	    struct dtv_fe_stats st;
-	    struct {
-		__u8 data[32];
-		__u32 len;
-		__u32 reserved1[3];
-		void *reserved2;
-	    } buffer;
-	} u;
-	int result;
-    } __attribute__ ((packed));
-
-    /* num of properties cannot exceed DTV_IOCTL_MAX_MSGS per ioctl */
-    #define DTV_IOCTL_MAX_MSGS 64
diff --git a/Documentation/media/uapi/dvb/dtv-stats.rst b/Documentation/media/uapi/dvb/dtv-stats.rst
deleted file mode 100644
index 35239e72bf74..000000000000
--- a/Documentation/media/uapi/dvb/dtv-stats.rst
+++ /dev/null
@@ -1,18 +0,0 @@
-.. -*- coding: utf-8; mode: rst -*-
-
-.. c:type:: dtv_stats
-
-****************
-struct dtv_stats
-****************
-
-
-.. code-block:: c
-
-    struct dtv_stats {
-	__u8 scale; /* enum fecap_scale_params type */
-	union {
-	    __u64 uvalue;   /* for counters and relative scales */
-	    __s64 svalue;   /* for 1/1000 dB measures */
-	};
-    } __packed;
diff --git a/Documentation/media/uapi/dvb/dvbproperty-006.rst b/Documentation/media/uapi/dvb/dvbproperty-006.rst
deleted file mode 100644
index 3343a0f306fe..000000000000
--- a/Documentation/media/uapi/dvb/dvbproperty-006.rst
+++ /dev/null
@@ -1,12 +0,0 @@
-.. -*- coding: utf-8; mode: rst -*-
-
-**************
-Property types
-**************
-
-On :ref:`FE_GET_PROPERTY and FE_SET_PROPERTY <FE_GET_PROPERTY>`,
-the actual action is determined by the dtv_property cmd/data pairs.
-With one single ioctl, is possible to get/set up to 64 properties. The
-actual meaning of each property is described on the next sections.
-
-The available frontend property types are shown on the next section.
diff --git a/Documentation/media/uapi/dvb/dvbproperty.rst b/Documentation/media/uapi/dvb/dvbproperty.rst
index 843f1d70aff0..c40943be5925 100644
--- a/Documentation/media/uapi/dvb/dvbproperty.rst
+++ b/Documentation/media/uapi/dvb/dvbproperty.rst
@@ -2,8 +2,9 @@
 
 .. _frontend-properties:
 
-DVB Frontend properties
-=======================
+**************
+Property types
+**************
 
 Tuning into a Digital TV physical channel and starting decoding it
 requires changing a set of parameters, in order to control the tuner,
@@ -20,10 +21,15 @@ enough to group the structs that would be required for those new
 standards. Also, extending it would break userspace.
 
 So, the legacy union/struct based approach was deprecated, in favor
-of a properties set approach.
+of a properties set approach. On such approach,
+:ref:`FE_GET_PROPERTY and FE_SET_PROPERTY <FE_GET_PROPERTY>` are used
+to setup the frontend and read its status.
+
+The actual action is determined by a set of dtv_property cmd/data pairs.
+With one single ioctl, is possible to get/set up to 64 properties.
 
 This section describes the new and recommended way to set the frontend,
-with suppports all digital TV delivery systems.
+with supports all digital TV delivery systems.
 
 .. note::
 
@@ -63,12 +69,9 @@ Mbauds, those properties should be sent to
 The code that would that would do the above is show in
 :ref:`dtv-prop-example`.
 
-.. _dtv-prop-example:
-
-Example: Setting digital TV frontend properties
-===============================================
-
 .. code-block:: c
+    :caption: Example: Setting digital TV frontend properties
+    :name: dtv-prop-example
 
     #include <stdio.h>
     #include <fcntl.h>
@@ -112,17 +115,12 @@ Example: Setting digital TV frontend properties
    provides methods for usual operations like program scanning and to
    read/write channel descriptor files.
 
-
 .. toctree::
     :maxdepth: 1
 
-    dtv-stats
-    dtv-fe-stats
-    dtv-property
-    dtv-properties
-    dvbproperty-006
     fe_property_parameters
     frontend-stat-properties
     frontend-property-terrestrial-systems
     frontend-property-cable-systems
     frontend-property-satellite-systems
+    frontend-header
diff --git a/Documentation/media/uapi/dvb/fe-diseqc-recv-slave-reply.rst b/Documentation/media/uapi/dvb/fe-diseqc-recv-slave-reply.rst
index 302db2857f90..473855584d7f 100644
--- a/Documentation/media/uapi/dvb/fe-diseqc-recv-slave-reply.rst
+++ b/Documentation/media/uapi/dvb/fe-diseqc-recv-slave-reply.rst
@@ -26,8 +26,7 @@ Arguments
     File descriptor returned by :ref:`open() <frontend_f_open>`.
 
 ``argp``
-    pointer to struct
-    :c:type:`dvb_diseqc_slave_reply`
+    pointer to struct :c:type:`dvb_diseqc_slave_reply`.
 
 
 Description
@@ -35,42 +34,7 @@ Description
 
 Receives reply from a DiSEqC 2.0 command.
 
-.. c:type:: dvb_diseqc_slave_reply
-
-.. tabularcolumns:: |p{4.4cm}|p{4.4cm}|p{8.7cm}|
-
-.. flat-table:: struct dvb_diseqc_slave_reply
-    :header-rows:  0
-    :stub-columns: 0
-    :widths:       1 1 2
-
-
-    -  .. row 1
-
-       -  uint8_t
-
-       -  msg[4]
-
-       -  DiSEqC message (framing, data[3])
-
-    -  .. row 2
-
-       -  uint8_t
-
-       -  msg_len
-
-       -  Length of the DiSEqC message. Valid values are 0 to 4, where 0
-	  means no msg
-
-    -  .. row 3
-
-       -  int
-
-       -  timeout
-
-       -  Return from ioctl after timeout ms with errorcode when no message
-	  was received
-
+The received message is stored at the buffer pointed by ``argp``.
 
 Return Value
 ============
diff --git a/Documentation/media/uapi/dvb/fe-diseqc-send-burst.rst b/Documentation/media/uapi/dvb/fe-diseqc-send-burst.rst
index e962f6ec5aaf..54d35517e784 100644
--- a/Documentation/media/uapi/dvb/fe-diseqc-send-burst.rst
+++ b/Documentation/media/uapi/dvb/fe-diseqc-send-burst.rst
@@ -26,7 +26,7 @@ Arguments
     File descriptor returned by :ref:`open() <frontend_f_open>`.
 
 ``tone``
-    an integer enumered value described at :c:type:`fe_sec_mini_cmd`
+    An integer enumered value described at :c:type:`fe_sec_mini_cmd`.
 
 
 Description
@@ -39,35 +39,6 @@ read/write permissions.
 It provides support for what's specified at
 `Digital Satellite Equipment Control (DiSEqC) - Simple "ToneBurst" Detection Circuit specification. <http://www.eutelsat.com/files/contributed/satellites/pdf/Diseqc/associated%20docs/simple_tone_burst_detec.pdf>`__
 
-.. c:type:: fe_sec_mini_cmd
-
-.. flat-table:: enum fe_sec_mini_cmd
-    :header-rows:  1
-    :stub-columns: 0
-
-
-    -  .. row 1
-
-       -  ID
-
-       -  Description
-
-    -  .. row 2
-
-       -  .. _SEC-MINI-A:
-
-	  ``SEC_MINI_A``
-
-       -  Sends a mini-DiSEqC 22kHz '0' Tone Burst to select satellite-A
-
-    -  .. row 3
-
-       -  .. _SEC-MINI-B:
-
-	  ``SEC_MINI_B``
-
-       -  Sends a mini-DiSEqC 22kHz '1' Data Burst to select satellite-B
-
 
 Return Value
 ============
diff --git a/Documentation/media/uapi/dvb/fe-diseqc-send-master-cmd.rst b/Documentation/media/uapi/dvb/fe-diseqc-send-master-cmd.rst
index bbcab3df39b5..7392d6747ad6 100644
--- a/Documentation/media/uapi/dvb/fe-diseqc-send-master-cmd.rst
+++ b/Documentation/media/uapi/dvb/fe-diseqc-send-master-cmd.rst
@@ -33,34 +33,7 @@ Arguments
 Description
 ===========
 
-Sends a DiSEqC command to the antenna subsystem.
-
-
-.. c:type:: dvb_diseqc_master_cmd
-
-.. tabularcolumns:: |p{4.4cm}|p{4.4cm}|p{8.7cm}|
-
-.. flat-table:: struct dvb_diseqc_master_cmd
-    :header-rows:  0
-    :stub-columns: 0
-    :widths:       1 1 2
-
-
-    -  .. row 1
-
-       -  uint8_t
-
-       -  msg[6]
-
-       -  DiSEqC message (framing, address, command, data[3])
-
-    -  .. row 2
-
-       -  uint8_t
-
-       -  msg_len
-
-       -  Length of the DiSEqC message. Valid values are 3 to 6
+Sends the DiSEqC command pointed by ``argp`` to the antenna subsystem.
 
 Return Value
 ============
diff --git a/Documentation/media/uapi/dvb/fe-get-info.rst b/Documentation/media/uapi/dvb/fe-get-info.rst
index e3d64b251f61..30dde8a791f3 100644
--- a/Documentation/media/uapi/dvb/fe-get-info.rst
+++ b/Documentation/media/uapi/dvb/fe-get-info.rst
@@ -40,112 +40,6 @@ takes a pointer to dvb_frontend_info which is filled by the driver.
 When the driver is not compatible with this specification the ioctl
 returns an error.
 
-.. c:type:: dvb_frontend_info
-
-.. tabularcolumns:: |p{4.4cm}|p{4.4cm}|p{8.7cm}|
-
-.. flat-table:: struct dvb_frontend_info
-    :header-rows:  0
-    :stub-columns: 0
-    :widths:       1 1 2
-
-
-    -  .. row 1
-
-       -  char
-
-       -  name[128]
-
-       -  Name of the frontend
-
-    -  .. row 2
-
-       -  fe_type_t
-
-       -  type
-
-       -  **DEPRECATED**. DVBv3 type. Should not be used on modern programs,
-	  as a frontend may have more than one type. So, the DVBv5 API
-	  should be used instead to enumerate and select the frontend type.
-
-    -  .. row 3
-
-       -  uint32_t
-
-       -  frequency_min
-
-       -  Minimal frequency supported by the frontend
-
-    -  .. row 4
-
-       -  uint32_t
-
-       -  frequency_max
-
-       -  Maximal frequency supported by the frontend
-
-    -  .. row 5
-
-       -  uint32_t
-
-       -  frequency_stepsize
-
-       -  Frequency step - all frequencies are multiple of this value
-
-    -  .. row 6
-
-       -  uint32_t
-
-       -  frequency_tolerance
-
-       -  Tolerance of the frequency
-
-    -  .. row 7
-
-       -  uint32_t
-
-       -  symbol_rate_min
-
-       -  Minimal symbol rate (for Cable/Satellite systems), in bauds
-
-    -  .. row 8
-
-       -  uint32_t
-
-       -  symbol_rate_max
-
-       -  Maximal symbol rate (for Cable/Satellite systems), in bauds
-
-    -  .. row 9
-
-       -  uint32_t
-
-       -  symbol_rate_tolerance
-
-       -  Maximal symbol rate tolerance, in ppm
-
-    -  .. row 10
-
-       -  uint32_t
-
-       -  notifier_delay
-
-       -  **DEPRECATED**. Not used by any driver.
-
-    -  .. row 11
-
-       -  enum :c:type:`fe_caps`
-
-       -  caps
-
-       -  Capabilities supported by the frontend
-
-
-.. note::
-
-   The frequencies are specified in Hz for Terrestrial and Cable
-   systems. They're specified in kHz for Satellite systems
-
 
 frontend capabilities
 =====================
@@ -153,269 +47,7 @@ frontend capabilities
 Capabilities describe what a frontend can do. Some capabilities are
 supported only on some specific frontend types.
 
-.. c:type:: fe_caps
-
-.. tabularcolumns:: |p{6.5cm}|p{11.0cm}|
-
-.. flat-table:: enum fe_caps
-    :header-rows:  1
-    :stub-columns: 0
-
-
-    -  .. row 1
-
-       -  ID
-
-       -  Description
-
-    -  .. row 2
-
-       -  .. _FE-IS-STUPID:
-
-	  ``FE_IS_STUPID``
-
-       -  There's something wrong at the frontend, and it can't report its
-	  capabilities
-
-    -  .. row 3
-
-       -  .. _FE-CAN-INVERSION-AUTO:
-
-	  ``FE_CAN_INVERSION_AUTO``
-
-       -  The frontend is capable of auto-detecting inversion
-
-    -  .. row 4
-
-       -  .. _FE-CAN-FEC-1-2:
-
-	  ``FE_CAN_FEC_1_2``
-
-       -  The frontend supports FEC 1/2
-
-    -  .. row 5
-
-       -  .. _FE-CAN-FEC-2-3:
-
-	  ``FE_CAN_FEC_2_3``
-
-       -  The frontend supports FEC 2/3
-
-    -  .. row 6
-
-       -  .. _FE-CAN-FEC-3-4:
-
-	  ``FE_CAN_FEC_3_4``
-
-       -  The frontend supports FEC 3/4
-
-    -  .. row 7
-
-       -  .. _FE-CAN-FEC-4-5:
-
-	  ``FE_CAN_FEC_4_5``
-
-       -  The frontend supports FEC 4/5
-
-    -  .. row 8
-
-       -  .. _FE-CAN-FEC-5-6:
-
-	  ``FE_CAN_FEC_5_6``
-
-       -  The frontend supports FEC 5/6
-
-    -  .. row 9
-
-       -  .. _FE-CAN-FEC-6-7:
-
-	  ``FE_CAN_FEC_6_7``
-
-       -  The frontend supports FEC 6/7
-
-    -  .. row 10
-
-       -  .. _FE-CAN-FEC-7-8:
-
-	  ``FE_CAN_FEC_7_8``
-
-       -  The frontend supports FEC 7/8
-
-    -  .. row 11
-
-       -  .. _FE-CAN-FEC-8-9:
-
-	  ``FE_CAN_FEC_8_9``
-
-       -  The frontend supports FEC 8/9
-
-    -  .. row 12
-
-       -  .. _FE-CAN-FEC-AUTO:
-
-	  ``FE_CAN_FEC_AUTO``
-
-       -  The frontend can autodetect FEC.
-
-    -  .. row 13
-
-       -  .. _FE-CAN-QPSK:
-
-	  ``FE_CAN_QPSK``
-
-       -  The frontend supports QPSK modulation
-
-    -  .. row 14
-
-       -  .. _FE-CAN-QAM-16:
-
-	  ``FE_CAN_QAM_16``
-
-       -  The frontend supports 16-QAM modulation
-
-    -  .. row 15
-
-       -  .. _FE-CAN-QAM-32:
-
-	  ``FE_CAN_QAM_32``
-
-       -  The frontend supports 32-QAM modulation
-
-    -  .. row 16
-
-       -  .. _FE-CAN-QAM-64:
-
-	  ``FE_CAN_QAM_64``
-
-       -  The frontend supports 64-QAM modulation
-
-    -  .. row 17
-
-       -  .. _FE-CAN-QAM-128:
-
-	  ``FE_CAN_QAM_128``
-
-       -  The frontend supports 128-QAM modulation
-
-    -  .. row 18
-
-       -  .. _FE-CAN-QAM-256:
-
-	  ``FE_CAN_QAM_256``
-
-       -  The frontend supports 256-QAM modulation
-
-    -  .. row 19
-
-       -  .. _FE-CAN-QAM-AUTO:
-
-	  ``FE_CAN_QAM_AUTO``
-
-       -  The frontend can autodetect modulation
-
-    -  .. row 20
-
-       -  .. _FE-CAN-TRANSMISSION-MODE-AUTO:
-
-	  ``FE_CAN_TRANSMISSION_MODE_AUTO``
-
-       -  The frontend can autodetect the transmission mode
-
-    -  .. row 21
-
-       -  .. _FE-CAN-BANDWIDTH-AUTO:
-
-	  ``FE_CAN_BANDWIDTH_AUTO``
-
-       -  The frontend can autodetect the bandwidth
-
-    -  .. row 22
-
-       -  .. _FE-CAN-GUARD-INTERVAL-AUTO:
-
-	  ``FE_CAN_GUARD_INTERVAL_AUTO``
-
-       -  The frontend can autodetect the guard interval
-
-    -  .. row 23
-
-       -  .. _FE-CAN-HIERARCHY-AUTO:
-
-	  ``FE_CAN_HIERARCHY_AUTO``
-
-       -  The frontend can autodetect hierarch
-
-    -  .. row 24
-
-       -  .. _FE-CAN-8VSB:
-
-	  ``FE_CAN_8VSB``
-
-       -  The frontend supports 8-VSB modulation
-
-    -  .. row 25
-
-       -  .. _FE-CAN-16VSB:
-
-	  ``FE_CAN_16VSB``
-
-       -  The frontend supports 16-VSB modulation
-
-    -  .. row 26
-
-       -  .. _FE-HAS-EXTENDED-CAPS:
-
-	  ``FE_HAS_EXTENDED_CAPS``
-
-       -  Currently, unused
-
-    -  .. row 27
-
-       -  .. _FE-CAN-MULTISTREAM:
-
-	  ``FE_CAN_MULTISTREAM``
-
-       -  The frontend supports multistream filtering
-
-    -  .. row 28
-
-       -  .. _FE-CAN-TURBO-FEC:
-
-	  ``FE_CAN_TURBO_FEC``
-
-       -  The frontend supports turbo FEC modulation
-
-    -  .. row 29
-
-       -  .. _FE-CAN-2G-MODULATION:
-
-	  ``FE_CAN_2G_MODULATION``
-
-       -  The frontend supports "2nd generation modulation" (DVB-S2/T2)>
-
-    -  .. row 30
-
-       -  .. _FE-NEEDS-BENDING:
-
-	  ``FE_NEEDS_BENDING``
-
-       -  Not supported anymore, don't use it
-
-    -  .. row 31
-
-       -  .. _FE-CAN-RECOVER:
-
-	  ``FE_CAN_RECOVER``
-
-       -  The frontend can recover from a cable unplug automatically
-
-    -  .. row 32
-
-       -  .. _FE-CAN-MUTE-TS:
-
-	  ``FE_CAN_MUTE_TS``
-
-       -  The frontend can stop spurious TS data output
+The frontend capabilities are described at :c:type:`fe_caps`.
 
 
 Return Value
diff --git a/Documentation/media/uapi/dvb/fe-get-property.rst b/Documentation/media/uapi/dvb/fe-get-property.rst
index 015d4db597b5..b22e37c4a787 100644
--- a/Documentation/media/uapi/dvb/fe-get-property.rst
+++ b/Documentation/media/uapi/dvb/fe-get-property.rst
@@ -29,7 +29,7 @@ Arguments
     File descriptor returned by :ref:`open() <frontend_f_open>`.
 
 ``argp``
-    pointer to struct :c:type:`dtv_properties`
+    Pointer to struct :c:type:`dtv_properties`.
 
 
 Description
diff --git a/Documentation/media/uapi/dvb/fe-read-status.rst b/Documentation/media/uapi/dvb/fe-read-status.rst
index 76b93e386552..21b2db3591fd 100644
--- a/Documentation/media/uapi/dvb/fe-read-status.rst
+++ b/Documentation/media/uapi/dvb/fe-read-status.rst
@@ -52,89 +52,6 @@ The fe_status parameter is used to indicate the current state and/or
 state changes of the frontend hardware. It is produced using the enum
 :c:type:`fe_status` values on a bitmask
 
-.. c:type:: fe_status
-
-.. tabularcolumns:: |p{3.5cm}|p{14.0cm}|
-
-.. _fe-status:
-
-.. flat-table:: enum fe_status
-    :header-rows:  1
-    :stub-columns: 0
-
-
-    -  .. row 1
-
-       -  ID
-
-       -  Description
-
-    -  .. row 2
-
-       -  .. _FE-NONE:
-
-	  ``FE_NONE``
-
-       -  The frontend doesn't have any kind of lock. That's the initial frontend status
-
-    -  .. row 3
-
-       -  .. _FE-HAS-SIGNAL:
-
-	  ``FE_HAS_SIGNAL``
-
-       -  The frontend has found something above the noise level
-
-    -  .. row 4
-
-       -  .. _FE-HAS-CARRIER:
-
-	  ``FE_HAS_CARRIER``
-
-       -  The frontend has found a DVB signal
-
-    -  .. row 5
-
-       -  .. _FE-HAS-VITERBI:
-
-	  ``FE_HAS_VITERBI``
-
-       -  The frontend FEC inner coding (Viterbi, LDPC or other inner code)
-	  is stable
-
-    -  .. row 6
-
-       -  .. _FE-HAS-SYNC:
-
-	  ``FE_HAS_SYNC``
-
-       -  Synchronization bytes was found
-
-    -  .. row 7
-
-       -  .. _FE-HAS-LOCK:
-
-	  ``FE_HAS_LOCK``
-
-       -  The DVB were locked and everything is working
-
-    -  .. row 8
-
-       -  .. _FE-TIMEDOUT:
-
-	  ``FE_TIMEDOUT``
-
-       -  no lock within the last about 2 seconds
-
-    -  .. row 9
-
-       -  .. _FE-REINIT:
-
-	  ``FE_REINIT``
-
-       -  The frontend was reinitialized, application is recommended to
-	  reset DiSEqC, tone and parameters
-
 
 Return Value
 ============
diff --git a/Documentation/media/uapi/dvb/fe-set-tone.rst b/Documentation/media/uapi/dvb/fe-set-tone.rst
index 84e4da3fd4c9..d0950acbbe64 100644
--- a/Documentation/media/uapi/dvb/fe-set-tone.rst
+++ b/Documentation/media/uapi/dvb/fe-set-tone.rst
@@ -45,36 +45,6 @@ this is done using the DiSEqC ioctls.
    capability of selecting the band. So, it is recommended that applications
    would change to SEC_TONE_OFF when the device is not used.
 
-.. c:type:: fe_sec_tone_mode
-
-.. flat-table:: enum fe_sec_tone_mode
-    :header-rows:  1
-    :stub-columns: 0
-
-
-    -  .. row 1
-
-       -  ID
-
-       -  Description
-
-    -  .. row 2
-
-       -  .. _SEC-TONE-ON:
-
-	  ``SEC_TONE_ON``
-
-       -  Sends a 22kHz tone burst to the antenna
-
-    -  .. row 3
-
-       -  .. _SEC-TONE-OFF:
-
-	  ``SEC_TONE_OFF``
-
-       -  Don't send a 22kHz tone to the antenna (except if the
-	  FE_DISEQC_* ioctls are called)
-
 
 Return Value
 ============
diff --git a/Documentation/media/uapi/dvb/fe_property_parameters.rst b/Documentation/media/uapi/dvb/fe_property_parameters.rst
index 7bb7559c4500..c6eb74f59b00 100644
--- a/Documentation/media/uapi/dvb/fe_property_parameters.rst
+++ b/Documentation/media/uapi/dvb/fe_property_parameters.rst
@@ -6,6 +6,11 @@
 Digital TV property parameters
 ******************************
 
+There are several different Digital TV parameters that can be used by
+:ref:`FE_SET_PROPERTY and FE_GET_PROPERTY ioctls<FE_GET_PROPERTY>`.
+This section describes each of them. Please notice, however, that only
+a subset of them are needed to setup a frontend.
+
 
 .. _DTV-UNDEFINED:
 
@@ -67,144 +72,36 @@ DTV_MODULATION
 ==============
 
 Specifies the frontend modulation type for delivery systems that
-supports more than one modulation type. The modulation can be one of the
-types defined by enum :c:type:`fe_modulation`.
-
-
-.. c:type:: fe_modulation
-
-Modulation property
--------------------
-
-Most of the digital TV standards currently offers more than one possible
-modulation (sometimes called as "constellation" on some standards). This
-enum contains the values used by the Kernel. Please note that not all
-modulations are supported by a given standard.
-
-
-.. flat-table:: enum fe_modulation
-    :header-rows:  1
-    :stub-columns: 0
-
-
-    -  .. row 1
-
-       -  ID
-
-       -  Description
-
-    -  .. row 2
-
-       -  .. _QPSK:
-
-	  ``QPSK``
-
-       -  QPSK modulation
-
-    -  .. row 3
-
-       -  .. _QAM-16:
-
-	  ``QAM_16``
-
-       -  16-QAM modulation
-
-    -  .. row 4
-
-       -  .. _QAM-32:
-
-	  ``QAM_32``
-
-       -  32-QAM modulation
-
-    -  .. row 5
-
-       -  .. _QAM-64:
-
-	  ``QAM_64``
-
-       -  64-QAM modulation
-
-    -  .. row 6
-
-       -  .. _QAM-128:
-
-	  ``QAM_128``
-
-       -  128-QAM modulation
-
-    -  .. row 7
-
-       -  .. _QAM-256:
-
-	  ``QAM_256``
-
-       -  256-QAM modulation
-
-    -  .. row 8
-
-       -  .. _QAM-AUTO:
-
-	  ``QAM_AUTO``
-
-       -  Autodetect QAM modulation
-
-    -  .. row 9
-
-       -  .. _VSB-8:
-
-	  ``VSB_8``
-
-       -  8-VSB modulation
-
-    -  .. row 10
-
-       -  .. _VSB-16:
-
-	  ``VSB_16``
-
-       -  16-VSB modulation
-
-    -  .. row 11
-
-       -  .. _PSK-8:
-
-	  ``PSK_8``
-
-       -  8-PSK modulation
-
-    -  .. row 12
-
-       -  .. _APSK-16:
-
-	  ``APSK_16``
-
-       -  16-APSK modulation
-
-    -  .. row 13
-
-       -  .. _APSK-32:
+supports more multiple modulations.
+
+The modulation can be one of the types defined by enum :c:type:`fe_modulation`.
+
+Most of the digital TV standards offers more than one possible
+modulation type.
+
+The table below presents a summary of the types of modulation types
+supported by each delivery system, as currently defined by specs.
+
+======================= =======================================================
+Standard		Modulation types
+======================= =======================================================
+ATSC (version 1)	8-VSB and 16-VSB.
+DMTB			4-QAM, 16-QAM, 32-QAM, 64-QAM and 4-QAM-NR.
+DVB-C Annex A/C		16-QAM, 32-QAM, 64-QAM and 256-QAM.
+DVB-C Annex B		64-QAM.
+DVB-T			QPSK, 16-QAM and 64-QAM.
+DVB-T2			QPSK, 16-QAM, 64-QAM and 256-QAM.
+DVB-S			No need to set. It supports only QPSK.
+DVB-S2			QPSK, 8-PSK, 16-APSK and 32-APSK.
+ISDB-T			QPSK, DQPSK, 16-QAM and 64-QAM.
+ISDB-S			8-PSK, QPSK and BPSK.
+======================= =======================================================
 
-	  ``APSK_32``
-
-       -  32-APSK modulation
-
-    -  .. row 14
-
-       -  .. _DQPSK:
-
-	  ``DQPSK``
-
-       -  DQPSK modulation
-
-    -  .. row 15
-
-       -  .. _QAM-4-NR:
-
-	  ``QAM_4_NR``
-
-       -  4-QAM-NR modulation
+.. note::
 
+   Please notice that some of the above modulation types may not be
+   defined currently at the Kernel. The reason is simple: no driver
+   needed such definition yet.
 
 
 .. _DTV-BANDWIDTH-HZ:
@@ -249,54 +146,7 @@ DTV_INVERSION
 
 Specifies if the frontend should do spectral inversion or not.
 
-.. c:type:: fe_spectral_inversion
-
-enum fe_modulation: Frontend spectral inversion
------------------------------------------------
-
-This parameter indicates if spectral inversion should be presumed or
-not. In the automatic setting (``INVERSION_AUTO``) the hardware will try
-to figure out the correct setting by itself. If the hardware doesn't
-support, the DVB core will try to lock at the carrier first with
-inversion off. If it fails, it will try to enable inversion.
-
-
-.. flat-table:: enum fe_modulation
-    :header-rows:  1
-    :stub-columns: 0
-
-
-    -  .. row 1
-
-       -  ID
-
-       -  Description
-
-    -  .. row 2
-
-       -  .. _INVERSION-OFF:
-
-	  ``INVERSION_OFF``
-
-       -  Don't do spectral band inversion.
-
-    -  .. row 3
-
-       -  .. _INVERSION-ON:
-
-	  ``INVERSION_ON``
-
-       -  Do spectral band inversion.
-
-    -  .. row 4
-
-       -  .. _INVERSION-AUTO:
-
-	  ``INVERSION_AUTO``
-
-       -  Autodetect spectral band inversion.
-
-
+The acceptable values are defined by :c:type:`fe_spectral_inversion`.
 
 .. _DTV-DISEQC-MASTER:
 
@@ -320,1525 +170,671 @@ standards.
 DTV_INNER_FEC
 =============
 
-Used cable/satellite transmissions. The acceptable values are:
+Used cable/satellite transmissions.
 
-.. c:type:: fe_code_rate
+The acceptable values are defined by :c:type:`fe_code_rate`.
 
-enum fe_code_rate: type of the Forward Error Correction.
---------------------------------------------------------
 
-.. flat-table:: enum fe_code_rate
-    :header-rows:  1
-    :stub-columns: 0
+.. _DTV-VOLTAGE:
 
+DTV_VOLTAGE
+===========
 
-    -  .. row 1
+The voltage is usually used with non-DiSEqC capable LNBs to switch the
+polarzation (horizontal/vertical). When using DiSEqC epuipment this
+voltage has to be switched consistently to the DiSEqC commands as
+described in the DiSEqC spec.
 
-       -  ID
+The acceptable values are defined by :c:type:`fe_sec_voltage`.
 
-       -  Description
 
-    -  .. row 2
+.. _DTV-TONE:
 
-       -  .. _FEC-NONE:
+DTV_TONE
+========
 
-	  ``FEC_NONE``
+Currently not used.
 
-       -  No Forward Error Correction Code
 
-    -  .. row 3
+.. _DTV-PILOT:
 
-       -  .. _FEC-AUTO:
+DTV_PILOT
+=========
 
-	  ``FEC_AUTO``
+Sets DVB-S2 pilot.
 
-       -  Autodetect Error Correction Code
+The acceptable values are defined by :c:type:`fe_pilot`.
 
-    -  .. row 4
 
-       -  .. _FEC-1-2:
+.. _DTV-ROLLOFF:
 
-	  ``FEC_1_2``
+DTV_ROLLOFF
+===========
 
-       -  Forward Error Correction Code 1/2
+Sets DVB-S2 rolloff
 
-    -  .. row 5
+The acceptable values are defined by :c:type:`fe_rolloff`.
 
-       -  .. _FEC-2-3:
 
-	  ``FEC_2_3``
+.. _DTV-DISEQC-SLAVE-REPLY:
 
-       -  Forward Error Correction Code 2/3
+DTV_DISEQC_SLAVE_REPLY
+======================
 
-    -  .. row 6
+Currently not implemented.
 
-       -  .. _FEC-3-4:
 
-	  ``FEC_3_4``
+.. _DTV-FE-CAPABILITY-COUNT:
 
-       -  Forward Error Correction Code 3/4
+DTV_FE_CAPABILITY_COUNT
+=======================
 
-    -  .. row 7
+Currently not implemented.
 
-       -  .. _FEC-4-5:
 
-	  ``FEC_4_5``
+.. _DTV-FE-CAPABILITY:
 
-       -  Forward Error Correction Code 4/5
+DTV_FE_CAPABILITY
+=================
 
-    -  .. row 8
+Currently not implemented.
 
-       -  .. _FEC-5-6:
 
-	  ``FEC_5_6``
+.. _DTV-DELIVERY-SYSTEM:
 
-       -  Forward Error Correction Code 5/6
+DTV_DELIVERY_SYSTEM
+===================
 
-    -  .. row 9
+Specifies the type of Delivery system.
 
-       -  .. _FEC-6-7:
+The acceptable values are defined by :c:type:`fe_delivery_system`.
 
-	  ``FEC_6_7``
 
-       -  Forward Error Correction Code 6/7
+.. _DTV-ISDBT-PARTIAL-RECEPTION:
 
-    -  .. row 10
+DTV_ISDBT_PARTIAL_RECEPTION
+===========================
 
-       -  .. _FEC-7-8:
+If ``DTV_ISDBT_SOUND_BROADCASTING`` is '0' this bit-field represents
+whether the channel is in partial reception mode or not.
 
-	  ``FEC_7_8``
+If '1' ``DTV_ISDBT_LAYERA_*`` values are assigned to the center segment
+and ``DTV_ISDBT_LAYERA_SEGMENT_COUNT`` has to be '1'.
 
-       -  Forward Error Correction Code 7/8
+If in addition ``DTV_ISDBT_SOUND_BROADCASTING`` is '1'
+``DTV_ISDBT_PARTIAL_RECEPTION`` represents whether this ISDB-Tsb channel
+is consisting of one segment and layer or three segments and two layers.
 
-    -  .. row 11
+Possible values: 0, 1, -1 (AUTO)
 
-       -  .. _FEC-8-9:
 
-	  ``FEC_8_9``
+.. _DTV-ISDBT-SOUND-BROADCASTING:
 
-       -  Forward Error Correction Code 8/9
+DTV_ISDBT_SOUND_BROADCASTING
+============================
 
-    -  .. row 12
+This field represents whether the other DTV_ISDBT_*-parameters are
+referring to an ISDB-T and an ISDB-Tsb channel. (See also
+``DTV_ISDBT_PARTIAL_RECEPTION``).
 
-       -  .. _FEC-9-10:
+Possible values: 0, 1, -1 (AUTO)
 
-	  ``FEC_9_10``
 
-       -  Forward Error Correction Code 9/10
+.. _DTV-ISDBT-SB-SUBCHANNEL-ID:
 
-    -  .. row 13
+DTV_ISDBT_SB_SUBCHANNEL_ID
+==========================
 
-       -  .. _FEC-2-5:
+This field only applies if ``DTV_ISDBT_SOUND_BROADCASTING`` is '1'.
 
-	  ``FEC_2_5``
+(Note of the author: This might not be the correct description of the
+``SUBCHANNEL-ID`` in all details, but it is my understanding of the
+technical background needed to program a device)
 
-       -  Forward Error Correction Code 2/5
+An ISDB-Tsb channel (1 or 3 segments) can be broadcasted alone or in a
+set of connected ISDB-Tsb channels. In this set of channels every
+channel can be received independently. The number of connected ISDB-Tsb
+segment can vary, e.g. depending on the frequency spectrum bandwidth
+available.
 
-    -  .. row 14
+Example: Assume 8 ISDB-Tsb connected segments are broadcasted. The
+broadcaster has several possibilities to put those channels in the air:
+Assuming a normal 13-segment ISDB-T spectrum he can align the 8 segments
+from position 1-8 to 5-13 or anything in between.
 
-       -  .. _FEC-3-5:
+The underlying layer of segments are subchannels: each segment is
+consisting of several subchannels with a predefined IDs. A sub-channel
+is used to help the demodulator to synchronize on the channel.
 
-	  ``FEC_3_5``
+An ISDB-T channel is always centered over all sub-channels. As for the
+example above, in ISDB-Tsb it is no longer as simple as that.
 
-       -  Forward Error Correction Code 3/5
+``The DTV_ISDBT_SB_SUBCHANNEL_ID`` parameter is used to give the
+sub-channel ID of the segment to be demodulated.
 
+Possible values: 0 .. 41, -1 (AUTO)
 
 
-.. _DTV-VOLTAGE:
+.. _DTV-ISDBT-SB-SEGMENT-IDX:
 
-DTV_VOLTAGE
-===========
+DTV_ISDBT_SB_SEGMENT_IDX
+========================
 
-The voltage is usually used with non-DiSEqC capable LNBs to switch the
-polarzation (horizontal/vertical). When using DiSEqC epuipment this
-voltage has to be switched consistently to the DiSEqC commands as
-described in the DiSEqC spec.
+This field only applies if ``DTV_ISDBT_SOUND_BROADCASTING`` is '1'.
 
+``DTV_ISDBT_SB_SEGMENT_IDX`` gives the index of the segment to be
+demodulated for an ISDB-Tsb channel where several of them are
+transmitted in the connected manner.
 
-.. c:type:: fe_sec_voltage
+Possible values: 0 .. ``DTV_ISDBT_SB_SEGMENT_COUNT`` - 1
 
-.. flat-table:: enum fe_sec_voltage
-    :header-rows:  1
-    :stub-columns: 0
+Note: This value cannot be determined by an automatic channel search.
 
 
-    -  .. row 1
+.. _DTV-ISDBT-SB-SEGMENT-COUNT:
 
-       -  ID
+DTV_ISDBT_SB_SEGMENT_COUNT
+==========================
 
-       -  Description
+This field only applies if ``DTV_ISDBT_SOUND_BROADCASTING`` is '1'.
 
-    -  .. row 2
+``DTV_ISDBT_SB_SEGMENT_COUNT`` gives the total count of connected
+ISDB-Tsb channels.
 
-       -  .. _SEC-VOLTAGE-13:
+Possible values: 1 .. 13
 
-	  ``SEC_VOLTAGE_13``
+Note: This value cannot be determined by an automatic channel search.
 
-       -  Set DC voltage level to 13V
 
-    -  .. row 3
+.. _isdb-hierq-layers:
 
-       -  .. _SEC-VOLTAGE-18:
+DTV-ISDBT-LAYER[A-C] parameters
+===============================
 
-	  ``SEC_VOLTAGE_18``
+ISDB-T channels can be coded hierarchically. As opposed to DVB-T in
+ISDB-T hierarchical layers can be decoded simultaneously. For that
+reason a ISDB-T demodulator has 3 Viterbi and 3 Reed-Solomon decoders.
 
-       -  Set DC voltage level to 18V
+ISDB-T has 3 hierarchical layers which each can use a part of the
+available segments. The total number of segments over all layers has to
+13 in ISDB-T.
 
-    -  .. row 4
+There are 3 parameter sets, for Layers A, B and C.
 
-       -  .. _SEC-VOLTAGE-OFF:
 
-	  ``SEC_VOLTAGE_OFF``
+.. _DTV-ISDBT-LAYER-ENABLED:
 
-       -  Don't send any voltage to the antenna
+DTV_ISDBT_LAYER_ENABLED
+-----------------------
 
+Hierarchical reception in ISDB-T is achieved by enabling or disabling
+layers in the decoding process. Setting all bits of
+``DTV_ISDBT_LAYER_ENABLED`` to '1' forces all layers (if applicable) to
+be demodulated. This is the default.
 
+If the channel is in the partial reception mode
+(``DTV_ISDBT_PARTIAL_RECEPTION`` = 1) the central segment can be decoded
+independently of the other 12 segments. In that mode layer A has to have
+a ``SEGMENT_COUNT`` of 1.
 
-.. _DTV-TONE:
+In ISDB-Tsb only layer A is used, it can be 1 or 3 in ISDB-Tsb according
+to ``DTV_ISDBT_PARTIAL_RECEPTION``. ``SEGMENT_COUNT`` must be filled
+accordingly.
 
-DTV_TONE
-========
+Only the values of the first 3 bits are used. Other bits will be silently ignored:
 
-Currently not used.
+``DTV_ISDBT_LAYER_ENABLED`` bit 0: layer A enabled
 
+``DTV_ISDBT_LAYER_ENABLED`` bit 1: layer B enabled
 
-.. _DTV-PILOT:
+``DTV_ISDBT_LAYER_ENABLED`` bit 2: layer C enabled
 
-DTV_PILOT
-=========
+``DTV_ISDBT_LAYER_ENABLED`` bits 3-31: unused
 
-Sets DVB-S2 pilot
 
+.. _DTV-ISDBT-LAYER-FEC:
 
-.. c:type:: fe_pilot
+DTV_ISDBT_LAYER[A-C]_FEC
+------------------------
 
-fe_pilot type
--------------
+The Forward Error Correction mechanism used by a given ISDB Layer, as
+defined by :c:type:`fe_code_rate`.
 
 
-.. flat-table:: enum fe_pilot
-    :header-rows:  1
-    :stub-columns: 0
+Possible values are: ``FEC_AUTO``, ``FEC_1_2``, ``FEC_2_3``, ``FEC_3_4``,
+``FEC_5_6``, ``FEC_7_8``
 
 
-    -  .. row 1
-
-       -  ID
-
-       -  Description
-
-    -  .. row 2
-
-       -  .. _PILOT-ON:
-
-	  ``PILOT_ON``
-
-       -  Pilot tones enabled
-
-    -  .. row 3
-
-       -  .. _PILOT-OFF:
-
-	  ``PILOT_OFF``
-
-       -  Pilot tones disabled
-
-    -  .. row 4
-
-       -  .. _PILOT-AUTO:
-
-	  ``PILOT_AUTO``
-
-       -  Autodetect pilot tones
-
-
-
-.. _DTV-ROLLOFF:
-
-DTV_ROLLOFF
-===========
-
-Sets DVB-S2 rolloff
-
-
-.. c:type:: fe_rolloff
-
-fe_rolloff type
----------------
-
-
-.. flat-table:: enum fe_rolloff
-    :header-rows:  1
-    :stub-columns: 0
-
-
-    -  .. row 1
-
-       -  ID
-
-       -  Description
-
-    -  .. row 2
-
-       -  .. _ROLLOFF-35:
-
-	  ``ROLLOFF_35``
-
-       -  Roloff factor: α=35%
-
-    -  .. row 3
-
-       -  .. _ROLLOFF-20:
-
-	  ``ROLLOFF_20``
-
-       -  Roloff factor: α=20%
-
-    -  .. row 4
-
-       -  .. _ROLLOFF-25:
-
-	  ``ROLLOFF_25``
-
-       -  Roloff factor: α=25%
-
-    -  .. row 5
-
-       -  .. _ROLLOFF-AUTO:
-
-	  ``ROLLOFF_AUTO``
-
-       -  Auto-detect the roloff factor.
-
-
-
-.. _DTV-DISEQC-SLAVE-REPLY:
-
-DTV_DISEQC_SLAVE_REPLY
-======================
-
-Currently not implemented.
-
-
-.. _DTV-FE-CAPABILITY-COUNT:
-
-DTV_FE_CAPABILITY_COUNT
-=======================
-
-Currently not implemented.
-
-
-.. _DTV-FE-CAPABILITY:
-
-DTV_FE_CAPABILITY
-=================
-
-Currently not implemented.
-
-
-.. _DTV-DELIVERY-SYSTEM:
-
-DTV_DELIVERY_SYSTEM
-===================
-
-Specifies the type of Delivery system
-
-
-.. c:type:: fe_delivery_system
-
-fe_delivery_system type
------------------------
-
-Possible values:
-
-
-.. flat-table:: enum fe_delivery_system
-    :header-rows:  1
-    :stub-columns: 0
-
-
-    -  .. row 1
-
-       -  ID
-
-       -  Description
-
-    -  .. row 2
-
-       -  .. _SYS-UNDEFINED:
-
-	  ``SYS_UNDEFINED``
-
-       -  Undefined standard. Generally, indicates an error
-
-    -  .. row 3
-
-       -  .. _SYS-DVBC-ANNEX-A:
-
-	  ``SYS_DVBC_ANNEX_A``
-
-       -  Cable TV: DVB-C following ITU-T J.83 Annex A spec
-
-    -  .. row 4
-
-       -  .. _SYS-DVBC-ANNEX-B:
-
-	  ``SYS_DVBC_ANNEX_B``
-
-       -  Cable TV: DVB-C following ITU-T J.83 Annex B spec (ClearQAM)
-
-    -  .. row 5
-
-       -  .. _SYS-DVBC-ANNEX-C:
-
-	  ``SYS_DVBC_ANNEX_C``
-
-       -  Cable TV: DVB-C following ITU-T J.83 Annex C spec
-
-    -  .. row 6
-
-       -  .. _SYS-ISDBC:
-
-	  ``SYS_ISDBC``
-
-       -  Cable TV: ISDB-C (no drivers yet)
-
-    -  .. row 7
-
-       -  .. _SYS-DVBT:
-
-	  ``SYS_DVBT``
-
-       -  Terrestral TV: DVB-T
-
-    -  .. row 8
-
-       -  .. _SYS-DVBT2:
-
-	  ``SYS_DVBT2``
-
-       -  Terrestral TV: DVB-T2
-
-    -  .. row 9
-
-       -  .. _SYS-ISDBT:
-
-	  ``SYS_ISDBT``
-
-       -  Terrestral TV: ISDB-T
-
-    -  .. row 10
-
-       -  .. _SYS-ATSC:
-
-	  ``SYS_ATSC``
-
-       -  Terrestral TV: ATSC
-
-    -  .. row 11
-
-       -  .. _SYS-ATSCMH:
-
-	  ``SYS_ATSCMH``
-
-       -  Terrestral TV (mobile): ATSC-M/H
-
-    -  .. row 12
-
-       -  .. _SYS-DTMB:
-
-	  ``SYS_DTMB``
-
-       -  Terrestrial TV: DTMB
-
-    -  .. row 13
-
-       -  .. _SYS-DVBS:
-
-	  ``SYS_DVBS``
-
-       -  Satellite TV: DVB-S
-
-    -  .. row 14
-
-       -  .. _SYS-DVBS2:
-
-	  ``SYS_DVBS2``
-
-       -  Satellite TV: DVB-S2
-
-    -  .. row 15
-
-       -  .. _SYS-TURBO:
-
-	  ``SYS_TURBO``
-
-       -  Satellite TV: DVB-S Turbo
-
-    -  .. row 16
-
-       -  .. _SYS-ISDBS:
-
-	  ``SYS_ISDBS``
-
-       -  Satellite TV: ISDB-S
-
-    -  .. row 17
-
-       -  .. _SYS-DAB:
-
-	  ``SYS_DAB``
-
-       -  Digital audio: DAB (not fully supported)
-
-    -  .. row 18
-
-       -  .. _SYS-DSS:
-
-	  ``SYS_DSS``
-
-       -  Satellite TV:"DSS (not fully supported)
-
-    -  .. row 19
-
-       -  .. _SYS-CMMB:
-
-	  ``SYS_CMMB``
-
-       -  Terrestral TV (mobile):CMMB (not fully supported)
-
-    -  .. row 20
-
-       -  .. _SYS-DVBH:
-
-	  ``SYS_DVBH``
-
-       -  Terrestral TV (mobile): DVB-H (standard deprecated)
-
-
-
-.. _DTV-ISDBT-PARTIAL-RECEPTION:
-
-DTV_ISDBT_PARTIAL_RECEPTION
-===========================
-
-If ``DTV_ISDBT_SOUND_BROADCASTING`` is '0' this bit-field represents
-whether the channel is in partial reception mode or not.
-
-If '1' ``DTV_ISDBT_LAYERA_*`` values are assigned to the center segment
-and ``DTV_ISDBT_LAYERA_SEGMENT_COUNT`` has to be '1'.
-
-If in addition ``DTV_ISDBT_SOUND_BROADCASTING`` is '1'
-``DTV_ISDBT_PARTIAL_RECEPTION`` represents whether this ISDB-Tsb channel
-is consisting of one segment and layer or three segments and two layers.
-
-Possible values: 0, 1, -1 (AUTO)
-
-
-.. _DTV-ISDBT-SOUND-BROADCASTING:
-
-DTV_ISDBT_SOUND_BROADCASTING
-============================
-
-This field represents whether the other DTV_ISDBT_*-parameters are
-referring to an ISDB-T and an ISDB-Tsb channel. (See also
-``DTV_ISDBT_PARTIAL_RECEPTION``).
-
-Possible values: 0, 1, -1 (AUTO)
-
-
-.. _DTV-ISDBT-SB-SUBCHANNEL-ID:
-
-DTV_ISDBT_SB_SUBCHANNEL_ID
-==========================
-
-This field only applies if ``DTV_ISDBT_SOUND_BROADCASTING`` is '1'.
-
-(Note of the author: This might not be the correct description of the
-``SUBCHANNEL-ID`` in all details, but it is my understanding of the
-technical background needed to program a device)
-
-An ISDB-Tsb channel (1 or 3 segments) can be broadcasted alone or in a
-set of connected ISDB-Tsb channels. In this set of channels every
-channel can be received independently. The number of connected ISDB-Tsb
-segment can vary, e.g. depending on the frequency spectrum bandwidth
-available.
-
-Example: Assume 8 ISDB-Tsb connected segments are broadcasted. The
-broadcaster has several possibilities to put those channels in the air:
-Assuming a normal 13-segment ISDB-T spectrum he can align the 8 segments
-from position 1-8 to 5-13 or anything in between.
-
-The underlying layer of segments are subchannels: each segment is
-consisting of several subchannels with a predefined IDs. A sub-channel
-is used to help the demodulator to synchronize on the channel.
-
-An ISDB-T channel is always centered over all sub-channels. As for the
-example above, in ISDB-Tsb it is no longer as simple as that.
-
-``The DTV_ISDBT_SB_SUBCHANNEL_ID`` parameter is used to give the
-sub-channel ID of the segment to be demodulated.
-
-Possible values: 0 .. 41, -1 (AUTO)
-
-
-.. _DTV-ISDBT-SB-SEGMENT-IDX:
-
-DTV_ISDBT_SB_SEGMENT_IDX
-========================
-
-This field only applies if ``DTV_ISDBT_SOUND_BROADCASTING`` is '1'.
-
-``DTV_ISDBT_SB_SEGMENT_IDX`` gives the index of the segment to be
-demodulated for an ISDB-Tsb channel where several of them are
-transmitted in the connected manner.
-
-Possible values: 0 .. ``DTV_ISDBT_SB_SEGMENT_COUNT`` - 1
-
-Note: This value cannot be determined by an automatic channel search.
-
-
-.. _DTV-ISDBT-SB-SEGMENT-COUNT:
-
-DTV_ISDBT_SB_SEGMENT_COUNT
-==========================
-
-This field only applies if ``DTV_ISDBT_SOUND_BROADCASTING`` is '1'.
-
-``DTV_ISDBT_SB_SEGMENT_COUNT`` gives the total count of connected
-ISDB-Tsb channels.
-
-Possible values: 1 .. 13
-
-Note: This value cannot be determined by an automatic channel search.
-
-
-.. _isdb-hierq-layers:
-
-DTV-ISDBT-LAYER[A-C] parameters
-===============================
-
-ISDB-T channels can be coded hierarchically. As opposed to DVB-T in
-ISDB-T hierarchical layers can be decoded simultaneously. For that
-reason a ISDB-T demodulator has 3 Viterbi and 3 Reed-Solomon decoders.
-
-ISDB-T has 3 hierarchical layers which each can use a part of the
-available segments. The total number of segments over all layers has to
-13 in ISDB-T.
-
-There are 3 parameter sets, for Layers A, B and C.
-
-
-.. _DTV-ISDBT-LAYER-ENABLED:
-
-DTV_ISDBT_LAYER_ENABLED
------------------------
-
-Hierarchical reception in ISDB-T is achieved by enabling or disabling
-layers in the decoding process. Setting all bits of
-``DTV_ISDBT_LAYER_ENABLED`` to '1' forces all layers (if applicable) to
-be demodulated. This is the default.
-
-If the channel is in the partial reception mode
-(``DTV_ISDBT_PARTIAL_RECEPTION`` = 1) the central segment can be decoded
-independently of the other 12 segments. In that mode layer A has to have
-a ``SEGMENT_COUNT`` of 1.
-
-In ISDB-Tsb only layer A is used, it can be 1 or 3 in ISDB-Tsb according
-to ``DTV_ISDBT_PARTIAL_RECEPTION``. ``SEGMENT_COUNT`` must be filled
-accordingly.
-
-Only the values of the first 3 bits are used. Other bits will be silently ignored:
-
-``DTV_ISDBT_LAYER_ENABLED`` bit 0: layer A enabled
-
-``DTV_ISDBT_LAYER_ENABLED`` bit 1: layer B enabled
-
-``DTV_ISDBT_LAYER_ENABLED`` bit 2: layer C enabled
-
-``DTV_ISDBT_LAYER_ENABLED`` bits 3-31: unused
-
-
-.. _DTV-ISDBT-LAYER-FEC:
-
-DTV_ISDBT_LAYER[A-C]_FEC
-------------------------
-
-Possible values: ``FEC_AUTO``, ``FEC_1_2``, ``FEC_2_3``, ``FEC_3_4``,
-``FEC_5_6``, ``FEC_7_8``
-
-
-.. _DTV-ISDBT-LAYER-MODULATION:
-
-DTV_ISDBT_LAYER[A-C]_MODULATION
--------------------------------
-
-Possible values: ``QAM_AUTO``, QP\ ``SK, QAM_16``, ``QAM_64``, ``DQPSK``
-
-Note: If layer C is ``DQPSK`` layer B has to be ``DQPSK``. If layer B is
-``DQPSK`` and ``DTV_ISDBT_PARTIAL_RECEPTION``\ =0 layer has to be
-``DQPSK``.
-
-
-.. _DTV-ISDBT-LAYER-SEGMENT-COUNT:
-
-DTV_ISDBT_LAYER[A-C]_SEGMENT_COUNT
-----------------------------------
-
-Possible values: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, -1 (AUTO)
-
-Note: Truth table for ``DTV_ISDBT_SOUND_BROADCASTING`` and
-``DTV_ISDBT_PARTIAL_RECEPTION`` and ``LAYER[A-C]_SEGMENT_COUNT``
-
-.. _isdbt-layer_seg-cnt-table:
-
-.. flat-table:: Truth table for ISDB-T Sound Broadcasting
-    :header-rows:  0
-    :stub-columns: 0
-
-
-    -  .. row 1
-
-       -  PR
-
-       -  SB
-
-       -  Layer A width
-
-       -  Layer B width
-
-       -  Layer C width
-
-       -  total width
-
-    -  .. row 2
-
-       -  0
-
-       -  0
-
-       -  1 .. 13
-
-       -  1 .. 13
-
-       -  1 .. 13
-
-       -  13
-
-    -  .. row 3
-
-       -  1
-
-       -  0
-
-       -  1
-
-       -  1 .. 13
-
-       -  1 .. 13
-
-       -  13
-
-    -  .. row 4
-
-       -  0
-
-       -  1
-
-       -  1
-
-       -  0
-
-       -  0
-
-       -  1
-
-    -  .. row 5
-
-       -  1
-
-       -  1
-
-       -  1
-
-       -  2
-
-       -  0
-
-       -  13
-
-
-
-.. _DTV-ISDBT-LAYER-TIME-INTERLEAVING:
-
-DTV_ISDBT_LAYER[A-C]_TIME_INTERLEAVING
---------------------------------------
-
-Valid values: 0, 1, 2, 4, -1 (AUTO)
-
-when DTV_ISDBT_SOUND_BROADCASTING is active, value 8 is also valid.
-
-Note: The real time interleaving length depends on the mode (fft-size).
-The values here are referring to what can be found in the
-TMCC-structure, as shown in the table below.
-
-
-.. c:type:: isdbt_layer_interleaving_table
-
-.. flat-table:: ISDB-T time interleaving modes
-    :header-rows:  0
-    :stub-columns: 0
-
-
-    -  .. row 1
-
-       -  ``DTV_ISDBT_LAYER[A-C]_TIME_INTERLEAVING``
-
-       -  Mode 1 (2K FFT)
-
-       -  Mode 2 (4K FFT)
-
-       -  Mode 3 (8K FFT)
-
-    -  .. row 2
-
-       -  0
-
-       -  0
-
-       -  0
-
-       -  0
-
-    -  .. row 3
-
-       -  1
-
-       -  4
-
-       -  2
-
-       -  1
-
-    -  .. row 4
-
-       -  2
-
-       -  8
-
-       -  4
-
-       -  2
-
-    -  .. row 5
-
-       -  4
-
-       -  16
-
-       -  8
-
-       -  4
-
-
-
-.. _DTV-ATSCMH-FIC-VER:
-
-DTV_ATSCMH_FIC_VER
-------------------
-
-Version number of the FIC (Fast Information Channel) signaling data.
-
-FIC is used for relaying information to allow rapid service acquisition
-by the receiver.
-
-Possible values: 0, 1, 2, 3, ..., 30, 31
-
-
-.. _DTV-ATSCMH-PARADE-ID:
-
-DTV_ATSCMH_PARADE_ID
---------------------
-
-Parade identification number
-
-A parade is a collection of up to eight MH groups, conveying one or two
-ensembles.
-
-Possible values: 0, 1, 2, 3, ..., 126, 127
-
-
-.. _DTV-ATSCMH-NOG:
-
-DTV_ATSCMH_NOG
---------------
-
-Number of MH groups per MH subframe for a designated parade.
-
-Possible values: 1, 2, 3, 4, 5, 6, 7, 8
-
-
-.. _DTV-ATSCMH-TNOG:
-
-DTV_ATSCMH_TNOG
----------------
-
-Total number of MH groups including all MH groups belonging to all MH
-parades in one MH subframe.
-
-Possible values: 0, 1, 2, 3, ..., 30, 31
-
-
-.. _DTV-ATSCMH-SGN:
-
-DTV_ATSCMH_SGN
---------------
-
-Start group number.
-
-Possible values: 0, 1, 2, 3, ..., 14, 15
-
-
-.. _DTV-ATSCMH-PRC:
-
-DTV_ATSCMH_PRC
---------------
-
-Parade repetition cycle.
-
-Possible values: 1, 2, 3, 4, 5, 6, 7, 8
-
-
-.. _DTV-ATSCMH-RS-FRAME-MODE:
-
-DTV_ATSCMH_RS_FRAME_MODE
-------------------------
-
-Reed Solomon (RS) frame mode.
-
-Possible values are:
-
-.. tabularcolumns:: |p{5.0cm}|p{12.5cm}|
-
-.. c:type:: atscmh_rs_frame_mode
-
-.. flat-table:: enum atscmh_rs_frame_mode
-    :header-rows:  1
-    :stub-columns: 0
-
-
-    -  .. row 1
-
-       -  ID
-
-       -  Description
-
-    -  .. row 2
-
-       -  .. _ATSCMH-RSFRAME-PRI-ONLY:
-
-	  ``ATSCMH_RSFRAME_PRI_ONLY``
-
-       -  Single Frame: There is only a primary RS Frame for all Group
-	  Regions.
-
-    -  .. row 3
-
-       -  .. _ATSCMH-RSFRAME-PRI-SEC:
-
-	  ``ATSCMH_RSFRAME_PRI_SEC``
-
-       -  Dual Frame: There are two separate RS Frames: Primary RS Frame for
-	  Group Region A and B and Secondary RS Frame for Group Region C and
-	  D.
-
-
-
-.. _DTV-ATSCMH-RS-FRAME-ENSEMBLE:
-
-DTV_ATSCMH_RS_FRAME_ENSEMBLE
-----------------------------
-
-Reed Solomon(RS) frame ensemble.
-
-Possible values are:
-
-
-.. c:type:: atscmh_rs_frame_ensemble
-
-.. flat-table:: enum atscmh_rs_frame_ensemble
-    :header-rows:  1
-    :stub-columns: 0
-
-
-    -  .. row 1
-
-       -  ID
-
-       -  Description
-
-    -  .. row 2
-
-       -  .. _ATSCMH-RSFRAME-ENS-PRI:
-
-	  ``ATSCMH_RSFRAME_ENS_PRI``
-
-       -  Primary Ensemble.
-
-    -  .. row 3
-
-       -  .. _ATSCMH-RSFRAME-ENS-SEC:
-
-	  ``AATSCMH_RSFRAME_PRI_SEC``
-
-       -  Secondary Ensemble.
-
-    -  .. row 4
-
-       -  .. _ATSCMH-RSFRAME-RES:
-
-	  ``AATSCMH_RSFRAME_RES``
-
-       -  Reserved. Shouldn't be used.
-
-
-
-.. _DTV-ATSCMH-RS-CODE-MODE-PRI:
-
-DTV_ATSCMH_RS_CODE_MODE_PRI
----------------------------
-
-Reed Solomon (RS) code mode (primary).
-
-Possible values are:
-
-
-.. c:type:: atscmh_rs_code_mode
-
-.. flat-table:: enum atscmh_rs_code_mode
-    :header-rows:  1
-    :stub-columns: 0
-
-
-    -  .. row 1
-
-       -  ID
-
-       -  Description
-
-    -  .. row 2
-
-       -  .. _ATSCMH-RSCODE-211-187:
-
-	  ``ATSCMH_RSCODE_211_187``
-
-       -  Reed Solomon code (211,187).
-
-    -  .. row 3
-
-       -  .. _ATSCMH-RSCODE-223-187:
-
-	  ``ATSCMH_RSCODE_223_187``
-
-       -  Reed Solomon code (223,187).
-
-    -  .. row 4
-
-       -  .. _ATSCMH-RSCODE-235-187:
-
-	  ``ATSCMH_RSCODE_235_187``
-
-       -  Reed Solomon code (235,187).
-
-    -  .. row 5
-
-       -  .. _ATSCMH-RSCODE-RES:
-
-	  ``ATSCMH_RSCODE_RES``
-
-       -  Reserved. Shouldn't be used.
-
-
-
-.. _DTV-ATSCMH-RS-CODE-MODE-SEC:
-
-DTV_ATSCMH_RS_CODE_MODE_SEC
----------------------------
-
-Reed Solomon (RS) code mode (secondary).
-
-Possible values are the same as documented on enum
-:c:type:`atscmh_rs_code_mode`:
-
-
-.. _DTV-ATSCMH-SCCC-BLOCK-MODE:
-
-DTV_ATSCMH_SCCC_BLOCK_MODE
---------------------------
-
-Series Concatenated Convolutional Code Block Mode.
-
-Possible values are:
-
-.. tabularcolumns:: |p{4.5cm}|p{13.0cm}|
-
-.. c:type:: atscmh_sccc_block_mode
-
-.. flat-table:: enum atscmh_scc_block_mode
-    :header-rows:  1
-    :stub-columns: 0
-
-
-    -  .. row 1
-
-       -  ID
-
-       -  Description
-
-    -  .. row 2
-
-       -  .. _ATSCMH-SCCC-BLK-SEP:
-
-	  ``ATSCMH_SCCC_BLK_SEP``
-
-       -  Separate SCCC: the SCCC outer code mode shall be set independently
-	  for each Group Region (A, B, C, D)
-
-    -  .. row 3
-
-       -  .. _ATSCMH-SCCC-BLK-COMB:
-
-	  ``ATSCMH_SCCC_BLK_COMB``
-
-       -  Combined SCCC: all four Regions shall have the same SCCC outer
-	  code mode.
+.. _DTV-ISDBT-LAYER-MODULATION:
 
-    -  .. row 4
+DTV_ISDBT_LAYER[A-C]_MODULATION
+-------------------------------
 
-       -  .. _ATSCMH-SCCC-BLK-RES:
+The modulation used by a given ISDB Layer, as defined by
+:c:type:`fe_modulation`.
 
-	  ``ATSCMH_SCCC_BLK_RES``
+Possible values are: ``QAM_AUTO``, ``QPSK``, ``QAM_16``, ``QAM_64``, ``DQPSK``
 
-       -  Reserved. Shouldn't be used.
+.. note::
 
+   #. If layer C is ``DQPSK``, then layer B has to be ``DQPSK``.
 
+   #. If layer B is ``DQPSK`` and ``DTV_ISDBT_PARTIAL_RECEPTION``\ = 0,
+      then layer has to be ``DQPSK``.
 
-.. _DTV-ATSCMH-SCCC-CODE-MODE-A:
 
-DTV_ATSCMH_SCCC_CODE_MODE_A
----------------------------
+.. _DTV-ISDBT-LAYER-SEGMENT-COUNT:
 
-Series Concatenated Convolutional Code Rate.
+DTV_ISDBT_LAYER[A-C]_SEGMENT_COUNT
+----------------------------------
 
-Possible values are:
+Possible values: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, -1 (AUTO)
 
+Note: Truth table for ``DTV_ISDBT_SOUND_BROADCASTING`` and
+``DTV_ISDBT_PARTIAL_RECEPTION`` and ``LAYER[A-C]_SEGMENT_COUNT``
 
-.. c:type:: atscmh_sccc_code_mode
+.. _isdbt-layer_seg-cnt-table:
 
-.. flat-table:: enum atscmh_sccc_code_mode
+.. flat-table:: Truth table for ISDB-T Sound Broadcasting
     :header-rows:  1
     :stub-columns: 0
 
 
     -  .. row 1
 
-       -  ID
-
-       -  Description
-
-    -  .. row 2
-
-       -  .. _ATSCMH-SCCC-CODE-HLF:
-
-	  ``ATSCMH_SCCC_CODE_HLF``
-
-       -  The outer code rate of a SCCC Block is 1/2 rate.
-
-    -  .. row 3
-
-       -  .. _ATSCMH-SCCC-CODE-QTR:
+       -  Partial Reception
 
-	  ``ATSCMH_SCCC_CODE_QTR``
+       -  Sound Broadcasting
 
-       -  The outer code rate of a SCCC Block is 1/4 rate.
+       -  Layer A width
 
-    -  .. row 4
+       -  Layer B width
 
-       -  .. _ATSCMH-SCCC-CODE-RES:
+       -  Layer C width
 
-	  ``ATSCMH_SCCC_CODE_RES``
+       -  total width
 
-       -  to be documented.
+    -  .. row 2
 
+       -  0
 
+       -  0
 
-.. _DTV-ATSCMH-SCCC-CODE-MODE-B:
+       -  1 .. 13
 
-DTV_ATSCMH_SCCC_CODE_MODE_B
----------------------------
+       -  1 .. 13
 
-Series Concatenated Convolutional Code Rate.
+       -  1 .. 13
 
-Possible values are the same as documented on enum
-:c:type:`atscmh_sccc_code_mode`.
+       -  13
 
+    -  .. row 3
 
-.. _DTV-ATSCMH-SCCC-CODE-MODE-C:
+       -  1
 
-DTV_ATSCMH_SCCC_CODE_MODE_C
----------------------------
+       -  0
 
-Series Concatenated Convolutional Code Rate.
+       -  1
 
-Possible values are the same as documented on enum
-:c:type:`atscmh_sccc_code_mode`.
+       -  1 .. 13
 
+       -  1 .. 13
 
-.. _DTV-ATSCMH-SCCC-CODE-MODE-D:
+       -  13
 
-DTV_ATSCMH_SCCC_CODE_MODE_D
----------------------------
+    -  .. row 4
 
-Series Concatenated Convolutional Code Rate.
+       -  0
 
-Possible values are the same as documented on enum
-:c:type:`atscmh_sccc_code_mode`.
+       -  1
 
+       -  1
 
-.. _DTV-API-VERSION:
+       -  0
 
-DTV_API_VERSION
-===============
+       -  0
 
-Returns the major/minor version of the DVB API
+       -  1
 
+    -  .. row 5
 
-.. _DTV-CODE-RATE-HP:
+       -  1
 
-DTV_CODE_RATE_HP
-================
+       -  1
 
-Used on terrestrial transmissions. The acceptable values are the ones
-described at :c:type:`fe_transmit_mode`.
+       -  1
 
+       -  2
 
-.. _DTV-CODE-RATE-LP:
+       -  0
 
-DTV_CODE_RATE_LP
-================
+       -  13
 
-Used on terrestrial transmissions. The acceptable values are the ones
-described at :c:type:`fe_transmit_mode`.
 
 
-.. _DTV-GUARD-INTERVAL:
+.. _DTV-ISDBT-LAYER-TIME-INTERLEAVING:
 
-DTV_GUARD_INTERVAL
-==================
+DTV_ISDBT_LAYER[A-C]_TIME_INTERLEAVING
+--------------------------------------
 
-Possible values are:
+Valid values: 0, 1, 2, 4, -1 (AUTO)
 
+when DTV_ISDBT_SOUND_BROADCASTING is active, value 8 is also valid.
 
-.. c:type:: fe_guard_interval
+Note: The real time interleaving length depends on the mode (fft-size).
+The values here are referring to what can be found in the
+TMCC-structure, as shown in the table below.
 
-Modulation guard interval
--------------------------
 
+.. c:type:: isdbt_layer_interleaving_table
 
-.. flat-table:: enum fe_guard_interval
+.. flat-table:: ISDB-T time interleaving modes
     :header-rows:  1
     :stub-columns: 0
 
 
     -  .. row 1
 
-       -  ID
-
-       -  Description
-
-    -  .. row 2
-
-       -  .. _GUARD-INTERVAL-AUTO:
-
-	  ``GUARD_INTERVAL_AUTO``
+       -  ``DTV_ISDBT_LAYER[A-C]_TIME_INTERLEAVING``
 
-       -  Autodetect the guard interval
+       -  Mode 1 (2K FFT)
 
-    -  .. row 3
+       -  Mode 2 (4K FFT)
 
-       -  .. _GUARD-INTERVAL-1-128:
+       -  Mode 3 (8K FFT)
 
-	  ``GUARD_INTERVAL_1_128``
+    -  .. row 2
 
-       -  Guard interval 1/128
+       -  0
 
-    -  .. row 4
+       -  0
 
-       -  .. _GUARD-INTERVAL-1-32:
+       -  0
 
-	  ``GUARD_INTERVAL_1_32``
+       -  0
 
-       -  Guard interval 1/32
+    -  .. row 3
 
-    -  .. row 5
+       -  1
 
-       -  .. _GUARD-INTERVAL-1-16:
+       -  4
 
-	  ``GUARD_INTERVAL_1_16``
+       -  2
 
-       -  Guard interval 1/16
+       -  1
 
-    -  .. row 6
+    -  .. row 4
 
-       -  .. _GUARD-INTERVAL-1-8:
+       -  2
 
-	  ``GUARD_INTERVAL_1_8``
+       -  8
 
-       -  Guard interval 1/8
+       -  4
 
-    -  .. row 7
+       -  2
 
-       -  .. _GUARD-INTERVAL-1-4:
+    -  .. row 5
 
-	  ``GUARD_INTERVAL_1_4``
+       -  4
 
-       -  Guard interval 1/4
+       -  16
 
-    -  .. row 8
+       -  8
 
-       -  .. _GUARD-INTERVAL-19-128:
+       -  4
 
-	  ``GUARD_INTERVAL_19_128``
 
-       -  Guard interval 19/128
 
-    -  .. row 9
+.. _DTV-ATSCMH-FIC-VER:
 
-       -  .. _GUARD-INTERVAL-19-256:
+DTV_ATSCMH_FIC_VER
+------------------
 
-	  ``GUARD_INTERVAL_19_256``
+Version number of the FIC (Fast Information Channel) signaling data.
 
-       -  Guard interval 19/256
+FIC is used for relaying information to allow rapid service acquisition
+by the receiver.
 
-    -  .. row 10
+Possible values: 0, 1, 2, 3, ..., 30, 31
 
-       -  .. _GUARD-INTERVAL-PN420:
 
-	  ``GUARD_INTERVAL_PN420``
+.. _DTV-ATSCMH-PARADE-ID:
 
-       -  PN length 420 (1/4)
+DTV_ATSCMH_PARADE_ID
+--------------------
 
-    -  .. row 11
+Parade identification number
 
-       -  .. _GUARD-INTERVAL-PN595:
+A parade is a collection of up to eight MH groups, conveying one or two
+ensembles.
 
-	  ``GUARD_INTERVAL_PN595``
+Possible values: 0, 1, 2, 3, ..., 126, 127
 
-       -  PN length 595 (1/6)
 
-    -  .. row 12
+.. _DTV-ATSCMH-NOG:
 
-       -  .. _GUARD-INTERVAL-PN945:
+DTV_ATSCMH_NOG
+--------------
 
-	  ``GUARD_INTERVAL_PN945``
+Number of MH groups per MH subframe for a designated parade.
 
-       -  PN length 945 (1/9)
+Possible values: 1, 2, 3, 4, 5, 6, 7, 8
 
 
-Notes:
+.. _DTV-ATSCMH-TNOG:
 
-1) If ``DTV_GUARD_INTERVAL`` is set the ``GUARD_INTERVAL_AUTO`` the
-hardware will try to find the correct guard interval (if capable) and
-will use TMCC to fill in the missing parameters.
+DTV_ATSCMH_TNOG
+---------------
 
-2) Intervals 1/128, 19/128 and 19/256 are used only for DVB-T2 at
-present
+Total number of MH groups including all MH groups belonging to all MH
+parades in one MH subframe.
 
-3) DTMB specifies PN420, PN595 and PN945.
+Possible values: 0, 1, 2, 3, ..., 30, 31
 
 
-.. _DTV-TRANSMISSION-MODE:
+.. _DTV-ATSCMH-SGN:
 
-DTV_TRANSMISSION_MODE
-=====================
+DTV_ATSCMH_SGN
+--------------
 
-Specifies the number of carriers used by the standard. This is used only
-on OFTM-based standards, e. g. DVB-T/T2, ISDB-T, DTMB
+Start group number.
 
+Possible values: 0, 1, 2, 3, ..., 14, 15
 
-.. c:type:: fe_transmit_mode
 
-enum fe_transmit_mode: Number of carriers per channel
------------------------------------------------------
+.. _DTV-ATSCMH-PRC:
 
-.. tabularcolumns:: |p{5.0cm}|p{12.5cm}|
+DTV_ATSCMH_PRC
+--------------
 
-.. flat-table:: enum fe_transmit_mode
-    :header-rows:  1
-    :stub-columns: 0
+Parade repetition cycle.
 
+Possible values: 1, 2, 3, 4, 5, 6, 7, 8
 
-    -  .. row 1
 
-       -  ID
+.. _DTV-ATSCMH-RS-FRAME-MODE:
 
-       -  Description
+DTV_ATSCMH_RS_FRAME_MODE
+------------------------
 
-    -  .. row 2
+Reed Solomon (RS) frame mode.
 
-       -  .. _TRANSMISSION-MODE-AUTO:
+The acceptable values are defined by :c:type:`atscmh_rs_frame_mode`.
 
-	  ``TRANSMISSION_MODE_AUTO``
 
-       -  Autodetect transmission mode. The hardware will try to find the
-	  correct FFT-size (if capable) to fill in the missing parameters.
+.. _DTV-ATSCMH-RS-FRAME-ENSEMBLE:
 
-    -  .. row 3
+DTV_ATSCMH_RS_FRAME_ENSEMBLE
+----------------------------
 
-       -  .. _TRANSMISSION-MODE-1K:
+Reed Solomon(RS) frame ensemble.
 
-	  ``TRANSMISSION_MODE_1K``
+The acceptable values are defined by :c:type:`atscmh_rs_frame_ensemble`.
 
-       -  Transmission mode 1K
 
-    -  .. row 4
+.. _DTV-ATSCMH-RS-CODE-MODE-PRI:
 
-       -  .. _TRANSMISSION-MODE-2K:
+DTV_ATSCMH_RS_CODE_MODE_PRI
+---------------------------
 
-	  ``TRANSMISSION_MODE_2K``
+Reed Solomon (RS) code mode (primary).
 
-       -  Transmission mode 2K
+The acceptable values are defined by :c:type:`atscmh_rs_code_mode`.
 
-    -  .. row 5
 
-       -  .. _TRANSMISSION-MODE-8K:
+.. _DTV-ATSCMH-RS-CODE-MODE-SEC:
 
-	  ``TRANSMISSION_MODE_8K``
+DTV_ATSCMH_RS_CODE_MODE_SEC
+---------------------------
 
-       -  Transmission mode 8K
+Reed Solomon (RS) code mode (secondary).
 
-    -  .. row 6
+The acceptable values are defined by :c:type:`atscmh_rs_code_mode`.
 
-       -  .. _TRANSMISSION-MODE-4K:
 
-	  ``TRANSMISSION_MODE_4K``
+.. _DTV-ATSCMH-SCCC-BLOCK-MODE:
 
-       -  Transmission mode 4K
+DTV_ATSCMH_SCCC_BLOCK_MODE
+--------------------------
 
-    -  .. row 7
+Series Concatenated Convolutional Code Block Mode.
 
-       -  .. _TRANSMISSION-MODE-16K:
+The acceptable values are defined by :c:type:`atscmh_sccc_block_mode`.
 
-	  ``TRANSMISSION_MODE_16K``
 
-       -  Transmission mode 16K
+.. _DTV-ATSCMH-SCCC-CODE-MODE-A:
 
-    -  .. row 8
+DTV_ATSCMH_SCCC_CODE_MODE_A
+---------------------------
 
-       -  .. _TRANSMISSION-MODE-32K:
+Series Concatenated Convolutional Code Rate.
 
-	  ``TRANSMISSION_MODE_32K``
+The acceptable values are defined by :c:type:`atscmh_sccc_code_mode`.
 
-       -  Transmission mode 32K
+.. _DTV-ATSCMH-SCCC-CODE-MODE-B:
 
-    -  .. row 9
+DTV_ATSCMH_SCCC_CODE_MODE_B
+---------------------------
 
-       -  .. _TRANSMISSION-MODE-C1:
+Series Concatenated Convolutional Code Rate.
 
-	  ``TRANSMISSION_MODE_C1``
+Possible values are the same as documented on enum
+:c:type:`atscmh_sccc_code_mode`.
 
-       -  Single Carrier (C=1) transmission mode (DTMB)
 
-    -  .. row 10
+.. _DTV-ATSCMH-SCCC-CODE-MODE-C:
 
-       -  .. _TRANSMISSION-MODE-C3780:
+DTV_ATSCMH_SCCC_CODE_MODE_C
+---------------------------
 
-	  ``TRANSMISSION_MODE_C3780``
+Series Concatenated Convolutional Code Rate.
 
-       -  Multi Carrier (C=3780) transmission mode (DTMB)
+Possible values are the same as documented on enum
+:c:type:`atscmh_sccc_code_mode`.
 
 
-Notes:
+.. _DTV-ATSCMH-SCCC-CODE-MODE-D:
 
-1) ISDB-T supports three carrier/symbol-size: 8K, 4K, 2K. It is called
-'mode' in the standard: Mode 1 is 2K, mode 2 is 4K, mode 3 is 8K
+DTV_ATSCMH_SCCC_CODE_MODE_D
+---------------------------
 
-2) If ``DTV_TRANSMISSION_MODE`` is set the ``TRANSMISSION_MODE_AUTO``
-the hardware will try to find the correct FFT-size (if capable) and will
-use TMCC to fill in the missing parameters.
+Series Concatenated Convolutional Code Rate.
 
-3) DVB-T specifies 2K and 8K as valid sizes.
+Possible values are the same as documented on enum
+:c:type:`atscmh_sccc_code_mode`.
 
-4) DVB-T2 specifies 1K, 2K, 4K, 8K, 16K and 32K.
 
-5) DTMB specifies C1 and C3780.
+.. _DTV-API-VERSION:
 
+DTV_API_VERSION
+===============
 
-.. _DTV-HIERARCHY:
+Returns the major/minor version of the DVB API
 
-DTV_HIERARCHY
-=============
 
-Frontend hierarchy
+.. _DTV-CODE-RATE-HP:
 
+DTV_CODE_RATE_HP
+================
 
-.. c:type:: fe_hierarchy
+Used on terrestrial transmissions.
 
-Frontend hierarchy
-------------------
+The acceptable values are defined by :c:type:`fe_transmit_mode`.
 
 
-.. flat-table:: enum fe_hierarchy
-    :header-rows:  1
-    :stub-columns: 0
+.. _DTV-CODE-RATE-LP:
 
+DTV_CODE_RATE_LP
+================
 
-    -  .. row 1
+Used on terrestrial transmissions.
 
-       -  ID
+The acceptable values are defined by :c:type:`fe_transmit_mode`.
 
-       -  Description
 
-    -  .. row 2
+.. _DTV-GUARD-INTERVAL:
 
-       -  .. _HIERARCHY-NONE:
+DTV_GUARD_INTERVAL
+==================
 
-	  ``HIERARCHY_NONE``
+The acceptable values are defined by :c:type:`fe_guard_interval`.
 
-       -  No hierarchy
+.. note::
 
-    -  .. row 3
+   #. If ``DTV_GUARD_INTERVAL`` is set the ``GUARD_INTERVAL_AUTO`` the
+      hardware will try to find the correct guard interval (if capable) and
+      will use TMCC to fill in the missing parameters.
+   #. Intervals ``GUARD_INTERVAL_1_128``, ``GUARD_INTERVAL_19_128``
+      and ``GUARD_INTERVAL_19_256`` are used only for DVB-T2 at
+      present.
+   #. Intervals ``GUARD_INTERVAL_PN420``, ``GUARD_INTERVAL_PN595`` and
+      ``GUARD_INTERVAL_PN945`` are used only for DMTB at the present.
+      On such standard, only those intervals and ``GUARD_INTERVAL_AUTO``
+      are valid.
 
-       -  .. _HIERARCHY-AUTO:
+.. _DTV-TRANSMISSION-MODE:
 
-	  ``HIERARCHY_AUTO``
+DTV_TRANSMISSION_MODE
+=====================
 
-       -  Autodetect hierarchy (if supported)
+Specifies the FFT size (with corresponds to the approximate number of
+carriers) used by the standard. This is used only on OFTM-based standards,
+e. g. DVB-T/T2, ISDB-T, DTMB.
 
-    -  .. row 4
+The acceptable values are defined by :c:type:`fe_transmit_mode`.
 
-       -  .. _HIERARCHY-1:
+.. note::
 
-	  ``HIERARCHY_1``
+   #. ISDB-T supports three carrier/symbol-size: 8K, 4K, 2K. It is called
+      **mode** on such standard, and are numbered from 1 to 3:
 
-       -  Hierarchy 1
+      ====	========	========================
+      Mode	FFT size	Transmission mode
+      ====	========	========================
+      1		2K		``TRANSMISSION_MODE_2K``
+      2		4K		``TRANSMISSION_MODE_4K``
+      3		8K		``TRANSMISSION_MODE_8K``
+      ====	========	========================
 
-    -  .. row 5
+   #. If ``DTV_TRANSMISSION_MODE`` is set the ``TRANSMISSION_MODE_AUTO``
+      the hardware will try to find the correct FFT-size (if capable) and
+      will use TMCC to fill in the missing parameters.
 
-       -  .. _HIERARCHY-2:
+   #. DVB-T specifies 2K and 8K as valid sizes.
 
-	  ``HIERARCHY_2``
+   #. DVB-T2 specifies 1K, 2K, 4K, 8K, 16K and 32K.
 
-       -  Hierarchy 2
+   #. DTMB specifies C1 and C3780.
 
-    -  .. row 6
 
-       -  .. _HIERARCHY-4:
+.. _DTV-HIERARCHY:
 
-	  ``HIERARCHY_4``
+DTV_HIERARCHY
+=============
 
-       -  Hierarchy 4
+Frontend hierarchy.
 
+The acceptable values are defined by :c:type:`fe_hierarchy`.
 
 
 .. _DTV-STREAM-ID:
@@ -1884,60 +880,17 @@ with it, rather than trying to use FE_GET_INFO. In the case of a
 legacy frontend, the result is just the same as with FE_GET_INFO, but
 in a more structured format
 
+The acceptable values are defined by :c:type:`fe_delivery_system`.
+
 
 .. _DTV-INTERLEAVING:
 
 DTV_INTERLEAVING
 ================
 
-Time interleaving to be used. Currently, used only on DTMB.
-
-
-.. c:type:: fe_interleaving
-
-.. flat-table:: enum fe_interleaving
-    :header-rows:  1
-    :stub-columns: 0
-
-
-    -  .. row 1
-
-       -  ID
-
-       -  Description
-
-    -  .. row 2
-
-       -  .. _INTERLEAVING-NONE:
-
-	  ``INTERLEAVING_NONE``
-
-       -  No interleaving.
-
-    -  .. row 3
-
-       -  .. _INTERLEAVING-AUTO:
-
-	  ``INTERLEAVING_AUTO``
-
-       -  Auto-detect interleaving.
-
-    -  .. row 4
-
-       -  .. _INTERLEAVING-240:
-
-	  ``INTERLEAVING_240``
-
-       -  Interleaving of 240 symbols.
-
-    -  .. row 5
-
-       -  .. _INTERLEAVING-720:
-
-	  ``INTERLEAVING_720``
-
-       -  Interleaving of 720 symbols.
+Time interleaving to be used.
 
+The acceptable values are defined by :c:type:`fe_interleaving`.
 
 
 .. _DTV-LNA:
diff --git a/Documentation/media/uapi/dvb/frontend-header.rst b/Documentation/media/uapi/dvb/frontend-header.rst
new file mode 100644
index 000000000000..8d8433cf1e12
--- /dev/null
+++ b/Documentation/media/uapi/dvb/frontend-header.rst
@@ -0,0 +1,4 @@
+Frontend uAPI data types
+========================
+
+.. kernel-doc:: include/uapi/linux/dvb/frontend.h
diff --git a/include/uapi/linux/dvb/frontend.h b/include/uapi/linux/dvb/frontend.h
index 16a318fc469a..e7c29d0bdee4 100644
--- a/include/uapi/linux/dvb/frontend.h
+++ b/include/uapi/linux/dvb/frontend.h
@@ -562,10 +562,10 @@ enum fe_pilot {
 };
 
 /**
- * enum fe_rolloff - Rolloff factor (also known as alpha)
- * @ROLLOFF_35:		Roloff factor: 35%
- * @ROLLOFF_20:		Roloff factor: 20%
- * @ROLLOFF_25:		Roloff factor: 25%
+ * enum fe_rolloff - Rolloff factor
+ * @ROLLOFF_35:		Roloff factor: α=35%
+ * @ROLLOFF_20:		Roloff factor: α=20%
+ * @ROLLOFF_25:		Roloff factor: α=25%
  * @ROLLOFF_AUTO:	Auto-detect the roloff factor.
  *
  * .. note:
-- 
cgit v1.2.3-71-gd317


From 791edca5685b26d4575e59f5420ba3e206f5cebb Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab@s-opensource.com>
Date: Thu, 31 Aug 2017 12:52:45 -0400
Subject: media: dmx.h: get rid of unused DMX_KERNEL_CLIENT

There's a flag defined for Digital TV demux that is not used
anywhere, called DMX_KERNEL_CLIENT. Get rid of it.

Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
---
 Documentation/media/dmx.h.rst.exceptions   | 1 -
 Documentation/media/uapi/dvb/dmx_types.rst | 1 -
 include/uapi/linux/dvb/dmx.h               | 1 -
 3 files changed, 3 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/media/dmx.h.rst.exceptions b/Documentation/media/dmx.h.rst.exceptions
index 2fdb458564ba..933ca5a61ce1 100644
--- a/Documentation/media/dmx.h.rst.exceptions
+++ b/Documentation/media/dmx.h.rst.exceptions
@@ -56,7 +56,6 @@ replace symbol DMX_SOURCE_DVR3 :c:type:`dmx_source`
 replace define DMX_CHECK_CRC :c:type:`dmx_sct_filter_params`
 replace define DMX_ONESHOT :c:type:`dmx_sct_filter_params`
 replace define DMX_IMMEDIATE_START :c:type:`dmx_sct_filter_params`
-replace define DMX_KERNEL_CLIENT :c:type:`dmx_sct_filter_params`
 
 # some typedefs should point to struct/enums
 replace typedef dmx_caps_t :c:type:`dmx_caps`
diff --git a/Documentation/media/uapi/dvb/dmx_types.rst b/Documentation/media/uapi/dvb/dmx_types.rst
index 80dd659860d7..0f0113205c94 100644
--- a/Documentation/media/uapi/dvb/dmx_types.rst
+++ b/Documentation/media/uapi/dvb/dmx_types.rst
@@ -147,7 +147,6 @@ struct dmx_sct_filter_params
     #define DMX_CHECK_CRC       1
     #define DMX_ONESHOT         2
     #define DMX_IMMEDIATE_START 4
-    #define DMX_KERNEL_CLIENT   0x8000
     };
 
 
diff --git a/include/uapi/linux/dvb/dmx.h b/include/uapi/linux/dvb/dmx.h
index 1bc4d6fb0f01..1702f923d425 100644
--- a/include/uapi/linux/dvb/dmx.h
+++ b/include/uapi/linux/dvb/dmx.h
@@ -103,7 +103,6 @@ struct dmx_sct_filter_params
 #define DMX_CHECK_CRC       1
 #define DMX_ONESHOT         2
 #define DMX_IMMEDIATE_START 4
-#define DMX_KERNEL_CLIENT   0x8000
 };
 
 
-- 
cgit v1.2.3-71-gd317


From 286fe1ca3fa1b6fcc7ce8695b7c8d681e6e1c3b7 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab@s-opensource.com>
Date: Thu, 31 Aug 2017 14:11:34 -0400
Subject: media: dmx.h: get rid of DMX_GET_CAPS

There's no driver currently using it; it is also not
documented about what it would be supposed to do.

So, get rid of it.

Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
---
 Documentation/media/dmx.h.rst.exceptions      |  1 -
 Documentation/media/uapi/dvb/dmx-get-caps.rst | 41 ---------------------------
 Documentation/media/uapi/dvb/dmx_fcalls.rst   |  1 -
 Documentation/media/uapi/dvb/dmx_types.rst    | 12 --------
 include/uapi/linux/dvb/dmx.h                  |  7 -----
 5 files changed, 62 deletions(-)
 delete mode 100644 Documentation/media/uapi/dvb/dmx-get-caps.rst

(limited to 'include/uapi/linux')

diff --git a/Documentation/media/dmx.h.rst.exceptions b/Documentation/media/dmx.h.rst.exceptions
index 933ca5a61ce1..5572d2dc9d0e 100644
--- a/Documentation/media/dmx.h.rst.exceptions
+++ b/Documentation/media/dmx.h.rst.exceptions
@@ -58,7 +58,6 @@ replace define DMX_ONESHOT :c:type:`dmx_sct_filter_params`
 replace define DMX_IMMEDIATE_START :c:type:`dmx_sct_filter_params`
 
 # some typedefs should point to struct/enums
-replace typedef dmx_caps_t :c:type:`dmx_caps`
 replace typedef dmx_filter_t :c:type:`dmx_filter`
 replace typedef dmx_pes_type_t :c:type:`dmx_pes_type`
 replace typedef dmx_input_t :c:type:`dmx_input`
diff --git a/Documentation/media/uapi/dvb/dmx-get-caps.rst b/Documentation/media/uapi/dvb/dmx-get-caps.rst
deleted file mode 100644
index 145fb520d779..000000000000
--- a/Documentation/media/uapi/dvb/dmx-get-caps.rst
+++ /dev/null
@@ -1,41 +0,0 @@
-.. -*- coding: utf-8; mode: rst -*-
-
-.. _DMX_GET_CAPS:
-
-============
-DMX_GET_CAPS
-============
-
-Name
-----
-
-DMX_GET_CAPS
-
-
-Synopsis
---------
-
-.. c:function:: int ioctl(fd, DMX_GET_CAPS, struct dmx_caps *caps)
-    :name: DMX_GET_CAPS
-
-Arguments
----------
-
-``fd``
-    File descriptor returned by :c:func:`open() <dvb-dmx-open>`.
-
-``caps``
-    Pointer to struct :c:type:`dmx_caps`
-
-
-Description
------------
-
-.. note:: This ioctl is undocumented. Documentation is welcome.
-
-Return Value
-------------
-
-On success 0 is returned, on error -1 and the ``errno`` variable is set
-appropriately. The generic error codes are described at the
-:ref:`Generic Error Codes <gen-errors>` chapter.
diff --git a/Documentation/media/uapi/dvb/dmx_fcalls.rst b/Documentation/media/uapi/dvb/dmx_fcalls.rst
index 77a1554d9834..49e013d4540f 100644
--- a/Documentation/media/uapi/dvb/dmx_fcalls.rst
+++ b/Documentation/media/uapi/dvb/dmx_fcalls.rst
@@ -21,7 +21,6 @@ Demux Function Calls
     dmx-get-event
     dmx-get-stc
     dmx-get-pes-pids
-    dmx-get-caps
     dmx-set-source
     dmx-add-pid
     dmx-remove-pid
diff --git a/Documentation/media/uapi/dvb/dmx_types.rst b/Documentation/media/uapi/dvb/dmx_types.rst
index 0f0113205c94..9e907b85cf16 100644
--- a/Documentation/media/uapi/dvb/dmx_types.rst
+++ b/Documentation/media/uapi/dvb/dmx_types.rst
@@ -199,18 +199,6 @@ struct dmx_stc
     };
 
 
-struct dmx_caps
-===============
-
-.. c:type:: dmx_caps
-
-.. code-block:: c
-
-     typedef struct dmx_caps {
-	__u32 caps;
-	int num_decoders;
-    } dmx_caps_t;
-
 
 enum dmx_source
 ===============
diff --git a/include/uapi/linux/dvb/dmx.h b/include/uapi/linux/dvb/dmx.h
index 1702f923d425..db8bd00c93de 100644
--- a/include/uapi/linux/dvb/dmx.h
+++ b/include/uapi/linux/dvb/dmx.h
@@ -115,11 +115,6 @@ struct dmx_pes_filter_params
 	__u32           flags;
 };
 
-struct dmx_caps {
-	__u32 caps;
-	int num_decoders;
-};
-
 enum dmx_source {
 	DMX_SOURCE_FRONT0 = 0,
 	DMX_SOURCE_FRONT1,
@@ -143,7 +138,6 @@ struct dmx_stc {
 #define DMX_SET_PES_FILTER       _IOW('o', 44, struct dmx_pes_filter_params)
 #define DMX_SET_BUFFER_SIZE      _IO('o', 45)
 #define DMX_GET_PES_PIDS         _IOR('o', 47, __u16[5])
-#define DMX_GET_CAPS             _IOR('o', 48, struct dmx_caps)
 #define DMX_SET_SOURCE           _IOW('o', 49, enum dmx_source)
 #define DMX_GET_STC              _IOWR('o', 50, struct dmx_stc)
 #define DMX_ADD_PID              _IOW('o', 51, __u16)
@@ -156,7 +150,6 @@ typedef enum dmx_output dmx_output_t;
 typedef enum dmx_input dmx_input_t;
 typedef enum dmx_ts_pes dmx_pes_type_t;
 typedef struct dmx_filter dmx_filter_t;
-typedef struct dmx_caps dmx_caps_t;
 typedef enum dmx_source  dmx_source_t;
 
 #endif
-- 
cgit v1.2.3-71-gd317


From 13adefbe9e566c6db91579e4ce17f1e5193d6f2c Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab@s-opensource.com>
Date: Thu, 31 Aug 2017 14:21:43 -0400
Subject: media: dmx.h: get rid of DMX_SET_SOURCE

No driver uses this ioctl, nor it is documented anywhere.

So, get rid of it.

Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
---
 Documentation/media/dmx.h.rst.exceptions        | 13 --------
 Documentation/media/uapi/dvb/dmx-set-source.rst | 44 -------------------------
 Documentation/media/uapi/dvb/dmx_fcalls.rst     |  1 -
 Documentation/media/uapi/dvb/dmx_types.rst      | 20 -----------
 include/uapi/linux/dvb/dmx.h                    | 12 -------
 5 files changed, 90 deletions(-)
 delete mode 100644 Documentation/media/uapi/dvb/dmx-set-source.rst

(limited to 'include/uapi/linux')

diff --git a/Documentation/media/dmx.h.rst.exceptions b/Documentation/media/dmx.h.rst.exceptions
index 5572d2dc9d0e..d2dac35bb84b 100644
--- a/Documentation/media/dmx.h.rst.exceptions
+++ b/Documentation/media/dmx.h.rst.exceptions
@@ -40,18 +40,6 @@ replace enum dmx_input :c:type:`dmx_input`
 replace symbol DMX_IN_FRONTEND :c:type:`dmx_input`
 replace symbol DMX_IN_DVR :c:type:`dmx_input`
 
-# dmx_source_t symbols
-replace enum dmx_source :c:type:`dmx_source`
-replace symbol DMX_SOURCE_FRONT0 :c:type:`dmx_source`
-replace symbol DMX_SOURCE_FRONT1 :c:type:`dmx_source`
-replace symbol DMX_SOURCE_FRONT2 :c:type:`dmx_source`
-replace symbol DMX_SOURCE_FRONT3 :c:type:`dmx_source`
-replace symbol DMX_SOURCE_DVR0 :c:type:`dmx_source`
-replace symbol DMX_SOURCE_DVR1 :c:type:`dmx_source`
-replace symbol DMX_SOURCE_DVR2 :c:type:`dmx_source`
-replace symbol DMX_SOURCE_DVR3 :c:type:`dmx_source`
-
-
 # Flags for struct dmx_sct_filter_params
 replace define DMX_CHECK_CRC :c:type:`dmx_sct_filter_params`
 replace define DMX_ONESHOT :c:type:`dmx_sct_filter_params`
@@ -61,4 +49,3 @@ replace define DMX_IMMEDIATE_START :c:type:`dmx_sct_filter_params`
 replace typedef dmx_filter_t :c:type:`dmx_filter`
 replace typedef dmx_pes_type_t :c:type:`dmx_pes_type`
 replace typedef dmx_input_t :c:type:`dmx_input`
-replace typedef dmx_source_t :c:type:`dmx_source`
diff --git a/Documentation/media/uapi/dvb/dmx-set-source.rst b/Documentation/media/uapi/dvb/dmx-set-source.rst
deleted file mode 100644
index ac7f77b25e06..000000000000
--- a/Documentation/media/uapi/dvb/dmx-set-source.rst
+++ /dev/null
@@ -1,44 +0,0 @@
-.. -*- coding: utf-8; mode: rst -*-
-
-.. _DMX_SET_SOURCE:
-
-==============
-DMX_SET_SOURCE
-==============
-
-Name
-----
-
-DMX_SET_SOURCE
-
-
-Synopsis
---------
-
-.. c:function:: int ioctl(fd, DMX_SET_SOURCE, struct dmx_source *src)
-    :name: DMX_SET_SOURCE
-
-
-Arguments
----------
-
-
-``fd``
-    File descriptor returned by :c:func:`open() <dvb-dmx-open>`.
-
-``src``
-   Undocumented.
-
-
-Description
------------
-
-.. note:: This ioctl is undocumented. Documentation is welcome.
-
-
-Return Value
-------------
-
-On success 0 is returned, on error -1 and the ``errno`` variable is set
-appropriately. The generic error codes are described at the
-:ref:`Generic Error Codes <gen-errors>` chapter.
diff --git a/Documentation/media/uapi/dvb/dmx_fcalls.rst b/Documentation/media/uapi/dvb/dmx_fcalls.rst
index 49e013d4540f..be98d60877f2 100644
--- a/Documentation/media/uapi/dvb/dmx_fcalls.rst
+++ b/Documentation/media/uapi/dvb/dmx_fcalls.rst
@@ -21,6 +21,5 @@ Demux Function Calls
     dmx-get-event
     dmx-get-stc
     dmx-get-pes-pids
-    dmx-set-source
     dmx-add-pid
     dmx-remove-pid
diff --git a/Documentation/media/uapi/dvb/dmx_types.rst b/Documentation/media/uapi/dvb/dmx_types.rst
index 9e907b85cf16..a205c02ccdc1 100644
--- a/Documentation/media/uapi/dvb/dmx_types.rst
+++ b/Documentation/media/uapi/dvb/dmx_types.rst
@@ -197,23 +197,3 @@ struct dmx_stc
 	unsigned int base;  /* output: divisor for stc to get 90 kHz clock */
 	__u64 stc;      /* output: stc in 'base'*90 kHz units */
     };
-
-
-
-enum dmx_source
-===============
-
-.. c:type:: dmx_source
-
-.. code-block:: c
-
-    typedef enum dmx_source {
-	DMX_SOURCE_FRONT0 = 0,
-	DMX_SOURCE_FRONT1,
-	DMX_SOURCE_FRONT2,
-	DMX_SOURCE_FRONT3,
-	DMX_SOURCE_DVR0   = 16,
-	DMX_SOURCE_DVR1,
-	DMX_SOURCE_DVR2,
-	DMX_SOURCE_DVR3
-    } dmx_source_t;
diff --git a/include/uapi/linux/dvb/dmx.h b/include/uapi/linux/dvb/dmx.h
index db8bd00c93de..08dc17060321 100644
--- a/include/uapi/linux/dvb/dmx.h
+++ b/include/uapi/linux/dvb/dmx.h
@@ -115,16 +115,6 @@ struct dmx_pes_filter_params
 	__u32           flags;
 };
 
-enum dmx_source {
-	DMX_SOURCE_FRONT0 = 0,
-	DMX_SOURCE_FRONT1,
-	DMX_SOURCE_FRONT2,
-	DMX_SOURCE_FRONT3,
-	DMX_SOURCE_DVR0   = 16,
-	DMX_SOURCE_DVR1,
-	DMX_SOURCE_DVR2,
-	DMX_SOURCE_DVR3
-};
 
 struct dmx_stc {
 	unsigned int num;	/* input : which STC? 0..N */
@@ -138,7 +128,6 @@ struct dmx_stc {
 #define DMX_SET_PES_FILTER       _IOW('o', 44, struct dmx_pes_filter_params)
 #define DMX_SET_BUFFER_SIZE      _IO('o', 45)
 #define DMX_GET_PES_PIDS         _IOR('o', 47, __u16[5])
-#define DMX_SET_SOURCE           _IOW('o', 49, enum dmx_source)
 #define DMX_GET_STC              _IOWR('o', 50, struct dmx_stc)
 #define DMX_ADD_PID              _IOW('o', 51, __u16)
 #define DMX_REMOVE_PID           _IOW('o', 52, __u16)
@@ -150,7 +139,6 @@ typedef enum dmx_output dmx_output_t;
 typedef enum dmx_input dmx_input_t;
 typedef enum dmx_ts_pes dmx_pes_type_t;
 typedef struct dmx_filter dmx_filter_t;
-typedef enum dmx_source  dmx_source_t;
 
 #endif
 
-- 
cgit v1.2.3-71-gd317


From bb98e6d280e00a1180f47d3391ee0bd1f312b5f6 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab@s-opensource.com>
Date: Thu, 31 Aug 2017 12:28:52 -0400
Subject: media: dmx.h: add kernel-doc markups and use it at Documentation/

The demux documentation is pretty poor nowadays: most of the
structs and enums aren't documented at all.

Add proper kernel-doc markups for them and use it.

Now, the demux API data structures are fully documented :-)

Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
---
 Documentation/media/dmx.h.rst.exceptions   |   5 +
 Documentation/media/uapi/dvb/dmx_types.rst | 173 +----------------------------
 include/uapi/linux/dvb/dmx.h               | 139 ++++++++++++++++++-----
 3 files changed, 120 insertions(+), 197 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/media/dmx.h.rst.exceptions b/Documentation/media/dmx.h.rst.exceptions
index d2dac35bb84b..629db384104a 100644
--- a/Documentation/media/dmx.h.rst.exceptions
+++ b/Documentation/media/dmx.h.rst.exceptions
@@ -49,3 +49,8 @@ replace define DMX_IMMEDIATE_START :c:type:`dmx_sct_filter_params`
 replace typedef dmx_filter_t :c:type:`dmx_filter`
 replace typedef dmx_pes_type_t :c:type:`dmx_pes_type`
 replace typedef dmx_input_t :c:type:`dmx_input`
+
+ignore symbol DMX_OUT_DECODER
+ignore symbol DMX_OUT_TAP
+ignore symbol DMX_OUT_TS_TAP
+ignore symbol DMX_OUT_TSDEMUX_TAP
diff --git a/Documentation/media/uapi/dvb/dmx_types.rst b/Documentation/media/uapi/dvb/dmx_types.rst
index 171205ed86a4..2a023a4f516c 100644
--- a/Documentation/media/uapi/dvb/dmx_types.rst
+++ b/Documentation/media/uapi/dvb/dmx_types.rst
@@ -6,175 +6,4 @@
 Demux Data Types
 ****************
 
-Output for the demux
-====================
-
-.. c:type:: dmx_output
-
-.. tabularcolumns:: |p{5.0cm}|p{12.5cm}|
-
-.. flat-table:: enum dmx_output
-    :header-rows:  1
-    :stub-columns: 0
-
-
-    -  .. row 1
-
-       -  ID
-
-       -  Description
-
-    -  .. row 2
-
-       -  .. _DMX-OUT-DECODER:
-
-	  DMX_OUT_DECODER
-
-       -  Streaming directly to decoder.
-
-    -  .. row 3
-
-       -  .. _DMX-OUT-TAP:
-
-	  DMX_OUT_TAP
-
-       -  Output going to a memory buffer (to be retrieved via the read
-	  command). Delivers the stream output to the demux device on which
-	  the ioctl is called.
-
-    -  .. row 4
-
-       -  .. _DMX-OUT-TS-TAP:
-
-	  DMX_OUT_TS_TAP
-
-       -  Output multiplexed into a new TS (to be retrieved by reading from
-	  the logical DVR device). Routes output to the logical DVR device
-	  ``/dev/dvb/adapter?/dvr?``, which delivers a TS multiplexed from
-	  all filters for which ``DMX_OUT_TS_TAP`` was specified.
-
-    -  .. row 5
-
-       -  .. _DMX-OUT-TSDEMUX-TAP:
-
-	  DMX_OUT_TSDEMUX_TAP
-
-       -  Like :ref:`DMX_OUT_TS_TAP <DMX-OUT-TS-TAP>` but retrieved
-	  from the DMX device.
-
-
-dmx_input_t
-===========
-
-.. c:type:: dmx_input
-
-.. code-block:: c
-
-    typedef enum
-    {
-	DMX_IN_FRONTEND, /* Input from a front-end device.  */
-	DMX_IN_DVR       /* Input from the logical DVR device.  */
-    } dmx_input_t;
-
-
-dmx_pes_type_t
-==============
-
-.. c:type:: dmx_pes_type
-
-
-.. code-block:: c
-
-    typedef enum
-    {
-	DMX_PES_AUDIO0,
-	DMX_PES_VIDEO0,
-	DMX_PES_TELETEXT0,
-	DMX_PES_SUBTITLE0,
-	DMX_PES_PCR0,
-
-	DMX_PES_AUDIO1,
-	DMX_PES_VIDEO1,
-	DMX_PES_TELETEXT1,
-	DMX_PES_SUBTITLE1,
-	DMX_PES_PCR1,
-
-	DMX_PES_AUDIO2,
-	DMX_PES_VIDEO2,
-	DMX_PES_TELETEXT2,
-	DMX_PES_SUBTITLE2,
-	DMX_PES_PCR2,
-
-	DMX_PES_AUDIO3,
-	DMX_PES_VIDEO3,
-	DMX_PES_TELETEXT3,
-	DMX_PES_SUBTITLE3,
-	DMX_PES_PCR3,
-
-	DMX_PES_OTHER
-    } dmx_pes_type_t;
-
-
-struct dmx_filter
-=================
-
-.. c:type:: dmx_filter
-
-.. code-block:: c
-
-     typedef struct dmx_filter
-    {
-	__u8  filter[DMX_FILTER_SIZE];
-	__u8  mask[DMX_FILTER_SIZE];
-	__u8  mode[DMX_FILTER_SIZE];
-    } dmx_filter_t;
-
-
-.. c:type:: dmx_sct_filter_params
-
-struct dmx_sct_filter_params
-============================
-
-
-.. code-block:: c
-
-    struct dmx_sct_filter_params
-    {
-	__u16          pid;
-	dmx_filter_t   filter;
-	__u32          timeout;
-	__u32          flags;
-    #define DMX_CHECK_CRC       1
-    #define DMX_ONESHOT         2
-    #define DMX_IMMEDIATE_START 4
-    };
-
-
-struct dmx_pes_filter_params
-============================
-
-.. c:type:: dmx_pes_filter_params
-
-.. code-block:: c
-
-    struct dmx_pes_filter_params
-    {
-	__u16          pid;
-	dmx_input_t    input;
-	dmx_output_t   output;
-	dmx_pes_type_t pes_type;
-	__u32          flags;
-    };
-
-struct dmx_stc
-==============
-
-.. c:type:: dmx_stc
-
-.. code-block:: c
-
-    struct dmx_stc {
-	unsigned int num;   /* input : which STC? 0..N */
-	unsigned int base;  /* output: divisor for stc to get 90 kHz clock */
-	__u64 stc;      /* output: stc in 'base'*90 kHz units */
-    };
+.. kernel-doc:: include/uapi/linux/dvb/dmx.h
diff --git a/include/uapi/linux/dvb/dmx.h b/include/uapi/linux/dvb/dmx.h
index 08dc17060321..4e3f3a2fe83f 100644
--- a/include/uapi/linux/dvb/dmx.h
+++ b/include/uapi/linux/dvb/dmx.h
@@ -32,26 +32,74 @@
 
 #define DMX_FILTER_SIZE 16
 
-enum dmx_output
-{
-	DMX_OUT_DECODER, /* Streaming directly to decoder. */
-	DMX_OUT_TAP,     /* Output going to a memory buffer */
-			 /* (to be retrieved via the read command).*/
-	DMX_OUT_TS_TAP,  /* Output multiplexed into a new TS  */
-			 /* (to be retrieved by reading from the */
-			 /* logical DVR device).                 */
-	DMX_OUT_TSDEMUX_TAP /* Like TS_TAP but retrieved from the DMX device */
+/**
+ * enum dmx_output - Output for the demux.
+ *
+ * @DMX_OUT_DECODER:
+ *	Streaming directly to decoder.
+ * @DMX_OUT_TAP:
+ *	Output going to a memory buffer (to be retrieved via the read command).
+ *	Delivers the stream output to the demux device on which the ioctl
+ *	is called.
+ * @DMX_OUT_TS_TAP:
+ *	Output multiplexed into a new TS (to be retrieved by reading from the
+ *	logical DVR device). Routes output to the logical DVR device
+ *	``/dev/dvb/adapter?/dvr?``, which delivers a TS multiplexed from all
+ *	filters for which @DMX_OUT_TS_TAP was specified.
+ * @DMX_OUT_TSDEMUX_TAP:
+ *	Like @DMX_OUT_TS_TAP but retrieved from the DMX device.
+ */
+enum dmx_output {
+	DMX_OUT_DECODER,
+	DMX_OUT_TAP,
+	DMX_OUT_TS_TAP,
+	DMX_OUT_TSDEMUX_TAP
 };
 
-enum dmx_input
-{
-	DMX_IN_FRONTEND, /* Input from a front-end device.  */
-	DMX_IN_DVR       /* Input from the logical DVR device.  */
+
+/**
+ * enum dmx_input - Input from the demux.
+ *
+ * @DMX_IN_FRONTEND:	Input from a front-end device.
+ * @DMX_IN_DVR:		Input from the logical DVR device.
+ */
+enum dmx_input {
+	DMX_IN_FRONTEND,
+	DMX_IN_DVR
 };
 
+/**
+ * enum dmx_ts_pes - type of the PES filter.
+ *
+ * @DMX_PES_AUDIO0:	first audio PID. Also referred as @DMX_PES_AUDIO.
+ * @DMX_PES_VIDEO0:	first video PID. Also referred as @DMX_PES_VIDEO.
+ * @DMX_PES_TELETEXT0:	first teletext PID. Also referred as @DMX_PES_TELETEXT.
+ * @DMX_PES_SUBTITLE0:	first subtitle PID. Also referred as @DMX_PES_SUBTITLE.
+ * @DMX_PES_PCR0:	first Program Clock Reference PID.
+ *			Also referred as @DMX_PES_PCR.
+ *
+ * @DMX_PES_AUDIO1:	second audio PID.
+ * @DMX_PES_VIDEO1:	second video PID.
+ * @DMX_PES_TELETEXT1:	second teletext PID.
+ * @DMX_PES_SUBTITLE1:	second subtitle PID.
+ * @DMX_PES_PCR1:	second Program Clock Reference PID.
+ *
+ * @DMX_PES_AUDIO2:	third audio PID.
+ * @DMX_PES_VIDEO2:	third video PID.
+ * @DMX_PES_TELETEXT2:	third teletext PID.
+ * @DMX_PES_SUBTITLE2:	third subtitle PID.
+ * @DMX_PES_PCR2:	third Program Clock Reference PID.
+ *
+ * @DMX_PES_AUDIO3:	fourth audio PID.
+ * @DMX_PES_VIDEO3:	fourth video PID.
+ * @DMX_PES_TELETEXT3:	fourth teletext PID.
+ * @DMX_PES_SUBTITLE3:	fourth subtitle PID.
+ * @DMX_PES_PCR3:	fourth Program Clock Reference PID.
+ *
+ * @DMX_PES_OTHER:	any other PID.
+ */
 
-enum dmx_ts_pes
-{
+enum dmx_ts_pes {
 	DMX_PES_AUDIO0,
 	DMX_PES_VIDEO0,
 	DMX_PES_TELETEXT0,
@@ -86,16 +134,42 @@ enum dmx_ts_pes
 #define DMX_PES_PCR      DMX_PES_PCR0
 
 
-struct dmx_filter
-{
+
+/**
+ * struct dmx_filter - Specifies a section header filter.
+ *
+ * @filter: bit array with bits to be matched at the section header.
+ * @mask: bits that are valid at the filter bit array.
+ * @mode: mode of match: if bit is zero, it will match if equal (positive
+ *	  match); if bit is one, it will match if the bit is negated.
+ *
+ * Note: All arrays in this struct have a size of DMX_FILTER_SIZE (16 bytes).
+ */
+struct dmx_filter {
 	__u8  filter[DMX_FILTER_SIZE];
 	__u8  mask[DMX_FILTER_SIZE];
 	__u8  mode[DMX_FILTER_SIZE];
 };
 
-
-struct dmx_sct_filter_params
-{
+/**
+ * struct dmx_sct_filter_params - Specifies a section filter.
+ *
+ * @pid: PID to be filtered.
+ * @filter: section header filter, as defined by &struct dmx_filter.
+ * @timeout: maximum time to filter, in milliseconds.
+ * @flags: extra flags for the section filter.
+ *
+ * Carries the configuration for a MPEG-TS section filter.
+ *
+ * The @flags can be:
+ *
+ *	- %DMX_CHECK_CRC - only deliver sections where the CRC check succeeded;
+ *	- %DMX_ONESHOT - disable the section filter after one section
+ *	  has been delivered;
+ *	- %DMX_IMMEDIATE_START - Start filter immediately without requiring a
+ *	  :ref:`DMX_START`.
+ */
+struct dmx_sct_filter_params {
 	__u16             pid;
 	struct dmx_filter filter;
 	__u32             timeout;
@@ -105,7 +179,16 @@ struct dmx_sct_filter_params
 #define DMX_IMMEDIATE_START 4
 };
 
-
+/**
+ * struct dmx_pes_filter_params - Specifies Packetized Elementary Stream (PES)
+ *	filter parameters.
+ *
+ * @pid:	PID to be filtered.
+ * @input:	Demux input, as specified by &enum dmx_input.
+ * @output:	Demux output, as specified by &enum dmx_output.
+ * @pes_type:	Type of the pes filter, as specified by &enum dmx_pes_type.
+ * @flags:	Demux PES flags.
+ */
 struct dmx_pes_filter_params
 {
 	__u16           pid;
@@ -115,11 +198,17 @@ struct dmx_pes_filter_params
 	__u32           flags;
 };
 
-
+/**
+ * struct dmx_stc - Stores System Time Counter (STC) information.
+ *
+ * @num: input data: number of the STC, from 0 to N.
+ * @base: output: divisor for STC to get 90 kHz clock.
+ * @stc: output: stc in @base * 90 kHz units.
+ */
 struct dmx_stc {
-	unsigned int num;	/* input : which STC? 0..N */
-	unsigned int base;	/* output: divisor for stc to get 90 kHz clock */
-	__u64 stc;		/* output: stc in 'base'*90 kHz units */
+	unsigned int num;
+	unsigned int base;
+	__u64 stc;
 };
 
 #define DMX_START                _IO('o', 41)
-- 
cgit v1.2.3-71-gd317


From 833ff5e7feda1a042b83e82208cef3d212ca0ef1 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab@s-opensource.com>
Date: Fri, 1 Sep 2017 07:41:49 -0400
Subject: media: ca.h: get rid of CA_SET_PID

This ioctl seems to be some attempt to support a feature
at the bt8xx dst_ca driver. Yet, as said there, it
"needs more work". Right now, the code there is just
a boilerplate.

At the end of the day, no driver uses this ioctl, nor it is
documented anywhere (except for "needs more work").

So, get rid of it.

Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
---
 Documentation/media/ca.h.rst.exceptions            |  1 -
 Documentation/media/dvb-drivers/ci.rst             |  1 -
 Documentation/media/uapi/dvb/ca-set-pid.rst        | 60 ----------------------
 Documentation/media/uapi/dvb/ca_data_types.rst     | 14 -----
 Documentation/media/uapi/dvb/ca_function_calls.rst |  1 -
 drivers/media/pci/bt8xx/dst_ca.c                   | 16 ------
 include/uapi/linux/dvb/ca.h                        |  7 ---
 7 files changed, 100 deletions(-)
 delete mode 100644 Documentation/media/uapi/dvb/ca-set-pid.rst

(limited to 'include/uapi/linux')

diff --git a/Documentation/media/ca.h.rst.exceptions b/Documentation/media/ca.h.rst.exceptions
index d7c9fed8c004..553559cc6ad7 100644
--- a/Documentation/media/ca.h.rst.exceptions
+++ b/Documentation/media/ca.h.rst.exceptions
@@ -16,7 +16,6 @@ replace define CA_NDS :c:type:`ca_descr_info`
 replace define CA_DSS :c:type:`ca_descr_info`
 
 # some typedefs should point to struct/enums
-replace typedef ca_pid_t :c:type:`ca_pid`
 replace typedef ca_slot_info_t :c:type:`ca_slot_info`
 replace typedef ca_descr_info_t :c:type:`ca_descr_info`
 replace typedef ca_caps_t :c:type:`ca_caps`
diff --git a/Documentation/media/dvb-drivers/ci.rst b/Documentation/media/dvb-drivers/ci.rst
index 69b07e9d1816..87f3748c49b9 100644
--- a/Documentation/media/dvb-drivers/ci.rst
+++ b/Documentation/media/dvb-drivers/ci.rst
@@ -143,7 +143,6 @@ All these ioctls are also valid for the High level CI interface
 #define CA_GET_MSG        _IOR('o', 132, ca_msg_t)
 #define CA_SEND_MSG       _IOW('o', 133, ca_msg_t)
 #define CA_SET_DESCR      _IOW('o', 134, ca_descr_t)
-#define CA_SET_PID        _IOW('o', 135, ca_pid_t)
 
 
 On querying the device, the device yields information thus:
diff --git a/Documentation/media/uapi/dvb/ca-set-pid.rst b/Documentation/media/uapi/dvb/ca-set-pid.rst
deleted file mode 100644
index 891c1c72ef24..000000000000
--- a/Documentation/media/uapi/dvb/ca-set-pid.rst
+++ /dev/null
@@ -1,60 +0,0 @@
-.. -*- coding: utf-8; mode: rst -*-
-
-.. _CA_SET_PID:
-
-==========
-CA_SET_PID
-==========
-
-Name
-----
-
-CA_SET_PID
-
-
-Synopsis
---------
-
-.. c:function:: int ioctl(fd, CA_SET_PID, struct ca_pid *pid)
-    :name: CA_SET_PID
-
-
-Arguments
----------
-
-``fd``
-  File descriptor returned by a previous call to :c:func:`open() <dvb-ca-open>`.
-
-``pid``
-  Pointer to struct :c:type:`ca_pid`.
-
-.. c:type:: ca_pid
-
-.. flat-table:: struct ca_pid
-    :header-rows:  1
-    :stub-columns: 0
-
-    -
-       - unsigned int
-       - pid
-       - Program ID
-
-    -
-       - int
-       - index
-       - PID index. Use -1 to disable.
-
-
-
-Description
------------
-
-.. note:: This ioctl is undocumented. Documentation is welcome.
-
-
-Return Value
-------------
-
-On success 0 is returned, on error -1 and the ``errno`` variable is set
-appropriately. The generic error codes are described at the
-:ref:`Generic Error Codes <gen-errors>` chapter.
diff --git a/Documentation/media/uapi/dvb/ca_data_types.rst b/Documentation/media/uapi/dvb/ca_data_types.rst
index d9e27c77426c..555b5137936b 100644
--- a/Documentation/media/uapi/dvb/ca_data_types.rst
+++ b/Documentation/media/uapi/dvb/ca_data_types.rst
@@ -94,17 +94,3 @@ ca_descr_t
 	unsigned int parity;
 	unsigned char cw[8];
     } ca_descr_t;
-
-
-.. c:type:: ca_pid
-
-ca-pid
-======
-
-
-.. code-block:: c
-
-    typedef struct ca_pid {
-	unsigned int pid;
-	int index;      /* -1 == disable*/
-    } ca_pid_t;
diff --git a/Documentation/media/uapi/dvb/ca_function_calls.rst b/Documentation/media/uapi/dvb/ca_function_calls.rst
index c085a0ebbc05..87d697851e82 100644
--- a/Documentation/media/uapi/dvb/ca_function_calls.rst
+++ b/Documentation/media/uapi/dvb/ca_function_calls.rst
@@ -18,4 +18,3 @@ CA Function Calls
     ca-get-msg
     ca-send-msg
     ca-set-descr
-    ca-set-pid
diff --git a/drivers/media/pci/bt8xx/dst_ca.c b/drivers/media/pci/bt8xx/dst_ca.c
index 90f4263452d3..7db47d8bbe15 100644
--- a/drivers/media/pci/bt8xx/dst_ca.c
+++ b/drivers/media/pci/bt8xx/dst_ca.c
@@ -64,13 +64,6 @@ static int ca_set_slot_descr(void)
 	return -EOPNOTSUPP;
 }
 
-/*	Need some more work	*/
-static int ca_set_pid(void)
-{
-	/*	We could make this more graceful ?	*/
-	return -EOPNOTSUPP;
-}
-
 static void put_command_and_length(u8 *data, int command, int length)
 {
 	data[0] = (command >> 16) & 0xff;
@@ -629,15 +622,6 @@ static long dst_ca_ioctl(struct file *file, unsigned int cmd, unsigned long ioct
 		}
 		dprintk(verbose, DST_CA_INFO, 1, " -->CA_SET_DESCR Success !");
 		break;
-	case CA_SET_PID:
-		dprintk(verbose, DST_CA_INFO, 1, " Setting PID");
-		if ((ca_set_pid()) < 0) {
-			dprintk(verbose, DST_CA_ERROR, 1, " -->CA_SET_PID Failed !");
-			result = -1;
-			goto free_mem_and_exit;
-		}
-		dprintk(verbose, DST_CA_INFO, 1, " -->CA_SET_PID Success !");
-		break;
 	default:
 		result = -EOPNOTSUPP;
 	}
diff --git a/include/uapi/linux/dvb/ca.h b/include/uapi/linux/dvb/ca.h
index 00cf24587bea..859f6c0c4751 100644
--- a/include/uapi/linux/dvb/ca.h
+++ b/include/uapi/linux/dvb/ca.h
@@ -73,11 +73,6 @@ struct ca_descr {
 	unsigned char cw[8];
 };
 
-struct ca_pid {
-	unsigned int pid;
-	int index;		/* -1 == disable*/
-};
-
 #define CA_RESET          _IO('o', 128)
 #define CA_GET_CAP        _IOR('o', 129, struct ca_caps)
 #define CA_GET_SLOT_INFO  _IOR('o', 130, struct ca_slot_info)
@@ -85,7 +80,6 @@ struct ca_pid {
 #define CA_GET_MSG        _IOR('o', 132, struct ca_msg)
 #define CA_SEND_MSG       _IOW('o', 133, struct ca_msg)
 #define CA_SET_DESCR      _IOW('o', 134, struct ca_descr)
-#define CA_SET_PID        _IOW('o', 135, struct ca_pid)
 
 #if !defined (__KERNEL__)
 
@@ -95,7 +89,6 @@ typedef struct ca_descr_info  ca_descr_info_t;
 typedef struct ca_caps  ca_caps_t;
 typedef struct ca_msg ca_msg_t;
 typedef struct ca_descr ca_descr_t;
-typedef struct ca_pid ca_pid_t;
 
 #endif
 
-- 
cgit v1.2.3-71-gd317


From fed7c4fe8bd0b131cc3f19ba2744061935cdcdb7 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab@s-opensource.com>
Date: Fri, 1 Sep 2017 07:48:02 -0400
Subject: media: ca.h: document most CA data types

For most of the stuff there, documenting is easy, as the
header file contains information.

Yet, I was unable to document two data structs:
	ca_msg and ca_descr

As those two structs are used by a few drivers, keep them.

Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
---
 Documentation/media/uapi/dvb/ca_data_types.rst | 75 ++++---------------------
 include/uapi/linux/dvb/ca.h                    | 78 ++++++++++++++++++++------
 2 files changed, 70 insertions(+), 83 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/media/uapi/dvb/ca_data_types.rst b/Documentation/media/uapi/dvb/ca_data_types.rst
index 555b5137936b..aa57dd176825 100644
--- a/Documentation/media/uapi/dvb/ca_data_types.rst
+++ b/Documentation/media/uapi/dvb/ca_data_types.rst
@@ -6,91 +6,36 @@
 CA Data Types
 *************
 
+.. kernel-doc:: include/uapi/linux/dvb/ca.h
 
-.. c:type:: ca_slot_info
-
-ca_slot_info_t
-==============
-
-
-.. code-block:: c
-
-    typedef struct ca_slot_info {
-	int num;               /* slot number */
-
-	int type;              /* CA interface this slot supports */
-    #define CA_CI            1     /* CI high level interface */
-    #define CA_CI_LINK       2     /* CI link layer level interface */
-    #define CA_CI_PHYS       4     /* CI physical layer level interface */
-    #define CA_DESCR         8     /* built-in descrambler */
-    #define CA_SC          128     /* simple smart card interface */
-
-	unsigned int flags;
-    #define CA_CI_MODULE_PRESENT 1 /* module (or card) inserted */
-    #define CA_CI_MODULE_READY   2
-    } ca_slot_info_t;
-
-
-.. c:type:: ca_descr_info
-
-ca_descr_info_t
-===============
-
-
-.. code-block:: c
-
-    typedef struct ca_descr_info {
-	unsigned int num;  /* number of available descramblers (keys) */
-	unsigned int type; /* type of supported scrambling system */
-    #define CA_ECD           1
-    #define CA_NDS           2
-    #define CA_DSS           4
-    } ca_descr_info_t;
-
-
-.. c:type:: ca_caps
-
-ca_caps_t
-=========
-
+.. c:type:: ca_msg
 
-.. code-block:: c
+Undocumented data types
+=======================
 
-    typedef struct ca_caps {
-	unsigned int slot_num;  /* total number of CA card and module slots */
-	unsigned int slot_type; /* OR of all supported types */
-	unsigned int descr_num; /* total number of descrambler slots (keys) */
-	unsigned int descr_type;/* OR of all supported types */
-     } ca_cap_t;
+.. note::
 
+   Those data types are undocumented. Documentation is welcome.
 
 .. c:type:: ca_msg
 
-ca_msg_t
-========
-
-
 .. code-block:: c
 
     /* a message to/from a CI-CAM */
-    typedef struct ca_msg {
+    struct ca_msg {
 	unsigned int index;
 	unsigned int type;
 	unsigned int length;
 	unsigned char msg[256];
-    } ca_msg_t;
+    };
 
 
 .. c:type:: ca_descr
 
-ca_descr_t
-==========
-
-
 .. code-block:: c
 
-    typedef struct ca_descr {
+    struct ca_descr {
 	unsigned int index;
 	unsigned int parity;
 	unsigned char cw[8];
-    } ca_descr_t;
+    };
diff --git a/include/uapi/linux/dvb/ca.h b/include/uapi/linux/dvb/ca.h
index 859f6c0c4751..7ee641b4124c 100644
--- a/include/uapi/linux/dvb/ca.h
+++ b/include/uapi/linux/dvb/ca.h
@@ -24,39 +24,81 @@
 #ifndef _DVBCA_H_
 #define _DVBCA_H_
 
-/* slot interface types and info */
+/**
+ * struct ca_slot_info - CA slot interface types and info.
+ *
+ * @num:	slot number.
+ * @type:	slot type.
+ * @flags:	flags applicable to the slot.
+ *
+ * This struct stores the CA slot information.
+ *
+ * @type can be:
+ *
+ *	- %CA_CI - CI high level interface;
+ *	- %CA_CI_LINK - CI link layer level interface;
+ *	- %CA_CI_PHYS - CI physical layer level interface;
+ *	- %CA_DESCR - built-in descrambler;
+ *	- %CA_SC -simple smart card interface.
+ *
+ * @flags can be:
+ *
+ *	- %CA_CI_MODULE_PRESENT - module (or card) inserted;
+ *	- %CA_CI_MODULE_READY - module is ready for usage.
+ */
 
 struct ca_slot_info {
-	int num;               /* slot number */
-
-	int type;              /* CA interface this slot supports */
-#define CA_CI            1     /* CI high level interface */
-#define CA_CI_LINK       2     /* CI link layer level interface */
-#define CA_CI_PHYS       4     /* CI physical layer level interface */
-#define CA_DESCR         8     /* built-in descrambler */
-#define CA_SC          128     /* simple smart card interface */
+	int num;
+	int type;
+#define CA_CI            1
+#define CA_CI_LINK       2
+#define CA_CI_PHYS       4
+#define CA_DESCR         8
+#define CA_SC          128
 
 	unsigned int flags;
-#define CA_CI_MODULE_PRESENT 1 /* module (or card) inserted */
+#define CA_CI_MODULE_PRESENT 1
 #define CA_CI_MODULE_READY   2
 };
 
 
-/* descrambler types and info */
-
+/**
+ * struct ca_descr_info - descrambler types and info.
+ *
+ * @num:	number of available descramblers (keys).
+ * @type:	type of supported scrambling system.
+ *
+ * Identifies the number of descramblers and their type.
+ *
+ * @type can be:
+ *
+ *	- %CA_ECD - European Common Descrambler (ECD) hardware;
+ *	- %CA_NDS - Videoguard (NDS) hardware;
+ *	- %CA_DSS - Distributed Sample Scrambling (DSS) hardware.
+ */
 struct ca_descr_info {
-	unsigned int num;          /* number of available descramblers (keys) */
-	unsigned int type;         /* type of supported scrambling system */
+	unsigned int num;
+	unsigned int type;
 #define CA_ECD           1
 #define CA_NDS           2
 #define CA_DSS           4
 };
 
+/**
+ * struct ca_caps - CA slot interface capabilities.
+ *
+ * @slot_num:	total number of CA card and module slots.
+ * @slot_type:	bitmap with all supported types as defined at
+ *		&struct ca_slot_info (e. g. %CA_CI, %CA_CI_LINK, etc).
+ * @descr_num:	total number of descrambler slots (keys)
+ * @descr_type:	bitmap with all supported types as defined at
+ *		&struct ca_descr_info (e. g. %CA_ECD, %CA_NDS, etc).
+ */
 struct ca_caps {
-	unsigned int slot_num;     /* total number of CA card and module slots */
-	unsigned int slot_type;    /* OR of all supported types */
-	unsigned int descr_num;    /* total number of descrambler slots (keys) */
-	unsigned int descr_type;   /* OR of all supported types */
+	unsigned int slot_num;
+	unsigned int slot_type;
+	unsigned int descr_num;
+	unsigned int descr_type;
 };
 
 /* a message to/from a CI-CAM */
-- 
cgit v1.2.3-71-gd317


From 5176d6eefd5d58fbb787f96c2140cffb2e826b17 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab@s-opensource.com>
Date: Fri, 1 Sep 2017 15:05:28 -0400
Subject: media: frontend.h: Avoid the term DVB when doesn't refer to a
 delivery system

The DVB term can either refer to the subsystem or to a delivery
system. Avoid it in the first case at the kernel-doc markups.

Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
---
 include/uapi/linux/dvb/frontend.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/dvb/frontend.h b/include/uapi/linux/dvb/frontend.h
index e7c29d0bdee4..fc2edb6014fe 100644
--- a/include/uapi/linux/dvb/frontend.h
+++ b/include/uapi/linux/dvb/frontend.h
@@ -239,11 +239,11 @@ enum fe_sec_mini_cmd {
  * @FE_NONE:		The frontend doesn't have any kind of lock.
  *			That's the initial frontend status
  * @FE_HAS_SIGNAL:	Has found something above the noise level.
- * @FE_HAS_CARRIER:	Has found a DVB signal.
+ * @FE_HAS_CARRIER:	Has found a signal.
  * @FE_HAS_VITERBI:	FEC inner coding (Viterbi, LDPC or other inner code).
  *			is stable.
  * @FE_HAS_SYNC:	Synchronization bytes was found.
- * @FE_HAS_LOCK:	DVB were locked and everything is working.
+ * @FE_HAS_LOCK:	Digital TV were locked and everything is working.
  * @FE_TIMEDOUT:	Fo lock within the last about 2 seconds.
  * @FE_REINIT:		Frontend was reinitialized, application is recommended
  *			to reset DiSEqC, tone and parameters.
@@ -269,7 +269,7 @@ enum fe_status {
  * This parameter indicates if spectral inversion should be presumed or
  * not. In the automatic setting (``INVERSION_AUTO``) the hardware will try
  * to figure out the correct setting by itself. If the hardware doesn't
- * support, the DVB core will try to lock at the carrier first with
+ * support, the %dvb_frontend will try to lock at the carrier first with
  * inversion off. If it fails, it will try to enable inversion.
  */
 enum fe_spectral_inversion {
-- 
cgit v1.2.3-71-gd317


From 56d51b65bcc7a5780663abd579fb6f039616b347 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab@s-opensource.com>
Date: Fri, 1 Sep 2017 15:45:47 -0400
Subject: media: net.h: add kernel-doc and use it at Documentation/

As we did with frontend.h, ca.h and dmx.h, move the struct
definition to net.h.

That should help to keep it updated, as more stuff gets
added there.

Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
---
 Documentation/media/uapi/dvb/net-add-if.rst | 34 -----------------------------
 Documentation/media/uapi/dvb/net-types.rst  |  9 ++++++++
 Documentation/media/uapi/dvb/net.rst        |  1 +
 include/uapi/linux/dvb/net.h                | 15 +++++++++++++
 4 files changed, 25 insertions(+), 34 deletions(-)
 create mode 100644 Documentation/media/uapi/dvb/net-types.rst

(limited to 'include/uapi/linux')

diff --git a/Documentation/media/uapi/dvb/net-add-if.rst b/Documentation/media/uapi/dvb/net-add-if.rst
index 1087efb9baa0..6749b70246c5 100644
--- a/Documentation/media/uapi/dvb/net-add-if.rst
+++ b/Documentation/media/uapi/dvb/net-add-if.rst
@@ -41,40 +41,6 @@ created.
 The struct :c:type:`dvb_net_if`::ifnum field will be
 filled with the number of the created interface.
 
-.. c:type:: dvb_net_if
-
-.. flat-table:: struct dvb_net_if
-    :header-rows:  1
-    :stub-columns: 0
-
-
-    -  .. row 1
-
-       -  ID
-
-       -  Description
-
-    -  .. row 2
-
-       -  pid
-
-       -  Packet ID (PID) of the MPEG-TS that contains data
-
-    -  .. row 3
-
-       -  ifnum
-
-       -  number of the Digital TV interface.
-
-    -  .. row 4
-
-       -  feedtype
-
-       -  Encapsulation type of the feed. It can be:
-	  ``DVB_NET_FEEDTYPE_MPE`` for MPE encoding or
-	  ``DVB_NET_FEEDTYPE_ULE`` for ULE encoding.
-
-
 Return Value
 ============
 
diff --git a/Documentation/media/uapi/dvb/net-types.rst b/Documentation/media/uapi/dvb/net-types.rst
new file mode 100644
index 000000000000..e1177bdcd623
--- /dev/null
+++ b/Documentation/media/uapi/dvb/net-types.rst
@@ -0,0 +1,9 @@
+.. -*- coding: utf-8; mode: rst -*-
+
+.. _dmx_types:
+
+**************
+Net Data Types
+**************
+
+.. kernel-doc:: include/uapi/linux/dvb/net.h
diff --git a/Documentation/media/uapi/dvb/net.rst b/Documentation/media/uapi/dvb/net.rst
index fdb5301a4b1f..e0cd4e402627 100644
--- a/Documentation/media/uapi/dvb/net.rst
+++ b/Documentation/media/uapi/dvb/net.rst
@@ -35,6 +35,7 @@ Digital TV net Function Calls
 .. toctree::
     :maxdepth: 1
 
+    net-types
     net-add-if
     net-remove-if
     net-get-if
diff --git a/include/uapi/linux/dvb/net.h b/include/uapi/linux/dvb/net.h
index f451e7eb0b0b..89d805f9a5a6 100644
--- a/include/uapi/linux/dvb/net.h
+++ b/include/uapi/linux/dvb/net.h
@@ -26,6 +26,21 @@
 
 #include <linux/types.h>
 
+/**
+ * struct dvb_net_if - describes a DVB network interface
+ *
+ * @pid: Packet ID (PID) of the MPEG-TS that contains data
+ * @if_num: number of the Digital TV interface.
+ * @feedtype: Encapsulation type of the feed.
+ *
+ * A MPEG-TS stream may contain packet IDs with IP packages on it.
+ * This struct describes it, and the type of encoding.
+ *
+ * @feedtype can be:
+ *
+ *	- %DVB_NET_FEEDTYPE_MPE for MPE encoding
+ *	- %DVB_NET_FEEDTYPE_ULE for ULE encoding.
+ */
 struct dvb_net_if {
 	__u16 pid;
 	__u16 if_num;
-- 
cgit v1.2.3-71-gd317


From bd9049edc66e13e868f819c39844f60443e70817 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab@s-opensource.com>
Date: Sun, 3 Sep 2017 20:50:17 -0400
Subject: media: ca docs: document CA_SET_DESCR ioctl and structs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The av7110 driver uses CA_SET_DESCR to store the descrambler
control words at the CA descrambler slots.

Document it.

Thanks-to: Honza Petrouš <jpetrous@gmail.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
---
 Documentation/media/uapi/dvb/ca-set-descr.rst | 15 ++-------------
 include/uapi/linux/dvb/ca.h                   |  9 ++++++++-
 2 files changed, 10 insertions(+), 14 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/media/uapi/dvb/ca-set-descr.rst b/Documentation/media/uapi/dvb/ca-set-descr.rst
index 9c484317d55c..a6c47205ffd8 100644
--- a/Documentation/media/uapi/dvb/ca-set-descr.rst
+++ b/Documentation/media/uapi/dvb/ca-set-descr.rst
@@ -28,22 +28,11 @@ Arguments
 ``msg``
   Pointer to struct :c:type:`ca_descr`.
 
-.. c:type:: ca_descr
-
-.. code-block:: c
-
-    struct ca_descr {
-	unsigned int index;
-	unsigned int parity;
-	unsigned char cw[8];
-    };
-
-
 Description
 -----------
 
-.. note:: This ioctl is undocumented. Documentation is welcome.
-
+CA_SET_DESCR is used for feeding descrambler CA slots with descrambling
+keys (refered as control words).
 
 Return Value
 ------------
diff --git a/include/uapi/linux/dvb/ca.h b/include/uapi/linux/dvb/ca.h
index 7ee641b4124c..c36fdb8e2733 100644
--- a/include/uapi/linux/dvb/ca.h
+++ b/include/uapi/linux/dvb/ca.h
@@ -109,9 +109,16 @@ struct ca_msg {
 	unsigned char msg[256];
 };
 
+/**
+ * struct ca_descr - CA descrambler control words info
+ *
+ * @index: CA Descrambler slot
+ * @parity: control words parity, where 0 means even and 1 means odd
+ * @cw: CA Descrambler control words
+ */
 struct ca_descr {
 	unsigned int index;
-	unsigned int parity;	/* 0 == even, 1 == odd */
+	unsigned int parity;
 	unsigned char cw[8];
 };
 
-- 
cgit v1.2.3-71-gd317


From 7e6854a9bfea9ed6553acd0204da5101c9a2e6a0 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab@s-opensource.com>
Date: Mon, 4 Sep 2017 08:03:40 -0400
Subject: media: ca.h: document ca_msg and the corresponding ioctls

Usually, CA messages are sent/received via reading/writing at
the CA device node. However, two drivers (dst_ca and firedtv-ci)
also implement it via ioctls.

Apparently, on both cases, the net result is the same.

Anyway, let's document it.

Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
---
 Documentation/media/uapi/dvb/ca-get-msg.rst  | 19 ++++++-------------
 Documentation/media/uapi/dvb/ca-send-msg.rst |  6 +++++-
 include/uapi/linux/dvb/ca.h                  | 11 ++++++++++-
 3 files changed, 21 insertions(+), 15 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/media/uapi/dvb/ca-get-msg.rst b/Documentation/media/uapi/dvb/ca-get-msg.rst
index bdb116552068..ceeda623ce93 100644
--- a/Documentation/media/uapi/dvb/ca-get-msg.rst
+++ b/Documentation/media/uapi/dvb/ca-get-msg.rst
@@ -28,22 +28,15 @@ Arguments
 ``msg``
   Pointer to struct :c:type:`ca_msg`.
 
-.. c:type:: ca_msg
-
-.. code-block:: c
-
-    /* a message to/from a CI-CAM */
-    struct ca_msg {
-	unsigned int index;
-	unsigned int type;
-	unsigned int length;
-	unsigned char msg[256];
-    };
-
 Description
 -----------
 
-.. note:: This ioctl is undocumented. Documentation is welcome.
+Receives a message via a CI CA module.
+
+.. note::
+
+   Please notice that, on most drivers, this is done by reading from
+   the /dev/adapter?/ca? device node.
 
 
 Return Value
diff --git a/Documentation/media/uapi/dvb/ca-send-msg.rst b/Documentation/media/uapi/dvb/ca-send-msg.rst
index 644b6bda1aea..9e91287b7bbc 100644
--- a/Documentation/media/uapi/dvb/ca-send-msg.rst
+++ b/Documentation/media/uapi/dvb/ca-send-msg.rst
@@ -32,8 +32,12 @@ Arguments
 Description
 -----------
 
-.. note:: This ioctl is undocumented. Documentation is welcome.
+Sends a message via a CI CA module.
 
+.. note::
+
+   Please notice that, on most drivers, this is done by writing
+   to the /dev/adapter?/ca? device node.
 
 Return Value
 ------------
diff --git a/include/uapi/linux/dvb/ca.h b/include/uapi/linux/dvb/ca.h
index c36fdb8e2733..24fc38efbc2b 100644
--- a/include/uapi/linux/dvb/ca.h
+++ b/include/uapi/linux/dvb/ca.h
@@ -101,7 +101,16 @@ struct ca_caps {
 	unsigned int descr_type;
 };
 
-/* a message to/from a CI-CAM */
+/**
+ * struct ca_msg - a message to/from a CI-CAM
+ *
+ * @index:	unused
+ * @type:	unused
+ * @length:	length of the message
+ * @msg:	message
+ *
+ * This struct carries a message to be send/received from a CI CA module.
+ */
 struct ca_msg {
 	unsigned int index;
 	unsigned int type;
-- 
cgit v1.2.3-71-gd317


From e4faa09b0dae4f8f429922190e9aa99a564ff785 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab@s-opensource.com>
Date: Tue, 5 Sep 2017 07:02:44 -0400
Subject: media: dvb headers: make checkpatch happier

Adjust dvb ca.h, dmx.h and frontend.h in order to make
checkpatch happier. Now, it only complains about the typedefs,
and those are there just to provide backward userspace
compatibility.

Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
---
 include/uapi/linux/dvb/ca.h       | 2 +-
 include/uapi/linux/dvb/dmx.h      | 5 ++---
 include/uapi/linux/dvb/frontend.h | 6 +++---
 3 files changed, 6 insertions(+), 7 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/dvb/ca.h b/include/uapi/linux/dvb/ca.h
index 24fc38efbc2b..cb150029fdff 100644
--- a/include/uapi/linux/dvb/ca.h
+++ b/include/uapi/linux/dvb/ca.h
@@ -139,7 +139,7 @@ struct ca_descr {
 #define CA_SEND_MSG       _IOW('o', 133, struct ca_msg)
 #define CA_SET_DESCR      _IOW('o', 134, struct ca_descr)
 
-#if !defined (__KERNEL__)
+#if !defined(__KERNEL__)
 
 /* This is needed for legacy userspace support */
 typedef struct ca_slot_info ca_slot_info_t;
diff --git a/include/uapi/linux/dvb/dmx.h b/include/uapi/linux/dvb/dmx.h
index 4e3f3a2fe83f..4aa5f6a1815a 100644
--- a/include/uapi/linux/dvb/dmx.h
+++ b/include/uapi/linux/dvb/dmx.h
@@ -189,8 +189,7 @@ struct dmx_sct_filter_params {
  * @pes_type:	Type of the pes filter, as specified by &enum dmx_pes_type.
  * @flags:	Demux PES flags.
  */
-struct dmx_pes_filter_params
-{
+struct dmx_pes_filter_params {
 	__u16           pid;
 	enum dmx_input  input;
 	enum dmx_output output;
@@ -221,7 +220,7 @@ struct dmx_stc {
 #define DMX_ADD_PID              _IOW('o', 51, __u16)
 #define DMX_REMOVE_PID           _IOW('o', 52, __u16)
 
-#if !defined (__KERNEL__)
+#if !defined(__KERNEL__)
 
 /* This is needed for legacy userspace support */
 typedef enum dmx_output dmx_output_t;
diff --git a/include/uapi/linux/dvb/frontend.h b/include/uapi/linux/dvb/frontend.h
index fc2edb6014fe..861cacd5711f 100644
--- a/include/uapi/linux/dvb/frontend.h
+++ b/include/uapi/linux/dvb/frontend.h
@@ -907,7 +907,7 @@ struct dtv_properties {
 #define FE_SET_PROPERTY		   _IOW('o', 82, struct dtv_properties)
 #define FE_GET_PROPERTY		   _IOR('o', 83, struct dtv_properties)
 
-#if defined(__DVB_CORE__) || !defined (__KERNEL__)
+#if defined(__DVB_CORE__) || !defined(__KERNEL__)
 
 /*
  * DEPRECATED: Everything below is deprecated in favor of DVBv5 API
@@ -982,8 +982,8 @@ struct dvb_ofdm_parameters {
 };
 
 struct dvb_frontend_parameters {
-	__u32 frequency;     /* (absolute) frequency in Hz for DVB-C/DVB-T/ATSC */
-			     /* intermediate frequency in kHz for DVB-S */
+	__u32 frequency;  /* (absolute) frequency in Hz for DVB-C/DVB-T/ATSC */
+			  /* intermediate frequency in kHz for DVB-S */
 	fe_spectral_inversion_t inversion;
 	union {
 		struct dvb_qpsk_parameters qpsk;	/* DVB-S */
-- 
cgit v1.2.3-71-gd317


From aafd4562dfee81a40ba21b5ea3cf5e06664bc7f6 Mon Sep 17 00:00:00 2001
From: Mike Kravetz <mike.kravetz@oracle.com>
Date: Wed, 6 Sep 2017 16:23:29 -0700
Subject: mm: arch: consolidate mmap hugetlb size encodings

A non-default huge page size can be encoded in the flags argument of the
mmap system call.  The definitions for these encodings are in arch
specific header files.  However, all architectures use the same values.

Consolidate all the definitions in the primary user header file
(uapi/linux/mman.h).  Include definitions for all known huge page sizes.
Use the generic encoding definitions in hugetlb_encode.h as the basis
for these definitions.

Link: http://lkml.kernel.org/r/1501527386-10736-3-git-send-email-mike.kravetz@oracle.com
Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Cc: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Davidlohr Bueso <dbueso@suse.de>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/alpha/include/uapi/asm/mman.h     | 11 -----------
 arch/mips/include/uapi/asm/mman.h      | 11 -----------
 arch/parisc/include/uapi/asm/mman.h    | 11 -----------
 arch/powerpc/include/uapi/asm/mman.h   | 16 ----------------
 arch/x86/include/uapi/asm/mman.h       |  3 ---
 arch/xtensa/include/uapi/asm/mman.h    | 11 -----------
 include/uapi/asm-generic/mman-common.h | 11 -----------
 include/uapi/linux/mman.h              | 22 ++++++++++++++++++++++
 8 files changed, 22 insertions(+), 74 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/arch/alpha/include/uapi/asm/mman.h b/arch/alpha/include/uapi/asm/mman.h
index 02760f6e6ca4..13b52aad3c43 100644
--- a/arch/alpha/include/uapi/asm/mman.h
+++ b/arch/alpha/include/uapi/asm/mman.h
@@ -67,17 +67,6 @@
 /* compatibility flags */
 #define MAP_FILE	0
 
-/*
- * When MAP_HUGETLB is set bits [26:31] encode the log2 of the huge page size.
- * This gives us 6 bits, which is enough until someone invents 128 bit address
- * spaces.
- *
- * Assume these are all power of twos.
- * When 0 use the default page size.
- */
-#define MAP_HUGE_SHIFT	26
-#define MAP_HUGE_MASK	0x3f
-
 #define PKEY_DISABLE_ACCESS	0x1
 #define PKEY_DISABLE_WRITE	0x2
 #define PKEY_ACCESS_MASK	(PKEY_DISABLE_ACCESS |\
diff --git a/arch/mips/include/uapi/asm/mman.h b/arch/mips/include/uapi/asm/mman.h
index 655e2fb5395b..398eebcc3541 100644
--- a/arch/mips/include/uapi/asm/mman.h
+++ b/arch/mips/include/uapi/asm/mman.h
@@ -94,17 +94,6 @@
 /* compatibility flags */
 #define MAP_FILE	0
 
-/*
- * When MAP_HUGETLB is set bits [26:31] encode the log2 of the huge page size.
- * This gives us 6 bits, which is enough until someone invents 128 bit address
- * spaces.
- *
- * Assume these are all power of twos.
- * When 0 use the default page size.
- */
-#define MAP_HUGE_SHIFT	26
-#define MAP_HUGE_MASK	0x3f
-
 #define PKEY_DISABLE_ACCESS	0x1
 #define PKEY_DISABLE_WRITE	0x2
 #define PKEY_ACCESS_MASK	(PKEY_DISABLE_ACCESS |\
diff --git a/arch/parisc/include/uapi/asm/mman.h b/arch/parisc/include/uapi/asm/mman.h
index 9a9c2fe4be50..b87fbe3f338a 100644
--- a/arch/parisc/include/uapi/asm/mman.h
+++ b/arch/parisc/include/uapi/asm/mman.h
@@ -64,17 +64,6 @@
 #define MAP_FILE	0
 #define MAP_VARIABLE	0
 
-/*
- * When MAP_HUGETLB is set bits [26:31] encode the log2 of the huge page size.
- * This gives us 6 bits, which is enough until someone invents 128 bit address
- * spaces.
- *
- * Assume these are all power of twos.
- * When 0 use the default page size.
- */
-#define MAP_HUGE_SHIFT	26
-#define MAP_HUGE_MASK	0x3f
-
 #define PKEY_DISABLE_ACCESS	0x1
 #define PKEY_DISABLE_WRITE	0x2
 #define PKEY_ACCESS_MASK	(PKEY_DISABLE_ACCESS |\
diff --git a/arch/powerpc/include/uapi/asm/mman.h b/arch/powerpc/include/uapi/asm/mman.h
index ab45cc2f3101..03c06ba7464f 100644
--- a/arch/powerpc/include/uapi/asm/mman.h
+++ b/arch/powerpc/include/uapi/asm/mman.h
@@ -29,20 +29,4 @@
 #define MAP_STACK	0x20000		/* give out an address that is best suited for process/thread stacks */
 #define MAP_HUGETLB	0x40000		/* create a huge page mapping */
 
-/*
- * When MAP_HUGETLB is set, bits [26:31] of the flags argument to mmap(2),
- * encode the log2 of the huge page size. A value of zero indicates that the
- * default huge page size should be used. To use a non-default huge page size,
- * one of these defines can be used, or the size can be encoded by hand. Note
- * that on most systems only a subset, or possibly none, of these sizes will be
- * available.
- */
-#define MAP_HUGE_512KB	(19 << MAP_HUGE_SHIFT)	/* 512KB HugeTLB Page */
-#define MAP_HUGE_1MB	(20 << MAP_HUGE_SHIFT)	/* 1MB   HugeTLB Page */
-#define MAP_HUGE_2MB	(21 << MAP_HUGE_SHIFT)	/* 2MB   HugeTLB Page */
-#define MAP_HUGE_8MB	(23 << MAP_HUGE_SHIFT)	/* 8MB   HugeTLB Page */
-#define MAP_HUGE_16MB	(24 << MAP_HUGE_SHIFT)	/* 16MB  HugeTLB Page */
-#define MAP_HUGE_1GB	(30 << MAP_HUGE_SHIFT)	/* 1GB   HugeTLB Page */
-#define MAP_HUGE_16GB	(34 << MAP_HUGE_SHIFT)	/* 16GB  HugeTLB Page */
-
 #endif /* _UAPI_ASM_POWERPC_MMAN_H */
diff --git a/arch/x86/include/uapi/asm/mman.h b/arch/x86/include/uapi/asm/mman.h
index 39bca7fac087..3be08f07695c 100644
--- a/arch/x86/include/uapi/asm/mman.h
+++ b/arch/x86/include/uapi/asm/mman.h
@@ -3,9 +3,6 @@
 
 #define MAP_32BIT	0x40		/* only give out 32bit addresses */
 
-#define MAP_HUGE_2MB    (21 << MAP_HUGE_SHIFT)
-#define MAP_HUGE_1GB    (30 << MAP_HUGE_SHIFT)
-
 #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
 /*
  * Take the 4 protection key bits out of the vma->vm_flags
diff --git a/arch/xtensa/include/uapi/asm/mman.h b/arch/xtensa/include/uapi/asm/mman.h
index 24365b30aae9..8ce77a2e9bab 100644
--- a/arch/xtensa/include/uapi/asm/mman.h
+++ b/arch/xtensa/include/uapi/asm/mman.h
@@ -106,17 +106,6 @@
 /* compatibility flags */
 #define MAP_FILE	0
 
-/*
- * When MAP_HUGETLB is set bits [26:31] encode the log2 of the huge page size.
- * This gives us 6 bits, which is enough until someone invents 128 bit address
- * spaces.
- *
- * Assume these are all power of twos.
- * When 0 use the default page size.
- */
-#define MAP_HUGE_SHIFT	26
-#define MAP_HUGE_MASK	0x3f
-
 #define PKEY_DISABLE_ACCESS	0x1
 #define PKEY_DISABLE_WRITE	0x2
 #define PKEY_ACCESS_MASK	(PKEY_DISABLE_ACCESS |\
diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h
index 8c27db0c5c08..d248f3c335b5 100644
--- a/include/uapi/asm-generic/mman-common.h
+++ b/include/uapi/asm-generic/mman-common.h
@@ -61,17 +61,6 @@
 /* compatibility flags */
 #define MAP_FILE	0
 
-/*
- * When MAP_HUGETLB is set bits [26:31] encode the log2 of the huge page size.
- * This gives us 6 bits, which is enough until someone invents 128 bit address
- * spaces.
- *
- * Assume these are all power of twos.
- * When 0 use the default page size.
- */
-#define MAP_HUGE_SHIFT	26
-#define MAP_HUGE_MASK	0x3f
-
 #define PKEY_DISABLE_ACCESS	0x1
 #define PKEY_DISABLE_WRITE	0x2
 #define PKEY_ACCESS_MASK	(PKEY_DISABLE_ACCESS |\
diff --git a/include/uapi/linux/mman.h b/include/uapi/linux/mman.h
index ade4acd3a90c..a937480d7cd3 100644
--- a/include/uapi/linux/mman.h
+++ b/include/uapi/linux/mman.h
@@ -2,6 +2,7 @@
 #define _UAPI_LINUX_MMAN_H
 
 #include <asm/mman.h>
+#include <asm-generic/hugetlb_encode.h>
 
 #define MREMAP_MAYMOVE	1
 #define MREMAP_FIXED	2
@@ -10,4 +11,25 @@
 #define OVERCOMMIT_ALWAYS		1
 #define OVERCOMMIT_NEVER		2
 
+/*
+ * Huge page size encoding when MAP_HUGETLB is specified, and a huge page
+ * size other than the default is desired.  See hugetlb_encode.h.
+ * All known huge page size encodings are provided here.  It is the
+ * responsibility of the application to know which sizes are supported on
+ * the running system.  See mmap(2) man page for details.
+ */
+#define MAP_HUGE_SHIFT	HUGETLB_FLAG_ENCODE_SHIFT
+#define MAP_HUGE_MASK	HUGETLB_FLAG_ENCODE_MASK
+
+#define MAP_HUGE_64KB	HUGETLB_FLAG_ENCODE_64KB
+#define MAP_HUGE_512KB	HUGETLB_FLAG_ENCODE_512KB
+#define MAP_HUGE_1MB	HUGETLB_FLAG_ENCODE_1MB
+#define MAP_HUGE_2MB	HUGETLB_FLAG_ENCODE_2MB
+#define MAP_HUGE_8MB	HUGETLB_FLAG_ENCODE_8MB
+#define MAP_HUGE_16MB	HUGETLB_FLAG_ENCODE_16MB
+#define MAP_HUGE_256MB	HUGETLB_FLAG_ENCODE_256MB
+#define MAP_HUGE_1GB	HUGETLB_FLAG_ENCODE_1GB
+#define MAP_HUGE_2GB	HUGETLB_FLAG_ENCODE_2GB
+#define MAP_HUGE_16GB	HUGETLB_FLAG_ENCODE_16GB
+
 #endif /* _UAPI_LINUX_MMAN_H */
-- 
cgit v1.2.3-71-gd317


From 4da243ac1cf6aeb30b7c555d56208982d66d6d33 Mon Sep 17 00:00:00 2001
From: Mike Kravetz <mike.kravetz@oracle.com>
Date: Wed, 6 Sep 2017 16:23:33 -0700
Subject: mm: shm: use new hugetlb size encoding definitions

Use the common definitions from hugetlb_encode.h header file for
encoding hugetlb size definitions in shmget system call flags.

In addition, move these definitions from the internal (kernel) to user
(uapi) header file.

Link: http://lkml.kernel.org/r/1501527386-10736-4-git-send-email-mike.kravetz@oracle.com
Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
Suggested-by: Matthew Wilcox <willy@infradead.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Cc: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Davidlohr Bueso <dbueso@suse.de>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/shm.h      | 17 -----------------
 include/uapi/linux/shm.h | 31 +++++++++++++++++++++++++++++--
 2 files changed, 29 insertions(+), 19 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/linux/shm.h b/include/linux/shm.h
index 0fb7061ec54c..21a5e6c43385 100644
--- a/include/linux/shm.h
+++ b/include/linux/shm.h
@@ -27,23 +27,6 @@ struct shmid_kernel /* private to the kernel */
 /* shm_mode upper byte flags */
 #define	SHM_DEST	01000	/* segment will be destroyed on last detach */
 #define SHM_LOCKED      02000   /* segment will not be swapped */
-#define SHM_HUGETLB     04000   /* segment will use huge TLB pages */
-#define SHM_NORESERVE   010000  /* don't check for reservations */
-
-/* Bits [26:31] are reserved */
-
-/*
- * When SHM_HUGETLB is set bits [26:31] encode the log2 of the huge page size.
- * This gives us 6 bits, which is enough until someone invents 128 bit address
- * spaces.
- *
- * Assume these are all power of twos.
- * When 0 use the default page size.
- */
-#define SHM_HUGE_SHIFT  26
-#define SHM_HUGE_MASK   0x3f
-#define SHM_HUGE_2MB    (21 << SHM_HUGE_SHIFT)
-#define SHM_HUGE_1GB    (30 << SHM_HUGE_SHIFT)
 
 #ifdef CONFIG_SYSVIPC
 struct sysv_shm {
diff --git a/include/uapi/linux/shm.h b/include/uapi/linux/shm.h
index 1fbf24ea37fd..cf23c873719d 100644
--- a/include/uapi/linux/shm.h
+++ b/include/uapi/linux/shm.h
@@ -3,6 +3,7 @@
 
 #include <linux/ipc.h>
 #include <linux/errno.h>
+#include <asm-generic/hugetlb_encode.h>
 #ifndef __KERNEL__
 #include <unistd.h>
 #endif
@@ -40,11 +41,37 @@ struct shmid_ds {
 /* Include the definition of shmid64_ds and shminfo64 */
 #include <asm/shmbuf.h>
 
-/* permission flag for shmget */
+/*
+ * shmget() shmflg values.
+ */
+/* The bottom nine bits are the same as open(2) mode flags */
 #define SHM_R		0400	/* or S_IRUGO from <linux/stat.h> */
 #define SHM_W		0200	/* or S_IWUGO from <linux/stat.h> */
+/* Bits 9 & 10 are IPC_CREAT and IPC_EXCL */
+#define SHM_HUGETLB	04000	/* segment will use huge TLB pages */
+#define SHM_NORESERVE	010000	/* don't check for reservations */
+
+/*
+ * Huge page size encoding when SHM_HUGETLB is specified, and a huge page
+ * size other than the default is desired.  See hugetlb_encode.h
+ */
+#define SHM_HUGE_SHIFT	HUGETLB_FLAG_ENCODE_SHIFT
+#define SHM_HUGE_MASK	HUGETLB_FLAG_ENCODE_MASK
+
+#define SHM_HUGE_64KB	HUGETLB_FLAG_ENCODE_64KB
+#define SHM_HUGE_512KB	HUGETLB_FLAG_ENCODE_512KB
+#define SHM_HUGE_1MB	HUGETLB_FLAG_ENCODE_1MB
+#define SHM_HUGE_2MB	HUGETLB_FLAG_ENCODE_2MB
+#define SHM_HUGE_8MB	HUGETLB_FLAG_ENCODE_8MB
+#define SHM_HUGE_16MB	HUGETLB_FLAG_ENCODE_16MB
+#define SHM_HUGE_256MB	HUGETLB_FLAG_ENCODE_256MB
+#define SHM_HUGE_1GB	HUGETLB_FLAG_ENCODE_1GB
+#define SHM_HUGE_2GB	HUGETLB_FLAG_ENCODE_2GB
+#define SHM_HUGE_16GB	HUGETLB_FLAG_ENCODE_16GB
 
-/* mode for attach */
+/*
+ * shmat() shmflg values
+ */
 #define	SHM_RDONLY	010000	/* read-only access */
 #define	SHM_RND		020000	/* round attach address to SHMLBA boundary */
 #define	SHM_REMAP	040000	/* take-over region on attach */
-- 
cgit v1.2.3-71-gd317


From 2d6d6f5a09a96cc1fec7ed992b825e05f64cb50e Mon Sep 17 00:00:00 2001
From: Prakash Sangappa <prakash.sangappa@oracle.com>
Date: Wed, 6 Sep 2017 16:23:39 -0700
Subject: mm: userfaultfd: add feature to request for a signal delivery

In some cases, userfaultfd mechanism should just deliver a SIGBUS signal
to the faulting process, instead of the page-fault event.  Dealing with
page-fault event using a monitor thread can be an overhead in these
cases.  For example applications like the database could use the
signaling mechanism for robustness purpose.

Database uses hugetlbfs for performance reason.  Files on hugetlbfs
filesystem are created and huge pages allocated using fallocate() API.
Pages are deallocated/freed using fallocate() hole punching support.
These files are mmapped and accessed by many processes as shared memory.
The database keeps track of which offsets in the hugetlbfs file have
pages allocated.

Any access to mapped address over holes in the file, which can occur due
to bugs in the application, is considered invalid and expect the process
to simply receive a SIGBUS.  However, currently when a hole in the file
is accessed via the mapped address, kernel/mm attempts to automatically
allocate a page at page fault time, resulting in implicitly filling the
hole in the file.  This may not be the desired behavior for applications
like the database that want to explicitly manage page allocations of
hugetlbfs files.

Using userfaultfd mechanism with this support to get a signal, database
application can prevent pages from being allocated implicitly when
processes access mapped address over holes in the file.

This patch adds UFFD_FEATURE_SIGBUS feature to userfaultfd mechnism to
request for a SIGBUS signal.

See following for previous discussion about the database requirement
leading to this proposal as suggested by Andrea.

http://www.spinics.net/lists/linux-mm/msg129224.html

Link: http://lkml.kernel.org/r/1501552446-748335-2-git-send-email-prakash.sangappa@oracle.com
Signed-off-by: Prakash Sangappa <prakash.sangappa@oracle.com>
Reviewed-by: Mike Rapoport <rppt@linux.vnet.ibm.com>
Reviewed-by: Andrea Arcangeli <aarcange@redhat.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/userfaultfd.c                 |  3 +++
 include/uapi/linux/userfaultfd.h | 10 +++++++++-
 2 files changed, 12 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 01a85e2660b8..5fd4d846691f 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -370,6 +370,9 @@ int handle_userfault(struct vm_fault *vmf, unsigned long reason)
 	VM_BUG_ON(reason & ~(VM_UFFD_MISSING|VM_UFFD_WP));
 	VM_BUG_ON(!(reason & VM_UFFD_MISSING) ^ !!(reason & VM_UFFD_WP));
 
+	if (ctx->features & UFFD_FEATURE_SIGBUS)
+		goto out;
+
 	/*
 	 * If it's already released don't get it. This avoids to loop
 	 * in __get_user_pages if userfaultfd_release waits on the
diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
index 3b059530dac9..d39d5db56771 100644
--- a/include/uapi/linux/userfaultfd.h
+++ b/include/uapi/linux/userfaultfd.h
@@ -23,7 +23,8 @@
 			   UFFD_FEATURE_EVENT_REMOVE |	\
 			   UFFD_FEATURE_EVENT_UNMAP |		\
 			   UFFD_FEATURE_MISSING_HUGETLBFS |	\
-			   UFFD_FEATURE_MISSING_SHMEM)
+			   UFFD_FEATURE_MISSING_SHMEM |		\
+			   UFFD_FEATURE_SIGBUS)
 #define UFFD_API_IOCTLS				\
 	((__u64)1 << _UFFDIO_REGISTER |		\
 	 (__u64)1 << _UFFDIO_UNREGISTER |	\
@@ -153,6 +154,12 @@ struct uffdio_api {
 	 * UFFD_FEATURE_MISSING_SHMEM works the same as
 	 * UFFD_FEATURE_MISSING_HUGETLBFS, but it applies to shmem
 	 * (i.e. tmpfs and other shmem based APIs).
+	 *
+	 * UFFD_FEATURE_SIGBUS feature means no page-fault
+	 * (UFFD_EVENT_PAGEFAULT) event will be delivered, instead
+	 * a SIGBUS signal will be sent to the faulting process.
+	 * The application process can enable this behavior by adding
+	 * it to uffdio_api.features.
 	 */
 #define UFFD_FEATURE_PAGEFAULT_FLAG_WP		(1<<0)
 #define UFFD_FEATURE_EVENT_FORK			(1<<1)
@@ -161,6 +168,7 @@ struct uffdio_api {
 #define UFFD_FEATURE_MISSING_HUGETLBFS		(1<<4)
 #define UFFD_FEATURE_MISSING_SHMEM		(1<<5)
 #define UFFD_FEATURE_EVENT_UNMAP		(1<<6)
+#define UFFD_FEATURE_SIGBUS			(1<<7)
 	__u64 features;
 
 	__u64 ioctls;
-- 
cgit v1.2.3-71-gd317


From 9d4ac934829ac58c5109c49e6dfe677300e5e652 Mon Sep 17 00:00:00 2001
From: Alexey Perevalov <a.perevalov@samsung.com>
Date: Wed, 6 Sep 2017 16:23:56 -0700
Subject: userfaultfd: provide pid in userfault msg

It could be useful for calculating downtime during postcopy live
migration per vCPU.  Side observer or application itself will be
informed about proper task's sleep during userfaultfd processing.

Process's thread id is being provided when user requeste it by setting
UFFD_FEATURE_THREAD_ID bit into uffdio_api.features.

Link: http://lkml.kernel.org/r/20170802165145.22628-6-aarcange@redhat.com
Signed-off-by: Alexey Perevalov <a.perevalov@samsung.com>
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Cc: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Cc: Maxime Coquelin <maxime.coquelin@redhat.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/userfaultfd.c                 |  8 ++++++--
 include/uapi/linux/userfaultfd.h | 10 +++++++---
 2 files changed, 13 insertions(+), 5 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 5fd4d846691f..665bf7a930b2 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -178,7 +178,8 @@ static inline void msg_init(struct uffd_msg *msg)
 
 static inline struct uffd_msg userfault_msg(unsigned long address,
 					    unsigned int flags,
-					    unsigned long reason)
+					    unsigned long reason,
+					    unsigned int features)
 {
 	struct uffd_msg msg;
 	msg_init(&msg);
@@ -202,6 +203,8 @@ static inline struct uffd_msg userfault_msg(unsigned long address,
 		 * write protect fault.
 		 */
 		msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WP;
+	if (features & UFFD_FEATURE_THREAD_ID)
+		msg.arg.pagefault.ptid = task_pid_vnr(current);
 	return msg;
 }
 
@@ -422,7 +425,8 @@ int handle_userfault(struct vm_fault *vmf, unsigned long reason)
 
 	init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function);
 	uwq.wq.private = current;
-	uwq.msg = userfault_msg(vmf->address, vmf->flags, reason);
+	uwq.msg = userfault_msg(vmf->address, vmf->flags, reason,
+			ctx->features);
 	uwq.ctx = ctx;
 	uwq.waken = false;
 
diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
index d39d5db56771..2b24c28d99a7 100644
--- a/include/uapi/linux/userfaultfd.h
+++ b/include/uapi/linux/userfaultfd.h
@@ -24,7 +24,8 @@
 			   UFFD_FEATURE_EVENT_UNMAP |		\
 			   UFFD_FEATURE_MISSING_HUGETLBFS |	\
 			   UFFD_FEATURE_MISSING_SHMEM |		\
-			   UFFD_FEATURE_SIGBUS)
+			   UFFD_FEATURE_SIGBUS |		\
+			   UFFD_FEATURE_THREAD_ID)
 #define UFFD_API_IOCTLS				\
 	((__u64)1 << _UFFDIO_REGISTER |		\
 	 (__u64)1 << _UFFDIO_UNREGISTER |	\
@@ -79,6 +80,7 @@ struct uffd_msg {
 		struct {
 			__u64	flags;
 			__u64	address;
+			__u32   ptid;
 		} pagefault;
 
 		struct {
@@ -158,8 +160,9 @@ struct uffdio_api {
 	 * UFFD_FEATURE_SIGBUS feature means no page-fault
 	 * (UFFD_EVENT_PAGEFAULT) event will be delivered, instead
 	 * a SIGBUS signal will be sent to the faulting process.
-	 * The application process can enable this behavior by adding
-	 * it to uffdio_api.features.
+	 *
+	 * UFFD_FEATURE_THREAD_ID pid of the page faulted task_struct will
+	 * be returned, if feature is not requested 0 will be returned.
 	 */
 #define UFFD_FEATURE_PAGEFAULT_FLAG_WP		(1<<0)
 #define UFFD_FEATURE_EVENT_FORK			(1<<1)
@@ -169,6 +172,7 @@ struct uffdio_api {
 #define UFFD_FEATURE_MISSING_SHMEM		(1<<5)
 #define UFFD_FEATURE_EVENT_UNMAP		(1<<6)
 #define UFFD_FEATURE_SIGBUS			(1<<7)
+#define UFFD_FEATURE_THREAD_ID			(1<<8)
 	__u64 features;
 
 	__u64 ioctls;
-- 
cgit v1.2.3-71-gd317


From a36985d31a65d5c0559fb582719e32eaf0ccec3b Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Wed, 6 Sep 2017 16:23:59 -0700
Subject: userfaultfd: provide pid in userfault msg - add feat union

No ABI change, but this will make it more explicit to software that ptid
is only available if requested by passing UFFD_FEATURE_THREAD_ID to
UFFDIO_API.  The fact it's a union will also self document it shouldn't
be taken for granted there's a tpid there.

Link: http://lkml.kernel.org/r/20170802165145.22628-7-aarcange@redhat.com
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Cc: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Cc: Alexey Perevalov <a.perevalov@samsung.com>
Cc: Maxime Coquelin <maxime.coquelin@redhat.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/userfaultfd.c                 | 2 +-
 include/uapi/linux/userfaultfd.h | 4 +++-
 2 files changed, 4 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 665bf7a930b2..5419e7da82ba 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -204,7 +204,7 @@ static inline struct uffd_msg userfault_msg(unsigned long address,
 		 */
 		msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WP;
 	if (features & UFFD_FEATURE_THREAD_ID)
-		msg.arg.pagefault.ptid = task_pid_vnr(current);
+		msg.arg.pagefault.feat.ptid = task_pid_vnr(current);
 	return msg;
 }
 
diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
index 2b24c28d99a7..d6d1f65cb3c3 100644
--- a/include/uapi/linux/userfaultfd.h
+++ b/include/uapi/linux/userfaultfd.h
@@ -80,7 +80,9 @@ struct uffd_msg {
 		struct {
 			__u64	flags;
 			__u64	address;
-			__u32   ptid;
+			union {
+				__u32 ptid;
+			} feat;
 		} pagefault;
 
 		struct {
-- 
cgit v1.2.3-71-gd317


From 749df87bd7bee5a79cef073f5d032ddb2b211de8 Mon Sep 17 00:00:00 2001
From: Mike Kravetz <mike.kravetz@oracle.com>
Date: Wed, 6 Sep 2017 16:24:16 -0700
Subject: mm/shmem: add hugetlbfs support to memfd_create()

This patch came out of discussions in this e-mail thread:
  http://lkml.kernel.org/r/1499357846-7481-1-git-send-email-mike.kravetz%40oracle.com

The Oracle JVM team is developing a new garbage collection model.  This
new model requires multiple mappings of the same anonymous memory.  One
straight forward way to accomplish this is with memfd_create.  They can
use the returned fd to create multiple mappings of the same memory.

The JVM today has an option to use (static hugetlb) huge pages.  If this
option is specified, they would like to use the same garbage collection
model requiring multiple mappings to the same memory.  Using hugetlbfs,
it is possible to explicitly mount a filesystem and specify file paths
in order to get an fd that can be used for multiple mappings.  However,
this introduces additional system admin work and coordination.

Ideally they would like to get a hugetlbfs fd without requiring explicit
mounting of a filesystem.  Today, mmap and shmget can make use of
hugetlbfs without explicitly mounting a filesystem.  The patch adds this
functionality to memfd_create.

Add a new flag MFD_HUGETLB to memfd_create() that will specify the file
to be created resides in the hugetlbfs filesystem.  This is the generic
hugetlbfs filesystem not associated with any specific mount point.  As
with other system calls that request hugetlbfs backed pages, there is
the ability to encode huge page size in the flag arguments.

hugetlbfs does not support sealing operations, therefore specifying
MFD_ALLOW_SEALING with MFD_HUGETLB will result in EINVAL.

Of course, the memfd_man page would need updating if this type of
functionality moves forward.

Link: http://lkml.kernel.org/r/1502149672-7759-2-git-send-email-mike.kravetz@oracle.com
Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: "Kirill A . Shutemov" <kirill.shutemov@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/uapi/linux/memfd.h | 24 ++++++++++++++++++++++++
 mm/shmem.c                 | 37 +++++++++++++++++++++++++++++++------
 2 files changed, 55 insertions(+), 6 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/memfd.h b/include/uapi/linux/memfd.h
index 534e364bda92..7f3a722dbd72 100644
--- a/include/uapi/linux/memfd.h
+++ b/include/uapi/linux/memfd.h
@@ -1,8 +1,32 @@
 #ifndef _UAPI_LINUX_MEMFD_H
 #define _UAPI_LINUX_MEMFD_H
 
+#include <asm-generic/hugetlb_encode.h>
+
 /* flags for memfd_create(2) (unsigned int) */
 #define MFD_CLOEXEC		0x0001U
 #define MFD_ALLOW_SEALING	0x0002U
+#define MFD_HUGETLB		0x0004U
+
+/*
+ * Huge page size encoding when MFD_HUGETLB is specified, and a huge page
+ * size other than the default is desired.  See hugetlb_encode.h.
+ * All known huge page size encodings are provided here.  It is the
+ * responsibility of the application to know which sizes are supported on
+ * the running system.  See mmap(2) man page for details.
+ */
+#define MFD_HUGE_SHIFT	HUGETLB_FLAG_ENCODE_SHIFT
+#define MFD_HUGE_MASK	HUGETLB_FLAG_ENCODE_MASK
+
+#define MFD_HUGE_64KB	HUGETLB_FLAG_ENCODE_64KB
+#define MFD_HUGE_512KB	HUGETLB_FLAG_ENCODE_512KB
+#define MFD_HUGE_1MB	HUGETLB_FLAG_ENCODE_1MB
+#define MFD_HUGE_2MB	HUGETLB_FLAG_ENCODE_2MB
+#define MFD_HUGE_8MB	HUGETLB_FLAG_ENCODE_8MB
+#define MFD_HUGE_16MB	HUGETLB_FLAG_ENCODE_16MB
+#define MFD_HUGE_256MB	HUGETLB_FLAG_ENCODE_256MB
+#define MFD_HUGE_1GB	HUGETLB_FLAG_ENCODE_1GB
+#define MFD_HUGE_2GB	HUGETLB_FLAG_ENCODE_2GB
+#define MFD_HUGE_16GB	HUGETLB_FLAG_ENCODE_16GB
 
 #endif /* _UAPI_LINUX_MEMFD_H */
diff --git a/mm/shmem.c b/mm/shmem.c
index 64bdc91187f7..47179bbe9ee7 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -34,6 +34,7 @@
 #include <linux/swap.h>
 #include <linux/uio.h>
 #include <linux/khugepaged.h>
+#include <linux/hugetlb.h>
 
 #include <asm/tlbflush.h> /* for arch/microblaze update_mmu_cache() */
 
@@ -3652,7 +3653,7 @@ static int shmem_show_options(struct seq_file *seq, struct dentry *root)
 #define MFD_NAME_PREFIX_LEN (sizeof(MFD_NAME_PREFIX) - 1)
 #define MFD_NAME_MAX_LEN (NAME_MAX - MFD_NAME_PREFIX_LEN)
 
-#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING)
+#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB)
 
 SYSCALL_DEFINE2(memfd_create,
 		const char __user *, uname,
@@ -3664,8 +3665,18 @@ SYSCALL_DEFINE2(memfd_create,
 	char *name;
 	long len;
 
-	if (flags & ~(unsigned int)MFD_ALL_FLAGS)
-		return -EINVAL;
+	if (!(flags & MFD_HUGETLB)) {
+		if (flags & ~(unsigned int)MFD_ALL_FLAGS)
+			return -EINVAL;
+	} else {
+		/* Sealing not supported in hugetlbfs (MFD_HUGETLB) */
+		if (flags & MFD_ALLOW_SEALING)
+			return -EINVAL;
+		/* Allow huge page size encoding in flags. */
+		if (flags & ~(unsigned int)(MFD_ALL_FLAGS |
+				(MFD_HUGE_MASK << MFD_HUGE_SHIFT)))
+			return -EINVAL;
+	}
 
 	/* length includes terminating zero */
 	len = strnlen_user(uname, MFD_NAME_MAX_LEN + 1);
@@ -3696,16 +3707,30 @@ SYSCALL_DEFINE2(memfd_create,
 		goto err_name;
 	}
 
-	file = shmem_file_setup(name, 0, VM_NORESERVE);
+	if (flags & MFD_HUGETLB) {
+		struct user_struct *user = NULL;
+
+		file = hugetlb_file_setup(name, 0, VM_NORESERVE, &user,
+					HUGETLB_ANONHUGE_INODE,
+					(flags >> MFD_HUGE_SHIFT) &
+					MFD_HUGE_MASK);
+	} else
+		file = shmem_file_setup(name, 0, VM_NORESERVE);
 	if (IS_ERR(file)) {
 		error = PTR_ERR(file);
 		goto err_fd;
 	}
-	info = SHMEM_I(file_inode(file));
 	file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE;
 	file->f_flags |= O_RDWR | O_LARGEFILE;
-	if (flags & MFD_ALLOW_SEALING)
+
+	if (flags & MFD_ALLOW_SEALING) {
+		/*
+		 * flags check at beginning of function ensures
+		 * this is not a hugetlbfs (MFD_HUGETLB) file.
+		 */
+		info = SHMEM_I(file_inode(file));
 		info->seals &= ~F_SEAL_SEAL;
+	}
 
 	fd_install(fd, file);
 	kfree(name);
-- 
cgit v1.2.3-71-gd317


From 3dd8f7c3b78b9556582fd64bf5c9986723f9dca1 Mon Sep 17 00:00:00 2001
From: Ian Kent <raven@themaw.net>
Date: Fri, 8 Sep 2017 16:16:30 -0700
Subject: autofs: make dev ioctl version and ismountpoint user accessible

Some of the autofs miscellaneous device ioctls need to be accessable to
user space applications without CAP_SYS_ADMIN to get information about
autofs mounts.

Link: http://lkml.kernel.org/r/150216642517.11652.2338933266137331637.stgit@pluto.themaw.net
Signed-off-by: Ian Kent <raven@themaw.net>
Cc: Colin Walters <walters@redhat.com>
Cc: Ondrej Holy <oholy@redhat.com>
Cc: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/autofs4/dev-ioctl.c              | 12 ++++++++----
 include/uapi/linux/auto_dev-ioctl.h |  2 +-
 2 files changed, 9 insertions(+), 5 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index 218a4ecc75cc..ea8b3a1cddd2 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -628,10 +628,6 @@ static int _autofs_dev_ioctl(unsigned int command,
 	ioctl_fn fn = NULL;
 	int err = 0;
 
-	/* only root can play with this */
-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
-
 	cmd_first = _IOC_NR(AUTOFS_DEV_IOCTL_IOC_FIRST);
 	cmd = _IOC_NR(command);
 
@@ -640,6 +636,14 @@ static int _autofs_dev_ioctl(unsigned int command,
 		return -ENOTTY;
 	}
 
+	/* Only root can use ioctls other than AUTOFS_DEV_IOCTL_VERSION_CMD
+	 * and AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD
+	 */
+	if (cmd != AUTOFS_DEV_IOCTL_VERSION_CMD &&
+	    cmd != AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD &&
+	    !capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
 	/* Copy the parameters into kernel space. */
 	param = copy_dev_ioctl(user);
 	if (IS_ERR(param))
diff --git a/include/uapi/linux/auto_dev-ioctl.h b/include/uapi/linux/auto_dev-ioctl.h
index 744b3d060968..5558db8e6646 100644
--- a/include/uapi/linux/auto_dev-ioctl.h
+++ b/include/uapi/linux/auto_dev-ioctl.h
@@ -16,7 +16,7 @@
 #define AUTOFS_DEVICE_NAME		"autofs"
 
 #define AUTOFS_DEV_IOCTL_VERSION_MAJOR	1
-#define AUTOFS_DEV_IOCTL_VERSION_MINOR	0
+#define AUTOFS_DEV_IOCTL_VERSION_MINOR	1
 
 #define AUTOFS_DEV_IOCTL_SIZE		sizeof(struct autofs_dev_ioctl)
 
-- 
cgit v1.2.3-71-gd317


From 1f28c5d055032e7e8ee5e48198dca7e125d0eec6 Mon Sep 17 00:00:00 2001
From: Tomohiro Kusumi <tkusumi@tuxera.com>
Date: Fri, 8 Sep 2017 16:16:34 -0700
Subject: autofs: remove unused AUTOFS_IOC_EXPIRE_DIRECT/INDIRECT

These are not used by either kernel or userspace, although
AUTOFS_IOC_EXPIRE_DIRECT once seems to have been used by userspace in
around 2006-2008, which was technically just an alias of the existing
ioctl AUTOFS_IOC_EXPIRE_MULTI.

ioctls for autofs are already complicated enough that they could be
removed unless these are staying here to be able to compile userspace code
of certain period of time from a decade ago.

Edit: raven@themaw.net
Yes, this is indeed very old and anything that still uses must
be updated becuase it will be using broken functionality.
End edit: raven@themaw.net

Link: http://lkml.kernel.org/r/150285067347.4670.11494624644273072003.stgit@pluto.themaw.net
Signed-off-by: Tomohiro Kusumi <tkusumi@tuxera.com>
Signed-off-by: Ian Kent <raven@themaw.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/uapi/linux/auto_fs4.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/auto_fs4.h b/include/uapi/linux/auto_fs4.h
index 7c6da423d54e..9453e9a07c9d 100644
--- a/include/uapi/linux/auto_fs4.h
+++ b/include/uapi/linux/auto_fs4.h
@@ -155,8 +155,6 @@ enum {
 };
 
 #define AUTOFS_IOC_EXPIRE_MULTI    _IOW(AUTOFS_IOCTL, AUTOFS_IOC_EXPIRE_MULTI_CMD, int)
-#define AUTOFS_IOC_EXPIRE_INDIRECT AUTOFS_IOC_EXPIRE_MULTI
-#define AUTOFS_IOC_EXPIRE_DIRECT   AUTOFS_IOC_EXPIRE_MULTI
 #define AUTOFS_IOC_PROTOSUBVER     _IOR(AUTOFS_IOCTL, AUTOFS_IOC_PROTOSUBVER_CMD, int)
 #define AUTOFS_IOC_ASKUMOUNT       _IOR(AUTOFS_IOCTL, AUTOFS_IOC_ASKUMOUNT_CMD, int)
 
-- 
cgit v1.2.3-71-gd317


From a2d818030135c293f878fbb772cf40e7a14c5acc Mon Sep 17 00:00:00 2001
From: "Robert P. J. Day" <rpjday@crashcourse.ca>
Date: Fri, 8 Sep 2017 16:17:19 -0700
Subject: drivers/pps: aesthetic tweaks to PPS-related content

Collection of aesthetic adjustments to various PPS-related files,
directories and Documentation, some quite minor just for the sake of
consistency, including:

 * Updated example of pps device tree node (courtesy Rodolfo G.)
 * "PPS-API" -> "PPS API"
 * "pps_source_info_s" -> "pps_source_info"
 * "ktimer driver" -> "pps-ktimer driver"
 * "ppstest /dev/pps0" -> "ppstest /dev/pps1" to match example
 * Add missing PPS-related entries to MAINTAINERS file
 * Other trivialities

Link: http://lkml.kernel.org/r/alpine.LFD.2.20.1708261048220.8106@localhost.localdomain
Signed-off-by: Robert P. J. Day <rpjday@crashcourse.ca>
Acked-by: Rodolfo Giometti <giometti@enneenne.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/devicetree/bindings/pps/pps-gpio.txt |  8 +++-
 Documentation/pps/pps.txt                          | 44 +++++++++++-----------
 MAINTAINERS                                        |  3 ++
 include/linux/pps-gpio.h                           |  2 +-
 include/linux/pps_kernel.h                         | 16 ++++----
 include/uapi/linux/pps.h                           |  4 +-
 kernel/time/timekeeping.c                          |  2 +-
 7 files changed, 43 insertions(+), 36 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/devicetree/bindings/pps/pps-gpio.txt b/Documentation/devicetree/bindings/pps/pps-gpio.txt
index 40bf9c3564a5..0de23b793657 100644
--- a/Documentation/devicetree/bindings/pps/pps-gpio.txt
+++ b/Documentation/devicetree/bindings/pps/pps-gpio.txt
@@ -13,8 +13,12 @@ Optional properties:
 
 Example:
 	pps {
-		compatible = "pps-gpio";
-		gpios = <&gpio2 6 0>;
+		pinctrl-names = "default";
+		pinctrl-0 = <&pinctrl_pps>;
 
+		gpios = <&gpio1 26 GPIO_ACTIVE_HIGH>;
 		assert-falling-edge;
+
+		compatible = "pps-gpio";
+		status = "okay";
 	};
diff --git a/Documentation/pps/pps.txt b/Documentation/pps/pps.txt
index 1fdbd5447216..99f5d8c4c652 100644
--- a/Documentation/pps/pps.txt
+++ b/Documentation/pps/pps.txt
@@ -48,12 +48,12 @@ problem:
    time_pps_create().
 
 This implies that the source has a /dev/... entry. This assumption is
-ok for the serial and parallel port, where you can do something
+OK for the serial and parallel port, where you can do something
 useful besides(!) the gathering of timestamps as it is the central
-task for a PPS-API. But this assumption does not work for a single
+task for a PPS API. But this assumption does not work for a single
 purpose GPIO line. In this case even basic file-related functionality
 (like read() and write()) makes no sense at all and should not be a
-precondition for the use of a PPS-API.
+precondition for the use of a PPS API.
 
 The problem can be simply solved if you consider that a PPS source is
 not always connected with a GPS data source.
@@ -88,13 +88,13 @@ Coding example
 --------------
 
 To register a PPS source into the kernel you should define a struct
-pps_source_info_s as follows:
+pps_source_info as follows:
 
     static struct pps_source_info pps_ktimer_info = {
 	    .name         = "ktimer",
 	    .path         = "",
-	    .mode         = PPS_CAPTUREASSERT | PPS_OFFSETASSERT | \
-			    PPS_ECHOASSERT | \
+	    .mode         = PPS_CAPTUREASSERT | PPS_OFFSETASSERT |
+			    PPS_ECHOASSERT |
 			    PPS_CANWAIT | PPS_TSFMT_TSPEC,
 	    .echo         = pps_ktimer_echo,
 	    .owner        = THIS_MODULE,
@@ -108,13 +108,13 @@ initialization routine as follows:
 
 The pps_register_source() prototype is:
 
-  int pps_register_source(struct pps_source_info_s *info, int default_params)
+  int pps_register_source(struct pps_source_info *info, int default_params)
 
 where "info" is a pointer to a structure that describes a particular
 PPS source, "default_params" tells the system what the initial default
 parameters for the device should be (it is obvious that these parameters
 must be a subset of ones defined in the struct
-pps_source_info_s which describe the capabilities of the driver).
+pps_source_info which describe the capabilities of the driver).
 
 Once you have registered a new PPS source into the system you can
 signal an assert event (for example in the interrupt handler routine)
@@ -142,8 +142,10 @@ If the SYSFS filesystem is enabled in the kernel it provides a new class:
 Every directory is the ID of a PPS sources defined in the system and
 inside you find several files:
 
-   $ ls /sys/class/pps/pps0/
-   assert	clear  echo  mode  name  path  subsystem@  uevent
+   $ ls -F /sys/class/pps/pps0/
+   assert     dev        mode       path       subsystem@
+   clear      echo       name       power/     uevent
+
 
 Inside each "assert" and "clear" file you can find the timestamp and a
 sequence number:
@@ -154,32 +156,32 @@ sequence number:
 Where before the "#" is the timestamp in seconds; after it is the
 sequence number. Other files are:
 
-* echo: reports if the PPS source has an echo function or not;
+ * echo: reports if the PPS source has an echo function or not;
 
-* mode: reports available PPS functioning modes;
+ * mode: reports available PPS functioning modes;
 
-* name: reports the PPS source's name;
+ * name: reports the PPS source's name;
 
-* path: reports the PPS source's device path, that is the device the
-  PPS source is connected to (if it exists).
+ * path: reports the PPS source's device path, that is the device the
+   PPS source is connected to (if it exists).
 
 
 Testing the PPS support
 -----------------------
 
 In order to test the PPS support even without specific hardware you can use
-the ktimer driver (see the client subsection in the PPS configuration menu)
+the pps-ktimer driver (see the client subsection in the PPS configuration menu)
 and the userland tools available in your distribution's pps-tools package,
-http://linuxpps.org , or https://github.com/ago/pps-tools .
+http://linuxpps.org , or https://github.com/redlab-i/pps-tools.
 
-Once you have enabled the compilation of ktimer just modprobe it (if
+Once you have enabled the compilation of pps-ktimer just modprobe it (if
 not statically compiled):
 
-   # modprobe ktimer
+   # modprobe pps-ktimer
 
 and the run ppstest as follow:
 
-   $ ./ppstest /dev/pps0
+   $ ./ppstest /dev/pps1
    trying PPS source "/dev/pps1"
    found PPS source "/dev/pps1"
    ok, found 1 source(s), now start fetching data...
@@ -187,7 +189,7 @@ and the run ppstest as follow:
    source 0 - assert 1186592700.388931295, sequence: 365 - clear  0.000000000, sequence: 0
    source 0 - assert 1186592701.389032765, sequence: 366 - clear  0.000000000, sequence: 0
 
-Please, note that to compile userland programs you need the file timepps.h .
+Please note that to compile userland programs, you need the file timepps.h.
 This is available in the pps-tools repository mentioned above.
 
 
diff --git a/MAINTAINERS b/MAINTAINERS
index ff3a349f24e4..109c5d9a04c4 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -10725,8 +10725,11 @@ W:	http://wiki.enneenne.com/index.php/LinuxPPS_support
 L:	linuxpps@ml.enneenne.com (subscribers-only)
 S:	Maintained
 F:	Documentation/pps/
+F:	Documentation/devicetree/bindings/pps/pps-gpio.txt
+F:	Documentation/ABI/testing/sysfs-pps
 F:	drivers/pps/
 F:	include/linux/pps*.h
+F:	include/uapi/linux/pps.h
 
 PPTP DRIVER
 M:	Dmitry Kozlov <xeb@mail.ru>
diff --git a/include/linux/pps-gpio.h b/include/linux/pps-gpio.h
index 0035abe41b9a..56f35dd3d01d 100644
--- a/include/linux/pps-gpio.h
+++ b/include/linux/pps-gpio.h
@@ -29,4 +29,4 @@ struct pps_gpio_platform_data {
 	const char *gpio_label;
 };
 
-#endif
+#endif /* _PPS_GPIO_H */
diff --git a/include/linux/pps_kernel.h b/include/linux/pps_kernel.h
index 35ac903956c7..80a980cc8d95 100644
--- a/include/linux/pps_kernel.h
+++ b/include/linux/pps_kernel.h
@@ -22,7 +22,6 @@
 #define LINUX_PPS_KERNEL_H
 
 #include <linux/pps.h>
-
 #include <linux/cdev.h>
 #include <linux/device.h>
 #include <linux/time.h>
@@ -35,9 +34,9 @@ struct pps_device;
 
 /* The specific PPS source info */
 struct pps_source_info {
-	char name[PPS_MAX_NAME_LEN];		/* simbolic name */
+	char name[PPS_MAX_NAME_LEN];		/* symbolic name */
 	char path[PPS_MAX_NAME_LEN];		/* path of connected device */
-	int mode;				/* PPS's allowed mode */
+	int mode;				/* PPS allowed mode */
 
 	void (*echo)(struct pps_device *pps,
 			int event, void *data);	/* PPS echo function */
@@ -57,10 +56,10 @@ struct pps_event_time {
 struct pps_device {
 	struct pps_source_info info;		/* PSS source info */
 
-	struct pps_kparams params;		/* PPS's current params */
+	struct pps_kparams params;		/* PPS current params */
 
-	__u32 assert_sequence;			/* PPS' assert event seq # */
-	__u32 clear_sequence;			/* PPS' clear event seq # */
+	__u32 assert_sequence;			/* PPS assert event seq # */
+	__u32 clear_sequence;			/* PPS clear event seq # */
 	struct pps_ktime assert_tu;
 	struct pps_ktime clear_tu;
 	int current_mode;			/* PPS mode at event time */
@@ -69,7 +68,7 @@ struct pps_device {
 	wait_queue_head_t queue;		/* PPS event queue */
 
 	unsigned int id;			/* PPS source unique ID */
-	void const *lookup_cookie;		/* pps_lookup_dev only */
+	void const *lookup_cookie;		/* For pps_lookup_dev() only */
 	struct cdev cdev;
 	struct device *dev;
 	struct fasync_struct *async_queue;	/* fasync method */
@@ -101,7 +100,7 @@ extern struct pps_device *pps_register_source(
 extern void pps_unregister_source(struct pps_device *pps);
 extern void pps_event(struct pps_device *pps,
 		struct pps_event_time *ts, int event, void *data);
-/* Look up a pps device by magic cookie */
+/* Look up a pps_device by magic cookie */
 struct pps_device *pps_lookup_dev(void const *cookie);
 
 static inline void timespec_to_pps_ktime(struct pps_ktime *kt,
@@ -132,4 +131,3 @@ static inline void pps_sub_ts(struct pps_event_time *ts, struct timespec64 delta
 }
 
 #endif /* LINUX_PPS_KERNEL_H */
-
diff --git a/include/uapi/linux/pps.h b/include/uapi/linux/pps.h
index c1cb3825a8bc..c29d6b791c08 100644
--- a/include/uapi/linux/pps.h
+++ b/include/uapi/linux/pps.h
@@ -95,8 +95,8 @@ struct pps_kparams {
 #define PPS_CAPTURECLEAR	0x02	/* capture clear events */
 #define PPS_CAPTUREBOTH		0x03	/* capture assert and clear events */
 
-#define PPS_OFFSETASSERT	0x10	/* apply compensation for assert ev. */
-#define PPS_OFFSETCLEAR		0x20	/* apply compensation for clear ev. */
+#define PPS_OFFSETASSERT	0x10	/* apply compensation for assert event */
+#define PPS_OFFSETCLEAR		0x20	/* apply compensation for clear event */
 
 #define PPS_CANWAIT		0x100	/* can we wait for an event? */
 #define PPS_CANPOLL		0x200	/* bit reserved for future use */
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 8ea4fb315719..2cafb49aa65e 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -2316,7 +2316,7 @@ void hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts)
 	raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
 }
 EXPORT_SYMBOL(hardpps);
-#endif
+#endif /* CONFIG_NTP_PPS */
 
 /**
  * xtime_update() - advances the timekeeping infrastructure
-- 
cgit v1.2.3-71-gd317


From 9beb8bedb05c5f3a353dba62b8fa7cbbb9ec685e Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sat, 9 Sep 2017 01:40:35 +0200
Subject: bpf: make error reporting in bpf_warn_invalid_xdp_action more clear

Differ between illegal XDP action code and just driver
unsupported one to provide better feedback when we throw
a one-time warning here. Reason is that with 814abfabef3c
("xdp: add bpf_redirect helper function") not all drivers
support the new XDP return code yet and thus they will
fall into their 'default' case when checking for return
codes after program return, which then triggers a
bpf_warn_invalid_xdp_action() stating that the return
code is illegal, but from XDP perspective it's not.

I decided not to place something like a XDP_ACT_MAX define
into uapi i) given we don't have this either for all other
program types, ii) future action codes could have further
encoding there, which would render such define unsuitable
and we wouldn't be able to rip it out again, and iii) we
rarely add new action codes.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/bpf.h | 4 ++--
 net/core/filter.c        | 6 +++++-
 2 files changed, 7 insertions(+), 3 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index ba848b761cfb..43ab5c402f98 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -766,8 +766,8 @@ struct bpf_sock {
 
 /* User return codes for XDP prog type.
  * A valid XDP program must return one of these defined values. All other
- * return codes are reserved for future use. Unknown return codes will result
- * in packet drop.
+ * return codes are reserved for future use. Unknown return codes will
+ * result in packet drops and a warning via bpf_warn_invalid_xdp_action().
  */
 enum xdp_action {
 	XDP_ABORTED = 0,
diff --git a/net/core/filter.c b/net/core/filter.c
index 0848df2cd9bf..3a50a9b021e2 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3609,7 +3609,11 @@ static bool xdp_is_valid_access(int off, int size,
 
 void bpf_warn_invalid_xdp_action(u32 act)
 {
-	WARN_ONCE(1, "Illegal XDP return value %u, expect packet loss\n", act);
+	const u32 act_max = XDP_REDIRECT;
+
+	WARN_ONCE(1, "%s XDP return value %u, expect packet loss!\n",
+		  act > act_max ? "Illegal" : "Driver unsupported",
+		  act);
 }
 EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action);
 
-- 
cgit v1.2.3-71-gd317


From 19cab8872692960535aa6d12e3a295ac51d1a648 Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Wed, 20 Sep 2017 15:52:13 -0700
Subject: net: ethtool: Add back transceiver type

Commit 3f1ac7a700d0 ("net: ethtool: add new ETHTOOL_xLINKSETTINGS API")
deprecated the ethtool_cmd::transceiver field, which was fine in
premise, except that the PHY library was actually using it to report the
type of transceiver: internal or external.

Use the first word of the reserved field to put this __u8 transceiver
field back in. It is made read-only, and we don't expect the
ETHTOOL_xLINKSETTINGS API to be doing anything with this anyway, so this
is mostly for the legacy path where we do:

ethtool_get_settings()
-> dev->ethtool_ops->get_link_ksettings()
   -> convert_link_ksettings_to_legacy_settings()

to have no information loss compared to the legacy get_settings API.

Fixes: 3f1ac7a700d0 ("net: ethtool: add new ETHTOOL_xLINKSETTINGS API")
Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/ethtool.h | 6 +++++-
 net/core/ethtool.c           | 2 ++
 2 files changed, 7 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h
index 9c041dae8e2c..5bd1b1de4ea0 100644
--- a/include/uapi/linux/ethtool.h
+++ b/include/uapi/linux/ethtool.h
@@ -1753,6 +1753,8 @@ enum ethtool_reset_flags {
  *	%ethtool_link_mode_bit_indices for the link modes, and other
  *	link features that the link partner advertised through
  *	autonegotiation; 0 if unknown or not applicable.  Read-only.
+ * @transceiver: Used to distinguish different possible PHY types,
+ *	reported consistently by PHYLIB.  Read-only.
  *
  * If autonegotiation is disabled, the speed and @duplex represent the
  * fixed link mode and are writable if the driver supports multiple
@@ -1804,7 +1806,9 @@ struct ethtool_link_settings {
 	__u8	eth_tp_mdix;
 	__u8	eth_tp_mdix_ctrl;
 	__s8	link_mode_masks_nwords;
-	__u32	reserved[8];
+	__u8	transceiver;
+	__u8	reserved1[3];
+	__u32	reserved[7];
 	__u32	link_mode_masks[0];
 	/* layout of link_mode_masks fields:
 	 * __u32 map_supported[link_mode_masks_nwords];
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 6a582ae4c5d9..3228411ada0f 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -525,6 +525,8 @@ convert_link_ksettings_to_legacy_settings(
 		= link_ksettings->base.eth_tp_mdix;
 	legacy_settings->eth_tp_mdix_ctrl
 		= link_ksettings->base.eth_tp_mdix_ctrl;
+	legacy_settings->transceiver
+		= link_ksettings->base.transceiver;
 	return retval;
 }
 
-- 
cgit v1.2.3-71-gd317