From 3713131345fbea291cbd859d248e06ed77815962 Mon Sep 17 00:00:00 2001
From: Radim Krčmář <rkrcmar@redhat.com>
Date: Tue, 12 Jul 2016 22:09:27 +0200
Subject: KVM: x86: add KVM_CAP_X2APIC_API
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

KVM_CAP_X2APIC_API is a capability for features related to x2APIC
enablement.  KVM_X2APIC_API_32BIT_FORMAT feature can be enabled to
extend APIC ID in get/set ioctl and MSI addresses to 32 bits.
Both are needed to support x2APIC.

The feature has to be enableable and disabled by default, because
get/set ioctl shifted and truncated APIC ID to 8 bits by using a
non-standard protocol inspired by xAPIC and the change is not
backward-compatible.

Changes to MSI addresses follow the format used by interrupt remapping
unit.  The upper address word, that used to be 0, contains upper 24 bits
of the LAPIC address in its upper 24 bits.  Lower 8 bits are reserved as
0.  Using the upper address word is not backward-compatible either as we
didn't check that userspace zeroed the word.  Reserved bits are still
not explicitly checked, but non-zero data will affect LAPIC addresses,
which will cause a bug.

Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/uapi/linux/kvm.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 05ebf475104c..f704403e19a0 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -866,6 +866,7 @@ struct kvm_ppc_smmu_info {
 #define KVM_CAP_ARM_PMU_V3 126
 #define KVM_CAP_VCPU_ATTRIBUTES 127
 #define KVM_CAP_MAX_VCPU_ID 128
+#define KVM_CAP_X2APIC_API 129
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -1313,4 +1314,6 @@ struct kvm_assigned_msix_entry {
 	__u16 padding[3];
 };
 
+#define KVM_X2APIC_API_USE_32BIT_IDS            (1ULL << 0)
+
 #endif /* __LINUX_KVM_H */
-- 
cgit v1.2.3-71-gd317


From c519265f2aa348b2f1b9ecf8fbe20bb7c0fb102e Mon Sep 17 00:00:00 2001
From: Radim Krčmář <rkrcmar@redhat.com>
Date: Tue, 12 Jul 2016 22:09:28 +0200
Subject: KVM: x86: add a flag to disable KVM x2apic broadcast quirk
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK as a feature flag to
KVM_CAP_X2APIC_API.

The quirk made KVM interpret 0xff as a broadcast even in x2APIC mode.
The enableable capability is needed in order to support standard x2APIC and
remain backward compatible.

Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
[Expand kvm_apic_mda comment. - Paolo]
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 Documentation/virtual/kvm/api.txt |  6 +++++
 arch/x86/include/asm/kvm_host.h   |  1 +
 arch/x86/kvm/lapic.c              | 53 +++++++++++++++++++++++++++++----------
 arch/x86/kvm/x86.c                |  5 +++-
 include/uapi/linux/kvm.h          |  1 +
 5 files changed, 52 insertions(+), 14 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index e34e51fa28b0..c4d2fb0e28de 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -3844,12 +3844,18 @@ Returns: 0 on success, -EINVAL when args[0] contains invalid features
 Valid feature flags in args[0] are
 
 #define KVM_X2APIC_API_USE_32BIT_IDS            (1ULL << 0)
+#define KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK  (1ULL << 1)
 
 Enabling KVM_X2APIC_API_USE_32BIT_IDS changes the behavior of
 KVM_SET_GSI_ROUTING, KVM_SIGNAL_MSI, KVM_SET_LAPIC, and KVM_GET_LAPIC,
 allowing the use of 32-bit APIC IDs.  See KVM_CAP_X2APIC_API in their
 respective sections.
 
+KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK must be enabled for x2APIC to work
+in logical mode or with more than 255 VCPUs.  Otherwise, KVM treats 0xff
+as a broadcast even in x2APIC mode in order to support physical x2APIC
+without interrupt remapping.  This is undesirable in logical mode,
+where 0xff represents CPUs 0-7 in cluster 0.
 
 
 8. Other capabilities.
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 7c00ba3242d7..074b5c760327 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -784,6 +784,7 @@ struct kvm_arch {
 	struct page *avic_physical_id_table_page;
 
 	bool x2apic_format;
+	bool x2apic_broadcast_quirk_disabled;
 };
 
 struct kvm_vm_stat {
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index d27a7829a4ce..a16e0bb95d28 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -616,17 +616,30 @@ static bool kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda)
 	}
 }
 
-/* KVM APIC implementation has two quirks
- *  - dest always begins at 0 while xAPIC MDA has offset 24,
- *  - IOxAPIC messages have to be delivered (directly) to x2APIC.
+/* The KVM local APIC implementation has two quirks:
+ *
+ *  - the xAPIC MDA stores the destination at bits 24-31, while this
+ *    is not true of struct kvm_lapic_irq's dest_id field.  This is
+ *    just a quirk in the API and is not problematic.
+ *
+ *  - in-kernel IOAPIC messages have to be delivered directly to
+ *    x2APIC, because the kernel does not support interrupt remapping.
+ *    In order to support broadcast without interrupt remapping, x2APIC
+ *    rewrites the destination of non-IPI messages from APIC_BROADCAST
+ *    to X2APIC_BROADCAST.
+ *
+ * The broadcast quirk can be disabled with KVM_CAP_X2APIC_API.  This is
+ * important when userspace wants to use x2APIC-format MSIs, because
+ * APIC_BROADCAST (0xff) is a legal route for "cluster 0, CPUs 0-7".
  */
-static u32 kvm_apic_mda(unsigned int dest_id, struct kvm_lapic *source,
-                                              struct kvm_lapic *target)
+static u32 kvm_apic_mda(struct kvm_vcpu *vcpu, unsigned int dest_id,
+		struct kvm_lapic *source, struct kvm_lapic *target)
 {
 	bool ipi = source != NULL;
 	bool x2apic_mda = apic_x2apic_mode(ipi ? source : target);
 
-	if (!ipi && dest_id == APIC_BROADCAST && x2apic_mda)
+	if (!vcpu->kvm->arch.x2apic_broadcast_quirk_disabled &&
+	    !ipi && dest_id == APIC_BROADCAST && x2apic_mda)
 		return X2APIC_BROADCAST;
 
 	return x2apic_mda ? dest_id : SET_APIC_DEST_FIELD(dest_id);
@@ -636,7 +649,7 @@ bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
 			   int short_hand, unsigned int dest, int dest_mode)
 {
 	struct kvm_lapic *target = vcpu->arch.apic;
-	u32 mda = kvm_apic_mda(dest, source, target);
+	u32 mda = kvm_apic_mda(vcpu, dest, source, target);
 
 	apic_debug("target %p, source %p, dest 0x%x, "
 		   "dest_mode 0x%x, short_hand 0x%x\n",
@@ -688,6 +701,25 @@ static void kvm_apic_disabled_lapic_found(struct kvm *kvm)
 	}
 }
 
+static bool kvm_apic_is_broadcast_dest(struct kvm *kvm, struct kvm_lapic **src,
+		struct kvm_lapic_irq *irq, struct kvm_apic_map *map)
+{
+	if (kvm->arch.x2apic_broadcast_quirk_disabled) {
+		if ((irq->dest_id == APIC_BROADCAST &&
+				map->mode != KVM_APIC_MODE_X2APIC))
+			return true;
+		if (irq->dest_id == X2APIC_BROADCAST)
+			return true;
+	} else {
+		bool x2apic_ipi = src && *src && apic_x2apic_mode(*src);
+		if (irq->dest_id == (x2apic_ipi ?
+		                     X2APIC_BROADCAST : APIC_BROADCAST))
+			return true;
+	}
+
+	return false;
+}
+
 /* Return true if the interrupt can be handled by using *bitmap as index mask
  * for valid destinations in *dst array.
  * Return false if kvm_apic_map_get_dest_lapic did nothing useful.
@@ -701,7 +733,6 @@ static inline bool kvm_apic_map_get_dest_lapic(struct kvm *kvm,
 		unsigned long *bitmap)
 {
 	int i, lowest;
-	bool x2apic_ipi;
 
 	if (irq->shorthand == APIC_DEST_SELF && src) {
 		*dst = src;
@@ -710,11 +741,7 @@ static inline bool kvm_apic_map_get_dest_lapic(struct kvm *kvm,
 	} else if (irq->shorthand)
 		return false;
 
-	x2apic_ipi = src && *src && apic_x2apic_mode(*src);
-	if (irq->dest_id == (x2apic_ipi ? X2APIC_BROADCAST : APIC_BROADCAST))
-		return false;
-
-	if (!map)
+	if (!map || kvm_apic_is_broadcast_dest(kvm, src, irq, map))
 		return false;
 
 	if (irq->dest_mode == APIC_DEST_PHYSICAL) {
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index d86f563a6896..f0d23622bc4e 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -90,7 +90,8 @@ static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE);
 #define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM
 #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
 
-#define KVM_X2APIC_API_VALID_FLAGS (KVM_X2APIC_API_USE_32BIT_IDS)
+#define KVM_X2APIC_API_VALID_FLAGS (KVM_X2APIC_API_USE_32BIT_IDS | \
+                                    KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK)
 
 static void update_cr8_intercept(struct kvm_vcpu *vcpu);
 static void process_nmi(struct kvm_vcpu *vcpu);
@@ -3811,6 +3812,8 @@ split_irqchip_unlock:
 
 		if (cap->args[0] & KVM_X2APIC_API_USE_32BIT_IDS)
 			kvm->arch.x2apic_format = true;
+		if (cap->args[0] & KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK)
+			kvm->arch.x2apic_broadcast_quirk_disabled = true;
 
 		r = 0;
 		break;
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index f704403e19a0..4f8030e5b05d 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1315,5 +1315,6 @@ struct kvm_assigned_msix_entry {
 };
 
 #define KVM_X2APIC_API_USE_32BIT_IDS            (1ULL << 0)
+#define KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK  (1ULL << 1)
 
 #endif /* __LINUX_KVM_H */
-- 
cgit v1.2.3-71-gd317


From 6502a34cfd6695929086187f63fe670cc3050e68 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <dahi@linux.vnet.ibm.com>
Date: Tue, 21 Jun 2016 14:19:51 +0200
Subject: KVM: s390: allow user space to handle instr 0x0000

We will use illegal instruction 0x0000 for handling 2 byte sw breakpoints
from user space. As it can be enabled dynamically via a capability,
let's move setting of ICTL_OPEREXC to the post creation step, so we avoid
any races when enabling that capability just while adding new cpus.

Acked-by: Janosch Frank <frankja@linux.vnet.ibm.com>
Reviewed-by: Cornelia Huck <cornelia.huck@de.ibm.com>
Signed-off-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 Documentation/virtual/kvm/api.txt | 13 +++++++++++++
 arch/s390/include/asm/kvm_host.h  |  2 ++
 arch/s390/kvm/intercept.c         |  3 +++
 arch/s390/kvm/kvm-s390.c          | 26 ++++++++++++++++++++++++--
 include/uapi/linux/kvm.h          |  1 +
 5 files changed, 43 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index c4d2fb0e28de..299306db5d84 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -3857,6 +3857,19 @@ as a broadcast even in x2APIC mode in order to support physical x2APIC
 without interrupt remapping.  This is undesirable in logical mode,
 where 0xff represents CPUs 0-7 in cluster 0.
 
+7.8 KVM_CAP_S390_USER_INSTR0
+
+Architectures: s390
+Parameters: none
+
+With this capability enabled, all illegal instructions 0x0000 (2 bytes) will
+be intercepted and forwarded to user space. User space can use this
+mechanism e.g. to realize 2-byte software breakpoints. The kernel will
+not inject an operating exception for these instructions, user space has
+to take care of that.
+
+This capability can be enabled dynamically even if VCPUs were already
+created and are running.
 
 8. Other capabilities.
 ----------------------
diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index 946fc86202fd..183b01727de4 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -43,6 +43,7 @@
 /* s390-specific vcpu->requests bit members */
 #define KVM_REQ_ENABLE_IBS         8
 #define KVM_REQ_DISABLE_IBS        9
+#define KVM_REQ_ICPT_OPEREXC       10
 
 #define SIGP_CTRL_C		0x80
 #define SIGP_CTRL_SCN_MASK	0x3f
@@ -666,6 +667,7 @@ struct kvm_arch{
 	int user_cpu_state_ctrl;
 	int user_sigp;
 	int user_stsi;
+	int user_instr0;
 	struct s390_io_adapter *adapters[MAX_S390_IO_ADAPTERS];
 	wait_queue_head_t ipte_wq;
 	int ipte_lock_count;
diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c
index 850be47c4cc9..7a2f1551bc39 100644
--- a/arch/s390/kvm/intercept.c
+++ b/arch/s390/kvm/intercept.c
@@ -359,6 +359,9 @@ static int handle_operexc(struct kvm_vcpu *vcpu)
 	    test_kvm_facility(vcpu->kvm, 74))
 		return handle_sthyi(vcpu);
 
+	if (vcpu->arch.sie_block->ipa == 0 && vcpu->kvm->arch.user_instr0)
+		return -EOPNOTSUPP;
+
 	return kvm_s390_inject_program_int(vcpu, PGM_OPERATION);
 }
 
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index d42428c11794..63ac7c1641a7 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -364,6 +364,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_S390_USER_STSI:
 	case KVM_CAP_S390_SKEYS:
 	case KVM_CAP_S390_IRQ_STATE:
+	case KVM_CAP_S390_USER_INSTR0:
 		r = 1;
 		break;
 	case KVM_CAP_S390_MEM_OP:
@@ -456,6 +457,16 @@ out:
 	return r;
 }
 
+static void icpt_operexc_on_all_vcpus(struct kvm *kvm)
+{
+	unsigned int i;
+	struct kvm_vcpu *vcpu;
+
+	kvm_for_each_vcpu(i, vcpu, kvm) {
+		kvm_s390_sync_request(KVM_REQ_ICPT_OPEREXC, vcpu);
+	}
+}
+
 static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap)
 {
 	int r;
@@ -507,6 +518,12 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap)
 		kvm->arch.user_stsi = 1;
 		r = 0;
 		break;
+	case KVM_CAP_S390_USER_INSTR0:
+		VM_EVENT(kvm, 3, "%s", "ENABLE: CAP_S390_USER_INSTR0");
+		kvm->arch.user_instr0 = 1;
+		icpt_operexc_on_all_vcpus(kvm);
+		r = 0;
+		break;
 	default:
 		r = -EINVAL;
 		break;
@@ -1836,6 +1853,8 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
 		vcpu->arch.gmap = vcpu->kvm->arch.gmap;
 		sca_add_vcpu(vcpu);
 	}
+	if (test_kvm_facility(vcpu->kvm, 74) || vcpu->kvm->arch.user_instr0)
+		vcpu->arch.sie_block->ictl |= ICTL_OPEREXC;
 	/* make vcpu_load load the right gmap on the first trigger */
 	vcpu->arch.enabled_gmap = vcpu->arch.gmap;
 }
@@ -1923,8 +1942,6 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
 	}
 	vcpu->arch.sie_block->riccbd = (unsigned long) &vcpu->run->s.regs.riccb;
 	vcpu->arch.sie_block->ictl |= ICTL_ISKE | ICTL_SSKE | ICTL_RRBE;
-	if (test_kvm_facility(vcpu->kvm, 74))
-		vcpu->arch.sie_block->ictl |= ICTL_OPEREXC;
 
 	if (vcpu->kvm->arch.use_cmma) {
 		rc = kvm_s390_vcpu_setup_cmma(vcpu);
@@ -2369,6 +2386,11 @@ retry:
 		goto retry;
 	}
 
+	if (kvm_check_request(KVM_REQ_ICPT_OPEREXC, vcpu)) {
+		vcpu->arch.sie_block->ictl |= ICTL_OPEREXC;
+		goto retry;
+	}
+
 	/* nothing to do, just clear the request */
 	clear_bit(KVM_REQ_UNHALT, &vcpu->requests);
 
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 4f8030e5b05d..70941f4ab6d8 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -867,6 +867,7 @@ struct kvm_ppc_smmu_info {
 #define KVM_CAP_VCPU_ATTRIBUTES 127
 #define KVM_CAP_MAX_VCPU_ID 128
 #define KVM_CAP_X2APIC_API 129
+#define KVM_CAP_S390_USER_INSTR0 130
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
-- 
cgit v1.2.3-71-gd317


From 2b8ddd9337ee0d001b22507f95596648a1a90992 Mon Sep 17 00:00:00 2001
From: Andre Przywara <andre.przywara@arm.com>
Date: Fri, 15 Jul 2016 12:43:24 +0100
Subject: KVM: Extend struct kvm_msi to hold a 32-bit device ID

The ARM GICv3 ITS MSI controller requires a device ID to be able to
assign the proper interrupt vector. On real hardware, this ID is
sampled from the bus. To be able to emulate an ITS controller, extend
the KVM MSI interface to let userspace provide such a device ID. For
PCI devices, the device ID is simply the 16-bit bus-device-function
triplet, which should be easily available to the userland tool.

Also there is a new KVM capability which advertises whether the
current VM requires a device ID to be set along with the MSI data.
This flag is still reported as not available everywhere, later we will
enable it when ITS emulation is used.

Signed-off-by: Andre Przywara <andre.przywara@arm.com>
Reviewed-by: Eric Auger <eric.auger@linaro.org>
Reviewed-by: Marc Zyngier <marc.zyngier@arm.com>
Acked-by: Christoffer Dall <christoffer.dall@linaro.org>
Acked-by: Paolo Bonzini <pbonzini@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 Documentation/virtual/kvm/api.txt | 12 ++++++++++--
 include/uapi/linux/kvm.h          |  5 ++++-
 2 files changed, 14 insertions(+), 3 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 09efa9eb3926..65513119fee8 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -2175,10 +2175,18 @@ struct kvm_msi {
 	__u32 address_hi;
 	__u32 data;
 	__u32 flags;
-	__u8  pad[16];
+	__u32 devid;
+	__u8  pad[12];
 };
 
-No flags are defined so far. The corresponding field must be 0.
+flags: KVM_MSI_VALID_DEVID: devid contains a valid value
+devid: If KVM_MSI_VALID_DEVID is set, contains a unique device identifier
+       for the device that wrote the MSI message.
+       For PCI, this is usually a BFD identifier in the lower 16 bits.
+
+The per-VM KVM_CAP_MSI_DEVID capability advertises the need to provide
+the device ID. If this capability is not set, userland cannot rely on
+the kernel to allow the KVM_MSI_VALID_DEVID flag being set.
 
 
 4.71 KVM_CREATE_PIT2
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 05ebf475104c..7de96f5bb92c 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -866,6 +866,7 @@ struct kvm_ppc_smmu_info {
 #define KVM_CAP_ARM_PMU_V3 126
 #define KVM_CAP_VCPU_ATTRIBUTES 127
 #define KVM_CAP_MAX_VCPU_ID 128
+#define KVM_CAP_MSI_DEVID 129
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -1024,12 +1025,14 @@ struct kvm_one_reg {
 	__u64 addr;
 };
 
+#define KVM_MSI_VALID_DEVID	(1U << 0)
 struct kvm_msi {
 	__u32 address_lo;
 	__u32 address_hi;
 	__u32 data;
 	__u32 flags;
-	__u8  pad[16];
+	__u32 devid;
+	__u8  pad[12];
 };
 
 struct kvm_arm_device_addr {
-- 
cgit v1.2.3-71-gd317


From 1085fdc68c6097244627a02a56bd2d8fe58a1a9c Mon Sep 17 00:00:00 2001
From: Andre Przywara <andre.przywara@arm.com>
Date: Fri, 15 Jul 2016 12:43:31 +0100
Subject: KVM: arm64: vgic-its: Introduce new KVM ITS device

Introduce a new KVM device that represents an ARM Interrupt Translation
Service (ITS) controller. Since there can be multiple of this per guest,
we can't piggy back on the existing GICv3 distributor device, but create
a new type of KVM device.
On the KVM_CREATE_DEVICE ioctl we allocate and initialize the ITS data
structure and store the pointer in the kvm_device data.
Upon an explicit init ioctl from userland (after having setup the MMIO
address) we register the handlers with the kvm_io_bus framework.
Any reference to an ITS thus has to go via this interface.

Signed-off-by: Andre Przywara <andre.przywara@arm.com>
Reviewed-by: Marc Zyngier <marc.zyngier@arm.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 Documentation/virtual/kvm/devices/arm-vgic.txt |  25 +++--
 arch/arm/kvm/arm.c                             |   1 +
 arch/arm64/include/uapi/asm/kvm.h              |   2 +
 include/kvm/arm_vgic.h                         |   3 +
 include/uapi/linux/kvm.h                       |   2 +
 virt/kvm/arm/vgic/vgic-its.c                   | 135 +++++++++++++++++++++++++
 virt/kvm/arm/vgic/vgic-kvm-device.c            |   4 +-
 virt/kvm/arm/vgic/vgic-mmio-v3.c               |   2 +-
 virt/kvm/arm/vgic/vgic.h                       |   3 +
 9 files changed, 168 insertions(+), 9 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/virtual/kvm/devices/arm-vgic.txt b/Documentation/virtual/kvm/devices/arm-vgic.txt
index 59541d49e15c..89182f80cc7f 100644
--- a/Documentation/virtual/kvm/devices/arm-vgic.txt
+++ b/Documentation/virtual/kvm/devices/arm-vgic.txt
@@ -4,16 +4,22 @@ ARM Virtual Generic Interrupt Controller (VGIC)
 Device types supported:
   KVM_DEV_TYPE_ARM_VGIC_V2     ARM Generic Interrupt Controller v2.0
   KVM_DEV_TYPE_ARM_VGIC_V3     ARM Generic Interrupt Controller v3.0
+  KVM_DEV_TYPE_ARM_VGIC_ITS    ARM Interrupt Translation Service Controller
 
-Only one VGIC instance may be instantiated through either this API or the
-legacy KVM_CREATE_IRQCHIP api.  The created VGIC will act as the VM interrupt
-controller, requiring emulated user-space devices to inject interrupts to the
-VGIC instead of directly to CPUs.
+Only one VGIC instance of the V2/V3 types above may be instantiated through
+either this API or the legacy KVM_CREATE_IRQCHIP api.  The created VGIC will
+act as the VM interrupt controller, requiring emulated user-space devices to
+inject interrupts to the VGIC instead of directly to CPUs.
 
 Creating a guest GICv3 device requires a host GICv3 as well.
 GICv3 implementations with hardware compatibility support allow a guest GICv2
 as well.
 
+Creating a virtual ITS controller requires a host GICv3 (but does not depend
+on having physical ITS controllers).
+There can be multiple ITS controllers per guest, each of them has to have
+a separate, non-overlapping MMIO region.
+
 Groups:
   KVM_DEV_ARM_VGIC_GRP_ADDR
   Attributes:
@@ -39,6 +45,13 @@ Groups:
       Only valid for KVM_DEV_TYPE_ARM_VGIC_V3.
       This address needs to be 64K aligned.
 
+    KVM_VGIC_V3_ADDR_TYPE_ITS (rw, 64-bit)
+      Base address in the guest physical address space of the GICv3 ITS
+      control register frame. The ITS allows MSI(-X) interrupts to be
+      injected into guests. This extension is optional. If the kernel
+      does not support the ITS, the call returns -ENODEV.
+      Only valid for KVM_DEV_TYPE_ARM_VGIC_ITS.
+      This address needs to be 64K aligned and the region covers 128K.
 
   KVM_DEV_ARM_VGIC_GRP_DIST_REGS
   Attributes:
@@ -109,8 +122,8 @@ Groups:
   KVM_DEV_ARM_VGIC_GRP_CTRL
   Attributes:
     KVM_DEV_ARM_VGIC_CTRL_INIT
-      request the initialization of the VGIC, no additional parameter in
-      kvm_device_attr.addr.
+      request the initialization of the VGIC or ITS, no additional parameter
+      in kvm_device_attr.addr.
   Errors:
     -ENXIO: VGIC not properly configured as required prior to calling
      this attribute
diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
index 972075cc111c..fb4661cf896e 100644
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -20,6 +20,7 @@
 #include <linux/errno.h>
 #include <linux/err.h>
 #include <linux/kvm_host.h>
+#include <linux/list.h>
 #include <linux/module.h>
 #include <linux/vmalloc.h>
 #include <linux/fs.h>
diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h
index f209ea151dca..3051f86a9b5f 100644
--- a/arch/arm64/include/uapi/asm/kvm.h
+++ b/arch/arm64/include/uapi/asm/kvm.h
@@ -87,9 +87,11 @@ struct kvm_regs {
 /* Supported VGICv3 address types  */
 #define KVM_VGIC_V3_ADDR_TYPE_DIST	2
 #define KVM_VGIC_V3_ADDR_TYPE_REDIST	3
+#define KVM_VGIC_ITS_ADDR_TYPE		4
 
 #define KVM_VGIC_V3_DIST_SIZE		SZ_64K
 #define KVM_VGIC_V3_REDIST_SIZE		(2 * SZ_64K)
+#define KVM_VGIC_V3_ITS_SIZE		(2 * SZ_64K)
 
 #define KVM_ARM_VCPU_POWER_OFF		0 /* CPU is started in OFF state */
 #define KVM_ARM_VCPU_EL1_32BIT		1 /* CPU running a 32bit VM */
diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
index 685f33975ce4..8609faced83e 100644
--- a/include/kvm/arm_vgic.h
+++ b/include/kvm/arm_vgic.h
@@ -134,6 +134,7 @@ struct vgic_its {
 	gpa_t			vgic_its_base;
 
 	bool			enabled;
+	bool			initialized;
 	struct vgic_io_device	iodev;
 };
 
@@ -167,6 +168,8 @@ struct vgic_dist {
 
 	struct vgic_io_device	dist_iodev;
 
+	bool			has_its;
+
 	/*
 	 * Contains the attributes and gpa of the LPI configuration table.
 	 * Since we report GICR_TYPER.CommonLPIAff as 0b00, we can share
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 7de96f5bb92c..d8c4c324cfae 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1077,6 +1077,8 @@ enum kvm_device_type {
 #define KVM_DEV_TYPE_FLIC		KVM_DEV_TYPE_FLIC
 	KVM_DEV_TYPE_ARM_VGIC_V3,
 #define KVM_DEV_TYPE_ARM_VGIC_V3	KVM_DEV_TYPE_ARM_VGIC_V3
+	KVM_DEV_TYPE_ARM_VGIC_ITS,
+#define KVM_DEV_TYPE_ARM_VGIC_ITS	KVM_DEV_TYPE_ARM_VGIC_ITS
 	KVM_DEV_TYPE_MAX,
 };
 
diff --git a/virt/kvm/arm/vgic/vgic-its.c b/virt/kvm/arm/vgic/vgic-its.c
index 4654d6edf6a6..6b47b3674690 100644
--- a/virt/kvm/arm/vgic/vgic-its.c
+++ b/virt/kvm/arm/vgic/vgic-its.c
@@ -21,6 +21,7 @@
 #include <linux/kvm.h>
 #include <linux/kvm_host.h>
 #include <linux/interrupt.h>
+#include <linux/uaccess.h>
 
 #include <linux/irqchip/arm-gic-v3.h>
 
@@ -84,6 +85,9 @@ static int vgic_its_init_its(struct kvm *kvm, struct vgic_its *its)
 	struct vgic_io_device *iodev = &its->iodev;
 	int ret;
 
+	if (its->initialized)
+		return 0;
+
 	if (IS_VGIC_ADDR_UNDEF(its->vgic_its_base))
 		return -ENXIO;
 
@@ -99,5 +103,136 @@ static int vgic_its_init_its(struct kvm *kvm, struct vgic_its *its)
 				      KVM_VGIC_V3_ITS_SIZE, &iodev->dev);
 	mutex_unlock(&kvm->slots_lock);
 
+	if (!ret)
+		its->initialized = true;
+
 	return ret;
 }
+
+static int vgic_its_create(struct kvm_device *dev, u32 type)
+{
+	struct vgic_its *its;
+
+	if (type != KVM_DEV_TYPE_ARM_VGIC_ITS)
+		return -ENODEV;
+
+	its = kzalloc(sizeof(struct vgic_its), GFP_KERNEL);
+	if (!its)
+		return -ENOMEM;
+
+	its->vgic_its_base = VGIC_ADDR_UNDEF;
+
+	dev->kvm->arch.vgic.has_its = true;
+	its->initialized = false;
+	its->enabled = false;
+
+	dev->private = its;
+
+	return 0;
+}
+
+static void vgic_its_destroy(struct kvm_device *kvm_dev)
+{
+	struct vgic_its *its = kvm_dev->private;
+
+	kfree(its);
+}
+
+static int vgic_its_has_attr(struct kvm_device *dev,
+			     struct kvm_device_attr *attr)
+{
+	switch (attr->group) {
+	case KVM_DEV_ARM_VGIC_GRP_ADDR:
+		switch (attr->attr) {
+		case KVM_VGIC_ITS_ADDR_TYPE:
+			return 0;
+		}
+		break;
+	case KVM_DEV_ARM_VGIC_GRP_CTRL:
+		switch (attr->attr) {
+		case KVM_DEV_ARM_VGIC_CTRL_INIT:
+			return 0;
+		}
+		break;
+	}
+	return -ENXIO;
+}
+
+static int vgic_its_set_attr(struct kvm_device *dev,
+			     struct kvm_device_attr *attr)
+{
+	struct vgic_its *its = dev->private;
+	int ret;
+
+	switch (attr->group) {
+	case KVM_DEV_ARM_VGIC_GRP_ADDR: {
+		u64 __user *uaddr = (u64 __user *)(long)attr->addr;
+		unsigned long type = (unsigned long)attr->attr;
+		u64 addr;
+
+		if (type != KVM_VGIC_ITS_ADDR_TYPE)
+			return -ENODEV;
+
+		if (its->initialized)
+			return -EBUSY;
+
+		if (copy_from_user(&addr, uaddr, sizeof(addr)))
+			return -EFAULT;
+
+		ret = vgic_check_ioaddr(dev->kvm, &its->vgic_its_base,
+					addr, SZ_64K);
+		if (ret)
+			return ret;
+
+		its->vgic_its_base = addr;
+
+		return 0;
+	}
+	case KVM_DEV_ARM_VGIC_GRP_CTRL:
+		switch (attr->attr) {
+		case KVM_DEV_ARM_VGIC_CTRL_INIT:
+			return vgic_its_init_its(dev->kvm, its);
+		}
+		break;
+	}
+	return -ENXIO;
+}
+
+static int vgic_its_get_attr(struct kvm_device *dev,
+			     struct kvm_device_attr *attr)
+{
+	switch (attr->group) {
+	case KVM_DEV_ARM_VGIC_GRP_ADDR: {
+		struct vgic_its *its = dev->private;
+		u64 addr = its->vgic_its_base;
+		u64 __user *uaddr = (u64 __user *)(long)attr->addr;
+		unsigned long type = (unsigned long)attr->attr;
+
+		if (type != KVM_VGIC_ITS_ADDR_TYPE)
+			return -ENODEV;
+
+		if (copy_to_user(uaddr, &addr, sizeof(addr)))
+			return -EFAULT;
+		break;
+	default:
+		return -ENXIO;
+	}
+	}
+
+	return 0;
+}
+
+static struct kvm_device_ops kvm_arm_vgic_its_ops = {
+	.name = "kvm-arm-vgic-its",
+	.create = vgic_its_create,
+	.destroy = vgic_its_destroy,
+	.set_attr = vgic_its_set_attr,
+	.get_attr = vgic_its_get_attr,
+	.has_attr = vgic_its_has_attr,
+};
+
+int kvm_vgic_register_its_device(void)
+{
+	return kvm_register_device_ops(&kvm_arm_vgic_its_ops,
+				       KVM_DEV_TYPE_ARM_VGIC_ITS);
+}
diff --git a/virt/kvm/arm/vgic/vgic-kvm-device.c b/virt/kvm/arm/vgic/vgic-kvm-device.c
index 2f24f13c6c90..561d2ba96a4f 100644
--- a/virt/kvm/arm/vgic/vgic-kvm-device.c
+++ b/virt/kvm/arm/vgic/vgic-kvm-device.c
@@ -21,8 +21,8 @@
 
 /* common helpers */
 
-static int vgic_check_ioaddr(struct kvm *kvm, phys_addr_t *ioaddr,
-			     phys_addr_t addr, phys_addr_t alignment)
+int vgic_check_ioaddr(struct kvm *kvm, phys_addr_t *ioaddr,
+		      phys_addr_t addr, phys_addr_t alignment)
 {
 	if (addr & ~KVM_PHYS_MASK)
 		return -E2BIG;
diff --git a/virt/kvm/arm/vgic/vgic-mmio-v3.c b/virt/kvm/arm/vgic/vgic-mmio-v3.c
index b92b7d6cabe6..a5c35050c786 100644
--- a/virt/kvm/arm/vgic/vgic-mmio-v3.c
+++ b/virt/kvm/arm/vgic/vgic-mmio-v3.c
@@ -49,7 +49,7 @@ bool vgic_has_its(struct kvm *kvm)
 	if (dist->vgic_model != KVM_DEV_TYPE_ARM_VGIC_V3)
 		return false;
 
-	return false;
+	return dist->has_its;
 }
 
 static unsigned long vgic_mmio_read_v3_misc(struct kvm_vcpu *vcpu,
diff --git a/virt/kvm/arm/vgic/vgic.h b/virt/kvm/arm/vgic/vgic.h
index 31807c166d2a..8192a293f119 100644
--- a/virt/kvm/arm/vgic/vgic.h
+++ b/virt/kvm/arm/vgic/vgic.h
@@ -42,6 +42,9 @@ void vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq);
 bool vgic_queue_irq_unlock(struct kvm *kvm, struct vgic_irq *irq);
 void vgic_kick_vcpus(struct kvm *kvm);
 
+int vgic_check_ioaddr(struct kvm *kvm, phys_addr_t *ioaddr,
+		      phys_addr_t addr, phys_addr_t alignment);
+
 void vgic_v2_process_maintenance(struct kvm_vcpu *vcpu);
 void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu);
 void vgic_v2_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr);
-- 
cgit v1.2.3-71-gd317


From 76a10b86785c5e3fc49bcee355502d035b07e47a Mon Sep 17 00:00:00 2001
From: Eric Auger <eric.auger@redhat.com>
Date: Fri, 22 Jul 2016 16:20:37 +0000
Subject: KVM: api: Pass the devid in the msi routing entry
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

On ARM, the MSI msg (address and data) comes along with
out-of-band device ID information. The device ID encodes the
device that writes the MSI msg. Let's convey the device id in
kvm_irq_routing_msi and use KVM_MSI_VALID_DEVID flag value in
kvm_irq_routing_entry to indicate the msi devid is populated.

Signed-off-by: Eric Auger <eric.auger@redhat.com>
Reviewed-by: Andre Przywara <andre.przywara@arm.com>
Acked-by: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 Documentation/virtual/kvm/api.txt | 19 +++++++++++++++++--
 include/uapi/linux/kvm.h          |  5 ++++-
 2 files changed, 21 insertions(+), 3 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 07049eadb124..415cde1647e9 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -1468,7 +1468,11 @@ struct kvm_irq_routing_entry {
 #define KVM_IRQ_ROUTING_S390_ADAPTER 3
 #define KVM_IRQ_ROUTING_HV_SINT 4
 
-No flags are specified so far, the corresponding field must be set to zero.
+flags:
+- KVM_MSI_VALID_DEVID: used along with KVM_IRQ_ROUTING_MSI
+  routing entry type, specifies that the devid field contains
+  a valid value.
+- zero otherwise
 
 struct kvm_irq_routing_irqchip {
 	__u32 irqchip;
@@ -1479,9 +1483,20 @@ struct kvm_irq_routing_msi {
 	__u32 address_lo;
 	__u32 address_hi;
 	__u32 data;
-	__u32 pad;
+	union {
+		__u32 pad;
+		__u32 devid;
+	};
 };
 
+devid: If KVM_MSI_VALID_DEVID is set, contains a unique device identifier
+       for the device that wrote the MSI message.
+       For PCI, this is usually a BFD identifier in the lower 16 bits.
+
+The per-VM KVM_CAP_MSI_DEVID capability advertises the requirement to
+provide the device ID. If this capability is not set, userland cannot
+rely on the kernel to allow the KVM_MSI_VALID_DEVID flag being set.
+
 struct kvm_irq_routing_s390_adapter {
 	__u64 ind_addr;
 	__u64 summary_addr;
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index d8c4c324cfae..eb2220895c6e 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -879,7 +879,10 @@ struct kvm_irq_routing_msi {
 	__u32 address_lo;
 	__u32 address_hi;
 	__u32 data;
-	__u32 pad;
+	union {
+		__u32 pad;
+		__u32 devid;
+	};
 };
 
 struct kvm_irq_routing_s390_adapter {
-- 
cgit v1.2.3-71-gd317


From 7af7c616fa2f1ce6c0d806b89898d2df098b4bd8 Mon Sep 17 00:00:00 2001
From: Hans van Kranenburg <hans.van.kranenburg@mendix.com>
Date: Sun, 3 Jul 2016 23:23:06 +0200
Subject: Btrfs: use the correct struct for BTRFS_IOC_LOGICAL_INO

BTRFS_IOC_LOGICAL_INO takes a btrfs_ioctl_logical_ino_args as argument,
not a btrfs_ioctl_ino_path_args. The lines were probably copy/pasted
when the code was written.

Since btrfs_ioctl_logical_ino_args and btrfs_ioctl_ino_path_args have
the same size, the actual IOCTL definition here does not change.

But, it makes the code less confusing for the reader.

Signed-off-by: Hans van Kranenburg <hans.van.kranenburg@mendix.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 include/uapi/linux/btrfs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h
index 2bdd1e3e7007..ac5eacd3055b 100644
--- a/include/uapi/linux/btrfs.h
+++ b/include/uapi/linux/btrfs.h
@@ -798,7 +798,7 @@ static inline char *btrfs_err_str(enum btrfs_err_code err_code)
 #define BTRFS_IOC_INO_PATHS _IOWR(BTRFS_IOCTL_MAGIC, 35, \
 					struct btrfs_ioctl_ino_path_args)
 #define BTRFS_IOC_LOGICAL_INO _IOWR(BTRFS_IOCTL_MAGIC, 36, \
-					struct btrfs_ioctl_ino_path_args)
+					struct btrfs_ioctl_logical_ino_args)
 #define BTRFS_IOC_SET_RECEIVED_SUBVOL _IOWR(BTRFS_IOCTL_MAGIC, 37, \
 				struct btrfs_ioctl_received_subvol_args)
 #define BTRFS_IOC_SEND _IOW(BTRFS_IOCTL_MAGIC, 38, struct btrfs_ioctl_send_args)
-- 
cgit v1.2.3-71-gd317


From 0dc696bcf2e86f48a23fb95ca2f40c8708241e7e Mon Sep 17 00:00:00 2001
From: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Date: Thu, 28 Jul 2016 10:57:30 +0800
Subject: elf: Add powerpc specific core note sections

This patch adds twelve ELF core note sections for powerpc
architecture for various registers and register sets which
need to be accessed from ptrace interface and then gdb.
These additions include special purpose registers like TAR,
PPR, DSCR, TM running and checkpointed state for various
register sets, EBB related register set, performance monitor
register set etc. Addition of these new ELF core note
sections extends the existing ELF ABI on powerpc arch without
affecting it in any manner.

Signed-off-by: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Signed-off-by: Simon Guo <wei.guo.simon@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 include/uapi/linux/elf.h | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/elf.h b/include/uapi/linux/elf.h
index cb4a72f888d5..1be3c5f6183b 100644
--- a/include/uapi/linux/elf.h
+++ b/include/uapi/linux/elf.h
@@ -381,6 +381,19 @@ typedef struct elf64_shdr {
 #define NT_PPC_VMX	0x100		/* PowerPC Altivec/VMX registers */
 #define NT_PPC_SPE	0x101		/* PowerPC SPE/EVR registers */
 #define NT_PPC_VSX	0x102		/* PowerPC VSX registers */
+#define NT_PPC_TAR	0x103		/* Target Address Register */
+#define NT_PPC_PPR	0x104		/* Program Priority Register */
+#define NT_PPC_DSCR	0x105		/* Data Stream Control Register */
+#define NT_PPC_EBB	0x106		/* Event Based Branch Registers */
+#define NT_PPC_PMU	0x107		/* Performance Monitor Registers */
+#define NT_PPC_TM_CGPR	0x108		/* TM checkpointed GPR Registers */
+#define NT_PPC_TM_CFPR	0x109		/* TM checkpointed FPR Registers */
+#define NT_PPC_TM_CVMX	0x10a		/* TM checkpointed VMX Registers */
+#define NT_PPC_TM_CVSX	0x10b		/* TM checkpointed VSX Registers */
+#define NT_PPC_TM_SPR	0x10c		/* TM Special Purpose Registers */
+#define NT_PPC_TM_CTAR	0x10d		/* TM checkpointed Target Address Register */
+#define NT_PPC_TM_CPPR	0x10e		/* TM checkpointed Program Priority Register */
+#define NT_PPC_TM_CDSCR	0x10f		/* TM checkpointed Data Stream Control Register */
 #define NT_386_TLS	0x200		/* i386 TLS slots (struct user_desc) */
 #define NT_386_IOPERM	0x201		/* x86 io permission bitmap (1=deny) */
 #define NT_X86_XSTATE	0x202		/* x86 extended state using xsave */
-- 
cgit v1.2.3-71-gd317


From 23528bb21ee2c9b27f3feddd77a2a3351a8df148 Mon Sep 17 00:00:00 2001
From: Sam Bobroff <sam.bobroff@au1.ibm.com>
Date: Wed, 20 Jul 2016 13:41:36 +1000
Subject: KVM: PPC: Introduce KVM_CAP_PPC_HTM

Introduce a new KVM capability, KVM_CAP_PPC_HTM, that can be queried to
determine if a PowerPC KVM guest should use HTM (Hardware Transactional
Memory).

This will be used by QEMU to populate the pa-features bits in the
guest's device tree.

Signed-off-by: Sam Bobroff <sam.bobroff@au1.ibm.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/powerpc/kvm/powerpc.c | 4 ++++
 include/uapi/linux/kvm.h   | 1 +
 2 files changed, 5 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 1ac036e45ed4..6ce40dd6fe51 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -588,6 +588,10 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 		r = 1;
 		break;
 #endif
+	case KVM_CAP_PPC_HTM:
+		r = cpu_has_feature(CPU_FTR_TM_COMP) &&
+		    is_kvmppc_hv_enabled(kvm);
+		break;
 	default:
 		r = 0;
 		break;
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 8f2756c263d4..e98bb4cce639 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -869,6 +869,7 @@ struct kvm_ppc_smmu_info {
 #define KVM_CAP_X2APIC_API 129
 #define KVM_CAP_S390_USER_INSTR0 130
 #define KVM_CAP_MSI_DEVID 131
+#define KVM_CAP_PPC_HTM 132
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
-- 
cgit v1.2.3-71-gd317


From 1a937693993ff10d7e80cca6ddd55f3000aa6376 Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Mon, 18 Apr 2016 12:58:14 +0300
Subject: virtio: new feature to detect IOMMU device quirk

The interaction between virtio and IOMMUs is messy.

On most systems with virtio, physical addresses match bus addresses,
and it doesn't particularly matter which one we use to program
the device.

On some systems, including Xen and any system with a physical device
that speaks virtio behind a physical IOMMU, we must program the IOMMU
for virtio DMA to work at all.

On other systems, including SPARC and PPC64, virtio-pci devices are
enumerated as though they are behind an IOMMU, but the virtio host
ignores the IOMMU, so we must either pretend that the IOMMU isn't
there or somehow map everything as the identity.

Add a feature bit to detect that quirk: VIRTIO_F_IOMMU_PLATFORM.

Any device with this feature bit set to 0 needs a quirk and has to be
passed physical addresses (as opposed to bus addresses) even though
the device is behind an IOMMU.

Note: it has to be a per-device quirk because for example, there could
be a mix of passed-through and virtual virtio devices. As another
example, some devices could be implemented by an out of process
hypervisor backend (in case of qemu vhost, or vhost-user) and so support
for an IOMMU needs to be coded up separately.

It would be cleanest to handle this in IOMMU core code, but that needs
per-device DMA ops. While we are waiting for that to be implemented, use
a work-around in virtio core.

Note: a "noiommu" feature is a quirk - add a wrapper to make
that clear.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/virtio/virtio_ring.c       | 15 ++++++++++++++-
 include/linux/virtio_config.h      | 13 +++++++++++++
 include/uapi/linux/virtio_config.h | 10 +++++++++-
 3 files changed, 36 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index ca6bfddaacad..114a0c88afb8 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -117,7 +117,10 @@ struct vring_virtqueue {
 #define to_vvq(_vq) container_of(_vq, struct vring_virtqueue, vq)
 
 /*
- * The interaction between virtio and a possible IOMMU is a mess.
+ * Modern virtio devices have feature bits to specify whether they need a
+ * quirk and bypass the IOMMU. If not there, just use the DMA API.
+ *
+ * If there, the interaction between virtio and DMA API is messy.
  *
  * On most systems with virtio, physical addresses match bus addresses,
  * and it doesn't particularly matter whether we use the DMA API.
@@ -133,10 +136,18 @@ struct vring_virtqueue {
  *
  * For the time being, we preserve historic behavior and bypass the DMA
  * API.
+ *
+ * TODO: install a per-device DMA ops structure that does the right thing
+ * taking into account all the above quirks, and use the DMA API
+ * unconditionally on data path.
  */
 
 static bool vring_use_dma_api(struct virtio_device *vdev)
 {
+	if (!virtio_has_iommu_quirk(vdev))
+		return true;
+
+	/* Otherwise, we are left to guess. */
 	/*
 	 * In theory, it's possible to have a buggy QEMU-supposed
 	 * emulated Q35 IOMMU and Xen enabled at the same time.  On
@@ -1099,6 +1110,8 @@ void vring_transport_features(struct virtio_device *vdev)
 			break;
 		case VIRTIO_F_VERSION_1:
 			break;
+		case VIRTIO_F_IOMMU_PLATFORM:
+			break;
 		default:
 			/* We don't understand this bit. */
 			__virtio_clear_bit(vdev, i);
diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h
index 6e6cb0c9d7cb..26c155bb639b 100644
--- a/include/linux/virtio_config.h
+++ b/include/linux/virtio_config.h
@@ -149,6 +149,19 @@ static inline bool virtio_has_feature(const struct virtio_device *vdev,
 	return __virtio_test_bit(vdev, fbit);
 }
 
+/**
+ * virtio_has_iommu_quirk - determine whether this device has the iommu quirk
+ * @vdev: the device
+ */
+static inline bool virtio_has_iommu_quirk(const struct virtio_device *vdev)
+{
+	/*
+	 * Note the reverse polarity of the quirk feature (compared to most
+	 * other features), this is for compatibility with legacy systems.
+	 */
+	return !virtio_has_feature(vdev, VIRTIO_F_IOMMU_PLATFORM);
+}
+
 static inline
 struct virtqueue *virtio_find_single_vq(struct virtio_device *vdev,
 					vq_callback_t *c, const char *n)
diff --git a/include/uapi/linux/virtio_config.h b/include/uapi/linux/virtio_config.h
index 4cb65bbfa654..308e2096291f 100644
--- a/include/uapi/linux/virtio_config.h
+++ b/include/uapi/linux/virtio_config.h
@@ -49,7 +49,7 @@
  * transport being used (eg. virtio_ring), the rest are per-device feature
  * bits. */
 #define VIRTIO_TRANSPORT_F_START	28
-#define VIRTIO_TRANSPORT_F_END		33
+#define VIRTIO_TRANSPORT_F_END		34
 
 #ifndef VIRTIO_CONFIG_NO_LEGACY
 /* Do we get callbacks when the ring is completely used, even if we've
@@ -63,4 +63,12 @@
 /* v1.0 compliant. */
 #define VIRTIO_F_VERSION_1		32
 
+/*
+ * If clear - device has the IOMMU bypass quirk feature.
+ * If set - use platform tools to detect the IOMMU.
+ *
+ * Note the reverse polarity (compared to most other features),
+ * this is for compatibility with legacy systems.
+ */
+#define VIRTIO_F_IOMMU_PLATFORM		33
 #endif /* _UAPI_LINUX_VIRTIO_CONFIG_H */
-- 
cgit v1.2.3-71-gd317


From 06a8fc78367d070720af960dcecec917d3ae5f3b Mon Sep 17 00:00:00 2001
From: Asias He <asias@redhat.com>
Date: Thu, 28 Jul 2016 15:36:32 +0100
Subject: VSOCK: Introduce virtio_vsock_common.ko

This module contains the common code and header files for the following
virtio_transporto and vhost_vsock kernel modules.

Signed-off-by: Asias He <asias@redhat.com>
Signed-off-by: Claudio Imbrenda <imbrenda@linux.vnet.ibm.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 MAINTAINERS                                        |  10 +
 include/linux/virtio_vsock.h                       | 154 ++++
 include/net/af_vsock.h                             |   2 +
 .../trace/events/vsock_virtio_transport_common.h   | 144 +++
 include/uapi/linux/Kbuild                          |   1 +
 include/uapi/linux/virtio_ids.h                    |   1 +
 include/uapi/linux/virtio_vsock.h                  |  94 ++
 net/vmw_vsock/virtio_transport_common.c            | 992 +++++++++++++++++++++
 8 files changed, 1398 insertions(+)
 create mode 100644 include/linux/virtio_vsock.h
 create mode 100644 include/trace/events/vsock_virtio_transport_common.h
 create mode 100644 include/uapi/linux/virtio_vsock.h
 create mode 100644 net/vmw_vsock/virtio_transport_common.c

(limited to 'include/uapi/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index 8c20323d1277..b49ffb86ebf0 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -12138,6 +12138,16 @@ S:	Maintained
 F:	drivers/media/v4l2-core/videobuf2-*
 F:	include/media/videobuf2-*
 
+VIRTIO AND VHOST VSOCK DRIVER
+M:	Stefan Hajnoczi <stefanha@redhat.com>
+L:	kvm@vger.kernel.org
+L:	virtualization@lists.linux-foundation.org
+L:	netdev@vger.kernel.org
+S:	Maintained
+F:	include/linux/virtio_vsock.h
+F:	include/uapi/linux/virtio_vsock.h
+F:	net/vmw_vsock/virtio_transport_common.c
+
 VIRTUAL SERIO DEVICE DRIVER
 M:	Stephen Chandler Paul <thatslyude@gmail.com>
 S:	Maintained
diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
new file mode 100644
index 000000000000..9638bfeb0d1f
--- /dev/null
+++ b/include/linux/virtio_vsock.h
@@ -0,0 +1,154 @@
+#ifndef _LINUX_VIRTIO_VSOCK_H
+#define _LINUX_VIRTIO_VSOCK_H
+
+#include <uapi/linux/virtio_vsock.h>
+#include <linux/socket.h>
+#include <net/sock.h>
+#include <net/af_vsock.h>
+
+#define VIRTIO_VSOCK_DEFAULT_MIN_BUF_SIZE	128
+#define VIRTIO_VSOCK_DEFAULT_BUF_SIZE		(1024 * 256)
+#define VIRTIO_VSOCK_DEFAULT_MAX_BUF_SIZE	(1024 * 256)
+#define VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE	(1024 * 4)
+#define VIRTIO_VSOCK_MAX_BUF_SIZE		0xFFFFFFFFUL
+#define VIRTIO_VSOCK_MAX_PKT_BUF_SIZE		(1024 * 64)
+
+enum {
+	VSOCK_VQ_RX     = 0, /* for host to guest data */
+	VSOCK_VQ_TX     = 1, /* for guest to host data */
+	VSOCK_VQ_EVENT  = 2,
+	VSOCK_VQ_MAX    = 3,
+};
+
+/* Per-socket state (accessed via vsk->trans) */
+struct virtio_vsock_sock {
+	struct vsock_sock *vsk;
+
+	/* Protected by lock_sock(sk_vsock(trans->vsk)) */
+	u32 buf_size;
+	u32 buf_size_min;
+	u32 buf_size_max;
+
+	spinlock_t tx_lock;
+	spinlock_t rx_lock;
+
+	/* Protected by tx_lock */
+	u32 tx_cnt;
+	u32 buf_alloc;
+	u32 peer_fwd_cnt;
+	u32 peer_buf_alloc;
+
+	/* Protected by rx_lock */
+	u32 fwd_cnt;
+	u32 rx_bytes;
+	struct list_head rx_queue;
+};
+
+struct virtio_vsock_pkt {
+	struct virtio_vsock_hdr	hdr;
+	struct work_struct work;
+	struct list_head list;
+	void *buf;
+	u32 len;
+	u32 off;
+	bool reply;
+};
+
+struct virtio_vsock_pkt_info {
+	u32 remote_cid, remote_port;
+	struct msghdr *msg;
+	u32 pkt_len;
+	u16 type;
+	u16 op;
+	u32 flags;
+	bool reply;
+};
+
+struct virtio_transport {
+	/* This must be the first field */
+	struct vsock_transport transport;
+
+	/* Takes ownership of the packet */
+	int (*send_pkt)(struct virtio_vsock_pkt *pkt);
+};
+
+ssize_t
+virtio_transport_stream_dequeue(struct vsock_sock *vsk,
+				struct msghdr *msg,
+				size_t len,
+				int type);
+int
+virtio_transport_dgram_dequeue(struct vsock_sock *vsk,
+			       struct msghdr *msg,
+			       size_t len, int flags);
+
+s64 virtio_transport_stream_has_data(struct vsock_sock *vsk);
+s64 virtio_transport_stream_has_space(struct vsock_sock *vsk);
+
+int virtio_transport_do_socket_init(struct vsock_sock *vsk,
+				 struct vsock_sock *psk);
+u64 virtio_transport_get_buffer_size(struct vsock_sock *vsk);
+u64 virtio_transport_get_min_buffer_size(struct vsock_sock *vsk);
+u64 virtio_transport_get_max_buffer_size(struct vsock_sock *vsk);
+void virtio_transport_set_buffer_size(struct vsock_sock *vsk, u64 val);
+void virtio_transport_set_min_buffer_size(struct vsock_sock *vsk, u64 val);
+void virtio_transport_set_max_buffer_size(struct vsock_sock *vs, u64 val);
+int
+virtio_transport_notify_poll_in(struct vsock_sock *vsk,
+				size_t target,
+				bool *data_ready_now);
+int
+virtio_transport_notify_poll_out(struct vsock_sock *vsk,
+				 size_t target,
+				 bool *space_available_now);
+
+int virtio_transport_notify_recv_init(struct vsock_sock *vsk,
+	size_t target, struct vsock_transport_recv_notify_data *data);
+int virtio_transport_notify_recv_pre_block(struct vsock_sock *vsk,
+	size_t target, struct vsock_transport_recv_notify_data *data);
+int virtio_transport_notify_recv_pre_dequeue(struct vsock_sock *vsk,
+	size_t target, struct vsock_transport_recv_notify_data *data);
+int virtio_transport_notify_recv_post_dequeue(struct vsock_sock *vsk,
+	size_t target, ssize_t copied, bool data_read,
+	struct vsock_transport_recv_notify_data *data);
+int virtio_transport_notify_send_init(struct vsock_sock *vsk,
+	struct vsock_transport_send_notify_data *data);
+int virtio_transport_notify_send_pre_block(struct vsock_sock *vsk,
+	struct vsock_transport_send_notify_data *data);
+int virtio_transport_notify_send_pre_enqueue(struct vsock_sock *vsk,
+	struct vsock_transport_send_notify_data *data);
+int virtio_transport_notify_send_post_enqueue(struct vsock_sock *vsk,
+	ssize_t written, struct vsock_transport_send_notify_data *data);
+
+u64 virtio_transport_stream_rcvhiwat(struct vsock_sock *vsk);
+bool virtio_transport_stream_is_active(struct vsock_sock *vsk);
+bool virtio_transport_stream_allow(u32 cid, u32 port);
+int virtio_transport_dgram_bind(struct vsock_sock *vsk,
+				struct sockaddr_vm *addr);
+bool virtio_transport_dgram_allow(u32 cid, u32 port);
+
+int virtio_transport_connect(struct vsock_sock *vsk);
+
+int virtio_transport_shutdown(struct vsock_sock *vsk, int mode);
+
+void virtio_transport_release(struct vsock_sock *vsk);
+
+ssize_t
+virtio_transport_stream_enqueue(struct vsock_sock *vsk,
+				struct msghdr *msg,
+				size_t len);
+int
+virtio_transport_dgram_enqueue(struct vsock_sock *vsk,
+			       struct sockaddr_vm *remote_addr,
+			       struct msghdr *msg,
+			       size_t len);
+
+void virtio_transport_destruct(struct vsock_sock *vsk);
+
+void virtio_transport_recv_pkt(struct virtio_vsock_pkt *pkt);
+void virtio_transport_free_pkt(struct virtio_vsock_pkt *pkt);
+void virtio_transport_inc_tx_pkt(struct virtio_vsock_sock *vvs, struct virtio_vsock_pkt *pkt);
+u32 virtio_transport_get_credit(struct virtio_vsock_sock *vvs, u32 wanted);
+void virtio_transport_put_credit(struct virtio_vsock_sock *vvs, u32 credit);
+
+#endif /* _LINUX_VIRTIO_VSOCK_H */
diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h
index 3af0b224f754..f2758964ce6f 100644
--- a/include/net/af_vsock.h
+++ b/include/net/af_vsock.h
@@ -63,6 +63,8 @@ struct vsock_sock {
 	struct list_head accept_queue;
 	bool rejected;
 	struct delayed_work dwork;
+	struct delayed_work close_work;
+	bool close_work_scheduled;
 	u32 peer_shutdown;
 	bool sent_request;
 	bool ignore_connecting_rst;
diff --git a/include/trace/events/vsock_virtio_transport_common.h b/include/trace/events/vsock_virtio_transport_common.h
new file mode 100644
index 000000000000..b7f1d6278280
--- /dev/null
+++ b/include/trace/events/vsock_virtio_transport_common.h
@@ -0,0 +1,144 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM vsock
+
+#if !defined(_TRACE_VSOCK_VIRTIO_TRANSPORT_COMMON_H) || \
+    defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_VSOCK_VIRTIO_TRANSPORT_COMMON_H
+
+#include <linux/tracepoint.h>
+
+TRACE_DEFINE_ENUM(VIRTIO_VSOCK_TYPE_STREAM);
+
+#define show_type(val) \
+	__print_symbolic(val, { VIRTIO_VSOCK_TYPE_STREAM, "STREAM" })
+
+TRACE_DEFINE_ENUM(VIRTIO_VSOCK_OP_INVALID);
+TRACE_DEFINE_ENUM(VIRTIO_VSOCK_OP_REQUEST);
+TRACE_DEFINE_ENUM(VIRTIO_VSOCK_OP_RESPONSE);
+TRACE_DEFINE_ENUM(VIRTIO_VSOCK_OP_RST);
+TRACE_DEFINE_ENUM(VIRTIO_VSOCK_OP_SHUTDOWN);
+TRACE_DEFINE_ENUM(VIRTIO_VSOCK_OP_RW);
+TRACE_DEFINE_ENUM(VIRTIO_VSOCK_OP_CREDIT_UPDATE);
+TRACE_DEFINE_ENUM(VIRTIO_VSOCK_OP_CREDIT_REQUEST);
+
+#define show_op(val) \
+	__print_symbolic(val, \
+			 { VIRTIO_VSOCK_OP_INVALID, "INVALID" }, \
+			 { VIRTIO_VSOCK_OP_REQUEST, "REQUEST" }, \
+			 { VIRTIO_VSOCK_OP_RESPONSE, "RESPONSE" }, \
+			 { VIRTIO_VSOCK_OP_RST, "RST" }, \
+			 { VIRTIO_VSOCK_OP_SHUTDOWN, "SHUTDOWN" }, \
+			 { VIRTIO_VSOCK_OP_RW, "RW" }, \
+			 { VIRTIO_VSOCK_OP_CREDIT_UPDATE, "CREDIT_UPDATE" }, \
+			 { VIRTIO_VSOCK_OP_CREDIT_REQUEST, "CREDIT_REQUEST" })
+
+TRACE_EVENT(virtio_transport_alloc_pkt,
+	TP_PROTO(
+		 __u32 src_cid, __u32 src_port,
+		 __u32 dst_cid, __u32 dst_port,
+		 __u32 len,
+		 __u16 type,
+		 __u16 op,
+		 __u32 flags
+	),
+	TP_ARGS(
+		src_cid, src_port,
+		dst_cid, dst_port,
+		len,
+		type,
+		op,
+		flags
+	),
+	TP_STRUCT__entry(
+		__field(__u32, src_cid)
+		__field(__u32, src_port)
+		__field(__u32, dst_cid)
+		__field(__u32, dst_port)
+		__field(__u32, len)
+		__field(__u16, type)
+		__field(__u16, op)
+		__field(__u32, flags)
+	),
+	TP_fast_assign(
+		__entry->src_cid = src_cid;
+		__entry->src_port = src_port;
+		__entry->dst_cid = dst_cid;
+		__entry->dst_port = dst_port;
+		__entry->len = len;
+		__entry->type = type;
+		__entry->op = op;
+		__entry->flags = flags;
+	),
+	TP_printk("%u:%u -> %u:%u len=%u type=%s op=%s flags=%#x",
+		  __entry->src_cid, __entry->src_port,
+		  __entry->dst_cid, __entry->dst_port,
+		  __entry->len,
+		  show_type(__entry->type),
+		  show_op(__entry->op),
+		  __entry->flags)
+);
+
+TRACE_EVENT(virtio_transport_recv_pkt,
+	TP_PROTO(
+		 __u32 src_cid, __u32 src_port,
+		 __u32 dst_cid, __u32 dst_port,
+		 __u32 len,
+		 __u16 type,
+		 __u16 op,
+		 __u32 flags,
+		 __u32 buf_alloc,
+		 __u32 fwd_cnt
+	),
+	TP_ARGS(
+		src_cid, src_port,
+		dst_cid, dst_port,
+		len,
+		type,
+		op,
+		flags,
+		buf_alloc,
+		fwd_cnt
+	),
+	TP_STRUCT__entry(
+		__field(__u32, src_cid)
+		__field(__u32, src_port)
+		__field(__u32, dst_cid)
+		__field(__u32, dst_port)
+		__field(__u32, len)
+		__field(__u16, type)
+		__field(__u16, op)
+		__field(__u32, flags)
+		__field(__u32, buf_alloc)
+		__field(__u32, fwd_cnt)
+	),
+	TP_fast_assign(
+		__entry->src_cid = src_cid;
+		__entry->src_port = src_port;
+		__entry->dst_cid = dst_cid;
+		__entry->dst_port = dst_port;
+		__entry->len = len;
+		__entry->type = type;
+		__entry->op = op;
+		__entry->flags = flags;
+		__entry->buf_alloc = buf_alloc;
+		__entry->fwd_cnt = fwd_cnt;
+	),
+	TP_printk("%u:%u -> %u:%u len=%u type=%s op=%s flags=%#x "
+		  "buf_alloc=%u fwd_cnt=%u",
+		  __entry->src_cid, __entry->src_port,
+		  __entry->dst_cid, __entry->dst_port,
+		  __entry->len,
+		  show_type(__entry->type),
+		  show_op(__entry->op),
+		  __entry->flags,
+		  __entry->buf_alloc,
+		  __entry->fwd_cnt)
+);
+
+#endif /* _TRACE_VSOCK_VIRTIO_TRANSPORT_COMMON_H */
+
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE vsock_virtio_transport_common
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild
index ec10cfef166a..3cf0116d9c2b 100644
--- a/include/uapi/linux/Kbuild
+++ b/include/uapi/linux/Kbuild
@@ -453,6 +453,7 @@ header-y += virtio_ring.h
 header-y += virtio_rng.h
 header-y += virtio_scsi.h
 header-y += virtio_types.h
+header-y += virtio_vsock.h
 header-y += vm_sockets.h
 header-y += vt.h
 header-y += wait.h
diff --git a/include/uapi/linux/virtio_ids.h b/include/uapi/linux/virtio_ids.h
index 77925f587b15..3228d582234a 100644
--- a/include/uapi/linux/virtio_ids.h
+++ b/include/uapi/linux/virtio_ids.h
@@ -41,5 +41,6 @@
 #define VIRTIO_ID_CAIF	       12 /* Virtio caif */
 #define VIRTIO_ID_GPU          16 /* virtio GPU */
 #define VIRTIO_ID_INPUT        18 /* virtio input */
+#define VIRTIO_ID_VSOCK        19 /* virtio vsock transport */
 
 #endif /* _LINUX_VIRTIO_IDS_H */
diff --git a/include/uapi/linux/virtio_vsock.h b/include/uapi/linux/virtio_vsock.h
new file mode 100644
index 000000000000..6b011c19b50f
--- /dev/null
+++ b/include/uapi/linux/virtio_vsock.h
@@ -0,0 +1,94 @@
+/*
+ * This header, excluding the #ifdef __KERNEL__ part, is BSD licensed so
+ * anyone can use the definitions to implement compatible drivers/servers:
+ *
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of IBM nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS IS''
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL IBM OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Copyright (C) Red Hat, Inc., 2013-2015
+ * Copyright (C) Asias He <asias@redhat.com>, 2013
+ * Copyright (C) Stefan Hajnoczi <stefanha@redhat.com>, 2015
+ */
+
+#ifndef _UAPI_LINUX_VIRTIO_VSOCK_H
+#define _UAPI_LINUX_VIRTIO_VOSCK_H
+
+#include <linux/types.h>
+#include <linux/virtio_ids.h>
+#include <linux/virtio_config.h>
+
+struct virtio_vsock_config {
+	__le64 guest_cid;
+} __attribute__((packed));
+
+enum virtio_vsock_event_id {
+	VIRTIO_VSOCK_EVENT_TRANSPORT_RESET = 0,
+};
+
+struct virtio_vsock_event {
+	__le32 id;
+} __attribute__((packed));
+
+struct virtio_vsock_hdr {
+	__le64	src_cid;
+	__le64	dst_cid;
+	__le32	src_port;
+	__le32	dst_port;
+	__le32	len;
+	__le16	type;		/* enum virtio_vsock_type */
+	__le16	op;		/* enum virtio_vsock_op */
+	__le32	flags;
+	__le32	buf_alloc;
+	__le32	fwd_cnt;
+} __attribute__((packed));
+
+enum virtio_vsock_type {
+	VIRTIO_VSOCK_TYPE_STREAM = 1,
+};
+
+enum virtio_vsock_op {
+	VIRTIO_VSOCK_OP_INVALID = 0,
+
+	/* Connect operations */
+	VIRTIO_VSOCK_OP_REQUEST = 1,
+	VIRTIO_VSOCK_OP_RESPONSE = 2,
+	VIRTIO_VSOCK_OP_RST = 3,
+	VIRTIO_VSOCK_OP_SHUTDOWN = 4,
+
+	/* To send payload */
+	VIRTIO_VSOCK_OP_RW = 5,
+
+	/* Tell the peer our credit info */
+	VIRTIO_VSOCK_OP_CREDIT_UPDATE = 6,
+	/* Request the peer to send the credit info to us */
+	VIRTIO_VSOCK_OP_CREDIT_REQUEST = 7,
+};
+
+/* VIRTIO_VSOCK_OP_SHUTDOWN flags values */
+enum virtio_vsock_shutdown {
+	VIRTIO_VSOCK_SHUTDOWN_RCV = 1,
+	VIRTIO_VSOCK_SHUTDOWN_SEND = 2,
+};
+
+#endif /* _UAPI_LINUX_VIRTIO_VSOCK_H */
diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
new file mode 100644
index 000000000000..a53b3a16b4f1
--- /dev/null
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -0,0 +1,992 @@
+/*
+ * common code for virtio vsock
+ *
+ * Copyright (C) 2013-2015 Red Hat, Inc.
+ * Author: Asias He <asias@redhat.com>
+ *         Stefan Hajnoczi <stefanha@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ */
+#include <linux/spinlock.h>
+#include <linux/module.h>
+#include <linux/ctype.h>
+#include <linux/list.h>
+#include <linux/virtio.h>
+#include <linux/virtio_ids.h>
+#include <linux/virtio_config.h>
+#include <linux/virtio_vsock.h>
+
+#include <net/sock.h>
+#include <net/af_vsock.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/vsock_virtio_transport_common.h>
+
+/* How long to wait for graceful shutdown of a connection */
+#define VSOCK_CLOSE_TIMEOUT (8 * HZ)
+
+static const struct virtio_transport *virtio_transport_get_ops(void)
+{
+	const struct vsock_transport *t = vsock_core_get_transport();
+
+	return container_of(t, struct virtio_transport, transport);
+}
+
+struct virtio_vsock_pkt *
+virtio_transport_alloc_pkt(struct virtio_vsock_pkt_info *info,
+			   size_t len,
+			   u32 src_cid,
+			   u32 src_port,
+			   u32 dst_cid,
+			   u32 dst_port)
+{
+	struct virtio_vsock_pkt *pkt;
+	int err;
+
+	pkt = kzalloc(sizeof(*pkt), GFP_KERNEL);
+	if (!pkt)
+		return NULL;
+
+	pkt->hdr.type		= cpu_to_le16(info->type);
+	pkt->hdr.op		= cpu_to_le16(info->op);
+	pkt->hdr.src_cid	= cpu_to_le64(src_cid);
+	pkt->hdr.dst_cid	= cpu_to_le64(dst_cid);
+	pkt->hdr.src_port	= cpu_to_le32(src_port);
+	pkt->hdr.dst_port	= cpu_to_le32(dst_port);
+	pkt->hdr.flags		= cpu_to_le32(info->flags);
+	pkt->len		= len;
+	pkt->hdr.len		= cpu_to_le32(len);
+	pkt->reply		= info->reply;
+
+	if (info->msg && len > 0) {
+		pkt->buf = kmalloc(len, GFP_KERNEL);
+		if (!pkt->buf)
+			goto out_pkt;
+		err = memcpy_from_msg(pkt->buf, info->msg, len);
+		if (err)
+			goto out;
+	}
+
+	trace_virtio_transport_alloc_pkt(src_cid, src_port,
+					 dst_cid, dst_port,
+					 len,
+					 info->type,
+					 info->op,
+					 info->flags);
+
+	return pkt;
+
+out:
+	kfree(pkt->buf);
+out_pkt:
+	kfree(pkt);
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_alloc_pkt);
+
+static int virtio_transport_send_pkt_info(struct vsock_sock *vsk,
+					  struct virtio_vsock_pkt_info *info)
+{
+	u32 src_cid, src_port, dst_cid, dst_port;
+	struct virtio_vsock_sock *vvs;
+	struct virtio_vsock_pkt *pkt;
+	u32 pkt_len = info->pkt_len;
+
+	src_cid = vm_sockets_get_local_cid();
+	src_port = vsk->local_addr.svm_port;
+	if (!info->remote_cid) {
+		dst_cid	= vsk->remote_addr.svm_cid;
+		dst_port = vsk->remote_addr.svm_port;
+	} else {
+		dst_cid = info->remote_cid;
+		dst_port = info->remote_port;
+	}
+
+	vvs = vsk->trans;
+
+	/* we can send less than pkt_len bytes */
+	if (pkt_len > VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE)
+		pkt_len = VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE;
+
+	/* virtio_transport_get_credit might return less than pkt_len credit */
+	pkt_len = virtio_transport_get_credit(vvs, pkt_len);
+
+	/* Do not send zero length OP_RW pkt */
+	if (pkt_len == 0 && info->op == VIRTIO_VSOCK_OP_RW)
+		return pkt_len;
+
+	pkt = virtio_transport_alloc_pkt(info, pkt_len,
+					 src_cid, src_port,
+					 dst_cid, dst_port);
+	if (!pkt) {
+		virtio_transport_put_credit(vvs, pkt_len);
+		return -ENOMEM;
+	}
+
+	virtio_transport_inc_tx_pkt(vvs, pkt);
+
+	return virtio_transport_get_ops()->send_pkt(pkt);
+}
+
+static void virtio_transport_inc_rx_pkt(struct virtio_vsock_sock *vvs,
+					struct virtio_vsock_pkt *pkt)
+{
+	vvs->rx_bytes += pkt->len;
+}
+
+static void virtio_transport_dec_rx_pkt(struct virtio_vsock_sock *vvs,
+					struct virtio_vsock_pkt *pkt)
+{
+	vvs->rx_bytes -= pkt->len;
+	vvs->fwd_cnt += pkt->len;
+}
+
+void virtio_transport_inc_tx_pkt(struct virtio_vsock_sock *vvs, struct virtio_vsock_pkt *pkt)
+{
+	spin_lock_bh(&vvs->tx_lock);
+	pkt->hdr.fwd_cnt = cpu_to_le32(vvs->fwd_cnt);
+	pkt->hdr.buf_alloc = cpu_to_le32(vvs->buf_alloc);
+	spin_unlock_bh(&vvs->tx_lock);
+}
+EXPORT_SYMBOL_GPL(virtio_transport_inc_tx_pkt);
+
+u32 virtio_transport_get_credit(struct virtio_vsock_sock *vvs, u32 credit)
+{
+	u32 ret;
+
+	spin_lock_bh(&vvs->tx_lock);
+	ret = vvs->peer_buf_alloc - (vvs->tx_cnt - vvs->peer_fwd_cnt);
+	if (ret > credit)
+		ret = credit;
+	vvs->tx_cnt += ret;
+	spin_unlock_bh(&vvs->tx_lock);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_get_credit);
+
+void virtio_transport_put_credit(struct virtio_vsock_sock *vvs, u32 credit)
+{
+	spin_lock_bh(&vvs->tx_lock);
+	vvs->tx_cnt -= credit;
+	spin_unlock_bh(&vvs->tx_lock);
+}
+EXPORT_SYMBOL_GPL(virtio_transport_put_credit);
+
+static int virtio_transport_send_credit_update(struct vsock_sock *vsk,
+					       int type,
+					       struct virtio_vsock_hdr *hdr)
+{
+	struct virtio_vsock_pkt_info info = {
+		.op = VIRTIO_VSOCK_OP_CREDIT_UPDATE,
+		.type = type,
+	};
+
+	return virtio_transport_send_pkt_info(vsk, &info);
+}
+
+static ssize_t
+virtio_transport_stream_do_dequeue(struct vsock_sock *vsk,
+				   struct msghdr *msg,
+				   size_t len)
+{
+	struct virtio_vsock_sock *vvs = vsk->trans;
+	struct virtio_vsock_pkt *pkt;
+	size_t bytes, total = 0;
+	int err = -EFAULT;
+
+	spin_lock_bh(&vvs->rx_lock);
+	while (total < len && !list_empty(&vvs->rx_queue)) {
+		pkt = list_first_entry(&vvs->rx_queue,
+				       struct virtio_vsock_pkt, list);
+
+		bytes = len - total;
+		if (bytes > pkt->len - pkt->off)
+			bytes = pkt->len - pkt->off;
+
+		/* sk_lock is held by caller so no one else can dequeue.
+		 * Unlock rx_lock since memcpy_to_msg() may sleep.
+		 */
+		spin_unlock_bh(&vvs->rx_lock);
+
+		err = memcpy_to_msg(msg, pkt->buf + pkt->off, bytes);
+		if (err)
+			goto out;
+
+		spin_lock_bh(&vvs->rx_lock);
+
+		total += bytes;
+		pkt->off += bytes;
+		if (pkt->off == pkt->len) {
+			virtio_transport_dec_rx_pkt(vvs, pkt);
+			list_del(&pkt->list);
+			virtio_transport_free_pkt(pkt);
+		}
+	}
+	spin_unlock_bh(&vvs->rx_lock);
+
+	/* Send a credit pkt to peer */
+	virtio_transport_send_credit_update(vsk, VIRTIO_VSOCK_TYPE_STREAM,
+					    NULL);
+
+	return total;
+
+out:
+	if (total)
+		err = total;
+	return err;
+}
+
+ssize_t
+virtio_transport_stream_dequeue(struct vsock_sock *vsk,
+				struct msghdr *msg,
+				size_t len, int flags)
+{
+	if (flags & MSG_PEEK)
+		return -EOPNOTSUPP;
+
+	return virtio_transport_stream_do_dequeue(vsk, msg, len);
+}
+EXPORT_SYMBOL_GPL(virtio_transport_stream_dequeue);
+
+int
+virtio_transport_dgram_dequeue(struct vsock_sock *vsk,
+			       struct msghdr *msg,
+			       size_t len, int flags)
+{
+	return -EOPNOTSUPP;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_dgram_dequeue);
+
+s64 virtio_transport_stream_has_data(struct vsock_sock *vsk)
+{
+	struct virtio_vsock_sock *vvs = vsk->trans;
+	s64 bytes;
+
+	spin_lock_bh(&vvs->rx_lock);
+	bytes = vvs->rx_bytes;
+	spin_unlock_bh(&vvs->rx_lock);
+
+	return bytes;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_stream_has_data);
+
+static s64 virtio_transport_has_space(struct vsock_sock *vsk)
+{
+	struct virtio_vsock_sock *vvs = vsk->trans;
+	s64 bytes;
+
+	bytes = vvs->peer_buf_alloc - (vvs->tx_cnt - vvs->peer_fwd_cnt);
+	if (bytes < 0)
+		bytes = 0;
+
+	return bytes;
+}
+
+s64 virtio_transport_stream_has_space(struct vsock_sock *vsk)
+{
+	struct virtio_vsock_sock *vvs = vsk->trans;
+	s64 bytes;
+
+	spin_lock_bh(&vvs->tx_lock);
+	bytes = virtio_transport_has_space(vsk);
+	spin_unlock_bh(&vvs->tx_lock);
+
+	return bytes;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_stream_has_space);
+
+int virtio_transport_do_socket_init(struct vsock_sock *vsk,
+				    struct vsock_sock *psk)
+{
+	struct virtio_vsock_sock *vvs;
+
+	vvs = kzalloc(sizeof(*vvs), GFP_KERNEL);
+	if (!vvs)
+		return -ENOMEM;
+
+	vsk->trans = vvs;
+	vvs->vsk = vsk;
+	if (psk) {
+		struct virtio_vsock_sock *ptrans = psk->trans;
+
+		vvs->buf_size	= ptrans->buf_size;
+		vvs->buf_size_min = ptrans->buf_size_min;
+		vvs->buf_size_max = ptrans->buf_size_max;
+		vvs->peer_buf_alloc = ptrans->peer_buf_alloc;
+	} else {
+		vvs->buf_size = VIRTIO_VSOCK_DEFAULT_BUF_SIZE;
+		vvs->buf_size_min = VIRTIO_VSOCK_DEFAULT_MIN_BUF_SIZE;
+		vvs->buf_size_max = VIRTIO_VSOCK_DEFAULT_MAX_BUF_SIZE;
+	}
+
+	vvs->buf_alloc = vvs->buf_size;
+
+	spin_lock_init(&vvs->rx_lock);
+	spin_lock_init(&vvs->tx_lock);
+	INIT_LIST_HEAD(&vvs->rx_queue);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_do_socket_init);
+
+u64 virtio_transport_get_buffer_size(struct vsock_sock *vsk)
+{
+	struct virtio_vsock_sock *vvs = vsk->trans;
+
+	return vvs->buf_size;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_get_buffer_size);
+
+u64 virtio_transport_get_min_buffer_size(struct vsock_sock *vsk)
+{
+	struct virtio_vsock_sock *vvs = vsk->trans;
+
+	return vvs->buf_size_min;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_get_min_buffer_size);
+
+u64 virtio_transport_get_max_buffer_size(struct vsock_sock *vsk)
+{
+	struct virtio_vsock_sock *vvs = vsk->trans;
+
+	return vvs->buf_size_max;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_get_max_buffer_size);
+
+void virtio_transport_set_buffer_size(struct vsock_sock *vsk, u64 val)
+{
+	struct virtio_vsock_sock *vvs = vsk->trans;
+
+	if (val > VIRTIO_VSOCK_MAX_BUF_SIZE)
+		val = VIRTIO_VSOCK_MAX_BUF_SIZE;
+	if (val < vvs->buf_size_min)
+		vvs->buf_size_min = val;
+	if (val > vvs->buf_size_max)
+		vvs->buf_size_max = val;
+	vvs->buf_size = val;
+	vvs->buf_alloc = val;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_set_buffer_size);
+
+void virtio_transport_set_min_buffer_size(struct vsock_sock *vsk, u64 val)
+{
+	struct virtio_vsock_sock *vvs = vsk->trans;
+
+	if (val > VIRTIO_VSOCK_MAX_BUF_SIZE)
+		val = VIRTIO_VSOCK_MAX_BUF_SIZE;
+	if (val > vvs->buf_size)
+		vvs->buf_size = val;
+	vvs->buf_size_min = val;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_set_min_buffer_size);
+
+void virtio_transport_set_max_buffer_size(struct vsock_sock *vsk, u64 val)
+{
+	struct virtio_vsock_sock *vvs = vsk->trans;
+
+	if (val > VIRTIO_VSOCK_MAX_BUF_SIZE)
+		val = VIRTIO_VSOCK_MAX_BUF_SIZE;
+	if (val < vvs->buf_size)
+		vvs->buf_size = val;
+	vvs->buf_size_max = val;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_set_max_buffer_size);
+
+int
+virtio_transport_notify_poll_in(struct vsock_sock *vsk,
+				size_t target,
+				bool *data_ready_now)
+{
+	if (vsock_stream_has_data(vsk))
+		*data_ready_now = true;
+	else
+		*data_ready_now = false;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_notify_poll_in);
+
+int
+virtio_transport_notify_poll_out(struct vsock_sock *vsk,
+				 size_t target,
+				 bool *space_avail_now)
+{
+	s64 free_space;
+
+	free_space = vsock_stream_has_space(vsk);
+	if (free_space > 0)
+		*space_avail_now = true;
+	else if (free_space == 0)
+		*space_avail_now = false;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_notify_poll_out);
+
+int virtio_transport_notify_recv_init(struct vsock_sock *vsk,
+	size_t target, struct vsock_transport_recv_notify_data *data)
+{
+	return 0;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_notify_recv_init);
+
+int virtio_transport_notify_recv_pre_block(struct vsock_sock *vsk,
+	size_t target, struct vsock_transport_recv_notify_data *data)
+{
+	return 0;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_notify_recv_pre_block);
+
+int virtio_transport_notify_recv_pre_dequeue(struct vsock_sock *vsk,
+	size_t target, struct vsock_transport_recv_notify_data *data)
+{
+	return 0;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_notify_recv_pre_dequeue);
+
+int virtio_transport_notify_recv_post_dequeue(struct vsock_sock *vsk,
+	size_t target, ssize_t copied, bool data_read,
+	struct vsock_transport_recv_notify_data *data)
+{
+	return 0;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_notify_recv_post_dequeue);
+
+int virtio_transport_notify_send_init(struct vsock_sock *vsk,
+	struct vsock_transport_send_notify_data *data)
+{
+	return 0;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_notify_send_init);
+
+int virtio_transport_notify_send_pre_block(struct vsock_sock *vsk,
+	struct vsock_transport_send_notify_data *data)
+{
+	return 0;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_notify_send_pre_block);
+
+int virtio_transport_notify_send_pre_enqueue(struct vsock_sock *vsk,
+	struct vsock_transport_send_notify_data *data)
+{
+	return 0;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_notify_send_pre_enqueue);
+
+int virtio_transport_notify_send_post_enqueue(struct vsock_sock *vsk,
+	ssize_t written, struct vsock_transport_send_notify_data *data)
+{
+	return 0;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_notify_send_post_enqueue);
+
+u64 virtio_transport_stream_rcvhiwat(struct vsock_sock *vsk)
+{
+	struct virtio_vsock_sock *vvs = vsk->trans;
+
+	return vvs->buf_size;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_stream_rcvhiwat);
+
+bool virtio_transport_stream_is_active(struct vsock_sock *vsk)
+{
+	return true;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_stream_is_active);
+
+bool virtio_transport_stream_allow(u32 cid, u32 port)
+{
+	return true;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_stream_allow);
+
+int virtio_transport_dgram_bind(struct vsock_sock *vsk,
+				struct sockaddr_vm *addr)
+{
+	return -EOPNOTSUPP;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_dgram_bind);
+
+bool virtio_transport_dgram_allow(u32 cid, u32 port)
+{
+	return false;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_dgram_allow);
+
+int virtio_transport_connect(struct vsock_sock *vsk)
+{
+	struct virtio_vsock_pkt_info info = {
+		.op = VIRTIO_VSOCK_OP_REQUEST,
+		.type = VIRTIO_VSOCK_TYPE_STREAM,
+	};
+
+	return virtio_transport_send_pkt_info(vsk, &info);
+}
+EXPORT_SYMBOL_GPL(virtio_transport_connect);
+
+int virtio_transport_shutdown(struct vsock_sock *vsk, int mode)
+{
+	struct virtio_vsock_pkt_info info = {
+		.op = VIRTIO_VSOCK_OP_SHUTDOWN,
+		.type = VIRTIO_VSOCK_TYPE_STREAM,
+		.flags = (mode & RCV_SHUTDOWN ?
+			  VIRTIO_VSOCK_SHUTDOWN_RCV : 0) |
+			 (mode & SEND_SHUTDOWN ?
+			  VIRTIO_VSOCK_SHUTDOWN_SEND : 0),
+	};
+
+	return virtio_transport_send_pkt_info(vsk, &info);
+}
+EXPORT_SYMBOL_GPL(virtio_transport_shutdown);
+
+int
+virtio_transport_dgram_enqueue(struct vsock_sock *vsk,
+			       struct sockaddr_vm *remote_addr,
+			       struct msghdr *msg,
+			       size_t dgram_len)
+{
+	return -EOPNOTSUPP;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_dgram_enqueue);
+
+ssize_t
+virtio_transport_stream_enqueue(struct vsock_sock *vsk,
+				struct msghdr *msg,
+				size_t len)
+{
+	struct virtio_vsock_pkt_info info = {
+		.op = VIRTIO_VSOCK_OP_RW,
+		.type = VIRTIO_VSOCK_TYPE_STREAM,
+		.msg = msg,
+		.pkt_len = len,
+	};
+
+	return virtio_transport_send_pkt_info(vsk, &info);
+}
+EXPORT_SYMBOL_GPL(virtio_transport_stream_enqueue);
+
+void virtio_transport_destruct(struct vsock_sock *vsk)
+{
+	struct virtio_vsock_sock *vvs = vsk->trans;
+
+	kfree(vvs);
+}
+EXPORT_SYMBOL_GPL(virtio_transport_destruct);
+
+static int virtio_transport_reset(struct vsock_sock *vsk,
+				  struct virtio_vsock_pkt *pkt)
+{
+	struct virtio_vsock_pkt_info info = {
+		.op = VIRTIO_VSOCK_OP_RST,
+		.type = VIRTIO_VSOCK_TYPE_STREAM,
+		.reply = !!pkt,
+	};
+
+	/* Send RST only if the original pkt is not a RST pkt */
+	if (pkt && le16_to_cpu(pkt->hdr.op) == VIRTIO_VSOCK_OP_RST)
+		return 0;
+
+	return virtio_transport_send_pkt_info(vsk, &info);
+}
+
+/* Normally packets are associated with a socket.  There may be no socket if an
+ * attempt was made to connect to a socket that does not exist.
+ */
+static int virtio_transport_reset_no_sock(struct virtio_vsock_pkt *pkt)
+{
+	struct virtio_vsock_pkt_info info = {
+		.op = VIRTIO_VSOCK_OP_RST,
+		.type = le16_to_cpu(pkt->hdr.type),
+		.reply = true,
+	};
+
+	/* Send RST only if the original pkt is not a RST pkt */
+	if (le16_to_cpu(pkt->hdr.op) == VIRTIO_VSOCK_OP_RST)
+		return 0;
+
+	pkt = virtio_transport_alloc_pkt(&info, 0,
+					 le32_to_cpu(pkt->hdr.dst_cid),
+					 le32_to_cpu(pkt->hdr.dst_port),
+					 le32_to_cpu(pkt->hdr.src_cid),
+					 le32_to_cpu(pkt->hdr.src_port));
+	if (!pkt)
+		return -ENOMEM;
+
+	return virtio_transport_get_ops()->send_pkt(pkt);
+}
+
+static void virtio_transport_wait_close(struct sock *sk, long timeout)
+{
+	if (timeout) {
+		DEFINE_WAIT(wait);
+
+		do {
+			prepare_to_wait(sk_sleep(sk), &wait,
+					TASK_INTERRUPTIBLE);
+			if (sk_wait_event(sk, &timeout,
+					  sock_flag(sk, SOCK_DONE)))
+				break;
+		} while (!signal_pending(current) && timeout);
+
+		finish_wait(sk_sleep(sk), &wait);
+	}
+}
+
+static void virtio_transport_do_close(struct vsock_sock *vsk,
+				      bool cancel_timeout)
+{
+	struct sock *sk = sk_vsock(vsk);
+
+	sock_set_flag(sk, SOCK_DONE);
+	vsk->peer_shutdown = SHUTDOWN_MASK;
+	if (vsock_stream_has_data(vsk) <= 0)
+		sk->sk_state = SS_DISCONNECTING;
+	sk->sk_state_change(sk);
+
+	if (vsk->close_work_scheduled &&
+	    (!cancel_timeout || cancel_delayed_work(&vsk->close_work))) {
+		vsk->close_work_scheduled = false;
+
+		vsock_remove_sock(vsk);
+
+		/* Release refcnt obtained when we scheduled the timeout */
+		sock_put(sk);
+	}
+}
+
+static void virtio_transport_close_timeout(struct work_struct *work)
+{
+	struct vsock_sock *vsk =
+		container_of(work, struct vsock_sock, close_work.work);
+	struct sock *sk = sk_vsock(vsk);
+
+	sock_hold(sk);
+	lock_sock(sk);
+
+	if (!sock_flag(sk, SOCK_DONE)) {
+		(void)virtio_transport_reset(vsk, NULL);
+
+		virtio_transport_do_close(vsk, false);
+	}
+
+	vsk->close_work_scheduled = false;
+
+	release_sock(sk);
+	sock_put(sk);
+}
+
+/* User context, vsk->sk is locked */
+static bool virtio_transport_close(struct vsock_sock *vsk)
+{
+	struct sock *sk = &vsk->sk;
+
+	if (!(sk->sk_state == SS_CONNECTED ||
+	      sk->sk_state == SS_DISCONNECTING))
+		return true;
+
+	/* Already received SHUTDOWN from peer, reply with RST */
+	if ((vsk->peer_shutdown & SHUTDOWN_MASK) == SHUTDOWN_MASK) {
+		(void)virtio_transport_reset(vsk, NULL);
+		return true;
+	}
+
+	if ((sk->sk_shutdown & SHUTDOWN_MASK) != SHUTDOWN_MASK)
+		(void)virtio_transport_shutdown(vsk, SHUTDOWN_MASK);
+
+	if (sock_flag(sk, SOCK_LINGER) && !(current->flags & PF_EXITING))
+		virtio_transport_wait_close(sk, sk->sk_lingertime);
+
+	if (sock_flag(sk, SOCK_DONE)) {
+		return true;
+	}
+
+	sock_hold(sk);
+	INIT_DELAYED_WORK(&vsk->close_work,
+			  virtio_transport_close_timeout);
+	vsk->close_work_scheduled = true;
+	schedule_delayed_work(&vsk->close_work, VSOCK_CLOSE_TIMEOUT);
+	return false;
+}
+
+void virtio_transport_release(struct vsock_sock *vsk)
+{
+	struct sock *sk = &vsk->sk;
+	bool remove_sock = true;
+
+	lock_sock(sk);
+	if (sk->sk_type == SOCK_STREAM)
+		remove_sock = virtio_transport_close(vsk);
+	release_sock(sk);
+
+	if (remove_sock)
+		vsock_remove_sock(vsk);
+}
+EXPORT_SYMBOL_GPL(virtio_transport_release);
+
+static int
+virtio_transport_recv_connecting(struct sock *sk,
+				 struct virtio_vsock_pkt *pkt)
+{
+	struct vsock_sock *vsk = vsock_sk(sk);
+	int err;
+	int skerr;
+
+	switch (le16_to_cpu(pkt->hdr.op)) {
+	case VIRTIO_VSOCK_OP_RESPONSE:
+		sk->sk_state = SS_CONNECTED;
+		sk->sk_socket->state = SS_CONNECTED;
+		vsock_insert_connected(vsk);
+		sk->sk_state_change(sk);
+		break;
+	case VIRTIO_VSOCK_OP_INVALID:
+		break;
+	case VIRTIO_VSOCK_OP_RST:
+		skerr = ECONNRESET;
+		err = 0;
+		goto destroy;
+	default:
+		skerr = EPROTO;
+		err = -EINVAL;
+		goto destroy;
+	}
+	return 0;
+
+destroy:
+	virtio_transport_reset(vsk, pkt);
+	sk->sk_state = SS_UNCONNECTED;
+	sk->sk_err = skerr;
+	sk->sk_error_report(sk);
+	return err;
+}
+
+static int
+virtio_transport_recv_connected(struct sock *sk,
+				struct virtio_vsock_pkt *pkt)
+{
+	struct vsock_sock *vsk = vsock_sk(sk);
+	struct virtio_vsock_sock *vvs = vsk->trans;
+	int err = 0;
+
+	switch (le16_to_cpu(pkt->hdr.op)) {
+	case VIRTIO_VSOCK_OP_RW:
+		pkt->len = le32_to_cpu(pkt->hdr.len);
+		pkt->off = 0;
+
+		spin_lock_bh(&vvs->rx_lock);
+		virtio_transport_inc_rx_pkt(vvs, pkt);
+		list_add_tail(&pkt->list, &vvs->rx_queue);
+		spin_unlock_bh(&vvs->rx_lock);
+
+		sk->sk_data_ready(sk);
+		return err;
+	case VIRTIO_VSOCK_OP_CREDIT_UPDATE:
+		sk->sk_write_space(sk);
+		break;
+	case VIRTIO_VSOCK_OP_SHUTDOWN:
+		if (le32_to_cpu(pkt->hdr.flags) & VIRTIO_VSOCK_SHUTDOWN_RCV)
+			vsk->peer_shutdown |= RCV_SHUTDOWN;
+		if (le32_to_cpu(pkt->hdr.flags) & VIRTIO_VSOCK_SHUTDOWN_SEND)
+			vsk->peer_shutdown |= SEND_SHUTDOWN;
+		if (vsk->peer_shutdown == SHUTDOWN_MASK &&
+		    vsock_stream_has_data(vsk) <= 0)
+			sk->sk_state = SS_DISCONNECTING;
+		if (le32_to_cpu(pkt->hdr.flags))
+			sk->sk_state_change(sk);
+		break;
+	case VIRTIO_VSOCK_OP_RST:
+		virtio_transport_do_close(vsk, true);
+		break;
+	default:
+		err = -EINVAL;
+		break;
+	}
+
+	virtio_transport_free_pkt(pkt);
+	return err;
+}
+
+static void
+virtio_transport_recv_disconnecting(struct sock *sk,
+				    struct virtio_vsock_pkt *pkt)
+{
+	struct vsock_sock *vsk = vsock_sk(sk);
+
+	if (le16_to_cpu(pkt->hdr.op) == VIRTIO_VSOCK_OP_RST)
+		virtio_transport_do_close(vsk, true);
+}
+
+static int
+virtio_transport_send_response(struct vsock_sock *vsk,
+			       struct virtio_vsock_pkt *pkt)
+{
+	struct virtio_vsock_pkt_info info = {
+		.op = VIRTIO_VSOCK_OP_RESPONSE,
+		.type = VIRTIO_VSOCK_TYPE_STREAM,
+		.remote_cid = le32_to_cpu(pkt->hdr.src_cid),
+		.remote_port = le32_to_cpu(pkt->hdr.src_port),
+		.reply = true,
+	};
+
+	return virtio_transport_send_pkt_info(vsk, &info);
+}
+
+/* Handle server socket */
+static int
+virtio_transport_recv_listen(struct sock *sk, struct virtio_vsock_pkt *pkt)
+{
+	struct vsock_sock *vsk = vsock_sk(sk);
+	struct vsock_sock *vchild;
+	struct sock *child;
+
+	if (le16_to_cpu(pkt->hdr.op) != VIRTIO_VSOCK_OP_REQUEST) {
+		virtio_transport_reset(vsk, pkt);
+		return -EINVAL;
+	}
+
+	if (sk_acceptq_is_full(sk)) {
+		virtio_transport_reset(vsk, pkt);
+		return -ENOMEM;
+	}
+
+	child = __vsock_create(sock_net(sk), NULL, sk, GFP_KERNEL,
+			       sk->sk_type, 0);
+	if (!child) {
+		virtio_transport_reset(vsk, pkt);
+		return -ENOMEM;
+	}
+
+	sk->sk_ack_backlog++;
+
+	lock_sock_nested(child, SINGLE_DEPTH_NESTING);
+
+	child->sk_state = SS_CONNECTED;
+
+	vchild = vsock_sk(child);
+	vsock_addr_init(&vchild->local_addr, le32_to_cpu(pkt->hdr.dst_cid),
+			le32_to_cpu(pkt->hdr.dst_port));
+	vsock_addr_init(&vchild->remote_addr, le32_to_cpu(pkt->hdr.src_cid),
+			le32_to_cpu(pkt->hdr.src_port));
+
+	vsock_insert_connected(vchild);
+	vsock_enqueue_accept(sk, child);
+	virtio_transport_send_response(vchild, pkt);
+
+	release_sock(child);
+
+	sk->sk_data_ready(sk);
+	return 0;
+}
+
+static bool virtio_transport_space_update(struct sock *sk,
+					  struct virtio_vsock_pkt *pkt)
+{
+	struct vsock_sock *vsk = vsock_sk(sk);
+	struct virtio_vsock_sock *vvs = vsk->trans;
+	bool space_available;
+
+	/* buf_alloc and fwd_cnt is always included in the hdr */
+	spin_lock_bh(&vvs->tx_lock);
+	vvs->peer_buf_alloc = le32_to_cpu(pkt->hdr.buf_alloc);
+	vvs->peer_fwd_cnt = le32_to_cpu(pkt->hdr.fwd_cnt);
+	space_available = virtio_transport_has_space(vsk);
+	spin_unlock_bh(&vvs->tx_lock);
+	return space_available;
+}
+
+/* We are under the virtio-vsock's vsock->rx_lock or vhost-vsock's vq->mutex
+ * lock.
+ */
+void virtio_transport_recv_pkt(struct virtio_vsock_pkt *pkt)
+{
+	struct sockaddr_vm src, dst;
+	struct vsock_sock *vsk;
+	struct sock *sk;
+	bool space_available;
+
+	vsock_addr_init(&src, le32_to_cpu(pkt->hdr.src_cid),
+			le32_to_cpu(pkt->hdr.src_port));
+	vsock_addr_init(&dst, le32_to_cpu(pkt->hdr.dst_cid),
+			le32_to_cpu(pkt->hdr.dst_port));
+
+	trace_virtio_transport_recv_pkt(src.svm_cid, src.svm_port,
+					dst.svm_cid, dst.svm_port,
+					le32_to_cpu(pkt->hdr.len),
+					le16_to_cpu(pkt->hdr.type),
+					le16_to_cpu(pkt->hdr.op),
+					le32_to_cpu(pkt->hdr.flags),
+					le32_to_cpu(pkt->hdr.buf_alloc),
+					le32_to_cpu(pkt->hdr.fwd_cnt));
+
+	if (le16_to_cpu(pkt->hdr.type) != VIRTIO_VSOCK_TYPE_STREAM) {
+		(void)virtio_transport_reset_no_sock(pkt);
+		goto free_pkt;
+	}
+
+	/* The socket must be in connected or bound table
+	 * otherwise send reset back
+	 */
+	sk = vsock_find_connected_socket(&src, &dst);
+	if (!sk) {
+		sk = vsock_find_bound_socket(&dst);
+		if (!sk) {
+			(void)virtio_transport_reset_no_sock(pkt);
+			goto free_pkt;
+		}
+	}
+
+	vsk = vsock_sk(sk);
+
+	space_available = virtio_transport_space_update(sk, pkt);
+
+	lock_sock(sk);
+
+	/* Update CID in case it has changed after a transport reset event */
+	vsk->local_addr.svm_cid = dst.svm_cid;
+
+	if (space_available)
+		sk->sk_write_space(sk);
+
+	switch (sk->sk_state) {
+	case VSOCK_SS_LISTEN:
+		virtio_transport_recv_listen(sk, pkt);
+		virtio_transport_free_pkt(pkt);
+		break;
+	case SS_CONNECTING:
+		virtio_transport_recv_connecting(sk, pkt);
+		virtio_transport_free_pkt(pkt);
+		break;
+	case SS_CONNECTED:
+		virtio_transport_recv_connected(sk, pkt);
+		break;
+	case SS_DISCONNECTING:
+		virtio_transport_recv_disconnecting(sk, pkt);
+		virtio_transport_free_pkt(pkt);
+		break;
+	default:
+		virtio_transport_free_pkt(pkt);
+		break;
+	}
+	release_sock(sk);
+
+	/* Release refcnt obtained when we fetched this socket out of the
+	 * bound or connected list.
+	 */
+	sock_put(sk);
+	return;
+
+free_pkt:
+	virtio_transport_free_pkt(pkt);
+}
+EXPORT_SYMBOL_GPL(virtio_transport_recv_pkt);
+
+void virtio_transport_free_pkt(struct virtio_vsock_pkt *pkt)
+{
+	kfree(pkt->buf);
+	kfree(pkt);
+}
+EXPORT_SYMBOL_GPL(virtio_transport_free_pkt);
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Asias He");
+MODULE_DESCRIPTION("common code for virtio vsock");
-- 
cgit v1.2.3-71-gd317


From 433fc58e6bf2c8bd97e57153ed28e64fd78207b8 Mon Sep 17 00:00:00 2001
From: Asias He <asias@redhat.com>
Date: Thu, 28 Jul 2016 15:36:34 +0100
Subject: VSOCK: Introduce vhost_vsock.ko

VM sockets vhost transport implementation.  This driver runs on the
host.

Signed-off-by: Asias He <asias@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 MAINTAINERS                |   2 +
 drivers/vhost/vsock.c      | 722 +++++++++++++++++++++++++++++++++++++++++++++
 include/uapi/linux/vhost.h |   5 +
 3 files changed, 729 insertions(+)
 create mode 100644 drivers/vhost/vsock.c

(limited to 'include/uapi/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index 7302663ae4c3..12c79e5dc27e 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -12148,6 +12148,8 @@ F:	include/linux/virtio_vsock.h
 F:	include/uapi/linux/virtio_vsock.h
 F:	net/vmw_vsock/virtio_transport_common.c
 F:	net/vmw_vsock/virtio_transport.c
+F:	drivers/vhost/vsock.c
+F:	drivers/vhost/vsock.h
 
 VIRTUAL SERIO DEVICE DRIVER
 M:	Stephen Chandler Paul <thatslyude@gmail.com>
diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
new file mode 100644
index 000000000000..028ca16c2d36
--- /dev/null
+++ b/drivers/vhost/vsock.c
@@ -0,0 +1,722 @@
+/*
+ * vhost transport for vsock
+ *
+ * Copyright (C) 2013-2015 Red Hat, Inc.
+ * Author: Asias He <asias@redhat.com>
+ *         Stefan Hajnoczi <stefanha@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ */
+#include <linux/miscdevice.h>
+#include <linux/atomic.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/vmalloc.h>
+#include <net/sock.h>
+#include <linux/virtio_vsock.h>
+#include <linux/vhost.h>
+
+#include <net/af_vsock.h>
+#include "vhost.h"
+
+#define VHOST_VSOCK_DEFAULT_HOST_CID	2
+
+enum {
+	VHOST_VSOCK_FEATURES = VHOST_FEATURES,
+};
+
+/* Used to track all the vhost_vsock instances on the system. */
+static DEFINE_SPINLOCK(vhost_vsock_lock);
+static LIST_HEAD(vhost_vsock_list);
+
+struct vhost_vsock {
+	struct vhost_dev dev;
+	struct vhost_virtqueue vqs[2];
+
+	/* Link to global vhost_vsock_list, protected by vhost_vsock_lock */
+	struct list_head list;
+
+	struct vhost_work send_pkt_work;
+	spinlock_t send_pkt_list_lock;
+	struct list_head send_pkt_list;	/* host->guest pending packets */
+
+	atomic_t queued_replies;
+
+	u32 guest_cid;
+};
+
+static u32 vhost_transport_get_local_cid(void)
+{
+	return VHOST_VSOCK_DEFAULT_HOST_CID;
+}
+
+static struct vhost_vsock *vhost_vsock_get(u32 guest_cid)
+{
+	struct vhost_vsock *vsock;
+
+	spin_lock_bh(&vhost_vsock_lock);
+	list_for_each_entry(vsock, &vhost_vsock_list, list) {
+		u32 other_cid = vsock->guest_cid;
+
+		/* Skip instances that have no CID yet */
+		if (other_cid == 0)
+			continue;
+
+		if (other_cid == guest_cid) {
+			spin_unlock_bh(&vhost_vsock_lock);
+			return vsock;
+		}
+	}
+	spin_unlock_bh(&vhost_vsock_lock);
+
+	return NULL;
+}
+
+static void
+vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
+			    struct vhost_virtqueue *vq)
+{
+	struct vhost_virtqueue *tx_vq = &vsock->vqs[VSOCK_VQ_TX];
+	bool added = false;
+	bool restart_tx = false;
+
+	mutex_lock(&vq->mutex);
+
+	if (!vq->private_data)
+		goto out;
+
+	/* Avoid further vmexits, we're already processing the virtqueue */
+	vhost_disable_notify(&vsock->dev, vq);
+
+	for (;;) {
+		struct virtio_vsock_pkt *pkt;
+		struct iov_iter iov_iter;
+		unsigned out, in;
+		size_t nbytes;
+		size_t len;
+		int head;
+
+		spin_lock_bh(&vsock->send_pkt_list_lock);
+		if (list_empty(&vsock->send_pkt_list)) {
+			spin_unlock_bh(&vsock->send_pkt_list_lock);
+			vhost_enable_notify(&vsock->dev, vq);
+			break;
+		}
+
+		pkt = list_first_entry(&vsock->send_pkt_list,
+				       struct virtio_vsock_pkt, list);
+		list_del_init(&pkt->list);
+		spin_unlock_bh(&vsock->send_pkt_list_lock);
+
+		head = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
+					 &out, &in, NULL, NULL);
+		if (head < 0) {
+			spin_lock_bh(&vsock->send_pkt_list_lock);
+			list_add(&pkt->list, &vsock->send_pkt_list);
+			spin_unlock_bh(&vsock->send_pkt_list_lock);
+			break;
+		}
+
+		if (head == vq->num) {
+			spin_lock_bh(&vsock->send_pkt_list_lock);
+			list_add(&pkt->list, &vsock->send_pkt_list);
+			spin_unlock_bh(&vsock->send_pkt_list_lock);
+
+			/* We cannot finish yet if more buffers snuck in while
+			 * re-enabling notify.
+			 */
+			if (unlikely(vhost_enable_notify(&vsock->dev, vq))) {
+				vhost_disable_notify(&vsock->dev, vq);
+				continue;
+			}
+			break;
+		}
+
+		if (out) {
+			virtio_transport_free_pkt(pkt);
+			vq_err(vq, "Expected 0 output buffers, got %u\n", out);
+			break;
+		}
+
+		len = iov_length(&vq->iov[out], in);
+		iov_iter_init(&iov_iter, READ, &vq->iov[out], in, len);
+
+		nbytes = copy_to_iter(&pkt->hdr, sizeof(pkt->hdr), &iov_iter);
+		if (nbytes != sizeof(pkt->hdr)) {
+			virtio_transport_free_pkt(pkt);
+			vq_err(vq, "Faulted on copying pkt hdr\n");
+			break;
+		}
+
+		nbytes = copy_to_iter(pkt->buf, pkt->len, &iov_iter);
+		if (nbytes != pkt->len) {
+			virtio_transport_free_pkt(pkt);
+			vq_err(vq, "Faulted on copying pkt buf\n");
+			break;
+		}
+
+		vhost_add_used(vq, head, sizeof(pkt->hdr) + pkt->len);
+		added = true;
+
+		if (pkt->reply) {
+			int val;
+
+			val = atomic_dec_return(&vsock->queued_replies);
+
+			/* Do we have resources to resume tx processing? */
+			if (val + 1 == tx_vq->num)
+				restart_tx = true;
+		}
+
+		virtio_transport_free_pkt(pkt);
+	}
+	if (added)
+		vhost_signal(&vsock->dev, vq);
+
+out:
+	mutex_unlock(&vq->mutex);
+
+	if (restart_tx)
+		vhost_poll_queue(&tx_vq->poll);
+}
+
+static void vhost_transport_send_pkt_work(struct vhost_work *work)
+{
+	struct vhost_virtqueue *vq;
+	struct vhost_vsock *vsock;
+
+	vsock = container_of(work, struct vhost_vsock, send_pkt_work);
+	vq = &vsock->vqs[VSOCK_VQ_RX];
+
+	vhost_transport_do_send_pkt(vsock, vq);
+}
+
+static int
+vhost_transport_send_pkt(struct virtio_vsock_pkt *pkt)
+{
+	struct vhost_vsock *vsock;
+	struct vhost_virtqueue *vq;
+	int len = pkt->len;
+
+	/* Find the vhost_vsock according to guest context id  */
+	vsock = vhost_vsock_get(le64_to_cpu(pkt->hdr.dst_cid));
+	if (!vsock) {
+		virtio_transport_free_pkt(pkt);
+		return -ENODEV;
+	}
+
+	vq = &vsock->vqs[VSOCK_VQ_RX];
+
+	if (pkt->reply)
+		atomic_inc(&vsock->queued_replies);
+
+	spin_lock_bh(&vsock->send_pkt_list_lock);
+	list_add_tail(&pkt->list, &vsock->send_pkt_list);
+	spin_unlock_bh(&vsock->send_pkt_list_lock);
+
+	vhost_work_queue(&vsock->dev, &vsock->send_pkt_work);
+	return len;
+}
+
+static struct virtio_vsock_pkt *
+vhost_vsock_alloc_pkt(struct vhost_virtqueue *vq,
+		      unsigned int out, unsigned int in)
+{
+	struct virtio_vsock_pkt *pkt;
+	struct iov_iter iov_iter;
+	size_t nbytes;
+	size_t len;
+
+	if (in != 0) {
+		vq_err(vq, "Expected 0 input buffers, got %u\n", in);
+		return NULL;
+	}
+
+	pkt = kzalloc(sizeof(*pkt), GFP_KERNEL);
+	if (!pkt)
+		return NULL;
+
+	len = iov_length(vq->iov, out);
+	iov_iter_init(&iov_iter, WRITE, vq->iov, out, len);
+
+	nbytes = copy_from_iter(&pkt->hdr, sizeof(pkt->hdr), &iov_iter);
+	if (nbytes != sizeof(pkt->hdr)) {
+		vq_err(vq, "Expected %zu bytes for pkt->hdr, got %zu bytes\n",
+		       sizeof(pkt->hdr), nbytes);
+		kfree(pkt);
+		return NULL;
+	}
+
+	if (le16_to_cpu(pkt->hdr.type) == VIRTIO_VSOCK_TYPE_STREAM)
+		pkt->len = le32_to_cpu(pkt->hdr.len);
+
+	/* No payload */
+	if (!pkt->len)
+		return pkt;
+
+	/* The pkt is too big */
+	if (pkt->len > VIRTIO_VSOCK_MAX_PKT_BUF_SIZE) {
+		kfree(pkt);
+		return NULL;
+	}
+
+	pkt->buf = kmalloc(pkt->len, GFP_KERNEL);
+	if (!pkt->buf) {
+		kfree(pkt);
+		return NULL;
+	}
+
+	nbytes = copy_from_iter(pkt->buf, pkt->len, &iov_iter);
+	if (nbytes != pkt->len) {
+		vq_err(vq, "Expected %u byte payload, got %zu bytes\n",
+		       pkt->len, nbytes);
+		virtio_transport_free_pkt(pkt);
+		return NULL;
+	}
+
+	return pkt;
+}
+
+/* Is there space left for replies to rx packets? */
+static bool vhost_vsock_more_replies(struct vhost_vsock *vsock)
+{
+	struct vhost_virtqueue *vq = &vsock->vqs[VSOCK_VQ_TX];
+	int val;
+
+	smp_rmb(); /* paired with atomic_inc() and atomic_dec_return() */
+	val = atomic_read(&vsock->queued_replies);
+
+	return val < vq->num;
+}
+
+static void vhost_vsock_handle_tx_kick(struct vhost_work *work)
+{
+	struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue,
+						  poll.work);
+	struct vhost_vsock *vsock = container_of(vq->dev, struct vhost_vsock,
+						 dev);
+	struct virtio_vsock_pkt *pkt;
+	int head;
+	unsigned int out, in;
+	bool added = false;
+
+	mutex_lock(&vq->mutex);
+
+	if (!vq->private_data)
+		goto out;
+
+	vhost_disable_notify(&vsock->dev, vq);
+	for (;;) {
+		if (!vhost_vsock_more_replies(vsock)) {
+			/* Stop tx until the device processes already
+			 * pending replies.  Leave tx virtqueue
+			 * callbacks disabled.
+			 */
+			goto no_more_replies;
+		}
+
+		head = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
+					 &out, &in, NULL, NULL);
+		if (head < 0)
+			break;
+
+		if (head == vq->num) {
+			if (unlikely(vhost_enable_notify(&vsock->dev, vq))) {
+				vhost_disable_notify(&vsock->dev, vq);
+				continue;
+			}
+			break;
+		}
+
+		pkt = vhost_vsock_alloc_pkt(vq, out, in);
+		if (!pkt) {
+			vq_err(vq, "Faulted on pkt\n");
+			continue;
+		}
+
+		/* Only accept correctly addressed packets */
+		if (le64_to_cpu(pkt->hdr.src_cid) == vsock->guest_cid)
+			virtio_transport_recv_pkt(pkt);
+		else
+			virtio_transport_free_pkt(pkt);
+
+		vhost_add_used(vq, head, sizeof(pkt->hdr) + pkt->len);
+		added = true;
+	}
+
+no_more_replies:
+	if (added)
+		vhost_signal(&vsock->dev, vq);
+
+out:
+	mutex_unlock(&vq->mutex);
+}
+
+static void vhost_vsock_handle_rx_kick(struct vhost_work *work)
+{
+	struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue,
+						poll.work);
+	struct vhost_vsock *vsock = container_of(vq->dev, struct vhost_vsock,
+						 dev);
+
+	vhost_transport_do_send_pkt(vsock, vq);
+}
+
+static int vhost_vsock_start(struct vhost_vsock *vsock)
+{
+	size_t i;
+	int ret;
+
+	mutex_lock(&vsock->dev.mutex);
+
+	ret = vhost_dev_check_owner(&vsock->dev);
+	if (ret)
+		goto err;
+
+	for (i = 0; i < ARRAY_SIZE(vsock->vqs); i++) {
+		struct vhost_virtqueue *vq = &vsock->vqs[i];
+
+		mutex_lock(&vq->mutex);
+
+		if (!vhost_vq_access_ok(vq)) {
+			ret = -EFAULT;
+			mutex_unlock(&vq->mutex);
+			goto err_vq;
+		}
+
+		if (!vq->private_data) {
+			vq->private_data = vsock;
+			vhost_vq_init_access(vq);
+		}
+
+		mutex_unlock(&vq->mutex);
+	}
+
+	mutex_unlock(&vsock->dev.mutex);
+	return 0;
+
+err_vq:
+	for (i = 0; i < ARRAY_SIZE(vsock->vqs); i++) {
+		struct vhost_virtqueue *vq = &vsock->vqs[i];
+
+		mutex_lock(&vq->mutex);
+		vq->private_data = NULL;
+		mutex_unlock(&vq->mutex);
+	}
+err:
+	mutex_unlock(&vsock->dev.mutex);
+	return ret;
+}
+
+static int vhost_vsock_stop(struct vhost_vsock *vsock)
+{
+	size_t i;
+	int ret;
+
+	mutex_lock(&vsock->dev.mutex);
+
+	ret = vhost_dev_check_owner(&vsock->dev);
+	if (ret)
+		goto err;
+
+	for (i = 0; i < ARRAY_SIZE(vsock->vqs); i++) {
+		struct vhost_virtqueue *vq = &vsock->vqs[i];
+
+		mutex_lock(&vq->mutex);
+		vq->private_data = NULL;
+		mutex_unlock(&vq->mutex);
+	}
+
+err:
+	mutex_unlock(&vsock->dev.mutex);
+	return ret;
+}
+
+static void vhost_vsock_free(struct vhost_vsock *vsock)
+{
+	if (is_vmalloc_addr(vsock))
+		vfree(vsock);
+	else
+		kfree(vsock);
+}
+
+static int vhost_vsock_dev_open(struct inode *inode, struct file *file)
+{
+	struct vhost_virtqueue **vqs;
+	struct vhost_vsock *vsock;
+	int ret;
+
+	/* This struct is large and allocation could fail, fall back to vmalloc
+	 * if there is no other way.
+	 */
+	vsock = kzalloc(sizeof(*vsock), GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
+	if (!vsock) {
+		vsock = vmalloc(sizeof(*vsock));
+		if (!vsock)
+			return -ENOMEM;
+	}
+
+	vqs = kmalloc_array(ARRAY_SIZE(vsock->vqs), sizeof(*vqs), GFP_KERNEL);
+	if (!vqs) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	atomic_set(&vsock->queued_replies, 0);
+
+	vqs[VSOCK_VQ_TX] = &vsock->vqs[VSOCK_VQ_TX];
+	vqs[VSOCK_VQ_RX] = &vsock->vqs[VSOCK_VQ_RX];
+	vsock->vqs[VSOCK_VQ_TX].handle_kick = vhost_vsock_handle_tx_kick;
+	vsock->vqs[VSOCK_VQ_RX].handle_kick = vhost_vsock_handle_rx_kick;
+
+	vhost_dev_init(&vsock->dev, vqs, ARRAY_SIZE(vsock->vqs));
+
+	file->private_data = vsock;
+	spin_lock_init(&vsock->send_pkt_list_lock);
+	INIT_LIST_HEAD(&vsock->send_pkt_list);
+	vhost_work_init(&vsock->send_pkt_work, vhost_transport_send_pkt_work);
+
+	spin_lock_bh(&vhost_vsock_lock);
+	list_add_tail(&vsock->list, &vhost_vsock_list);
+	spin_unlock_bh(&vhost_vsock_lock);
+	return 0;
+
+out:
+	vhost_vsock_free(vsock);
+	return ret;
+}
+
+static void vhost_vsock_flush(struct vhost_vsock *vsock)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(vsock->vqs); i++)
+		if (vsock->vqs[i].handle_kick)
+			vhost_poll_flush(&vsock->vqs[i].poll);
+	vhost_work_flush(&vsock->dev, &vsock->send_pkt_work);
+}
+
+static void vhost_vsock_reset_orphans(struct sock *sk)
+{
+	struct vsock_sock *vsk = vsock_sk(sk);
+
+	/* vmci_transport.c doesn't take sk_lock here either.  At least we're
+	 * under vsock_table_lock so the sock cannot disappear while we're
+	 * executing.
+	 */
+
+	if (!vhost_vsock_get(vsk->local_addr.svm_cid)) {
+		sock_set_flag(sk, SOCK_DONE);
+		vsk->peer_shutdown = SHUTDOWN_MASK;
+		sk->sk_state = SS_UNCONNECTED;
+		sk->sk_err = ECONNRESET;
+		sk->sk_error_report(sk);
+	}
+}
+
+static int vhost_vsock_dev_release(struct inode *inode, struct file *file)
+{
+	struct vhost_vsock *vsock = file->private_data;
+
+	spin_lock_bh(&vhost_vsock_lock);
+	list_del(&vsock->list);
+	spin_unlock_bh(&vhost_vsock_lock);
+
+	/* Iterating over all connections for all CIDs to find orphans is
+	 * inefficient.  Room for improvement here. */
+	vsock_for_each_connected_socket(vhost_vsock_reset_orphans);
+
+	vhost_vsock_stop(vsock);
+	vhost_vsock_flush(vsock);
+	vhost_dev_stop(&vsock->dev);
+
+	spin_lock_bh(&vsock->send_pkt_list_lock);
+	while (!list_empty(&vsock->send_pkt_list)) {
+		struct virtio_vsock_pkt *pkt;
+
+		pkt = list_first_entry(&vsock->send_pkt_list,
+				struct virtio_vsock_pkt, list);
+		list_del_init(&pkt->list);
+		virtio_transport_free_pkt(pkt);
+	}
+	spin_unlock_bh(&vsock->send_pkt_list_lock);
+
+	vhost_dev_cleanup(&vsock->dev, false);
+	kfree(vsock->dev.vqs);
+	vhost_vsock_free(vsock);
+	return 0;
+}
+
+static int vhost_vsock_set_cid(struct vhost_vsock *vsock, u64 guest_cid)
+{
+	struct vhost_vsock *other;
+
+	/* Refuse reserved CIDs */
+	if (guest_cid <= VMADDR_CID_HOST ||
+	    guest_cid == U32_MAX)
+		return -EINVAL;
+
+	/* 64-bit CIDs are not yet supported */
+	if (guest_cid > U32_MAX)
+		return -EINVAL;
+
+	/* Refuse if CID is already in use */
+	other = vhost_vsock_get(guest_cid);
+	if (other && other != vsock)
+		return -EADDRINUSE;
+
+	spin_lock_bh(&vhost_vsock_lock);
+	vsock->guest_cid = guest_cid;
+	spin_unlock_bh(&vhost_vsock_lock);
+
+	return 0;
+}
+
+static int vhost_vsock_set_features(struct vhost_vsock *vsock, u64 features)
+{
+	struct vhost_virtqueue *vq;
+	int i;
+
+	if (features & ~VHOST_VSOCK_FEATURES)
+		return -EOPNOTSUPP;
+
+	mutex_lock(&vsock->dev.mutex);
+	if ((features & (1 << VHOST_F_LOG_ALL)) &&
+	    !vhost_log_access_ok(&vsock->dev)) {
+		mutex_unlock(&vsock->dev.mutex);
+		return -EFAULT;
+	}
+
+	for (i = 0; i < ARRAY_SIZE(vsock->vqs); i++) {
+		vq = &vsock->vqs[i];
+		mutex_lock(&vq->mutex);
+		vq->acked_features = features;
+		mutex_unlock(&vq->mutex);
+	}
+	mutex_unlock(&vsock->dev.mutex);
+	return 0;
+}
+
+static long vhost_vsock_dev_ioctl(struct file *f, unsigned int ioctl,
+				  unsigned long arg)
+{
+	struct vhost_vsock *vsock = f->private_data;
+	void __user *argp = (void __user *)arg;
+	u64 guest_cid;
+	u64 features;
+	int start;
+	int r;
+
+	switch (ioctl) {
+	case VHOST_VSOCK_SET_GUEST_CID:
+		if (copy_from_user(&guest_cid, argp, sizeof(guest_cid)))
+			return -EFAULT;
+		return vhost_vsock_set_cid(vsock, guest_cid);
+	case VHOST_VSOCK_SET_RUNNING:
+		if (copy_from_user(&start, argp, sizeof(start)))
+			return -EFAULT;
+		if (start)
+			return vhost_vsock_start(vsock);
+		else
+			return vhost_vsock_stop(vsock);
+	case VHOST_GET_FEATURES:
+		features = VHOST_VSOCK_FEATURES;
+		if (copy_to_user(argp, &features, sizeof(features)))
+			return -EFAULT;
+		return 0;
+	case VHOST_SET_FEATURES:
+		if (copy_from_user(&features, argp, sizeof(features)))
+			return -EFAULT;
+		return vhost_vsock_set_features(vsock, features);
+	default:
+		mutex_lock(&vsock->dev.mutex);
+		r = vhost_dev_ioctl(&vsock->dev, ioctl, argp);
+		if (r == -ENOIOCTLCMD)
+			r = vhost_vring_ioctl(&vsock->dev, ioctl, argp);
+		else
+			vhost_vsock_flush(vsock);
+		mutex_unlock(&vsock->dev.mutex);
+		return r;
+	}
+}
+
+static const struct file_operations vhost_vsock_fops = {
+	.owner          = THIS_MODULE,
+	.open           = vhost_vsock_dev_open,
+	.release        = vhost_vsock_dev_release,
+	.llseek		= noop_llseek,
+	.unlocked_ioctl = vhost_vsock_dev_ioctl,
+};
+
+static struct miscdevice vhost_vsock_misc = {
+	.minor = MISC_DYNAMIC_MINOR,
+	.name = "vhost-vsock",
+	.fops = &vhost_vsock_fops,
+};
+
+static struct virtio_transport vhost_transport = {
+	.transport = {
+		.get_local_cid            = vhost_transport_get_local_cid,
+
+		.init                     = virtio_transport_do_socket_init,
+		.destruct                 = virtio_transport_destruct,
+		.release                  = virtio_transport_release,
+		.connect                  = virtio_transport_connect,
+		.shutdown                 = virtio_transport_shutdown,
+
+		.dgram_enqueue            = virtio_transport_dgram_enqueue,
+		.dgram_dequeue            = virtio_transport_dgram_dequeue,
+		.dgram_bind               = virtio_transport_dgram_bind,
+		.dgram_allow              = virtio_transport_dgram_allow,
+
+		.stream_enqueue           = virtio_transport_stream_enqueue,
+		.stream_dequeue           = virtio_transport_stream_dequeue,
+		.stream_has_data          = virtio_transport_stream_has_data,
+		.stream_has_space         = virtio_transport_stream_has_space,
+		.stream_rcvhiwat          = virtio_transport_stream_rcvhiwat,
+		.stream_is_active         = virtio_transport_stream_is_active,
+		.stream_allow             = virtio_transport_stream_allow,
+
+		.notify_poll_in           = virtio_transport_notify_poll_in,
+		.notify_poll_out          = virtio_transport_notify_poll_out,
+		.notify_recv_init         = virtio_transport_notify_recv_init,
+		.notify_recv_pre_block    = virtio_transport_notify_recv_pre_block,
+		.notify_recv_pre_dequeue  = virtio_transport_notify_recv_pre_dequeue,
+		.notify_recv_post_dequeue = virtio_transport_notify_recv_post_dequeue,
+		.notify_send_init         = virtio_transport_notify_send_init,
+		.notify_send_pre_block    = virtio_transport_notify_send_pre_block,
+		.notify_send_pre_enqueue  = virtio_transport_notify_send_pre_enqueue,
+		.notify_send_post_enqueue = virtio_transport_notify_send_post_enqueue,
+
+		.set_buffer_size          = virtio_transport_set_buffer_size,
+		.set_min_buffer_size      = virtio_transport_set_min_buffer_size,
+		.set_max_buffer_size      = virtio_transport_set_max_buffer_size,
+		.get_buffer_size          = virtio_transport_get_buffer_size,
+		.get_min_buffer_size      = virtio_transport_get_min_buffer_size,
+		.get_max_buffer_size      = virtio_transport_get_max_buffer_size,
+	},
+
+	.send_pkt = vhost_transport_send_pkt,
+};
+
+static int __init vhost_vsock_init(void)
+{
+	int ret;
+
+	ret = vsock_core_init(&vhost_transport.transport);
+	if (ret < 0)
+		return ret;
+	return misc_register(&vhost_vsock_misc);
+};
+
+static void __exit vhost_vsock_exit(void)
+{
+	misc_deregister(&vhost_vsock_misc);
+	vsock_core_exit();
+};
+
+module_init(vhost_vsock_init);
+module_exit(vhost_vsock_exit);
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Asias He");
+MODULE_DESCRIPTION("vhost transport for vsock ");
diff --git a/include/uapi/linux/vhost.h b/include/uapi/linux/vhost.h
index 61a8777178c6..c4400b267716 100644
--- a/include/uapi/linux/vhost.h
+++ b/include/uapi/linux/vhost.h
@@ -175,4 +175,9 @@ struct vhost_scsi_target {
 #define VHOST_SCSI_SET_EVENTS_MISSED _IOW(VHOST_VIRTIO, 0x43, __u32)
 #define VHOST_SCSI_GET_EVENTS_MISSED _IOW(VHOST_VIRTIO, 0x44, __u32)
 
+/* VHOST_VSOCK specific defines */
+
+#define VHOST_VSOCK_SET_GUEST_CID	_IOW(VHOST_VIRTIO, 0x60, __u64)
+#define VHOST_VSOCK_SET_RUNNING		_IOW(VHOST_VIRTIO, 0x61, int)
+
 #endif
-- 
cgit v1.2.3-71-gd317


From 6b1e6cc7855b09a0a9bfa1d9f30172ba366f161c Mon Sep 17 00:00:00 2001
From: Jason Wang <jasowang@redhat.com>
Date: Thu, 23 Jun 2016 02:04:32 -0400
Subject: vhost: new device IOTLB API

This patch tries to implement an device IOTLB for vhost. This could be
used with userspace(qemu) implementation of DMA remapping
to emulate an IOMMU for the guest.

The idea is simple, cache the translation in a software device IOTLB
(which is implemented as an interval tree) in vhost and use vhost_net
file descriptor for reporting IOTLB miss and IOTLB
update/invalidation. When vhost meets an IOTLB miss, the fault
address, size and access can be read from the file. After userspace
finishes the translation, it writes the translated address to the
vhost_net file to update the device IOTLB.

When device IOTLB is enabled by setting VIRTIO_F_IOMMU_PLATFORM all vq
addresses set by ioctl are treated as iova instead of virtual address and
the accessing can only be done through IOTLB instead of direct userspace
memory access. Before each round or vq processing, all vq metadata is
prefetched in device IOTLB to make sure no translation fault happens
during vq processing.

In most cases, virtqueues are contiguous even in virtual address space.
The IOTLB translation for virtqueue itself may make it a little
slower. We might add fast path cache on top of this patch.

Signed-off-by: Jason Wang <jasowang@redhat.com>
[mst: use virtio feature bit: VHOST_F_DEVICE_IOTLB -> VIRTIO_F_IOMMU_PLATFORM ]
[mst: fix build warnings ]
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
[ weiyj.lk: missing unlock on error ]
Signed-off-by: Wei Yongjun <weiyj.lk@gmail.com>
---
 drivers/vhost/net.c        |  59 ++++-
 drivers/vhost/vhost.c      | 636 ++++++++++++++++++++++++++++++++++++++++++---
 drivers/vhost/vhost.h      |  32 ++-
 include/uapi/linux/vhost.h |  28 ++
 4 files changed, 705 insertions(+), 50 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index a6b270aff9ef..0965f869dc57 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -61,7 +61,8 @@ MODULE_PARM_DESC(experimental_zcopytx, "Enable Zero Copy TX;"
 enum {
 	VHOST_NET_FEATURES = VHOST_FEATURES |
 			 (1ULL << VHOST_NET_F_VIRTIO_NET_HDR) |
-			 (1ULL << VIRTIO_NET_F_MRG_RXBUF)
+			 (1ULL << VIRTIO_NET_F_MRG_RXBUF) |
+			 (1ULL << VIRTIO_F_IOMMU_PLATFORM)
 };
 
 enum {
@@ -308,7 +309,7 @@ static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
 {
 	unsigned long uninitialized_var(endtime);
 	int r = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
-				    out_num, in_num, NULL, NULL);
+				  out_num, in_num, NULL, NULL);
 
 	if (r == vq->num && vq->busyloop_timeout) {
 		preempt_disable();
@@ -318,7 +319,7 @@ static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
 			cpu_relax_lowlatency();
 		preempt_enable();
 		r = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
-					out_num, in_num, NULL, NULL);
+				      out_num, in_num, NULL, NULL);
 	}
 
 	return r;
@@ -351,6 +352,9 @@ static void handle_tx(struct vhost_net *net)
 	if (!sock)
 		goto out;
 
+	if (!vq_iotlb_prefetch(vq))
+		goto out;
+
 	vhost_disable_notify(&net->dev, vq);
 
 	hdr_size = nvq->vhost_hlen;
@@ -612,6 +616,10 @@ static void handle_rx(struct vhost_net *net)
 	sock = vq->private_data;
 	if (!sock)
 		goto out;
+
+	if (!vq_iotlb_prefetch(vq))
+		goto out;
+
 	vhost_disable_notify(&net->dev, vq);
 
 	vhost_hlen = nvq->vhost_hlen;
@@ -1080,10 +1088,14 @@ static int vhost_net_set_features(struct vhost_net *n, u64 features)
 	}
 	mutex_lock(&n->dev.mutex);
 	if ((features & (1 << VHOST_F_LOG_ALL)) &&
-	    !vhost_log_access_ok(&n->dev)) {
-		mutex_unlock(&n->dev.mutex);
-		return -EFAULT;
+	    !vhost_log_access_ok(&n->dev))
+		goto out_unlock;
+
+	if ((features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))) {
+		if (vhost_init_device_iotlb(&n->dev, true))
+			goto out_unlock;
 	}
+
 	for (i = 0; i < VHOST_NET_VQ_MAX; ++i) {
 		mutex_lock(&n->vqs[i].vq.mutex);
 		n->vqs[i].vq.acked_features = features;
@@ -1093,6 +1105,10 @@ static int vhost_net_set_features(struct vhost_net *n, u64 features)
 	}
 	mutex_unlock(&n->dev.mutex);
 	return 0;
+
+out_unlock:
+	mutex_unlock(&n->dev.mutex);
+	return -EFAULT;
 }
 
 static long vhost_net_set_owner(struct vhost_net *n)
@@ -1166,9 +1182,40 @@ static long vhost_net_compat_ioctl(struct file *f, unsigned int ioctl,
 }
 #endif
 
+static ssize_t vhost_net_chr_read_iter(struct kiocb *iocb, struct iov_iter *to)
+{
+	struct file *file = iocb->ki_filp;
+	struct vhost_net *n = file->private_data;
+	struct vhost_dev *dev = &n->dev;
+	int noblock = file->f_flags & O_NONBLOCK;
+
+	return vhost_chr_read_iter(dev, to, noblock);
+}
+
+static ssize_t vhost_net_chr_write_iter(struct kiocb *iocb,
+					struct iov_iter *from)
+{
+	struct file *file = iocb->ki_filp;
+	struct vhost_net *n = file->private_data;
+	struct vhost_dev *dev = &n->dev;
+
+	return vhost_chr_write_iter(dev, from);
+}
+
+static unsigned int vhost_net_chr_poll(struct file *file, poll_table *wait)
+{
+	struct vhost_net *n = file->private_data;
+	struct vhost_dev *dev = &n->dev;
+
+	return vhost_chr_poll(file, dev, wait);
+}
+
 static const struct file_operations vhost_net_fops = {
 	.owner          = THIS_MODULE,
 	.release        = vhost_net_release,
+	.read_iter      = vhost_net_chr_read_iter,
+	.write_iter     = vhost_net_chr_write_iter,
+	.poll           = vhost_net_chr_poll,
 	.unlocked_ioctl = vhost_net_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl   = vhost_net_compat_ioctl,
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 8071f3638db9..d02c1614921f 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -35,6 +35,10 @@ static ushort max_mem_regions = 64;
 module_param(max_mem_regions, ushort, 0444);
 MODULE_PARM_DESC(max_mem_regions,
 	"Maximum number of memory regions in memory map. (default: 64)");
+static int max_iotlb_entries = 2048;
+module_param(max_iotlb_entries, int, 0444);
+MODULE_PARM_DESC(max_iotlb_entries,
+	"Maximum number of iotlb entries. (default: 2048)");
 
 enum {
 	VHOST_MEMORY_F_LOG = 0x1,
@@ -306,6 +310,7 @@ static void vhost_vq_reset(struct vhost_dev *dev,
 	vhost_disable_cross_endian(vq);
 	vq->busyloop_timeout = 0;
 	vq->umem = NULL;
+	vq->iotlb = NULL;
 }
 
 static int vhost_worker(void *data)
@@ -400,9 +405,14 @@ void vhost_dev_init(struct vhost_dev *dev,
 	dev->log_ctx = NULL;
 	dev->log_file = NULL;
 	dev->umem = NULL;
+	dev->iotlb = NULL;
 	dev->mm = NULL;
 	dev->worker = NULL;
 	init_llist_head(&dev->work_list);
+	init_waitqueue_head(&dev->wait);
+	INIT_LIST_HEAD(&dev->read_list);
+	INIT_LIST_HEAD(&dev->pending_list);
+	spin_lock_init(&dev->iotlb_lock);
 
 
 	for (i = 0; i < dev->nvqs; ++i) {
@@ -550,6 +560,15 @@ void vhost_dev_stop(struct vhost_dev *dev)
 }
 EXPORT_SYMBOL_GPL(vhost_dev_stop);
 
+static void vhost_umem_free(struct vhost_umem *umem,
+			    struct vhost_umem_node *node)
+{
+	vhost_umem_interval_tree_remove(node, &umem->umem_tree);
+	list_del(&node->link);
+	kfree(node);
+	umem->numem--;
+}
+
 static void vhost_umem_clean(struct vhost_umem *umem)
 {
 	struct vhost_umem_node *node, *tmp;
@@ -557,14 +576,31 @@ static void vhost_umem_clean(struct vhost_umem *umem)
 	if (!umem)
 		return;
 
-	list_for_each_entry_safe(node, tmp, &umem->umem_list, link) {
-		vhost_umem_interval_tree_remove(node, &umem->umem_tree);
-		list_del(&node->link);
-		kvfree(node);
-	}
+	list_for_each_entry_safe(node, tmp, &umem->umem_list, link)
+		vhost_umem_free(umem, node);
+
 	kvfree(umem);
 }
 
+static void vhost_clear_msg(struct vhost_dev *dev)
+{
+	struct vhost_msg_node *node, *n;
+
+	spin_lock(&dev->iotlb_lock);
+
+	list_for_each_entry_safe(node, n, &dev->read_list, node) {
+		list_del(&node->node);
+		kfree(node);
+	}
+
+	list_for_each_entry_safe(node, n, &dev->pending_list, node) {
+		list_del(&node->node);
+		kfree(node);
+	}
+
+	spin_unlock(&dev->iotlb_lock);
+}
+
 /* Caller should have device mutex if and only if locked is set */
 void vhost_dev_cleanup(struct vhost_dev *dev, bool locked)
 {
@@ -593,6 +629,10 @@ void vhost_dev_cleanup(struct vhost_dev *dev, bool locked)
 	/* No one will access memory at this point */
 	vhost_umem_clean(dev->umem);
 	dev->umem = NULL;
+	vhost_umem_clean(dev->iotlb);
+	dev->iotlb = NULL;
+	vhost_clear_msg(dev);
+	wake_up_interruptible_poll(&dev->wait, POLLIN | POLLRDNORM);
 	WARN_ON(!llist_empty(&dev->work_list));
 	if (dev->worker) {
 		kthread_stop(dev->worker);
@@ -668,28 +708,381 @@ static int memory_access_ok(struct vhost_dev *d, struct vhost_umem *umem,
 	return 1;
 }
 
-#define vhost_put_user(vq, x, ptr)  __put_user(x, ptr)
+static int translate_desc(struct vhost_virtqueue *vq, u64 addr, u32 len,
+			  struct iovec iov[], int iov_size, int access);
 
 static int vhost_copy_to_user(struct vhost_virtqueue *vq, void *to,
 			      const void *from, unsigned size)
 {
-	return __copy_to_user(to, from, size);
-}
+	int ret;
 
-#define vhost_get_user(vq, x, ptr) __get_user(x, ptr)
+	if (!vq->iotlb)
+		return __copy_to_user(to, from, size);
+	else {
+		/* This function should be called after iotlb
+		 * prefetch, which means we're sure that all vq
+		 * could be access through iotlb. So -EAGAIN should
+		 * not happen in this case.
+		 */
+		/* TODO: more fast path */
+		struct iov_iter t;
+		ret = translate_desc(vq, (u64)(uintptr_t)to, size, vq->iotlb_iov,
+				     ARRAY_SIZE(vq->iotlb_iov),
+				     VHOST_ACCESS_WO);
+		if (ret < 0)
+			goto out;
+		iov_iter_init(&t, WRITE, vq->iotlb_iov, ret, size);
+		ret = copy_to_iter(from, size, &t);
+		if (ret == size)
+			ret = 0;
+	}
+out:
+	return ret;
+}
 
 static int vhost_copy_from_user(struct vhost_virtqueue *vq, void *to,
 				void *from, unsigned size)
 {
-	return __copy_from_user(to, from, size);
+	int ret;
+
+	if (!vq->iotlb)
+		return __copy_from_user(to, from, size);
+	else {
+		/* This function should be called after iotlb
+		 * prefetch, which means we're sure that vq
+		 * could be access through iotlb. So -EAGAIN should
+		 * not happen in this case.
+		 */
+		/* TODO: more fast path */
+		struct iov_iter f;
+		ret = translate_desc(vq, (u64)(uintptr_t)from, size, vq->iotlb_iov,
+				     ARRAY_SIZE(vq->iotlb_iov),
+				     VHOST_ACCESS_RO);
+		if (ret < 0) {
+			vq_err(vq, "IOTLB translation failure: uaddr "
+			       "%p size 0x%llx\n", from,
+			       (unsigned long long) size);
+			goto out;
+		}
+		iov_iter_init(&f, READ, vq->iotlb_iov, ret, size);
+		ret = copy_from_iter(to, size, &f);
+		if (ret == size)
+			ret = 0;
+	}
+
+out:
+	return ret;
+}
+
+static void __user *__vhost_get_user(struct vhost_virtqueue *vq,
+				     void *addr, unsigned size)
+{
+	int ret;
+
+	/* This function should be called after iotlb
+	 * prefetch, which means we're sure that vq
+	 * could be access through iotlb. So -EAGAIN should
+	 * not happen in this case.
+	 */
+	/* TODO: more fast path */
+	ret = translate_desc(vq, (u64)(uintptr_t)addr, size, vq->iotlb_iov,
+			     ARRAY_SIZE(vq->iotlb_iov),
+			     VHOST_ACCESS_RO);
+	if (ret < 0) {
+		vq_err(vq, "IOTLB translation failure: uaddr "
+			"%p size 0x%llx\n", addr,
+			(unsigned long long) size);
+		return NULL;
+	}
+
+	if (ret != 1 || vq->iotlb_iov[0].iov_len != size) {
+		vq_err(vq, "Non atomic userspace memory access: uaddr "
+			"%p size 0x%llx\n", addr,
+			(unsigned long long) size);
+		return NULL;
+	}
+
+	return vq->iotlb_iov[0].iov_base;
+}
+
+#define vhost_put_user(vq, x, ptr) \
+({ \
+	int ret = -EFAULT; \
+	if (!vq->iotlb) { \
+		ret = __put_user(x, ptr); \
+	} else { \
+		__typeof__(ptr) to = \
+			(__typeof__(ptr)) __vhost_get_user(vq, ptr, sizeof(*ptr)); \
+		if (to != NULL) \
+			ret = __put_user(x, to); \
+		else \
+			ret = -EFAULT;	\
+	} \
+	ret; \
+})
+
+#define vhost_get_user(vq, x, ptr) \
+({ \
+	int ret; \
+	if (!vq->iotlb) { \
+		ret = __get_user(x, ptr); \
+	} else { \
+		__typeof__(ptr) from = \
+			(__typeof__(ptr)) __vhost_get_user(vq, ptr, sizeof(*ptr)); \
+		if (from != NULL) \
+			ret = __get_user(x, from); \
+		else \
+			ret = -EFAULT; \
+	} \
+	ret; \
+})
+
+static void vhost_dev_lock_vqs(struct vhost_dev *d)
+{
+	int i = 0;
+	for (i = 0; i < d->nvqs; ++i)
+		mutex_lock(&d->vqs[i]->mutex);
+}
+
+static void vhost_dev_unlock_vqs(struct vhost_dev *d)
+{
+	int i = 0;
+	for (i = 0; i < d->nvqs; ++i)
+		mutex_unlock(&d->vqs[i]->mutex);
+}
+
+static int vhost_new_umem_range(struct vhost_umem *umem,
+				u64 start, u64 size, u64 end,
+				u64 userspace_addr, int perm)
+{
+	struct vhost_umem_node *tmp, *node = kmalloc(sizeof(*node), GFP_ATOMIC);
+
+	if (!node)
+		return -ENOMEM;
+
+	if (umem->numem == max_iotlb_entries) {
+		tmp = list_first_entry(&umem->umem_list, typeof(*tmp), link);
+		vhost_umem_free(umem, tmp);
+	}
+
+	node->start = start;
+	node->size = size;
+	node->last = end;
+	node->userspace_addr = userspace_addr;
+	node->perm = perm;
+	INIT_LIST_HEAD(&node->link);
+	list_add_tail(&node->link, &umem->umem_list);
+	vhost_umem_interval_tree_insert(node, &umem->umem_tree);
+	umem->numem++;
+
+	return 0;
+}
+
+static void vhost_del_umem_range(struct vhost_umem *umem,
+				 u64 start, u64 end)
+{
+	struct vhost_umem_node *node;
+
+	while ((node = vhost_umem_interval_tree_iter_first(&umem->umem_tree,
+							   start, end)))
+		vhost_umem_free(umem, node);
+}
+
+static void vhost_iotlb_notify_vq(struct vhost_dev *d,
+				  struct vhost_iotlb_msg *msg)
+{
+	struct vhost_msg_node *node, *n;
+
+	spin_lock(&d->iotlb_lock);
+
+	list_for_each_entry_safe(node, n, &d->pending_list, node) {
+		struct vhost_iotlb_msg *vq_msg = &node->msg.iotlb;
+		if (msg->iova <= vq_msg->iova &&
+		    msg->iova + msg->size - 1 > vq_msg->iova &&
+		    vq_msg->type == VHOST_IOTLB_MISS) {
+			vhost_poll_queue(&node->vq->poll);
+			list_del(&node->node);
+			kfree(node);
+		}
+	}
+
+	spin_unlock(&d->iotlb_lock);
+}
+
+static int umem_access_ok(u64 uaddr, u64 size, int access)
+{
+	unsigned long a = uaddr;
+
+	if ((access & VHOST_ACCESS_RO) &&
+	    !access_ok(VERIFY_READ, (void __user *)a, size))
+		return -EFAULT;
+	if ((access & VHOST_ACCESS_WO) &&
+	    !access_ok(VERIFY_WRITE, (void __user *)a, size))
+		return -EFAULT;
+	return 0;
+}
+
+int vhost_process_iotlb_msg(struct vhost_dev *dev,
+			    struct vhost_iotlb_msg *msg)
+{
+	int ret = 0;
+
+	vhost_dev_lock_vqs(dev);
+	switch (msg->type) {
+	case VHOST_IOTLB_UPDATE:
+		if (!dev->iotlb) {
+			ret = -EFAULT;
+			break;
+		}
+		if (umem_access_ok(msg->uaddr, msg->size, msg->perm)) {
+			ret = -EFAULT;
+			break;
+		}
+		if (vhost_new_umem_range(dev->iotlb, msg->iova, msg->size,
+					 msg->iova + msg->size - 1,
+					 msg->uaddr, msg->perm)) {
+			ret = -ENOMEM;
+			break;
+		}
+		vhost_iotlb_notify_vq(dev, msg);
+		break;
+	case VHOST_IOTLB_INVALIDATE:
+		vhost_del_umem_range(dev->iotlb, msg->iova,
+				     msg->iova + msg->size - 1);
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+
+	vhost_dev_unlock_vqs(dev);
+	return ret;
+}
+ssize_t vhost_chr_write_iter(struct vhost_dev *dev,
+			     struct iov_iter *from)
+{
+	struct vhost_msg_node node;
+	unsigned size = sizeof(struct vhost_msg);
+	size_t ret;
+	int err;
+
+	if (iov_iter_count(from) < size)
+		return 0;
+	ret = copy_from_iter(&node.msg, size, from);
+	if (ret != size)
+		goto done;
+
+	switch (node.msg.type) {
+	case VHOST_IOTLB_MSG:
+		err = vhost_process_iotlb_msg(dev, &node.msg.iotlb);
+		if (err)
+			ret = err;
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+
+done:
+	return ret;
+}
+EXPORT_SYMBOL(vhost_chr_write_iter);
+
+unsigned int vhost_chr_poll(struct file *file, struct vhost_dev *dev,
+			    poll_table *wait)
+{
+	unsigned int mask = 0;
+
+	poll_wait(file, &dev->wait, wait);
+
+	if (!list_empty(&dev->read_list))
+		mask |= POLLIN | POLLRDNORM;
+
+	return mask;
+}
+EXPORT_SYMBOL(vhost_chr_poll);
+
+ssize_t vhost_chr_read_iter(struct vhost_dev *dev, struct iov_iter *to,
+			    int noblock)
+{
+	DEFINE_WAIT(wait);
+	struct vhost_msg_node *node;
+	ssize_t ret = 0;
+	unsigned size = sizeof(struct vhost_msg);
+
+	if (iov_iter_count(to) < size)
+		return 0;
+
+	while (1) {
+		if (!noblock)
+			prepare_to_wait(&dev->wait, &wait,
+					TASK_INTERRUPTIBLE);
+
+		node = vhost_dequeue_msg(dev, &dev->read_list);
+		if (node)
+			break;
+		if (noblock) {
+			ret = -EAGAIN;
+			break;
+		}
+		if (signal_pending(current)) {
+			ret = -ERESTARTSYS;
+			break;
+		}
+		if (!dev->iotlb) {
+			ret = -EBADFD;
+			break;
+		}
+
+		schedule();
+	}
+
+	if (!noblock)
+		finish_wait(&dev->wait, &wait);
+
+	if (node) {
+		ret = copy_to_iter(&node->msg, size, to);
+
+		if (ret != size || node->msg.type != VHOST_IOTLB_MISS) {
+			kfree(node);
+			return ret;
+		}
+
+		vhost_enqueue_msg(dev, &dev->pending_list, node);
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(vhost_chr_read_iter);
+
+static int vhost_iotlb_miss(struct vhost_virtqueue *vq, u64 iova, int access)
+{
+	struct vhost_dev *dev = vq->dev;
+	struct vhost_msg_node *node;
+	struct vhost_iotlb_msg *msg;
+
+	node = vhost_new_msg(vq, VHOST_IOTLB_MISS);
+	if (!node)
+		return -ENOMEM;
+
+	msg = &node->msg.iotlb;
+	msg->type = VHOST_IOTLB_MISS;
+	msg->iova = iova;
+	msg->perm = access;
+
+	vhost_enqueue_msg(dev, &dev->read_list, node);
+
+	return 0;
 }
 
 static int vq_access_ok(struct vhost_virtqueue *vq, unsigned int num,
 			struct vring_desc __user *desc,
 			struct vring_avail __user *avail,
 			struct vring_used __user *used)
+
 {
 	size_t s = vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX) ? 2 : 0;
+
 	return access_ok(VERIFY_READ, desc, num * sizeof *desc) &&
 	       access_ok(VERIFY_READ, avail,
 			 sizeof *avail + num * sizeof *avail->ring + s) &&
@@ -697,6 +1090,54 @@ static int vq_access_ok(struct vhost_virtqueue *vq, unsigned int num,
 			sizeof *used + num * sizeof *used->ring + s);
 }
 
+static int iotlb_access_ok(struct vhost_virtqueue *vq,
+			   int access, u64 addr, u64 len)
+{
+	const struct vhost_umem_node *node;
+	struct vhost_umem *umem = vq->iotlb;
+	u64 s = 0, size;
+
+	while (len > s) {
+		node = vhost_umem_interval_tree_iter_first(&umem->umem_tree,
+							   addr,
+							   addr + len - 1);
+		if (node == NULL || node->start > addr) {
+			vhost_iotlb_miss(vq, addr, access);
+			return false;
+		} else if (!(node->perm & access)) {
+			/* Report the possible access violation by
+			 * request another translation from userspace.
+			 */
+			return false;
+		}
+
+		size = node->size - addr + node->start;
+		s += size;
+		addr += size;
+	}
+
+	return true;
+}
+
+int vq_iotlb_prefetch(struct vhost_virtqueue *vq)
+{
+	size_t s = vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX) ? 2 : 0;
+	unsigned int num = vq->num;
+
+	if (!vq->iotlb)
+		return 1;
+
+	return iotlb_access_ok(vq, VHOST_ACCESS_RO, (u64)(uintptr_t)vq->desc,
+			       num * sizeof *vq->desc) &&
+	       iotlb_access_ok(vq, VHOST_ACCESS_RO, (u64)(uintptr_t)vq->avail,
+			       sizeof *vq->avail +
+			       num * sizeof *vq->avail->ring + s) &&
+	       iotlb_access_ok(vq, VHOST_ACCESS_WO, (u64)(uintptr_t)vq->used,
+			       sizeof *vq->used +
+			       num * sizeof *vq->used->ring + s);
+}
+EXPORT_SYMBOL_GPL(vq_iotlb_prefetch);
+
 /* Can we log writes? */
 /* Caller should have device mutex but not vq mutex */
 int vhost_log_access_ok(struct vhost_dev *dev)
@@ -723,16 +1164,35 @@ static int vq_log_access_ok(struct vhost_virtqueue *vq,
 /* Caller should have vq mutex and device mutex */
 int vhost_vq_access_ok(struct vhost_virtqueue *vq)
 {
+	if (vq->iotlb) {
+		/* When device IOTLB was used, the access validation
+		 * will be validated during prefetching.
+		 */
+		return 1;
+	}
 	return vq_access_ok(vq, vq->num, vq->desc, vq->avail, vq->used) &&
 		vq_log_access_ok(vq, vq->log_base);
 }
 EXPORT_SYMBOL_GPL(vhost_vq_access_ok);
 
+static struct vhost_umem *vhost_umem_alloc(void)
+{
+	struct vhost_umem *umem = vhost_kvzalloc(sizeof(*umem));
+
+	if (!umem)
+		return NULL;
+
+	umem->umem_tree = RB_ROOT;
+	umem->numem = 0;
+	INIT_LIST_HEAD(&umem->umem_list);
+
+	return umem;
+}
+
 static long vhost_set_memory(struct vhost_dev *d, struct vhost_memory __user *m)
 {
 	struct vhost_memory mem, *newmem;
 	struct vhost_memory_region *region;
-	struct vhost_umem_node *node;
 	struct vhost_umem *newumem, *oldumem;
 	unsigned long size = offsetof(struct vhost_memory, regions);
 	int i;
@@ -754,28 +1214,23 @@ static long vhost_set_memory(struct vhost_dev *d, struct vhost_memory __user *m)
 		return -EFAULT;
 	}
 
-	newumem = vhost_kvzalloc(sizeof(*newumem));
+	newumem = vhost_umem_alloc();
 	if (!newumem) {
 		kvfree(newmem);
 		return -ENOMEM;
 	}
 
-	newumem->umem_tree = RB_ROOT;
-	INIT_LIST_HEAD(&newumem->umem_list);
-
 	for (region = newmem->regions;
 	     region < newmem->regions + mem.nregions;
 	     region++) {
-		node = vhost_kvzalloc(sizeof(*node));
-		if (!node)
+		if (vhost_new_umem_range(newumem,
+					 region->guest_phys_addr,
+					 region->memory_size,
+					 region->guest_phys_addr +
+					 region->memory_size - 1,
+					 region->userspace_addr,
+					 VHOST_ACCESS_RW))
 			goto err;
-		node->start = region->guest_phys_addr;
-		node->size = region->memory_size;
-		node->last = node->start + node->size - 1;
-		node->userspace_addr = region->userspace_addr;
-		INIT_LIST_HEAD(&node->link);
-		list_add_tail(&node->link, &newumem->umem_list);
-		vhost_umem_interval_tree_insert(node, &newumem->umem_tree);
 	}
 
 	if (!memory_access_ok(d, newumem, 0))
@@ -1019,6 +1474,30 @@ long vhost_vring_ioctl(struct vhost_dev *d, int ioctl, void __user *argp)
 }
 EXPORT_SYMBOL_GPL(vhost_vring_ioctl);
 
+int vhost_init_device_iotlb(struct vhost_dev *d, bool enabled)
+{
+	struct vhost_umem *niotlb, *oiotlb;
+	int i;
+
+	niotlb = vhost_umem_alloc();
+	if (!niotlb)
+		return -ENOMEM;
+
+	oiotlb = d->iotlb;
+	d->iotlb = niotlb;
+
+	for (i = 0; i < d->nvqs; ++i) {
+		mutex_lock(&d->vqs[i]->mutex);
+		d->vqs[i]->iotlb = niotlb;
+		mutex_unlock(&d->vqs[i]->mutex);
+	}
+
+	vhost_umem_clean(oiotlb);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(vhost_init_device_iotlb);
+
 /* Caller must have device mutex */
 long vhost_dev_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *argp)
 {
@@ -1233,15 +1712,20 @@ int vhost_vq_init_access(struct vhost_virtqueue *vq)
 	if (r)
 		goto err;
 	vq->signalled_used_valid = false;
-	if (!access_ok(VERIFY_READ, &vq->used->idx, sizeof vq->used->idx)) {
+	if (!vq->iotlb &&
+	    !access_ok(VERIFY_READ, &vq->used->idx, sizeof vq->used->idx)) {
 		r = -EFAULT;
 		goto err;
 	}
 	r = vhost_get_user(vq, last_used_idx, &vq->used->idx);
-	if (r)
+	if (r) {
+		vq_err(vq, "Can't access used idx at %p\n",
+		       &vq->used->idx);
 		goto err;
+	}
 	vq->last_used_idx = vhost16_to_cpu(vq, last_used_idx);
 	return 0;
+
 err:
 	vq->is_le = is_le;
 	return r;
@@ -1249,10 +1733,11 @@ err:
 EXPORT_SYMBOL_GPL(vhost_vq_init_access);
 
 static int translate_desc(struct vhost_virtqueue *vq, u64 addr, u32 len,
-			  struct iovec iov[], int iov_size)
+			  struct iovec iov[], int iov_size, int access)
 {
 	const struct vhost_umem_node *node;
-	struct vhost_umem *umem = vq->umem;
+	struct vhost_dev *dev = vq->dev;
+	struct vhost_umem *umem = dev->iotlb ? dev->iotlb : dev->umem;
 	struct iovec *_iov;
 	u64 s = 0;
 	int ret = 0;
@@ -1263,12 +1748,21 @@ static int translate_desc(struct vhost_virtqueue *vq, u64 addr, u32 len,
 			ret = -ENOBUFS;
 			break;
 		}
+
 		node = vhost_umem_interval_tree_iter_first(&umem->umem_tree,
 							addr, addr + len - 1);
 		if (node == NULL || node->start > addr) {
-			ret = -EFAULT;
+			if (umem != dev->iotlb) {
+				ret = -EFAULT;
+				break;
+			}
+			ret = -EAGAIN;
+			break;
+		} else if (!(node->perm & access)) {
+			ret = -EPERM;
 			break;
 		}
+
 		_iov = iov + ret;
 		size = node->size - addr + node->start;
 		_iov->iov_len = min((u64)len - s, size);
@@ -1279,6 +1773,8 @@ static int translate_desc(struct vhost_virtqueue *vq, u64 addr, u32 len,
 		++ret;
 	}
 
+	if (ret == -EAGAIN)
+		vhost_iotlb_miss(vq, addr, access);
 	return ret;
 }
 
@@ -1313,7 +1809,7 @@ static int get_indirect(struct vhost_virtqueue *vq,
 	unsigned int i = 0, count, found = 0;
 	u32 len = vhost32_to_cpu(vq, indirect->len);
 	struct iov_iter from;
-	int ret;
+	int ret, access;
 
 	/* Sanity check */
 	if (unlikely(len % sizeof desc)) {
@@ -1325,9 +1821,10 @@ static int get_indirect(struct vhost_virtqueue *vq,
 	}
 
 	ret = translate_desc(vq, vhost64_to_cpu(vq, indirect->addr), len, vq->indirect,
-			     UIO_MAXIOV);
+			     UIO_MAXIOV, VHOST_ACCESS_RO);
 	if (unlikely(ret < 0)) {
-		vq_err(vq, "Translation failure %d in indirect.\n", ret);
+		if (ret != -EAGAIN)
+			vq_err(vq, "Translation failure %d in indirect.\n", ret);
 		return ret;
 	}
 	iov_iter_init(&from, READ, vq->indirect, ret, len);
@@ -1365,16 +1862,22 @@ static int get_indirect(struct vhost_virtqueue *vq,
 			return -EINVAL;
 		}
 
+		if (desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_WRITE))
+			access = VHOST_ACCESS_WO;
+		else
+			access = VHOST_ACCESS_RO;
+
 		ret = translate_desc(vq, vhost64_to_cpu(vq, desc.addr),
 				     vhost32_to_cpu(vq, desc.len), iov + iov_count,
-				     iov_size - iov_count);
+				     iov_size - iov_count, access);
 		if (unlikely(ret < 0)) {
-			vq_err(vq, "Translation failure %d indirect idx %d\n",
-			       ret, i);
+			if (ret != -EAGAIN)
+				vq_err(vq, "Translation failure %d indirect idx %d\n",
+					ret, i);
 			return ret;
 		}
 		/* If this is an input descriptor, increment that count. */
-		if (desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_WRITE)) {
+		if (access == VHOST_ACCESS_WO) {
 			*in_num += ret;
 			if (unlikely(log)) {
 				log[*log_num].addr = vhost64_to_cpu(vq, desc.addr);
@@ -1413,7 +1916,7 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
 	u16 last_avail_idx;
 	__virtio16 avail_idx;
 	__virtio16 ring_head;
-	int ret;
+	int ret, access;
 
 	/* Check it isn't doing very strange things with descriptor numbers. */
 	last_avail_idx = vq->last_avail_idx;
@@ -1487,22 +1990,28 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
 					   out_num, in_num,
 					   log, log_num, &desc);
 			if (unlikely(ret < 0)) {
-				vq_err(vq, "Failure detected "
-				       "in indirect descriptor at idx %d\n", i);
+				if (ret != -EAGAIN)
+					vq_err(vq, "Failure detected "
+						"in indirect descriptor at idx %d\n", i);
 				return ret;
 			}
 			continue;
 		}
 
+		if (desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_WRITE))
+			access = VHOST_ACCESS_WO;
+		else
+			access = VHOST_ACCESS_RO;
 		ret = translate_desc(vq, vhost64_to_cpu(vq, desc.addr),
 				     vhost32_to_cpu(vq, desc.len), iov + iov_count,
-				     iov_size - iov_count);
+				     iov_size - iov_count, access);
 		if (unlikely(ret < 0)) {
-			vq_err(vq, "Translation failure %d descriptor idx %d\n",
-			       ret, i);
+			if (ret != -EAGAIN)
+				vq_err(vq, "Translation failure %d descriptor idx %d\n",
+					ret, i);
 			return ret;
 		}
-		if (desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_WRITE)) {
+		if (access == VHOST_ACCESS_WO) {
 			/* If this is an input descriptor,
 			 * increment that count. */
 			*in_num += ret;
@@ -1768,6 +2277,47 @@ void vhost_disable_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
 }
 EXPORT_SYMBOL_GPL(vhost_disable_notify);
 
+/* Create a new message. */
+struct vhost_msg_node *vhost_new_msg(struct vhost_virtqueue *vq, int type)
+{
+	struct vhost_msg_node *node = kmalloc(sizeof *node, GFP_KERNEL);
+	if (!node)
+		return NULL;
+	node->vq = vq;
+	node->msg.type = type;
+	return node;
+}
+EXPORT_SYMBOL_GPL(vhost_new_msg);
+
+void vhost_enqueue_msg(struct vhost_dev *dev, struct list_head *head,
+		       struct vhost_msg_node *node)
+{
+	spin_lock(&dev->iotlb_lock);
+	list_add_tail(&node->node, head);
+	spin_unlock(&dev->iotlb_lock);
+
+	wake_up_interruptible_poll(&dev->wait, POLLIN | POLLRDNORM);
+}
+EXPORT_SYMBOL_GPL(vhost_enqueue_msg);
+
+struct vhost_msg_node *vhost_dequeue_msg(struct vhost_dev *dev,
+					 struct list_head *head)
+{
+	struct vhost_msg_node *node = NULL;
+
+	spin_lock(&dev->iotlb_lock);
+	if (!list_empty(head)) {
+		node = list_first_entry(head, struct vhost_msg_node,
+					node);
+		list_del(&node->node);
+	}
+	spin_unlock(&dev->iotlb_lock);
+
+	return node;
+}
+EXPORT_SYMBOL_GPL(vhost_dequeue_msg);
+
+
 static int __init vhost_init(void)
 {
 	return 0;
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index eaaf6df72218..78f3c5fc02e4 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -65,13 +65,15 @@ struct vhost_umem_node {
 	__u64 last;
 	__u64 size;
 	__u64 userspace_addr;
-	__u64 flags_padding;
+	__u32 perm;
+	__u32 flags_padding;
 	__u64 __subtree_last;
 };
 
 struct vhost_umem {
 	struct rb_root umem_tree;
 	struct list_head umem_list;
+	int numem;
 };
 
 /* The virtqueue structure describes a queue attached to a device. */
@@ -119,10 +121,12 @@ struct vhost_virtqueue {
 	u64 log_addr;
 
 	struct iovec iov[UIO_MAXIOV];
+	struct iovec iotlb_iov[64];
 	struct iovec *indirect;
 	struct vring_used_elem *heads;
 	/* Protected by virtqueue mutex. */
 	struct vhost_umem *umem;
+	struct vhost_umem *iotlb;
 	void *private_data;
 	u64 acked_features;
 	/* Log write descriptors */
@@ -139,6 +143,12 @@ struct vhost_virtqueue {
 	u32 busyloop_timeout;
 };
 
+struct vhost_msg_node {
+  struct vhost_msg msg;
+  struct vhost_virtqueue *vq;
+  struct list_head node;
+};
+
 struct vhost_dev {
 	struct mm_struct *mm;
 	struct mutex mutex;
@@ -149,6 +159,11 @@ struct vhost_dev {
 	struct llist_head work_list;
 	struct task_struct *worker;
 	struct vhost_umem *umem;
+	struct vhost_umem *iotlb;
+	spinlock_t iotlb_lock;
+	struct list_head read_list;
+	struct list_head pending_list;
+	wait_queue_head_t wait;
 };
 
 void vhost_dev_init(struct vhost_dev *, struct vhost_virtqueue **vqs, int nvqs);
@@ -185,6 +200,21 @@ bool vhost_enable_notify(struct vhost_dev *, struct vhost_virtqueue *);
 
 int vhost_log_write(struct vhost_virtqueue *vq, struct vhost_log *log,
 		    unsigned int log_num, u64 len);
+int vq_iotlb_prefetch(struct vhost_virtqueue *vq);
+
+struct vhost_msg_node *vhost_new_msg(struct vhost_virtqueue *vq, int type);
+void vhost_enqueue_msg(struct vhost_dev *dev,
+		       struct list_head *head,
+		       struct vhost_msg_node *node);
+struct vhost_msg_node *vhost_dequeue_msg(struct vhost_dev *dev,
+					 struct list_head *head);
+unsigned int vhost_chr_poll(struct file *file, struct vhost_dev *dev,
+			    poll_table *wait);
+ssize_t vhost_chr_read_iter(struct vhost_dev *dev, struct iov_iter *to,
+			    int noblock);
+ssize_t vhost_chr_write_iter(struct vhost_dev *dev,
+			     struct iov_iter *from);
+int vhost_init_device_iotlb(struct vhost_dev *d, bool enabled);
 
 #define vq_err(vq, fmt, ...) do {                                  \
 		pr_debug(pr_fmt(fmt), ##__VA_ARGS__);       \
diff --git a/include/uapi/linux/vhost.h b/include/uapi/linux/vhost.h
index c4400b267716..56b7ab584cc0 100644
--- a/include/uapi/linux/vhost.h
+++ b/include/uapi/linux/vhost.h
@@ -47,6 +47,32 @@ struct vhost_vring_addr {
 	__u64 log_guest_addr;
 };
 
+/* no alignment requirement */
+struct vhost_iotlb_msg {
+	__u64 iova;
+	__u64 size;
+	__u64 uaddr;
+#define VHOST_ACCESS_RO      0x1
+#define VHOST_ACCESS_WO      0x2
+#define VHOST_ACCESS_RW      0x3
+	__u8 perm;
+#define VHOST_IOTLB_MISS           1
+#define VHOST_IOTLB_UPDATE         2
+#define VHOST_IOTLB_INVALIDATE     3
+#define VHOST_IOTLB_ACCESS_FAIL    4
+	__u8 type;
+};
+
+#define VHOST_IOTLB_MSG 0x1
+
+struct vhost_msg {
+	int type;
+	union {
+		struct vhost_iotlb_msg iotlb;
+		__u8 padding[64];
+	};
+};
+
 struct vhost_memory_region {
 	__u64 guest_phys_addr;
 	__u64 memory_size; /* bytes */
@@ -146,6 +172,8 @@ struct vhost_memory {
 #define VHOST_F_LOG_ALL 26
 /* vhost-net should add virtio_net_hdr for RX, and strip for TX packets. */
 #define VHOST_NET_F_VIRTIO_NET_HDR 27
+/* Vhost have device IOTLB */
+#define VHOST_F_DEVICE_IOTLB 63
 
 /* VHOST_SCSI specific definitions */
 
-- 
cgit v1.2.3-71-gd317


From db3f60012482756f46cc4d7d9ad7d793ae30360c Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Tue, 2 Aug 2016 14:03:36 -0700
Subject: uapi: move forward declarations of internal structures

Don't user forward declarations of internal kernel structures in headers
exported to userspace.

Move "struct completion;".
Move "struct task_struct;".

Link: http://lkml.kernel.org/r/20160713215808.GA22486@p183.telecom.by
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/capability.h      | 1 +
 include/linux/sysctl.h          | 1 +
 include/uapi/linux/capability.h | 2 --
 include/uapi/linux/sysctl.h     | 2 --
 4 files changed, 2 insertions(+), 4 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/linux/capability.h b/include/linux/capability.h
index 5f3c63dde2d5..dbc21c719ce6 100644
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -38,6 +38,7 @@ struct cpu_vfs_cap_data {
 struct file;
 struct inode;
 struct dentry;
+struct task_struct;
 struct user_namespace;
 
 extern const kernel_cap_t __cap_empty_set;
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index fa7bc29925c9..697e160c78d0 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -28,6 +28,7 @@
 #include <uapi/linux/sysctl.h>
 
 /* For the /proc/sys support */
+struct completion;
 struct ctl_table;
 struct nsproxy;
 struct ctl_table_root;
diff --git a/include/uapi/linux/capability.h b/include/uapi/linux/capability.h
index 12c37a197d24..49bc06295398 100644
--- a/include/uapi/linux/capability.h
+++ b/include/uapi/linux/capability.h
@@ -15,8 +15,6 @@
 
 #include <linux/types.h>
 
-struct task_struct;
-
 /* User-level do most of the mapping between kernel and user
    capabilities based on the version tag given by the kernel. The
    kernel might be somewhat backwards compatible, but don't bet on
diff --git a/include/uapi/linux/sysctl.h b/include/uapi/linux/sysctl.h
index 0956373b56db..d2b12152e358 100644
--- a/include/uapi/linux/sysctl.h
+++ b/include/uapi/linux/sysctl.h
@@ -26,8 +26,6 @@
 #include <linux/types.h>
 #include <linux/compiler.h>
 
-struct completion;
-
 #define CTL_MAXNAME 10		/* how many path components do we allow in a
 				   call to sysctl?   In other words, what is
 				   the largest acceptable value for the nlen
-- 
cgit v1.2.3-71-gd317


From e63e88bc53bac7e4c3f592f8126c51a7569be673 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Tue, 2 Aug 2016 14:05:30 -0700
Subject: nilfs2: move ioctl interface and disk layout to uapi separately

The header file "include/linux/nilfs2_fs.h" is composed of parts for
ioctl and disk format, and both are intended to be shared with user
space programs.

This moves them to the uapi directory "include/uapi/linux" splitting the
file to "nilfs2_api.h" and "nilfs2_ondisk.h".  The following minor
changes are accompanied by this migration:

 - nilfs_direct_node struct in nilfs2/direct.h is converged to
   nilfs2_ondisk.h because it's an on-disk structure.
 - inline functions nilfs_rec_len_from_disk() and
   nilfs_rec_len_to_disk() are moved to nilfs2/dir.c.

Link: http://lkml.kernel.org/r/1465825507-3407-4-git-send-email-konishi.ryusuke@lab.ntt.co.jp
Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/filesystems/nilfs2.txt |   3 +-
 Documentation/ioctl/ioctl-number.txt |   2 +-
 MAINTAINERS                          |   3 +-
 fs/nilfs2/bmap.h                     |   2 +-
 fs/nilfs2/btree.h                    |   2 +-
 fs/nilfs2/cpfile.c                   |   1 -
 fs/nilfs2/cpfile.h                   |   3 +-
 fs/nilfs2/dat.h                      |   1 +
 fs/nilfs2/dir.c                      |  22 +
 fs/nilfs2/direct.h                   |  10 -
 fs/nilfs2/ifile.h                    |   1 -
 fs/nilfs2/ioctl.c                    |   1 -
 fs/nilfs2/nilfs.h                    |   3 +-
 fs/nilfs2/segment.h                  |   1 -
 fs/nilfs2/sufile.c                   |   1 -
 fs/nilfs2/sufile.h                   |   1 -
 include/linux/nilfs2_fs.h            | 934 -----------------------------------
 include/uapi/linux/nilfs2_api.h      | 292 +++++++++++
 include/uapi/linux/nilfs2_ondisk.h   | 650 ++++++++++++++++++++++++
 19 files changed, 976 insertions(+), 957 deletions(-)
 delete mode 100644 include/linux/nilfs2_fs.h
 create mode 100644 include/uapi/linux/nilfs2_api.h
 create mode 100644 include/uapi/linux/nilfs2_ondisk.h

(limited to 'include/uapi/linux')

diff --git a/Documentation/filesystems/nilfs2.txt b/Documentation/filesystems/nilfs2.txt
index 5b21ef76f751..c0727dc36271 100644
--- a/Documentation/filesystems/nilfs2.txt
+++ b/Documentation/filesystems/nilfs2.txt
@@ -267,7 +267,8 @@ among NILFS2 files can be depicted as follows:
                                   `-- file (ino=yy)
                                     ( regular file, directory, or symlink )
 
-For detail on the format of each file, please see include/linux/nilfs2_fs.h.
+For detail on the format of each file, please see nilfs2_ondisk.h
+located at include/uapi/linux directory.
 
 There are no patents or other intellectual property that we protect
 with regard to the design of NILFS2.  It is allowed to replicate the
diff --git a/Documentation/ioctl/ioctl-number.txt b/Documentation/ioctl/ioctl-number.txt
index 56af5e43e9c0..81c7f2bb7daf 100644
--- a/Documentation/ioctl/ioctl-number.txt
+++ b/Documentation/ioctl/ioctl-number.txt
@@ -248,7 +248,7 @@ Code  Seq#(hex)	Include File		Comments
 'm'	00	drivers/scsi/megaraid/megaraid_ioctl.h	conflict!
 'm'	00-1F	net/irda/irmod.h	conflict!
 'n'	00-7F	linux/ncp_fs.h and fs/ncpfs/ioctl.c
-'n'	80-8F	linux/nilfs2_fs.h	NILFS2
+'n'	80-8F	uapi/linux/nilfs2_api.h	NILFS2
 'n'	E0-FF	linux/matroxfb.h	matroxfb
 'o'	00-1F	fs/ocfs2/ocfs2_fs.h	OCFS2
 'o'     00-03   mtd/ubi-user.h		conflict! (OCFS2 and UBI overlaps)
diff --git a/MAINTAINERS b/MAINTAINERS
index bb51bbbc9e1d..e9eacacf0f08 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -8258,8 +8258,9 @@ T:	git git://github.com/konis/nilfs2.git
 S:	Supported
 F:	Documentation/filesystems/nilfs2.txt
 F:	fs/nilfs2/
-F:	include/linux/nilfs2_fs.h
 F:	include/trace/events/nilfs2.h
+F:	include/uapi/linux/nilfs2_api.h
+F:	include/uapi/linux/nilfs2_ondisk.h
 
 NINJA SCSI-3 / NINJA SCSI-32Bi (16bit/CardBus) PCMCIA SCSI HOST ADAPTER DRIVER
 M:	YOKOTA Hiroshi <yokota@netlab.is.tsukuba.ac.jp>
diff --git a/fs/nilfs2/bmap.h b/fs/nilfs2/bmap.h
index b6a4c8f93ac8..2b6ffbe5997a 100644
--- a/fs/nilfs2/bmap.h
+++ b/fs/nilfs2/bmap.h
@@ -22,7 +22,7 @@
 #include <linux/types.h>
 #include <linux/fs.h>
 #include <linux/buffer_head.h>
-#include <linux/nilfs2_fs.h>
+#include <linux/nilfs2_ondisk.h>	/* nilfs_binfo, nilfs_inode, etc */
 #include "alloc.h"
 #include "dat.h"
 
diff --git a/fs/nilfs2/btree.h b/fs/nilfs2/btree.h
index df1a25faa83b..2184e47fa4bf 100644
--- a/fs/nilfs2/btree.h
+++ b/fs/nilfs2/btree.h
@@ -22,7 +22,7 @@
 #include <linux/types.h>
 #include <linux/buffer_head.h>
 #include <linux/list.h>
-#include <linux/nilfs2_fs.h>
+#include <linux/nilfs2_ondisk.h>	/* nilfs_btree_node */
 #include "btnode.h"
 #include "bmap.h"
 
diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c
index 19d9f4ae8347..a15a1601e931 100644
--- a/fs/nilfs2/cpfile.c
+++ b/fs/nilfs2/cpfile.c
@@ -21,7 +21,6 @@
 #include <linux/string.h>
 #include <linux/buffer_head.h>
 #include <linux/errno.h>
-#include <linux/nilfs2_fs.h>
 #include "mdt.h"
 #include "cpfile.h"
 
diff --git a/fs/nilfs2/cpfile.h b/fs/nilfs2/cpfile.h
index 0249744ae234..6eca972f9673 100644
--- a/fs/nilfs2/cpfile.h
+++ b/fs/nilfs2/cpfile.h
@@ -21,7 +21,8 @@
 
 #include <linux/fs.h>
 #include <linux/buffer_head.h>
-#include <linux/nilfs2_fs.h>
+#include <linux/nilfs2_api.h>		/* nilfs_cpstat */
+#include <linux/nilfs2_ondisk.h>	/* nilfs_inode, nilfs_checkpoint */
 
 
 int nilfs_cpfile_get_checkpoint(struct inode *, __u64, int,
diff --git a/fs/nilfs2/dat.h b/fs/nilfs2/dat.h
index abbfdabcabea..57dc6cf466d0 100644
--- a/fs/nilfs2/dat.h
+++ b/fs/nilfs2/dat.h
@@ -22,6 +22,7 @@
 #include <linux/types.h>
 #include <linux/buffer_head.h>
 #include <linux/fs.h>
+#include <linux/nilfs2_ondisk.h>	/* nilfs_inode, nilfs_checkpoint */
 
 
 struct nilfs_palloc_req;
diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c
index 746956d2937a..908ebbf0ac7e 100644
--- a/fs/nilfs2/dir.c
+++ b/fs/nilfs2/dir.c
@@ -42,6 +42,28 @@
 #include "nilfs.h"
 #include "page.h"
 
+static inline unsigned int nilfs_rec_len_from_disk(__le16 dlen)
+{
+	unsigned int len = le16_to_cpu(dlen);
+
+#if (PAGE_SIZE >= 65536)
+	if (len == NILFS_MAX_REC_LEN)
+		return 1 << 16;
+#endif
+	return len;
+}
+
+static inline __le16 nilfs_rec_len_to_disk(unsigned int len)
+{
+#if (PAGE_SIZE >= 65536)
+	if (len == (1 << 16))
+		return cpu_to_le16(NILFS_MAX_REC_LEN);
+
+	BUG_ON(len > (1 << 16));
+#endif
+	return cpu_to_le16(len);
+}
+
 /*
  * nilfs uses block-sized chunks. Arguably, sector-sized ones would be
  * more robust, but we have what we have
diff --git a/fs/nilfs2/direct.h b/fs/nilfs2/direct.h
index 3015a6e78724..cfe85e848bba 100644
--- a/fs/nilfs2/direct.h
+++ b/fs/nilfs2/direct.h
@@ -24,16 +24,6 @@
 #include "bmap.h"
 
 
-/**
- * struct nilfs_direct_node - direct node
- * @dn_flags: flags
- * @dn_pad: padding
- */
-struct nilfs_direct_node {
-	__u8 dn_flags;
-	__u8 pad[7];
-};
-
 #define NILFS_DIRECT_NBLOCKS	(NILFS_BMAP_SIZE / sizeof(__le64) - 1)
 #define NILFS_DIRECT_KEY_MIN	0
 #define NILFS_DIRECT_KEY_MAX	(NILFS_DIRECT_NBLOCKS - 1)
diff --git a/fs/nilfs2/ifile.h b/fs/nilfs2/ifile.h
index 23ad2f091e76..188b94fe0ec5 100644
--- a/fs/nilfs2/ifile.h
+++ b/fs/nilfs2/ifile.h
@@ -23,7 +23,6 @@
 
 #include <linux/fs.h>
 #include <linux/buffer_head.h>
-#include <linux/nilfs2_fs.h>
 #include "mdt.h"
 #include "alloc.h"
 
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index 827283fe9525..f1d7989459fd 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -25,7 +25,6 @@
 #include <linux/compat.h>	/* compat_ptr() */
 #include <linux/mount.h>	/* mnt_want_write_file(), mnt_drop_write_file() */
 #include <linux/buffer_head.h>
-#include <linux/nilfs2_fs.h>
 #include "nilfs.h"
 #include "segment.h"
 #include "bmap.h"
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index 2ba8a146af1f..33f8c8fc96e8 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -23,7 +23,8 @@
 #include <linux/buffer_head.h>
 #include <linux/spinlock.h>
 #include <linux/blkdev.h>
-#include <linux/nilfs2_fs.h>
+#include <linux/nilfs2_api.h>
+#include <linux/nilfs2_ondisk.h>
 #include "the_nilfs.h"
 #include "bmap.h"
 
diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h
index 6565c10b7b76..1060949d7dd2 100644
--- a/fs/nilfs2/segment.h
+++ b/fs/nilfs2/segment.h
@@ -23,7 +23,6 @@
 #include <linux/fs.h>
 #include <linux/buffer_head.h>
 #include <linux/workqueue.h>
-#include <linux/nilfs2_fs.h>
 #include "nilfs.h"
 
 struct nilfs_root;
diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c
index 12d11de93602..1541a1e9221a 100644
--- a/fs/nilfs2/sufile.c
+++ b/fs/nilfs2/sufile.c
@@ -22,7 +22,6 @@
 #include <linux/string.h>
 #include <linux/buffer_head.h>
 #include <linux/errno.h>
-#include <linux/nilfs2_fs.h>
 #include "mdt.h"
 #include "sufile.h"
 
diff --git a/fs/nilfs2/sufile.h b/fs/nilfs2/sufile.h
index 46e89872294c..158a9190c8ec 100644
--- a/fs/nilfs2/sufile.h
+++ b/fs/nilfs2/sufile.h
@@ -21,7 +21,6 @@
 
 #include <linux/fs.h>
 #include <linux/buffer_head.h>
-#include <linux/nilfs2_fs.h>
 #include "mdt.h"
 
 
diff --git a/include/linux/nilfs2_fs.h b/include/linux/nilfs2_fs.h
deleted file mode 100644
index 5988dd57ba66..000000000000
--- a/include/linux/nilfs2_fs.h
+++ /dev/null
@@ -1,934 +0,0 @@
-/*
- * nilfs2_fs.h - NILFS2 on-disk structures and common declarations.
- *
- * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published
- * by the Free Software Foundation; either version 2.1 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Lesser General Public License for more details.
- *
- * Written by Koji Sato and Ryusuke Konishi.
- */
-/*
- *  linux/include/linux/ext2_fs.h
- *
- * Copyright (C) 1992, 1993, 1994, 1995
- * Remy Card (card@masi.ibp.fr)
- * Laboratoire MASI - Institut Blaise Pascal
- * Universite Pierre et Marie Curie (Paris VI)
- *
- *  from
- *
- *  linux/include/linux/minix_fs.h
- *
- *  Copyright (C) 1991, 1992  Linus Torvalds
- */
-
-#ifndef _LINUX_NILFS_FS_H
-#define _LINUX_NILFS_FS_H
-
-#include <linux/types.h>
-#include <linux/ioctl.h>
-#include <linux/magic.h>
-#include <linux/bug.h>
-
-
-#define NILFS_INODE_BMAP_SIZE	7
-/**
- * struct nilfs_inode - structure of an inode on disk
- * @i_blocks: blocks count
- * @i_size: size in bytes
- * @i_ctime: creation time (seconds)
- * @i_mtime: modification time (seconds)
- * @i_ctime_nsec: creation time (nano seconds)
- * @i_mtime_nsec: modification time (nano seconds)
- * @i_uid: user id
- * @i_gid: group id
- * @i_mode: file mode
- * @i_links_count: links count
- * @i_flags: file flags
- * @i_bmap: block mapping
- * @i_xattr: extended attributes
- * @i_generation: file generation (for NFS)
- * @i_pad:	padding
- */
-struct nilfs_inode {
-	__le64	i_blocks;
-	__le64	i_size;
-	__le64	i_ctime;
-	__le64	i_mtime;
-	__le32	i_ctime_nsec;
-	__le32	i_mtime_nsec;
-	__le32	i_uid;
-	__le32	i_gid;
-	__le16	i_mode;
-	__le16	i_links_count;
-	__le32	i_flags;
-	__le64	i_bmap[NILFS_INODE_BMAP_SIZE];
-#define i_device_code	i_bmap[0]
-	__le64	i_xattr;
-	__le32	i_generation;
-	__le32	i_pad;
-};
-
-#define NILFS_MIN_INODE_SIZE		128
-
-/**
- * struct nilfs_super_root - structure of super root
- * @sr_sum: check sum
- * @sr_bytes: byte count of the structure
- * @sr_flags: flags (reserved)
- * @sr_nongc_ctime: write time of the last segment not for cleaner operation
- * @sr_dat: DAT file inode
- * @sr_cpfile: checkpoint file inode
- * @sr_sufile: segment usage file inode
- */
-struct nilfs_super_root {
-	__le32 sr_sum;
-	__le16 sr_bytes;
-	__le16 sr_flags;
-	__le64 sr_nongc_ctime;
-	struct nilfs_inode sr_dat;
-	struct nilfs_inode sr_cpfile;
-	struct nilfs_inode sr_sufile;
-};
-
-#define NILFS_SR_MDT_OFFSET(inode_size, i)  \
-	((unsigned long)&((struct nilfs_super_root *)0)->sr_dat + \
-			(inode_size) * (i))
-#define NILFS_SR_DAT_OFFSET(inode_size)     NILFS_SR_MDT_OFFSET(inode_size, 0)
-#define NILFS_SR_CPFILE_OFFSET(inode_size)  NILFS_SR_MDT_OFFSET(inode_size, 1)
-#define NILFS_SR_SUFILE_OFFSET(inode_size)  NILFS_SR_MDT_OFFSET(inode_size, 2)
-#define NILFS_SR_BYTES(inode_size)	    NILFS_SR_MDT_OFFSET(inode_size, 3)
-
-/*
- * Maximal mount counts
- */
-#define NILFS_DFL_MAX_MNT_COUNT		50      /* 50 mounts */
-
-/*
- * File system states (sbp->s_state, nilfs->ns_mount_state)
- */
-#define NILFS_VALID_FS			0x0001  /* Unmounted cleanly */
-#define NILFS_ERROR_FS			0x0002  /* Errors detected */
-#define NILFS_RESIZE_FS			0x0004	/* Resize required */
-
-/*
- * Mount flags (sbi->s_mount_opt)
- */
-#define NILFS_MOUNT_ERROR_MODE		0x0070  /* Error mode mask */
-#define NILFS_MOUNT_ERRORS_CONT		0x0010  /* Continue on errors */
-#define NILFS_MOUNT_ERRORS_RO		0x0020  /* Remount fs ro on errors */
-#define NILFS_MOUNT_ERRORS_PANIC	0x0040  /* Panic on errors */
-#define NILFS_MOUNT_BARRIER		0x1000  /* Use block barriers */
-#define NILFS_MOUNT_STRICT_ORDER	0x2000  /*
-						 * Apply strict in-order
-						 * semantics also for data
-						 */
-#define NILFS_MOUNT_NORECOVERY		0x4000  /*
-						 * Disable write access during
-						 * mount-time recovery
-						 */
-#define NILFS_MOUNT_DISCARD		0x8000  /* Issue DISCARD requests */
-
-
-/**
- * struct nilfs_super_block - structure of super block on disk
- */
-struct nilfs_super_block {
-/*00*/	__le32	s_rev_level;		/* Revision level */
-	__le16	s_minor_rev_level;	/* minor revision level */
-	__le16	s_magic;		/* Magic signature */
-
-	__le16  s_bytes;		/*
-					 * Bytes count of CRC calculation
-					 * for this structure. s_reserved
-					 * is excluded.
-					 */
-	__le16  s_flags;		/* flags */
-	__le32  s_crc_seed;		/* Seed value of CRC calculation */
-/*10*/	__le32	s_sum;			/* Check sum of super block */
-
-	__le32	s_log_block_size;	/*
-					 * Block size represented as follows
-					 * blocksize =
-					 *     1 << (s_log_block_size + 10)
-					 */
-	__le64  s_nsegments;		/* Number of segments in filesystem */
-/*20*/	__le64  s_dev_size;		/* block device size in bytes */
-	__le64	s_first_data_block;	/* 1st seg disk block number */
-/*30*/	__le32  s_blocks_per_segment;   /* number of blocks per full segment */
-	__le32	s_r_segments_percentage; /* Reserved segments percentage */
-
-	__le64  s_last_cno;		/* Last checkpoint number */
-/*40*/	__le64  s_last_pseg;		/* disk block addr pseg written last */
-	__le64  s_last_seq;             /* seq. number of seg written last */
-/*50*/	__le64	s_free_blocks_count;	/* Free blocks count */
-
-	__le64	s_ctime;		/*
-					 * Creation time (execution time of
-					 * newfs)
-					 */
-/*60*/	__le64	s_mtime;		/* Mount time */
-	__le64	s_wtime;		/* Write time */
-/*70*/	__le16	s_mnt_count;		/* Mount count */
-	__le16	s_max_mnt_count;	/* Maximal mount count */
-	__le16	s_state;		/* File system state */
-	__le16	s_errors;		/* Behaviour when detecting errors */
-	__le64	s_lastcheck;		/* time of last check */
-
-/*80*/	__le32	s_checkinterval;	/* max. time between checks */
-	__le32	s_creator_os;		/* OS */
-	__le16	s_def_resuid;		/* Default uid for reserved blocks */
-	__le16	s_def_resgid;		/* Default gid for reserved blocks */
-	__le32	s_first_ino;		/* First non-reserved inode */
-
-/*90*/	__le16  s_inode_size;		/* Size of an inode */
-	__le16  s_dat_entry_size;       /* Size of a dat entry */
-	__le16  s_checkpoint_size;      /* Size of a checkpoint */
-	__le16	s_segment_usage_size;	/* Size of a segment usage */
-
-/*98*/	__u8	s_uuid[16];		/* 128-bit uuid for volume */
-/*A8*/	char	s_volume_name[80];	/* volume name */
-
-/*F8*/	__le32  s_c_interval;           /* Commit interval of segment */
-	__le32  s_c_block_max;          /*
-					 * Threshold of data amount for
-					 * the segment construction
-					 */
-/*100*/	__le64  s_feature_compat;	/* Compatible feature set */
-	__le64  s_feature_compat_ro;	/* Read-only compatible feature set */
-	__le64  s_feature_incompat;	/* Incompatible feature set */
-	__u32	s_reserved[186];	/* padding to the end of the block */
-};
-
-/*
- * Codes for operating systems
- */
-#define NILFS_OS_LINUX		0
-/* Codes from 1 to 4 are reserved to keep compatibility with ext2 creator-OS */
-
-/*
- * Revision levels
- */
-#define NILFS_CURRENT_REV	2	/* current major revision */
-#define NILFS_MINOR_REV		0	/* minor revision */
-#define NILFS_MIN_SUPP_REV	2	/* minimum supported revision */
-
-/*
- * Feature set definitions
- *
- * If there is a bit set in the incompatible feature set that the kernel
- * doesn't know about, it should refuse to mount the filesystem.
- */
-#define NILFS_FEATURE_COMPAT_RO_BLOCK_COUNT	0x00000001ULL
-
-#define NILFS_FEATURE_COMPAT_SUPP	0ULL
-#define NILFS_FEATURE_COMPAT_RO_SUPP	NILFS_FEATURE_COMPAT_RO_BLOCK_COUNT
-#define NILFS_FEATURE_INCOMPAT_SUPP	0ULL
-
-/*
- * Bytes count of super_block for CRC-calculation
- */
-#define NILFS_SB_BYTES  \
-	((long)&((struct nilfs_super_block *)0)->s_reserved)
-
-/*
- * Special inode number
- */
-#define NILFS_ROOT_INO		2	/* Root file inode */
-#define NILFS_DAT_INO		3	/* DAT file */
-#define NILFS_CPFILE_INO	4	/* checkpoint file */
-#define NILFS_SUFILE_INO	5	/* segment usage file */
-#define NILFS_IFILE_INO		6	/* ifile */
-#define NILFS_ATIME_INO		7	/* Atime file (reserved) */
-#define NILFS_XATTR_INO		8	/* Xattribute file (reserved) */
-#define NILFS_SKETCH_INO	10	/* Sketch file */
-#define NILFS_USER_INO		11	/* Fisrt user's file inode number */
-
-#define NILFS_SB_OFFSET_BYTES	1024	/* byte offset of nilfs superblock */
-
-#define NILFS_SEG_MIN_BLOCKS	16	/*
-					 * Minimum number of blocks in
-					 * a full segment
-					 */
-#define NILFS_PSEG_MIN_BLOCKS	2	/*
-					 * Minimum number of blocks in
-					 * a partial segment
-					 */
-#define NILFS_MIN_NRSVSEGS	8	/*
-					 * Minimum number of reserved
-					 * segments
-					 */
-
-/*
- * We call DAT, cpfile, and sufile root metadata files.  Inodes of
- * these files are written in super root block instead of ifile, and
- * garbage collector doesn't keep any past versions of these files.
- */
-#define NILFS_ROOT_METADATA_FILE(ino) \
-	((ino) >= NILFS_DAT_INO && (ino) <= NILFS_SUFILE_INO)
-
-/*
- * bytes offset of secondary super block
- */
-#define NILFS_SB2_OFFSET_BYTES(devsize)	((((devsize) >> 12) - 1) << 12)
-
-/*
- * Maximal count of links to a file
- */
-#define NILFS_LINK_MAX		32000
-
-/*
- * Structure of a directory entry
- *  (Same as ext2)
- */
-
-#define NILFS_NAME_LEN 255
-
-/*
- * Block size limitations
- */
-#define NILFS_MIN_BLOCK_SIZE		1024
-#define NILFS_MAX_BLOCK_SIZE		65536
-
-/*
- * The new version of the directory entry.  Since V0 structures are
- * stored in intel byte order, and the name_len field could never be
- * bigger than 255 chars, it's safe to reclaim the extra byte for the
- * file_type field.
- */
-struct nilfs_dir_entry {
-	__le64	inode;			/* Inode number */
-	__le16	rec_len;		/* Directory entry length */
-	__u8	name_len;		/* Name length */
-	__u8	file_type;		/* Dir entry type (file, dir, etc) */
-	char	name[NILFS_NAME_LEN];	/* File name */
-	char    pad;
-};
-
-/*
- * NILFS directory file types.  Only the low 3 bits are used.  The
- * other bits are reserved for now.
- */
-enum {
-	NILFS_FT_UNKNOWN,
-	NILFS_FT_REG_FILE,
-	NILFS_FT_DIR,
-	NILFS_FT_CHRDEV,
-	NILFS_FT_BLKDEV,
-	NILFS_FT_FIFO,
-	NILFS_FT_SOCK,
-	NILFS_FT_SYMLINK,
-	NILFS_FT_MAX
-};
-
-/*
- * NILFS_DIR_PAD defines the directory entries boundaries
- *
- * NOTE: It must be a multiple of 8
- */
-#define NILFS_DIR_PAD			8
-#define NILFS_DIR_ROUND			(NILFS_DIR_PAD - 1)
-#define NILFS_DIR_REC_LEN(name_len)	(((name_len) + 12 + NILFS_DIR_ROUND) & \
-					~NILFS_DIR_ROUND)
-#define NILFS_MAX_REC_LEN		((1<<16)-1)
-
-static inline unsigned int nilfs_rec_len_from_disk(__le16 dlen)
-{
-	unsigned int len = le16_to_cpu(dlen);
-
-#if !defined(__KERNEL__) || (PAGE_SIZE >= 65536)
-	if (len == NILFS_MAX_REC_LEN)
-		return 1 << 16;
-#endif
-	return len;
-}
-
-static inline __le16 nilfs_rec_len_to_disk(unsigned int len)
-{
-#if !defined(__KERNEL__) || (PAGE_SIZE >= 65536)
-	if (len == (1 << 16))
-		return cpu_to_le16(NILFS_MAX_REC_LEN);
-	else if (len > (1 << 16))
-		BUG();
-#endif
-	return cpu_to_le16(len);
-}
-
-/**
- * struct nilfs_finfo - file information
- * @fi_ino: inode number
- * @fi_cno: checkpoint number
- * @fi_nblocks: number of blocks (including intermediate blocks)
- * @fi_ndatablk: number of file data blocks
- */
-struct nilfs_finfo {
-	__le64 fi_ino;
-	__le64 fi_cno;
-	__le32 fi_nblocks;
-	__le32 fi_ndatablk;
-	/* array of virtual block numbers */
-};
-
-/**
- * struct nilfs_binfo_v - information for the block to which a virtual block number is assigned
- * @bi_vblocknr: virtual block number
- * @bi_blkoff: block offset
- */
-struct nilfs_binfo_v {
-	__le64 bi_vblocknr;
-	__le64 bi_blkoff;
-};
-
-/**
- * struct nilfs_binfo_dat - information for the block which belongs to the DAT file
- * @bi_blkoff: block offset
- * @bi_level: level
- * @bi_pad: padding
- */
-struct nilfs_binfo_dat {
-	__le64 bi_blkoff;
-	__u8 bi_level;
-	__u8 bi_pad[7];
-};
-
-/**
- * union nilfs_binfo: block information
- * @bi_v: nilfs_binfo_v structure
- * @bi_dat: nilfs_binfo_dat structure
- */
-union nilfs_binfo {
-	struct nilfs_binfo_v bi_v;
-	struct nilfs_binfo_dat bi_dat;
-};
-
-/**
- * struct nilfs_segment_summary - segment summary header
- * @ss_datasum: checksum of data
- * @ss_sumsum: checksum of segment summary
- * @ss_magic: magic number
- * @ss_bytes: size of this structure in bytes
- * @ss_flags: flags
- * @ss_seq: sequence number
- * @ss_create: creation timestamp
- * @ss_next: next segment
- * @ss_nblocks: number of blocks
- * @ss_nfinfo: number of finfo structures
- * @ss_sumbytes: total size of segment summary in bytes
- * @ss_pad: padding
- * @ss_cno: checkpoint number
- */
-struct nilfs_segment_summary {
-	__le32 ss_datasum;
-	__le32 ss_sumsum;
-	__le32 ss_magic;
-	__le16 ss_bytes;
-	__le16 ss_flags;
-	__le64 ss_seq;
-	__le64 ss_create;
-	__le64 ss_next;
-	__le32 ss_nblocks;
-	__le32 ss_nfinfo;
-	__le32 ss_sumbytes;
-	__le32 ss_pad;
-	__le64 ss_cno;
-	/* array of finfo structures */
-};
-
-#define NILFS_SEGSUM_MAGIC	0x1eaffa11  /* segment summary magic number */
-
-/*
- * Segment summary flags
- */
-#define NILFS_SS_LOGBGN 0x0001  /* begins a logical segment */
-#define NILFS_SS_LOGEND 0x0002  /* ends a logical segment */
-#define NILFS_SS_SR     0x0004  /* has super root */
-#define NILFS_SS_SYNDT  0x0008  /* includes data only updates */
-#define NILFS_SS_GC     0x0010  /* segment written for cleaner operation */
-
-/**
- * struct nilfs_btree_node - B-tree node
- * @bn_flags: flags
- * @bn_level: level
- * @bn_nchildren: number of children
- * @bn_pad: padding
- */
-struct nilfs_btree_node {
-	__u8 bn_flags;
-	__u8 bn_level;
-	__le16 bn_nchildren;
-	__le32 bn_pad;
-};
-
-/* flags */
-#define NILFS_BTREE_NODE_ROOT   0x01
-
-/* level */
-#define NILFS_BTREE_LEVEL_DATA          0
-#define NILFS_BTREE_LEVEL_NODE_MIN      (NILFS_BTREE_LEVEL_DATA + 1)
-#define NILFS_BTREE_LEVEL_MAX           14	/* Max level (exclusive) */
-
-/**
- * struct nilfs_palloc_group_desc - block group descriptor
- * @pg_nfrees: number of free entries in block group
- */
-struct nilfs_palloc_group_desc {
-	__le32 pg_nfrees;
-};
-
-/**
- * struct nilfs_dat_entry - disk address translation entry
- * @de_blocknr: block number
- * @de_start: start checkpoint number
- * @de_end: end checkpoint number
- * @de_rsv: reserved for future use
- */
-struct nilfs_dat_entry {
-	__le64 de_blocknr;
-	__le64 de_start;
-	__le64 de_end;
-	__le64 de_rsv;
-};
-
-#define NILFS_MIN_DAT_ENTRY_SIZE	32
-
-/**
- * struct nilfs_snapshot_list - snapshot list
- * @ssl_next: next checkpoint number on snapshot list
- * @ssl_prev: previous checkpoint number on snapshot list
- */
-struct nilfs_snapshot_list {
-	__le64 ssl_next;
-	__le64 ssl_prev;
-};
-
-/**
- * struct nilfs_checkpoint - checkpoint structure
- * @cp_flags: flags
- * @cp_checkpoints_count: checkpoints count in a block
- * @cp_snapshot_list: snapshot list
- * @cp_cno: checkpoint number
- * @cp_create: creation timestamp
- * @cp_nblk_inc: number of blocks incremented by this checkpoint
- * @cp_inodes_count: inodes count
- * @cp_blocks_count: blocks count
- * @cp_ifile_inode: inode of ifile
- */
-struct nilfs_checkpoint {
-	__le32 cp_flags;
-	__le32 cp_checkpoints_count;
-	struct nilfs_snapshot_list cp_snapshot_list;
-	__le64 cp_cno;
-	__le64 cp_create;
-	__le64 cp_nblk_inc;
-	__le64 cp_inodes_count;
-	__le64 cp_blocks_count;
-
-	/*
-	 * Do not change the byte offset of ifile inode.
-	 * To keep the compatibility of the disk format,
-	 * additional fields should be added behind cp_ifile_inode.
-	 */
-	struct nilfs_inode cp_ifile_inode;
-};
-
-#define NILFS_MIN_CHECKPOINT_SIZE	(64 + NILFS_MIN_INODE_SIZE)
-
-/* checkpoint flags */
-enum {
-	NILFS_CHECKPOINT_SNAPSHOT,
-	NILFS_CHECKPOINT_INVALID,
-	NILFS_CHECKPOINT_SKETCH,
-	NILFS_CHECKPOINT_MINOR,
-};
-
-#define NILFS_CHECKPOINT_FNS(flag, name)				\
-static inline void							\
-nilfs_checkpoint_set_##name(struct nilfs_checkpoint *cp)		\
-{									\
-	cp->cp_flags = cpu_to_le32(le32_to_cpu(cp->cp_flags) |		\
-				   (1UL << NILFS_CHECKPOINT_##flag));	\
-}									\
-static inline void							\
-nilfs_checkpoint_clear_##name(struct nilfs_checkpoint *cp)		\
-{									\
-	cp->cp_flags = cpu_to_le32(le32_to_cpu(cp->cp_flags) &		\
-				   ~(1UL << NILFS_CHECKPOINT_##flag));	\
-}									\
-static inline int							\
-nilfs_checkpoint_##name(const struct nilfs_checkpoint *cp)		\
-{									\
-	return !!(le32_to_cpu(cp->cp_flags) &				\
-		  (1UL << NILFS_CHECKPOINT_##flag));			\
-}
-
-NILFS_CHECKPOINT_FNS(SNAPSHOT, snapshot)
-NILFS_CHECKPOINT_FNS(INVALID, invalid)
-NILFS_CHECKPOINT_FNS(MINOR, minor)
-
-/**
- * struct nilfs_cpinfo - checkpoint information
- * @ci_flags: flags
- * @ci_pad: padding
- * @ci_cno: checkpoint number
- * @ci_create: creation timestamp
- * @ci_nblk_inc: number of blocks incremented by this checkpoint
- * @ci_inodes_count: inodes count
- * @ci_blocks_count: blocks count
- * @ci_next: next checkpoint number in snapshot list
- */
-struct nilfs_cpinfo {
-	__u32 ci_flags;
-	__u32 ci_pad;
-	__u64 ci_cno;
-	__u64 ci_create;
-	__u64 ci_nblk_inc;
-	__u64 ci_inodes_count;
-	__u64 ci_blocks_count;
-	__u64 ci_next;
-};
-
-#define NILFS_CPINFO_FNS(flag, name)					\
-static inline int							\
-nilfs_cpinfo_##name(const struct nilfs_cpinfo *cpinfo)			\
-{									\
-	return !!(cpinfo->ci_flags & (1UL << NILFS_CHECKPOINT_##flag));	\
-}
-
-NILFS_CPINFO_FNS(SNAPSHOT, snapshot)
-NILFS_CPINFO_FNS(INVALID, invalid)
-NILFS_CPINFO_FNS(MINOR, minor)
-
-
-/**
- * struct nilfs_cpfile_header - checkpoint file header
- * @ch_ncheckpoints: number of checkpoints
- * @ch_nsnapshots: number of snapshots
- * @ch_snapshot_list: snapshot list
- */
-struct nilfs_cpfile_header {
-	__le64 ch_ncheckpoints;
-	__le64 ch_nsnapshots;
-	struct nilfs_snapshot_list ch_snapshot_list;
-};
-
-#define NILFS_CPFILE_FIRST_CHECKPOINT_OFFSET	\
-	((sizeof(struct nilfs_cpfile_header) +				\
-	  sizeof(struct nilfs_checkpoint) - 1) /			\
-			sizeof(struct nilfs_checkpoint))
-
-/**
- * struct nilfs_segment_usage - segment usage
- * @su_lastmod: last modified timestamp
- * @su_nblocks: number of blocks in segment
- * @su_flags: flags
- */
-struct nilfs_segment_usage {
-	__le64 su_lastmod;
-	__le32 su_nblocks;
-	__le32 su_flags;
-};
-
-#define NILFS_MIN_SEGMENT_USAGE_SIZE	16
-
-/* segment usage flag */
-enum {
-	NILFS_SEGMENT_USAGE_ACTIVE,
-	NILFS_SEGMENT_USAGE_DIRTY,
-	NILFS_SEGMENT_USAGE_ERROR,
-
-	/* ... */
-};
-
-#define NILFS_SEGMENT_USAGE_FNS(flag, name)				\
-static inline void							\
-nilfs_segment_usage_set_##name(struct nilfs_segment_usage *su)		\
-{									\
-	su->su_flags = cpu_to_le32(le32_to_cpu(su->su_flags) |		\
-				   (1UL << NILFS_SEGMENT_USAGE_##flag));\
-}									\
-static inline void							\
-nilfs_segment_usage_clear_##name(struct nilfs_segment_usage *su)	\
-{									\
-	su->su_flags =							\
-		cpu_to_le32(le32_to_cpu(su->su_flags) &			\
-			    ~(1UL << NILFS_SEGMENT_USAGE_##flag));      \
-}									\
-static inline int							\
-nilfs_segment_usage_##name(const struct nilfs_segment_usage *su)	\
-{									\
-	return !!(le32_to_cpu(su->su_flags) &				\
-		  (1UL << NILFS_SEGMENT_USAGE_##flag));			\
-}
-
-NILFS_SEGMENT_USAGE_FNS(ACTIVE, active)
-NILFS_SEGMENT_USAGE_FNS(DIRTY, dirty)
-NILFS_SEGMENT_USAGE_FNS(ERROR, error)
-
-static inline void
-nilfs_segment_usage_set_clean(struct nilfs_segment_usage *su)
-{
-	su->su_lastmod = cpu_to_le64(0);
-	su->su_nblocks = cpu_to_le32(0);
-	su->su_flags = cpu_to_le32(0);
-}
-
-static inline int
-nilfs_segment_usage_clean(const struct nilfs_segment_usage *su)
-{
-	return !le32_to_cpu(su->su_flags);
-}
-
-/**
- * struct nilfs_sufile_header - segment usage file header
- * @sh_ncleansegs: number of clean segments
- * @sh_ndirtysegs: number of dirty segments
- * @sh_last_alloc: last allocated segment number
- */
-struct nilfs_sufile_header {
-	__le64 sh_ncleansegs;
-	__le64 sh_ndirtysegs;
-	__le64 sh_last_alloc;
-	/* ... */
-};
-
-#define NILFS_SUFILE_FIRST_SEGMENT_USAGE_OFFSET	\
-	((sizeof(struct nilfs_sufile_header) +				\
-	  sizeof(struct nilfs_segment_usage) - 1) /			\
-			 sizeof(struct nilfs_segment_usage))
-
-/**
- * nilfs_suinfo - segment usage information
- * @sui_lastmod: timestamp of last modification
- * @sui_nblocks: number of written blocks in segment
- * @sui_flags: segment usage flags
- */
-struct nilfs_suinfo {
-	__u64 sui_lastmod;
-	__u32 sui_nblocks;
-	__u32 sui_flags;
-};
-
-#define NILFS_SUINFO_FNS(flag, name)					\
-static inline int							\
-nilfs_suinfo_##name(const struct nilfs_suinfo *si)			\
-{									\
-	return si->sui_flags & (1UL << NILFS_SEGMENT_USAGE_##flag);	\
-}
-
-NILFS_SUINFO_FNS(ACTIVE, active)
-NILFS_SUINFO_FNS(DIRTY, dirty)
-NILFS_SUINFO_FNS(ERROR, error)
-
-static inline int nilfs_suinfo_clean(const struct nilfs_suinfo *si)
-{
-	return !si->sui_flags;
-}
-
-/* ioctl */
-/**
- * nilfs_suinfo_update - segment usage information update
- * @sup_segnum: segment number
- * @sup_flags: flags for which fields are active in sup_sui
- * @sup_reserved: reserved necessary for alignment
- * @sup_sui: segment usage information
- */
-struct nilfs_suinfo_update {
-	__u64 sup_segnum;
-	__u32 sup_flags;
-	__u32 sup_reserved;
-	struct nilfs_suinfo sup_sui;
-};
-
-enum {
-	NILFS_SUINFO_UPDATE_LASTMOD,
-	NILFS_SUINFO_UPDATE_NBLOCKS,
-	NILFS_SUINFO_UPDATE_FLAGS,
-	__NR_NILFS_SUINFO_UPDATE_FIELDS,
-};
-
-#define NILFS_SUINFO_UPDATE_FNS(flag, name)				\
-static inline void							\
-nilfs_suinfo_update_set_##name(struct nilfs_suinfo_update *sup)		\
-{									\
-	sup->sup_flags |= 1UL << NILFS_SUINFO_UPDATE_##flag;		\
-}									\
-static inline void							\
-nilfs_suinfo_update_clear_##name(struct nilfs_suinfo_update *sup)	\
-{									\
-	sup->sup_flags &= ~(1UL << NILFS_SUINFO_UPDATE_##flag);		\
-}									\
-static inline int							\
-nilfs_suinfo_update_##name(const struct nilfs_suinfo_update *sup)	\
-{									\
-	return !!(sup->sup_flags & (1UL << NILFS_SUINFO_UPDATE_##flag));\
-}
-
-NILFS_SUINFO_UPDATE_FNS(LASTMOD, lastmod)
-NILFS_SUINFO_UPDATE_FNS(NBLOCKS, nblocks)
-NILFS_SUINFO_UPDATE_FNS(FLAGS, flags)
-
-enum {
-	NILFS_CHECKPOINT,
-	NILFS_SNAPSHOT,
-};
-
-/**
- * struct nilfs_cpmode - change checkpoint mode structure
- * @cm_cno: checkpoint number
- * @cm_mode: mode of checkpoint
- * @cm_pad: padding
- */
-struct nilfs_cpmode {
-	__u64 cm_cno;
-	__u32 cm_mode;
-	__u32 cm_pad;
-};
-
-/**
- * struct nilfs_argv - argument vector
- * @v_base: pointer on data array from userspace
- * @v_nmembs: number of members in data array
- * @v_size: size of data array in bytes
- * @v_flags: flags
- * @v_index: start number of target data items
- */
-struct nilfs_argv {
-	__u64 v_base;
-	__u32 v_nmembs;	/* number of members */
-	__u16 v_size;	/* size of members */
-	__u16 v_flags;
-	__u64 v_index;
-};
-
-/**
- * struct nilfs_period - period of checkpoint numbers
- * @p_start: start checkpoint number (inclusive)
- * @p_end: end checkpoint number (exclusive)
- */
-struct nilfs_period {
-	__u64 p_start;
-	__u64 p_end;
-};
-
-/**
- * struct nilfs_cpstat - checkpoint statistics
- * @cs_cno: checkpoint number
- * @cs_ncps: number of checkpoints
- * @cs_nsss: number of snapshots
- */
-struct nilfs_cpstat {
-	__u64 cs_cno;
-	__u64 cs_ncps;
-	__u64 cs_nsss;
-};
-
-/**
- * struct nilfs_sustat - segment usage statistics
- * @ss_nsegs: number of segments
- * @ss_ncleansegs: number of clean segments
- * @ss_ndirtysegs: number of dirty segments
- * @ss_ctime: creation time of the last segment
- * @ss_nongc_ctime: creation time of the last segment not for GC
- * @ss_prot_seq: least sequence number of segments which must not be reclaimed
- */
-struct nilfs_sustat {
-	__u64 ss_nsegs;
-	__u64 ss_ncleansegs;
-	__u64 ss_ndirtysegs;
-	__u64 ss_ctime;
-	__u64 ss_nongc_ctime;
-	__u64 ss_prot_seq;
-};
-
-/**
- * struct nilfs_vinfo - virtual block number information
- * @vi_vblocknr: virtual block number
- * @vi_start: start checkpoint number (inclusive)
- * @vi_end: end checkpoint number (exclusive)
- * @vi_blocknr: disk block number
- */
-struct nilfs_vinfo {
-	__u64 vi_vblocknr;
-	__u64 vi_start;
-	__u64 vi_end;
-	__u64 vi_blocknr;
-};
-
-/**
- * struct nilfs_vdesc - descriptor of virtual block number
- * @vd_ino: inode number
- * @vd_cno: checkpoint number
- * @vd_vblocknr: virtual block number
- * @vd_period: period of checkpoint numbers
- * @vd_blocknr: disk block number
- * @vd_offset: logical block offset inside a file
- * @vd_flags: flags (data or node block)
- * @vd_pad: padding
- */
-struct nilfs_vdesc {
-	__u64 vd_ino;
-	__u64 vd_cno;
-	__u64 vd_vblocknr;
-	struct nilfs_period vd_period;
-	__u64 vd_blocknr;
-	__u64 vd_offset;
-	__u32 vd_flags;
-	__u32 vd_pad;
-};
-
-/**
- * struct nilfs_bdesc - descriptor of disk block number
- * @bd_ino: inode number
- * @bd_oblocknr: disk block address (for skipping dead blocks)
- * @bd_blocknr: disk block address
- * @bd_offset: logical block offset inside a file
- * @bd_level: level in the b-tree organization
- * @bd_pad: padding
- */
-struct nilfs_bdesc {
-	__u64 bd_ino;
-	__u64 bd_oblocknr;
-	__u64 bd_blocknr;
-	__u64 bd_offset;
-	__u32 bd_level;
-	__u32 bd_pad;
-};
-
-#define NILFS_IOCTL_IDENT		'n'
-
-#define NILFS_IOCTL_CHANGE_CPMODE  \
-	_IOW(NILFS_IOCTL_IDENT, 0x80, struct nilfs_cpmode)
-#define NILFS_IOCTL_DELETE_CHECKPOINT  \
-	_IOW(NILFS_IOCTL_IDENT, 0x81, __u64)
-#define NILFS_IOCTL_GET_CPINFO  \
-	_IOR(NILFS_IOCTL_IDENT, 0x82, struct nilfs_argv)
-#define NILFS_IOCTL_GET_CPSTAT  \
-	_IOR(NILFS_IOCTL_IDENT, 0x83, struct nilfs_cpstat)
-#define NILFS_IOCTL_GET_SUINFO  \
-	_IOR(NILFS_IOCTL_IDENT, 0x84, struct nilfs_argv)
-#define NILFS_IOCTL_GET_SUSTAT  \
-	_IOR(NILFS_IOCTL_IDENT, 0x85, struct nilfs_sustat)
-#define NILFS_IOCTL_GET_VINFO  \
-	_IOWR(NILFS_IOCTL_IDENT, 0x86, struct nilfs_argv)
-#define NILFS_IOCTL_GET_BDESCS  \
-	_IOWR(NILFS_IOCTL_IDENT, 0x87, struct nilfs_argv)
-#define NILFS_IOCTL_CLEAN_SEGMENTS  \
-	_IOW(NILFS_IOCTL_IDENT, 0x88, struct nilfs_argv[5])
-#define NILFS_IOCTL_SYNC  \
-	_IOR(NILFS_IOCTL_IDENT, 0x8A, __u64)
-#define NILFS_IOCTL_RESIZE  \
-	_IOW(NILFS_IOCTL_IDENT, 0x8B, __u64)
-#define NILFS_IOCTL_SET_ALLOC_RANGE  \
-	_IOW(NILFS_IOCTL_IDENT, 0x8C, __u64[2])
-#define NILFS_IOCTL_SET_SUINFO  \
-	_IOW(NILFS_IOCTL_IDENT, 0x8D, struct nilfs_argv)
-
-#endif	/* _LINUX_NILFS_FS_H */
diff --git a/include/uapi/linux/nilfs2_api.h b/include/uapi/linux/nilfs2_api.h
new file mode 100644
index 000000000000..ef4c1de89b11
--- /dev/null
+++ b/include/uapi/linux/nilfs2_api.h
@@ -0,0 +1,292 @@
+/*
+ * nilfs2_api.h - NILFS2 user space API
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation; either version 2.1 of the License, or
+ * (at your option) any later version.
+ */
+
+#ifndef _LINUX_NILFS2_API_H
+#define _LINUX_NILFS2_API_H
+
+#include <linux/types.h>
+#include <linux/ioctl.h>
+
+/**
+ * struct nilfs_cpinfo - checkpoint information
+ * @ci_flags: flags
+ * @ci_pad: padding
+ * @ci_cno: checkpoint number
+ * @ci_create: creation timestamp
+ * @ci_nblk_inc: number of blocks incremented by this checkpoint
+ * @ci_inodes_count: inodes count
+ * @ci_blocks_count: blocks count
+ * @ci_next: next checkpoint number in snapshot list
+ */
+struct nilfs_cpinfo {
+	__u32 ci_flags;
+	__u32 ci_pad;
+	__u64 ci_cno;
+	__u64 ci_create;
+	__u64 ci_nblk_inc;
+	__u64 ci_inodes_count;
+	__u64 ci_blocks_count;
+	__u64 ci_next;
+};
+
+/* checkpoint flags */
+enum {
+	NILFS_CPINFO_SNAPSHOT,
+	NILFS_CPINFO_INVALID,
+	NILFS_CPINFO_SKETCH,
+	NILFS_CPINFO_MINOR,
+};
+
+#define NILFS_CPINFO_FNS(flag, name)					\
+static inline int							\
+nilfs_cpinfo_##name(const struct nilfs_cpinfo *cpinfo)			\
+{									\
+	return !!(cpinfo->ci_flags & (1UL << NILFS_CPINFO_##flag));	\
+}
+
+NILFS_CPINFO_FNS(SNAPSHOT, snapshot)
+NILFS_CPINFO_FNS(INVALID, invalid)
+NILFS_CPINFO_FNS(MINOR, minor)
+
+/**
+ * nilfs_suinfo - segment usage information
+ * @sui_lastmod: timestamp of last modification
+ * @sui_nblocks: number of written blocks in segment
+ * @sui_flags: segment usage flags
+ */
+struct nilfs_suinfo {
+	__u64 sui_lastmod;
+	__u32 sui_nblocks;
+	__u32 sui_flags;
+};
+
+/* segment usage flags */
+enum {
+	NILFS_SUINFO_ACTIVE,
+	NILFS_SUINFO_DIRTY,
+	NILFS_SUINFO_ERROR,
+};
+
+#define NILFS_SUINFO_FNS(flag, name)					\
+static inline int							\
+nilfs_suinfo_##name(const struct nilfs_suinfo *si)			\
+{									\
+	return si->sui_flags & (1UL << NILFS_SUINFO_##flag);		\
+}
+
+NILFS_SUINFO_FNS(ACTIVE, active)
+NILFS_SUINFO_FNS(DIRTY, dirty)
+NILFS_SUINFO_FNS(ERROR, error)
+
+static inline int nilfs_suinfo_clean(const struct nilfs_suinfo *si)
+{
+	return !si->sui_flags;
+}
+
+/**
+ * nilfs_suinfo_update - segment usage information update
+ * @sup_segnum: segment number
+ * @sup_flags: flags for which fields are active in sup_sui
+ * @sup_reserved: reserved necessary for alignment
+ * @sup_sui: segment usage information
+ */
+struct nilfs_suinfo_update {
+	__u64 sup_segnum;
+	__u32 sup_flags;
+	__u32 sup_reserved;
+	struct nilfs_suinfo sup_sui;
+};
+
+enum {
+	NILFS_SUINFO_UPDATE_LASTMOD,
+	NILFS_SUINFO_UPDATE_NBLOCKS,
+	NILFS_SUINFO_UPDATE_FLAGS,
+	__NR_NILFS_SUINFO_UPDATE_FIELDS,
+};
+
+#define NILFS_SUINFO_UPDATE_FNS(flag, name)				\
+static inline void							\
+nilfs_suinfo_update_set_##name(struct nilfs_suinfo_update *sup)		\
+{									\
+	sup->sup_flags |= 1UL << NILFS_SUINFO_UPDATE_##flag;		\
+}									\
+static inline void							\
+nilfs_suinfo_update_clear_##name(struct nilfs_suinfo_update *sup)	\
+{									\
+	sup->sup_flags &= ~(1UL << NILFS_SUINFO_UPDATE_##flag);		\
+}									\
+static inline int							\
+nilfs_suinfo_update_##name(const struct nilfs_suinfo_update *sup)	\
+{									\
+	return !!(sup->sup_flags & (1UL << NILFS_SUINFO_UPDATE_##flag));\
+}
+
+NILFS_SUINFO_UPDATE_FNS(LASTMOD, lastmod)
+NILFS_SUINFO_UPDATE_FNS(NBLOCKS, nblocks)
+NILFS_SUINFO_UPDATE_FNS(FLAGS, flags)
+
+enum {
+	NILFS_CHECKPOINT,
+	NILFS_SNAPSHOT,
+};
+
+/**
+ * struct nilfs_cpmode - change checkpoint mode structure
+ * @cm_cno: checkpoint number
+ * @cm_mode: mode of checkpoint
+ * @cm_pad: padding
+ */
+struct nilfs_cpmode {
+	__u64 cm_cno;
+	__u32 cm_mode;
+	__u32 cm_pad;
+};
+
+/**
+ * struct nilfs_argv - argument vector
+ * @v_base: pointer on data array from userspace
+ * @v_nmembs: number of members in data array
+ * @v_size: size of data array in bytes
+ * @v_flags: flags
+ * @v_index: start number of target data items
+ */
+struct nilfs_argv {
+	__u64 v_base;
+	__u32 v_nmembs;	/* number of members */
+	__u16 v_size;	/* size of members */
+	__u16 v_flags;
+	__u64 v_index;
+};
+
+/**
+ * struct nilfs_period - period of checkpoint numbers
+ * @p_start: start checkpoint number (inclusive)
+ * @p_end: end checkpoint number (exclusive)
+ */
+struct nilfs_period {
+	__u64 p_start;
+	__u64 p_end;
+};
+
+/**
+ * struct nilfs_cpstat - checkpoint statistics
+ * @cs_cno: checkpoint number
+ * @cs_ncps: number of checkpoints
+ * @cs_nsss: number of snapshots
+ */
+struct nilfs_cpstat {
+	__u64 cs_cno;
+	__u64 cs_ncps;
+	__u64 cs_nsss;
+};
+
+/**
+ * struct nilfs_sustat - segment usage statistics
+ * @ss_nsegs: number of segments
+ * @ss_ncleansegs: number of clean segments
+ * @ss_ndirtysegs: number of dirty segments
+ * @ss_ctime: creation time of the last segment
+ * @ss_nongc_ctime: creation time of the last segment not for GC
+ * @ss_prot_seq: least sequence number of segments which must not be reclaimed
+ */
+struct nilfs_sustat {
+	__u64 ss_nsegs;
+	__u64 ss_ncleansegs;
+	__u64 ss_ndirtysegs;
+	__u64 ss_ctime;
+	__u64 ss_nongc_ctime;
+	__u64 ss_prot_seq;
+};
+
+/**
+ * struct nilfs_vinfo - virtual block number information
+ * @vi_vblocknr: virtual block number
+ * @vi_start: start checkpoint number (inclusive)
+ * @vi_end: end checkpoint number (exclusive)
+ * @vi_blocknr: disk block number
+ */
+struct nilfs_vinfo {
+	__u64 vi_vblocknr;
+	__u64 vi_start;
+	__u64 vi_end;
+	__u64 vi_blocknr;
+};
+
+/**
+ * struct nilfs_vdesc - descriptor of virtual block number
+ * @vd_ino: inode number
+ * @vd_cno: checkpoint number
+ * @vd_vblocknr: virtual block number
+ * @vd_period: period of checkpoint numbers
+ * @vd_blocknr: disk block number
+ * @vd_offset: logical block offset inside a file
+ * @vd_flags: flags (data or node block)
+ * @vd_pad: padding
+ */
+struct nilfs_vdesc {
+	__u64 vd_ino;
+	__u64 vd_cno;
+	__u64 vd_vblocknr;
+	struct nilfs_period vd_period;
+	__u64 vd_blocknr;
+	__u64 vd_offset;
+	__u32 vd_flags;
+	__u32 vd_pad;
+};
+
+/**
+ * struct nilfs_bdesc - descriptor of disk block number
+ * @bd_ino: inode number
+ * @bd_oblocknr: disk block address (for skipping dead blocks)
+ * @bd_blocknr: disk block address
+ * @bd_offset: logical block offset inside a file
+ * @bd_level: level in the b-tree organization
+ * @bd_pad: padding
+ */
+struct nilfs_bdesc {
+	__u64 bd_ino;
+	__u64 bd_oblocknr;
+	__u64 bd_blocknr;
+	__u64 bd_offset;
+	__u32 bd_level;
+	__u32 bd_pad;
+};
+
+#define NILFS_IOCTL_IDENT	'n'
+
+#define NILFS_IOCTL_CHANGE_CPMODE					\
+	_IOW(NILFS_IOCTL_IDENT, 0x80, struct nilfs_cpmode)
+#define NILFS_IOCTL_DELETE_CHECKPOINT					\
+	_IOW(NILFS_IOCTL_IDENT, 0x81, __u64)
+#define NILFS_IOCTL_GET_CPINFO						\
+	_IOR(NILFS_IOCTL_IDENT, 0x82, struct nilfs_argv)
+#define NILFS_IOCTL_GET_CPSTAT						\
+	_IOR(NILFS_IOCTL_IDENT, 0x83, struct nilfs_cpstat)
+#define NILFS_IOCTL_GET_SUINFO						\
+	_IOR(NILFS_IOCTL_IDENT, 0x84, struct nilfs_argv)
+#define NILFS_IOCTL_GET_SUSTAT						\
+	_IOR(NILFS_IOCTL_IDENT, 0x85, struct nilfs_sustat)
+#define NILFS_IOCTL_GET_VINFO						\
+	_IOWR(NILFS_IOCTL_IDENT, 0x86, struct nilfs_argv)
+#define NILFS_IOCTL_GET_BDESCS						\
+	_IOWR(NILFS_IOCTL_IDENT, 0x87, struct nilfs_argv)
+#define NILFS_IOCTL_CLEAN_SEGMENTS					\
+	_IOW(NILFS_IOCTL_IDENT, 0x88, struct nilfs_argv[5])
+#define NILFS_IOCTL_SYNC						\
+	_IOR(NILFS_IOCTL_IDENT, 0x8A, __u64)
+#define NILFS_IOCTL_RESIZE						\
+	_IOW(NILFS_IOCTL_IDENT, 0x8B, __u64)
+#define NILFS_IOCTL_SET_ALLOC_RANGE					\
+	_IOW(NILFS_IOCTL_IDENT, 0x8C, __u64[2])
+#define NILFS_IOCTL_SET_SUINFO						\
+	_IOW(NILFS_IOCTL_IDENT, 0x8D, struct nilfs_argv)
+
+#endif /* _LINUX_NILFS2_API_H */
diff --git a/include/uapi/linux/nilfs2_ondisk.h b/include/uapi/linux/nilfs2_ondisk.h
new file mode 100644
index 000000000000..2a8a3addb675
--- /dev/null
+++ b/include/uapi/linux/nilfs2_ondisk.h
@@ -0,0 +1,650 @@
+/*
+ * nilfs2_ondisk.h - NILFS2 on-disk structures
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation; either version 2.1 of the License, or
+ * (at your option) any later version.
+ */
+/*
+ *  linux/include/linux/ext2_fs.h
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ *  from
+ *
+ *  linux/include/linux/minix_fs.h
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ */
+
+#ifndef _LINUX_NILFS2_ONDISK_H
+#define _LINUX_NILFS2_ONDISK_H
+
+#include <linux/types.h>
+#include <linux/magic.h>
+
+
+#define NILFS_INODE_BMAP_SIZE	7
+
+/**
+ * struct nilfs_inode - structure of an inode on disk
+ * @i_blocks: blocks count
+ * @i_size: size in bytes
+ * @i_ctime: creation time (seconds)
+ * @i_mtime: modification time (seconds)
+ * @i_ctime_nsec: creation time (nano seconds)
+ * @i_mtime_nsec: modification time (nano seconds)
+ * @i_uid: user id
+ * @i_gid: group id
+ * @i_mode: file mode
+ * @i_links_count: links count
+ * @i_flags: file flags
+ * @i_bmap: block mapping
+ * @i_xattr: extended attributes
+ * @i_generation: file generation (for NFS)
+ * @i_pad: padding
+ */
+struct nilfs_inode {
+	__le64	i_blocks;
+	__le64	i_size;
+	__le64	i_ctime;
+	__le64	i_mtime;
+	__le32	i_ctime_nsec;
+	__le32	i_mtime_nsec;
+	__le32	i_uid;
+	__le32	i_gid;
+	__le16	i_mode;
+	__le16	i_links_count;
+	__le32	i_flags;
+	__le64	i_bmap[NILFS_INODE_BMAP_SIZE];
+#define i_device_code	i_bmap[0]
+	__le64	i_xattr;
+	__le32	i_generation;
+	__le32	i_pad;
+};
+
+#define NILFS_MIN_INODE_SIZE		128
+
+/**
+ * struct nilfs_super_root - structure of super root
+ * @sr_sum: check sum
+ * @sr_bytes: byte count of the structure
+ * @sr_flags: flags (reserved)
+ * @sr_nongc_ctime: write time of the last segment not for cleaner operation
+ * @sr_dat: DAT file inode
+ * @sr_cpfile: checkpoint file inode
+ * @sr_sufile: segment usage file inode
+ */
+struct nilfs_super_root {
+	__le32 sr_sum;
+	__le16 sr_bytes;
+	__le16 sr_flags;
+	__le64 sr_nongc_ctime;
+	struct nilfs_inode sr_dat;
+	struct nilfs_inode sr_cpfile;
+	struct nilfs_inode sr_sufile;
+};
+
+#define NILFS_SR_MDT_OFFSET(inode_size, i)  \
+	((unsigned long)&((struct nilfs_super_root *)0)->sr_dat + \
+			(inode_size) * (i))
+#define NILFS_SR_DAT_OFFSET(inode_size)     NILFS_SR_MDT_OFFSET(inode_size, 0)
+#define NILFS_SR_CPFILE_OFFSET(inode_size)  NILFS_SR_MDT_OFFSET(inode_size, 1)
+#define NILFS_SR_SUFILE_OFFSET(inode_size)  NILFS_SR_MDT_OFFSET(inode_size, 2)
+#define NILFS_SR_BYTES(inode_size)	    NILFS_SR_MDT_OFFSET(inode_size, 3)
+
+/*
+ * Maximal mount counts
+ */
+#define NILFS_DFL_MAX_MNT_COUNT		50      /* 50 mounts */
+
+/*
+ * File system states (sbp->s_state, nilfs->ns_mount_state)
+ */
+#define NILFS_VALID_FS			0x0001  /* Unmounted cleanly */
+#define NILFS_ERROR_FS			0x0002  /* Errors detected */
+#define NILFS_RESIZE_FS			0x0004	/* Resize required */
+
+/*
+ * Mount flags (sbi->s_mount_opt)
+ */
+#define NILFS_MOUNT_ERROR_MODE		0x0070  /* Error mode mask */
+#define NILFS_MOUNT_ERRORS_CONT		0x0010  /* Continue on errors */
+#define NILFS_MOUNT_ERRORS_RO		0x0020  /* Remount fs ro on errors */
+#define NILFS_MOUNT_ERRORS_PANIC	0x0040  /* Panic on errors */
+#define NILFS_MOUNT_BARRIER		0x1000  /* Use block barriers */
+#define NILFS_MOUNT_STRICT_ORDER	0x2000  /*
+						 * Apply strict in-order
+						 * semantics also for data
+						 */
+#define NILFS_MOUNT_NORECOVERY		0x4000  /*
+						 * Disable write access during
+						 * mount-time recovery
+						 */
+#define NILFS_MOUNT_DISCARD		0x8000  /* Issue DISCARD requests */
+
+
+/**
+ * struct nilfs_super_block - structure of super block on disk
+ */
+struct nilfs_super_block {
+/*00*/	__le32	s_rev_level;		/* Revision level */
+	__le16	s_minor_rev_level;	/* minor revision level */
+	__le16	s_magic;		/* Magic signature */
+
+	__le16  s_bytes;		/*
+					 * Bytes count of CRC calculation
+					 * for this structure. s_reserved
+					 * is excluded.
+					 */
+	__le16  s_flags;		/* flags */
+	__le32  s_crc_seed;		/* Seed value of CRC calculation */
+/*10*/	__le32	s_sum;			/* Check sum of super block */
+
+	__le32	s_log_block_size;	/*
+					 * Block size represented as follows
+					 * blocksize =
+					 *     1 << (s_log_block_size + 10)
+					 */
+	__le64  s_nsegments;		/* Number of segments in filesystem */
+/*20*/	__le64  s_dev_size;		/* block device size in bytes */
+	__le64	s_first_data_block;	/* 1st seg disk block number */
+/*30*/	__le32  s_blocks_per_segment;   /* number of blocks per full segment */
+	__le32	s_r_segments_percentage; /* Reserved segments percentage */
+
+	__le64  s_last_cno;		/* Last checkpoint number */
+/*40*/	__le64  s_last_pseg;		/* disk block addr pseg written last */
+	__le64  s_last_seq;             /* seq. number of seg written last */
+/*50*/	__le64	s_free_blocks_count;	/* Free blocks count */
+
+	__le64	s_ctime;		/*
+					 * Creation time (execution time of
+					 * newfs)
+					 */
+/*60*/	__le64	s_mtime;		/* Mount time */
+	__le64	s_wtime;		/* Write time */
+/*70*/	__le16	s_mnt_count;		/* Mount count */
+	__le16	s_max_mnt_count;	/* Maximal mount count */
+	__le16	s_state;		/* File system state */
+	__le16	s_errors;		/* Behaviour when detecting errors */
+	__le64	s_lastcheck;		/* time of last check */
+
+/*80*/	__le32	s_checkinterval;	/* max. time between checks */
+	__le32	s_creator_os;		/* OS */
+	__le16	s_def_resuid;		/* Default uid for reserved blocks */
+	__le16	s_def_resgid;		/* Default gid for reserved blocks */
+	__le32	s_first_ino;		/* First non-reserved inode */
+
+/*90*/	__le16  s_inode_size;		/* Size of an inode */
+	__le16  s_dat_entry_size;       /* Size of a dat entry */
+	__le16  s_checkpoint_size;      /* Size of a checkpoint */
+	__le16	s_segment_usage_size;	/* Size of a segment usage */
+
+/*98*/	__u8	s_uuid[16];		/* 128-bit uuid for volume */
+/*A8*/	char	s_volume_name[80];	/* volume name */
+
+/*F8*/	__le32  s_c_interval;           /* Commit interval of segment */
+	__le32  s_c_block_max;          /*
+					 * Threshold of data amount for
+					 * the segment construction
+					 */
+/*100*/	__le64  s_feature_compat;	/* Compatible feature set */
+	__le64  s_feature_compat_ro;	/* Read-only compatible feature set */
+	__le64  s_feature_incompat;	/* Incompatible feature set */
+	__u32	s_reserved[186];	/* padding to the end of the block */
+};
+
+/*
+ * Codes for operating systems
+ */
+#define NILFS_OS_LINUX		0
+/* Codes from 1 to 4 are reserved to keep compatibility with ext2 creator-OS */
+
+/*
+ * Revision levels
+ */
+#define NILFS_CURRENT_REV	2	/* current major revision */
+#define NILFS_MINOR_REV		0	/* minor revision */
+#define NILFS_MIN_SUPP_REV	2	/* minimum supported revision */
+
+/*
+ * Feature set definitions
+ *
+ * If there is a bit set in the incompatible feature set that the kernel
+ * doesn't know about, it should refuse to mount the filesystem.
+ */
+#define NILFS_FEATURE_COMPAT_RO_BLOCK_COUNT	0x00000001ULL
+
+#define NILFS_FEATURE_COMPAT_SUPP	0ULL
+#define NILFS_FEATURE_COMPAT_RO_SUPP	NILFS_FEATURE_COMPAT_RO_BLOCK_COUNT
+#define NILFS_FEATURE_INCOMPAT_SUPP	0ULL
+
+/*
+ * Bytes count of super_block for CRC-calculation
+ */
+#define NILFS_SB_BYTES  \
+	((long)&((struct nilfs_super_block *)0)->s_reserved)
+
+/*
+ * Special inode number
+ */
+#define NILFS_ROOT_INO		2	/* Root file inode */
+#define NILFS_DAT_INO		3	/* DAT file */
+#define NILFS_CPFILE_INO	4	/* checkpoint file */
+#define NILFS_SUFILE_INO	5	/* segment usage file */
+#define NILFS_IFILE_INO		6	/* ifile */
+#define NILFS_ATIME_INO		7	/* Atime file (reserved) */
+#define NILFS_XATTR_INO		8	/* Xattribute file (reserved) */
+#define NILFS_SKETCH_INO	10	/* Sketch file */
+#define NILFS_USER_INO		11	/* Fisrt user's file inode number */
+
+#define NILFS_SB_OFFSET_BYTES	1024	/* byte offset of nilfs superblock */
+
+#define NILFS_SEG_MIN_BLOCKS	16	/*
+					 * Minimum number of blocks in
+					 * a full segment
+					 */
+#define NILFS_PSEG_MIN_BLOCKS	2	/*
+					 * Minimum number of blocks in
+					 * a partial segment
+					 */
+#define NILFS_MIN_NRSVSEGS	8	/*
+					 * Minimum number of reserved
+					 * segments
+					 */
+
+/*
+ * We call DAT, cpfile, and sufile root metadata files.  Inodes of
+ * these files are written in super root block instead of ifile, and
+ * garbage collector doesn't keep any past versions of these files.
+ */
+#define NILFS_ROOT_METADATA_FILE(ino) \
+	((ino) >= NILFS_DAT_INO && (ino) <= NILFS_SUFILE_INO)
+
+/*
+ * bytes offset of secondary super block
+ */
+#define NILFS_SB2_OFFSET_BYTES(devsize)	((((devsize) >> 12) - 1) << 12)
+
+/*
+ * Maximal count of links to a file
+ */
+#define NILFS_LINK_MAX		32000
+
+/*
+ * Structure of a directory entry
+ *  (Same as ext2)
+ */
+
+#define NILFS_NAME_LEN 255
+
+/*
+ * Block size limitations
+ */
+#define NILFS_MIN_BLOCK_SIZE		1024
+#define NILFS_MAX_BLOCK_SIZE		65536
+
+/*
+ * The new version of the directory entry.  Since V0 structures are
+ * stored in intel byte order, and the name_len field could never be
+ * bigger than 255 chars, it's safe to reclaim the extra byte for the
+ * file_type field.
+ */
+struct nilfs_dir_entry {
+	__le64	inode;			/* Inode number */
+	__le16	rec_len;		/* Directory entry length */
+	__u8	name_len;		/* Name length */
+	__u8	file_type;		/* Dir entry type (file, dir, etc) */
+	char	name[NILFS_NAME_LEN];	/* File name */
+	char    pad;
+};
+
+/*
+ * NILFS directory file types.  Only the low 3 bits are used.  The
+ * other bits are reserved for now.
+ */
+enum {
+	NILFS_FT_UNKNOWN,
+	NILFS_FT_REG_FILE,
+	NILFS_FT_DIR,
+	NILFS_FT_CHRDEV,
+	NILFS_FT_BLKDEV,
+	NILFS_FT_FIFO,
+	NILFS_FT_SOCK,
+	NILFS_FT_SYMLINK,
+	NILFS_FT_MAX
+};
+
+/*
+ * NILFS_DIR_PAD defines the directory entries boundaries
+ *
+ * NOTE: It must be a multiple of 8
+ */
+#define NILFS_DIR_PAD			8
+#define NILFS_DIR_ROUND			(NILFS_DIR_PAD - 1)
+#define NILFS_DIR_REC_LEN(name_len)	(((name_len) + 12 + NILFS_DIR_ROUND) & \
+					~NILFS_DIR_ROUND)
+#define NILFS_MAX_REC_LEN		((1 << 16) - 1)
+
+/**
+ * struct nilfs_finfo - file information
+ * @fi_ino: inode number
+ * @fi_cno: checkpoint number
+ * @fi_nblocks: number of blocks (including intermediate blocks)
+ * @fi_ndatablk: number of file data blocks
+ */
+struct nilfs_finfo {
+	__le64 fi_ino;
+	__le64 fi_cno;
+	__le32 fi_nblocks;
+	__le32 fi_ndatablk;
+};
+
+/**
+ * struct nilfs_binfo_v - information on a data block (except DAT)
+ * @bi_vblocknr: virtual block number
+ * @bi_blkoff: block offset
+ */
+struct nilfs_binfo_v {
+	__le64 bi_vblocknr;
+	__le64 bi_blkoff;
+};
+
+/**
+ * struct nilfs_binfo_dat - information on a DAT node block
+ * @bi_blkoff: block offset
+ * @bi_level: level
+ * @bi_pad: padding
+ */
+struct nilfs_binfo_dat {
+	__le64 bi_blkoff;
+	__u8 bi_level;
+	__u8 bi_pad[7];
+};
+
+/**
+ * union nilfs_binfo: block information
+ * @bi_v: nilfs_binfo_v structure
+ * @bi_dat: nilfs_binfo_dat structure
+ */
+union nilfs_binfo {
+	struct nilfs_binfo_v bi_v;
+	struct nilfs_binfo_dat bi_dat;
+};
+
+/**
+ * struct nilfs_segment_summary - segment summary header
+ * @ss_datasum: checksum of data
+ * @ss_sumsum: checksum of segment summary
+ * @ss_magic: magic number
+ * @ss_bytes: size of this structure in bytes
+ * @ss_flags: flags
+ * @ss_seq: sequence number
+ * @ss_create: creation timestamp
+ * @ss_next: next segment
+ * @ss_nblocks: number of blocks
+ * @ss_nfinfo: number of finfo structures
+ * @ss_sumbytes: total size of segment summary in bytes
+ * @ss_pad: padding
+ * @ss_cno: checkpoint number
+ */
+struct nilfs_segment_summary {
+	__le32 ss_datasum;
+	__le32 ss_sumsum;
+	__le32 ss_magic;
+	__le16 ss_bytes;
+	__le16 ss_flags;
+	__le64 ss_seq;
+	__le64 ss_create;
+	__le64 ss_next;
+	__le32 ss_nblocks;
+	__le32 ss_nfinfo;
+	__le32 ss_sumbytes;
+	__le32 ss_pad;
+	__le64 ss_cno;
+	/* array of finfo structures */
+};
+
+#define NILFS_SEGSUM_MAGIC	0x1eaffa11  /* segment summary magic number */
+
+/*
+ * Segment summary flags
+ */
+#define NILFS_SS_LOGBGN 0x0001  /* begins a logical segment */
+#define NILFS_SS_LOGEND 0x0002  /* ends a logical segment */
+#define NILFS_SS_SR     0x0004  /* has super root */
+#define NILFS_SS_SYNDT  0x0008  /* includes data only updates */
+#define NILFS_SS_GC     0x0010  /* segment written for cleaner operation */
+
+/**
+ * struct nilfs_btree_node - header of B-tree node block
+ * @bn_flags: flags
+ * @bn_level: level
+ * @bn_nchildren: number of children
+ * @bn_pad: padding
+ */
+struct nilfs_btree_node {
+	__u8 bn_flags;
+	__u8 bn_level;
+	__le16 bn_nchildren;
+	__le32 bn_pad;
+};
+
+/* flags */
+#define NILFS_BTREE_NODE_ROOT   0x01
+
+/* level */
+#define NILFS_BTREE_LEVEL_DATA          0
+#define NILFS_BTREE_LEVEL_NODE_MIN      (NILFS_BTREE_LEVEL_DATA + 1)
+#define NILFS_BTREE_LEVEL_MAX           14	/* Max level (exclusive) */
+
+/**
+ * struct nilfs_direct_node - header of built-in bmap array
+ * @dn_flags: flags
+ * @dn_pad: padding
+ */
+struct nilfs_direct_node {
+	__u8 dn_flags;
+	__u8 pad[7];
+};
+
+/**
+ * struct nilfs_palloc_group_desc - block group descriptor
+ * @pg_nfrees: number of free entries in block group
+ */
+struct nilfs_palloc_group_desc {
+	__le32 pg_nfrees;
+};
+
+/**
+ * struct nilfs_dat_entry - disk address translation entry
+ * @de_blocknr: block number
+ * @de_start: start checkpoint number
+ * @de_end: end checkpoint number
+ * @de_rsv: reserved for future use
+ */
+struct nilfs_dat_entry {
+	__le64 de_blocknr;
+	__le64 de_start;
+	__le64 de_end;
+	__le64 de_rsv;
+};
+
+#define NILFS_MIN_DAT_ENTRY_SIZE	32
+
+/**
+ * struct nilfs_snapshot_list - snapshot list
+ * @ssl_next: next checkpoint number on snapshot list
+ * @ssl_prev: previous checkpoint number on snapshot list
+ */
+struct nilfs_snapshot_list {
+	__le64 ssl_next;
+	__le64 ssl_prev;
+};
+
+/**
+ * struct nilfs_checkpoint - checkpoint structure
+ * @cp_flags: flags
+ * @cp_checkpoints_count: checkpoints count in a block
+ * @cp_snapshot_list: snapshot list
+ * @cp_cno: checkpoint number
+ * @cp_create: creation timestamp
+ * @cp_nblk_inc: number of blocks incremented by this checkpoint
+ * @cp_inodes_count: inodes count
+ * @cp_blocks_count: blocks count
+ * @cp_ifile_inode: inode of ifile
+ */
+struct nilfs_checkpoint {
+	__le32 cp_flags;
+	__le32 cp_checkpoints_count;
+	struct nilfs_snapshot_list cp_snapshot_list;
+	__le64 cp_cno;
+	__le64 cp_create;
+	__le64 cp_nblk_inc;
+	__le64 cp_inodes_count;
+	__le64 cp_blocks_count;
+
+	/*
+	 * Do not change the byte offset of ifile inode.
+	 * To keep the compatibility of the disk format,
+	 * additional fields should be added behind cp_ifile_inode.
+	 */
+	struct nilfs_inode cp_ifile_inode;
+};
+
+#define NILFS_MIN_CHECKPOINT_SIZE	(64 + NILFS_MIN_INODE_SIZE)
+
+/* checkpoint flags */
+enum {
+	NILFS_CHECKPOINT_SNAPSHOT,
+	NILFS_CHECKPOINT_INVALID,
+	NILFS_CHECKPOINT_SKETCH,
+	NILFS_CHECKPOINT_MINOR,
+};
+
+#define NILFS_CHECKPOINT_FNS(flag, name)				\
+static inline void							\
+nilfs_checkpoint_set_##name(struct nilfs_checkpoint *cp)		\
+{									\
+	cp->cp_flags = cpu_to_le32(le32_to_cpu(cp->cp_flags) |		\
+				   (1UL << NILFS_CHECKPOINT_##flag));	\
+}									\
+static inline void							\
+nilfs_checkpoint_clear_##name(struct nilfs_checkpoint *cp)		\
+{									\
+	cp->cp_flags = cpu_to_le32(le32_to_cpu(cp->cp_flags) &		\
+				   ~(1UL << NILFS_CHECKPOINT_##flag));	\
+}									\
+static inline int							\
+nilfs_checkpoint_##name(const struct nilfs_checkpoint *cp)		\
+{									\
+	return !!(le32_to_cpu(cp->cp_flags) &				\
+		  (1UL << NILFS_CHECKPOINT_##flag));			\
+}
+
+NILFS_CHECKPOINT_FNS(SNAPSHOT, snapshot)
+NILFS_CHECKPOINT_FNS(INVALID, invalid)
+NILFS_CHECKPOINT_FNS(MINOR, minor)
+
+/**
+ * struct nilfs_cpfile_header - checkpoint file header
+ * @ch_ncheckpoints: number of checkpoints
+ * @ch_nsnapshots: number of snapshots
+ * @ch_snapshot_list: snapshot list
+ */
+struct nilfs_cpfile_header {
+	__le64 ch_ncheckpoints;
+	__le64 ch_nsnapshots;
+	struct nilfs_snapshot_list ch_snapshot_list;
+};
+
+#define NILFS_CPFILE_FIRST_CHECKPOINT_OFFSET				\
+	((sizeof(struct nilfs_cpfile_header) +				\
+	  sizeof(struct nilfs_checkpoint) - 1) /			\
+			sizeof(struct nilfs_checkpoint))
+
+/**
+ * struct nilfs_segment_usage - segment usage
+ * @su_lastmod: last modified timestamp
+ * @su_nblocks: number of blocks in segment
+ * @su_flags: flags
+ */
+struct nilfs_segment_usage {
+	__le64 su_lastmod;
+	__le32 su_nblocks;
+	__le32 su_flags;
+};
+
+#define NILFS_MIN_SEGMENT_USAGE_SIZE	16
+
+/* segment usage flag */
+enum {
+	NILFS_SEGMENT_USAGE_ACTIVE,
+	NILFS_SEGMENT_USAGE_DIRTY,
+	NILFS_SEGMENT_USAGE_ERROR,
+};
+
+#define NILFS_SEGMENT_USAGE_FNS(flag, name)				\
+static inline void							\
+nilfs_segment_usage_set_##name(struct nilfs_segment_usage *su)		\
+{									\
+	su->su_flags = cpu_to_le32(le32_to_cpu(su->su_flags) |		\
+				   (1UL << NILFS_SEGMENT_USAGE_##flag));\
+}									\
+static inline void							\
+nilfs_segment_usage_clear_##name(struct nilfs_segment_usage *su)	\
+{									\
+	su->su_flags =							\
+		cpu_to_le32(le32_to_cpu(su->su_flags) &			\
+			    ~(1UL << NILFS_SEGMENT_USAGE_##flag));      \
+}									\
+static inline int							\
+nilfs_segment_usage_##name(const struct nilfs_segment_usage *su)	\
+{									\
+	return !!(le32_to_cpu(su->su_flags) &				\
+		  (1UL << NILFS_SEGMENT_USAGE_##flag));			\
+}
+
+NILFS_SEGMENT_USAGE_FNS(ACTIVE, active)
+NILFS_SEGMENT_USAGE_FNS(DIRTY, dirty)
+NILFS_SEGMENT_USAGE_FNS(ERROR, error)
+
+static inline void
+nilfs_segment_usage_set_clean(struct nilfs_segment_usage *su)
+{
+	su->su_lastmod = cpu_to_le64(0);
+	su->su_nblocks = cpu_to_le32(0);
+	su->su_flags = cpu_to_le32(0);
+}
+
+static inline int
+nilfs_segment_usage_clean(const struct nilfs_segment_usage *su)
+{
+	return !le32_to_cpu(su->su_flags);
+}
+
+/**
+ * struct nilfs_sufile_header - segment usage file header
+ * @sh_ncleansegs: number of clean segments
+ * @sh_ndirtysegs: number of dirty segments
+ * @sh_last_alloc: last allocated segment number
+ */
+struct nilfs_sufile_header {
+	__le64 sh_ncleansegs;
+	__le64 sh_ndirtysegs;
+	__le64 sh_last_alloc;
+	/* ... */
+};
+
+#define NILFS_SUFILE_FIRST_SEGMENT_USAGE_OFFSET				\
+	((sizeof(struct nilfs_sufile_header) +				\
+	  sizeof(struct nilfs_segment_usage) - 1) /			\
+			 sizeof(struct nilfs_segment_usage))
+
+#endif	/* _LINUX_NILFS2_ONDISK_H */
-- 
cgit v1.2.3-71-gd317


From b6e8d4aa1110306378af0f3472a6b85a1f039a16 Mon Sep 17 00:00:00 2001
From: Alexandre Bounine <alexandre.bounine@idt.com>
Date: Tue, 2 Aug 2016 14:06:25 -0700
Subject: rapidio: add RapidIO channelized messaging driver

Add channelized messaging driver to support native RapidIO messaging
exchange between multiple senders/recipients on devices that use kernel
RapidIO subsystem services.

This device driver is the result of collaboration within the RapidIO.org
Software Task Group (STG) between Texas Instruments, Prodrive
Technologies, Nokia Networks, BAE and IDT.  Additional input was
received from other members of RapidIO.org.

The objective was to create a character mode driver interface which
exposes messaging capabilities of RapidIO endpoint devices (mports)
directly to applications, in a manner that allows the numerous and
varied RapidIO implementations to interoperate.

This char mode device driver allows user-space applications to setup
messaging communication channels using single shared RapidIO messaging
mailbox.

By default this driver uses RapidIO MBOX_1 (MBOX_0 is reserved for use by
RIONET Ethernet emulation driver).

[weiyj.lk@gmail.com: rapidio/rio_cm: fix return value check in riocm_init()]
  Link: http://lkml.kernel.org/r/1469198221-21970-1-git-send-email-alexandre.bounine@idt.com
Link: http://lkml.kernel.org/r/1468952862-18056-1-git-send-email-alexandre.bounine@idt.com
Signed-off-by: Alexandre Bounine <alexandre.bounine@idt.com>
Tested-by: Barry Wood <barry.wood@idt.com>
Cc: Matt Porter <mporter@kernel.crashing.org>
Cc: Aurelien Jacquiot <a-jacquiot@ti.com>
Cc: Andre van Herk <andre.van.herk@prodrive-technologies.com>
Cc: Barry Wood <barry.wood@idt.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/rapidio/rio_cm.txt |  119 ++
 drivers/rapidio/Kconfig          |    9 +
 drivers/rapidio/Makefile         |    1 +
 drivers/rapidio/rio_cm.c         | 2366 ++++++++++++++++++++++++++++++++++++++
 include/uapi/linux/Kbuild        |    1 +
 include/uapi/linux/rio_cm_cdev.h |   78 ++
 6 files changed, 2574 insertions(+)
 create mode 100644 Documentation/rapidio/rio_cm.txt
 create mode 100644 drivers/rapidio/rio_cm.c
 create mode 100644 include/uapi/linux/rio_cm_cdev.h

(limited to 'include/uapi/linux')

diff --git a/Documentation/rapidio/rio_cm.txt b/Documentation/rapidio/rio_cm.txt
new file mode 100644
index 000000000000..27aa401f1126
--- /dev/null
+++ b/Documentation/rapidio/rio_cm.txt
@@ -0,0 +1,119 @@
+RapidIO subsystem Channelized Messaging character device driver (rio_cm.c)
+==========================================================================
+
+Version History:
+----------------
+  1.0.0 - Initial driver release.
+
+==========================================================================
+
+I. Overview
+
+This device driver is the result of collaboration within the RapidIO.org
+Software Task Group (STG) between Texas Instruments, Prodrive Technologies,
+Nokia Networks, BAE and IDT.  Additional input was received from other members
+of RapidIO.org.
+
+The objective was to create a character mode driver interface which exposes
+messaging capabilities of RapidIO endpoint devices (mports) directly
+to applications, in a manner that allows the numerous and varied RapidIO
+implementations to interoperate.
+
+This driver (RIO_CM) provides to user-space applications shared access to
+RapidIO mailbox messaging resources.
+
+RapidIO specification (Part 2) defines that endpoint devices may have up to four
+messaging mailboxes in case of multi-packet message (up to 4KB) and
+up to 64 mailboxes if single-packet messages (up to 256 B) are used. In addition
+to protocol definition limitations, a particular hardware implementation can
+have reduced number of messaging mailboxes.  RapidIO aware applications must
+therefore share the messaging resources of a RapidIO endpoint.
+
+Main purpose of this device driver is to provide RapidIO mailbox messaging
+capability to large number of user-space processes by introducing socket-like
+operations using a single messaging mailbox.  This allows applications to
+use the limited RapidIO messaging hardware resources efficiently.
+
+Most of device driver's operations are supported through 'ioctl' system calls.
+
+When loaded this device driver creates a single file system node named rio_cm
+in /dev directory common for all registered RapidIO mport devices.
+
+Following ioctl commands are available to user-space applications:
+
+- RIO_CM_MPORT_GET_LIST : Returns to caller list of local mport devices that
+    support messaging operations (number of entries up to RIO_MAX_MPORTS).
+    Each list entry is combination of mport's index in the system and RapidIO
+    destination ID assigned to the port.
+- RIO_CM_EP_GET_LIST_SIZE : Returns number of messaging capable remote endpoints
+    in a RapidIO network associated with the specified mport device.
+- RIO_CM_EP_GET_LIST : Returns list of RapidIO destination IDs for messaging
+    capable remote endpoints (peers) available in a RapidIO network associated
+    with the specified mport device.
+- RIO_CM_CHAN_CREATE : Creates RapidIO message exchange channel data structure
+    with channel ID assigned automatically or as requested by a caller.
+- RIO_CM_CHAN_BIND : Binds the specified channel data structure to the specified
+    mport device.
+- RIO_CM_CHAN_LISTEN : Enables listening for connection requests on the specified
+    channel.
+- RIO_CM_CHAN_ACCEPT : Accepts a connection request from peer on the specified
+    channel. If wait timeout for this request is specified by a caller it is
+    a blocking call. If timeout set to 0 this is non-blocking call - ioctl
+    handler checks for a pending connection request and if one is not available
+    exits with -EGAIN error status immediately.
+- RIO_CM_CHAN_CONNECT : Sends a connection request to a remote peer/channel.
+- RIO_CM_CHAN_SEND : Sends a data message through the specified channel.
+    The handler for this request assumes that message buffer specified by
+    a caller includes the reserved space for a packet header required by
+    this driver.
+- RIO_CM_CHAN_RECEIVE : Receives a data message through a connected channel.
+    If the channel does not have an incoming message ready to return this ioctl
+    handler will wait for new message until timeout specified by a caller
+    expires. If timeout value is set to 0, ioctl handler uses a default value
+    defined by MAX_SCHEDULE_TIMEOUT.
+- RIO_CM_CHAN_CLOSE : Closes a specified channel and frees associated buffers.
+    If the specified channel is in the CONNECTED state, sends close notification
+    to the remote peer.
+
+The ioctl command codes and corresponding data structures intended for use by
+user-space applications are defined in 'include/uapi/linux/rio_cm_cdev.h'.
+
+II. Hardware Compatibility
+
+This device driver uses standard interfaces defined by kernel RapidIO subsystem
+and therefore it can be used with any mport device driver registered by RapidIO
+subsystem with limitations set by available mport HW implementation of messaging
+mailboxes.
+
+III. Module parameters
+
+- 'dbg_level' - This parameter allows to control amount of debug information
+        generated by this device driver. This parameter is formed by set of
+        bit masks that correspond to the specific functional block.
+        For mask definitions see 'drivers/rapidio/devices/rio_cm.c'
+        This parameter can be changed dynamically.
+        Use CONFIG_RAPIDIO_DEBUG=y to enable debug output at the top level.
+
+- 'cmbox' - Number of RapidIO mailbox to use (default value is 1).
+        This parameter allows to set messaging mailbox number that will be used
+        within entire RapidIO network. It can be used when default mailbox is
+        used by other device drivers or is not supported by some nodes in the
+        RapidIO network.
+
+- 'chstart' - Start channel number for dynamic assignment. Default value - 256.
+        Allows to exclude channel numbers below this parameter from dynamic
+        allocation to avoid conflicts with software components that use
+        reserved predefined channel numbers.
+
+IV. Known problems
+
+  None.
+
+V. User-space Applications and API Library
+
+Messaging API library and applications that use this device driver are available
+from RapidIO.org.
+
+VI. TODO List
+
+- Add support for system notification messages (reserved channel 0).
diff --git a/drivers/rapidio/Kconfig b/drivers/rapidio/Kconfig
index b5a10d3c92c7..d6d2f20c4597 100644
--- a/drivers/rapidio/Kconfig
+++ b/drivers/rapidio/Kconfig
@@ -67,6 +67,15 @@ config RAPIDIO_ENUM_BASIC
 
 endchoice
 
+config RAPIDIO_CHMAN
+	tristate "RapidIO Channelized Messaging driver"
+	depends on RAPIDIO
+	help
+	  This option includes RapidIO channelized messaging driver which
+	  provides socket-like interface to allow sharing of single RapidIO
+	  messaging mailbox between multiple user-space applications.
+	  See "Documentation/rapidio/rio_cm.txt" for driver description.
+
 config RAPIDIO_MPORT_CDEV
 	tristate "RapidIO /dev mport device driver"
 	depends on RAPIDIO
diff --git a/drivers/rapidio/Makefile b/drivers/rapidio/Makefile
index 6271ada6993f..74dcea45ad49 100644
--- a/drivers/rapidio/Makefile
+++ b/drivers/rapidio/Makefile
@@ -5,6 +5,7 @@ obj-$(CONFIG_RAPIDIO) += rapidio.o
 rapidio-y := rio.o rio-access.o rio-driver.o rio-sysfs.o
 
 obj-$(CONFIG_RAPIDIO_ENUM_BASIC) += rio-scan.o
+obj-$(CONFIG_RAPIDIO_CHMAN)	+= rio_cm.o
 
 obj-$(CONFIG_RAPIDIO)		+= switches/
 obj-$(CONFIG_RAPIDIO)		+= devices/
diff --git a/drivers/rapidio/rio_cm.c b/drivers/rapidio/rio_cm.c
new file mode 100644
index 000000000000..cecc15a880de
--- /dev/null
+++ b/drivers/rapidio/rio_cm.c
@@ -0,0 +1,2366 @@
+/*
+ * rio_cm - RapidIO Channelized Messaging Driver
+ *
+ * Copyright 2013-2016 Integrated Device Technology, Inc.
+ * Copyright (c) 2015, Prodrive Technologies
+ * Copyright (c) 2015, RapidIO Trade Association
+ *
+ * This program is free software; you can redistribute  it and/or modify it
+ * under  the terms of  the GNU General  Public License as published by the
+ * Free Software Foundation;  either version 2 of the  License, or (at your
+ * option) any later version.
+ *
+ * THIS PROGRAM IS DISTRIBUTED IN THE HOPE THAT IT WILL BE USEFUL,
+ * BUT WITHOUT ANY WARRANTY; WITHOUT EVEN THE IMPLIED WARRANTY OF
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.  SEE THE
+ * GNU GENERAL PUBLIC LICENSE FOR MORE DETAILS.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/dma-mapping.h>
+#include <linux/delay.h>
+#include <linux/sched.h>
+#include <linux/rio.h>
+#include <linux/rio_drv.h>
+#include <linux/slab.h>
+#include <linux/idr.h>
+#include <linux/interrupt.h>
+#include <linux/cdev.h>
+#include <linux/fs.h>
+#include <linux/poll.h>
+#include <linux/reboot.h>
+#include <linux/bitops.h>
+#include <linux/printk.h>
+#include <linux/rio_cm_cdev.h>
+
+#define DRV_NAME        "rio_cm"
+#define DRV_VERSION     "1.0.0"
+#define DRV_AUTHOR      "Alexandre Bounine <alexandre.bounine@idt.com>"
+#define DRV_DESC        "RapidIO Channelized Messaging Driver"
+#define DEV_NAME	"rio_cm"
+
+/* Debug output filtering masks */
+enum {
+	DBG_NONE	= 0,
+	DBG_INIT	= BIT(0), /* driver init */
+	DBG_EXIT	= BIT(1), /* driver exit */
+	DBG_MPORT	= BIT(2), /* mport add/remove */
+	DBG_RDEV	= BIT(3), /* RapidIO device add/remove */
+	DBG_CHOP	= BIT(4), /* channel operations */
+	DBG_WAIT	= BIT(5), /* waiting for events */
+	DBG_TX		= BIT(6), /* message TX */
+	DBG_TX_EVENT	= BIT(7), /* message TX event */
+	DBG_RX_DATA	= BIT(8), /* inbound data messages */
+	DBG_RX_CMD	= BIT(9), /* inbound REQ/ACK/NACK messages */
+	DBG_ALL		= ~0,
+};
+
+#ifdef DEBUG
+#define riocm_debug(level, fmt, arg...) \
+	do { \
+		if (DBG_##level & dbg_level) \
+			pr_debug(DRV_NAME ": %s " fmt "\n", \
+				__func__, ##arg); \
+	} while (0)
+#else
+#define riocm_debug(level, fmt, arg...) \
+		no_printk(KERN_DEBUG pr_fmt(DRV_NAME fmt "\n"), ##arg)
+#endif
+
+#define riocm_warn(fmt, arg...) \
+	pr_warn(DRV_NAME ": %s WARNING " fmt "\n", __func__, ##arg)
+
+#define riocm_error(fmt, arg...) \
+	pr_err(DRV_NAME ": %s ERROR " fmt "\n", __func__, ##arg)
+
+
+static int cmbox = 1;
+module_param(cmbox, int, S_IRUGO);
+MODULE_PARM_DESC(cmbox, "RapidIO Mailbox number (default 1)");
+
+static int chstart = 256;
+module_param(chstart, int, S_IRUGO);
+MODULE_PARM_DESC(chstart,
+		 "Start channel number for dynamic allocation (default 256)");
+
+#ifdef DEBUG
+static u32 dbg_level = DBG_NONE;
+module_param(dbg_level, uint, S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(dbg_level, "Debugging output level (default 0 = none)");
+#endif
+
+MODULE_AUTHOR(DRV_AUTHOR);
+MODULE_DESCRIPTION(DRV_DESC);
+MODULE_LICENSE("GPL");
+MODULE_VERSION(DRV_VERSION);
+
+#define RIOCM_TX_RING_SIZE	128
+#define RIOCM_RX_RING_SIZE	128
+#define RIOCM_CONNECT_TO	3 /* connect response TO (in sec) */
+
+#define RIOCM_MAX_CHNUM		0xffff /* Use full range of u16 field */
+#define RIOCM_CHNUM_AUTO	0
+#define RIOCM_MAX_EP_COUNT	0x10000 /* Max number of endpoints */
+
+enum rio_cm_state {
+	RIO_CM_IDLE,
+	RIO_CM_CONNECT,
+	RIO_CM_CONNECTED,
+	RIO_CM_DISCONNECT,
+	RIO_CM_CHAN_BOUND,
+	RIO_CM_LISTEN,
+	RIO_CM_DESTROYING,
+};
+
+enum rio_cm_pkt_type {
+	RIO_CM_SYS	= 0xaa,
+	RIO_CM_CHAN	= 0x55,
+};
+
+enum rio_cm_chop {
+	CM_CONN_REQ,
+	CM_CONN_ACK,
+	CM_CONN_CLOSE,
+	CM_DATA_MSG,
+};
+
+struct rio_ch_base_bhdr {
+	u32 src_id;
+	u32 dst_id;
+#define RIO_HDR_LETTER_MASK 0xffff0000
+#define RIO_HDR_MBOX_MASK   0x0000ffff
+	u8  src_mbox;
+	u8  dst_mbox;
+	u8  type;
+} __attribute__((__packed__));
+
+struct rio_ch_chan_hdr {
+	struct rio_ch_base_bhdr bhdr;
+	u8 ch_op;
+	u16 dst_ch;
+	u16 src_ch;
+	u16 msg_len;
+	u16 rsrvd;
+} __attribute__((__packed__));
+
+struct tx_req {
+	struct list_head node;
+	struct rio_dev   *rdev;
+	void		 *buffer;
+	size_t		 len;
+};
+
+struct cm_dev {
+	struct list_head	list;
+	struct rio_mport	*mport;
+	void			*rx_buf[RIOCM_RX_RING_SIZE];
+	int			rx_slots;
+	struct mutex		rx_lock;
+
+	void			*tx_buf[RIOCM_TX_RING_SIZE];
+	int			tx_slot;
+	int			tx_cnt;
+	int			tx_ack_slot;
+	struct list_head	tx_reqs;
+	spinlock_t		tx_lock;
+
+	struct list_head	peers;
+	u32			npeers;
+	struct workqueue_struct *rx_wq;
+	struct work_struct	rx_work;
+};
+
+struct chan_rx_ring {
+	void	*buf[RIOCM_RX_RING_SIZE];
+	int	head;
+	int	tail;
+	int	count;
+
+	/* Tracking RX buffers reported to upper level */
+	void	*inuse[RIOCM_RX_RING_SIZE];
+	int	inuse_cnt;
+};
+
+struct rio_channel {
+	u16			id;	/* local channel ID */
+	struct kref		ref;	/* channel refcount */
+	struct file		*filp;
+	struct cm_dev		*cmdev;	/* associated CM device object */
+	struct rio_dev		*rdev;	/* remote RapidIO device */
+	enum rio_cm_state	state;
+	int			error;
+	spinlock_t		lock;
+	void			*context;
+	u32			loc_destid;	/* local destID */
+	u32			rem_destid;	/* remote destID */
+	u16			rem_channel;	/* remote channel ID */
+	struct list_head	accept_queue;
+	struct list_head	ch_node;
+	struct completion	comp;
+	struct completion	comp_close;
+	struct chan_rx_ring	rx_ring;
+};
+
+struct cm_peer {
+	struct list_head node;
+	struct rio_dev *rdev;
+};
+
+struct rio_cm_work {
+	struct work_struct work;
+	struct cm_dev *cm;
+	void *data;
+};
+
+struct conn_req {
+	struct list_head node;
+	u32 destid;	/* requester destID */
+	u16 chan;	/* requester channel ID */
+	struct cm_dev *cmdev;
+};
+
+/*
+ * A channel_dev structure represents a CM_CDEV
+ * @cdev	Character device
+ * @dev		Associated device object
+ */
+struct channel_dev {
+	struct cdev	cdev;
+	struct device	*dev;
+};
+
+static struct rio_channel *riocm_ch_alloc(u16 ch_num);
+static void riocm_ch_free(struct kref *ref);
+static int riocm_post_send(struct cm_dev *cm, struct rio_dev *rdev,
+			   void *buffer, size_t len);
+static int riocm_ch_close(struct rio_channel *ch);
+
+static DEFINE_SPINLOCK(idr_lock);
+static DEFINE_IDR(ch_idr);
+
+static LIST_HEAD(cm_dev_list);
+static DECLARE_RWSEM(rdev_sem);
+
+static struct class *dev_class;
+static unsigned int dev_major;
+static unsigned int dev_minor_base;
+static dev_t dev_number;
+static struct channel_dev riocm_cdev;
+
+#define is_msg_capable(src_ops, dst_ops)			\
+			((src_ops & RIO_SRC_OPS_DATA_MSG) &&	\
+			 (dst_ops & RIO_DST_OPS_DATA_MSG))
+#define dev_cm_capable(dev) \
+	is_msg_capable(dev->src_ops, dev->dst_ops)
+
+static int riocm_cmp(struct rio_channel *ch, enum rio_cm_state cmp)
+{
+	int ret;
+
+	spin_lock_bh(&ch->lock);
+	ret = (ch->state == cmp);
+	spin_unlock_bh(&ch->lock);
+	return ret;
+}
+
+static int riocm_cmp_exch(struct rio_channel *ch,
+			   enum rio_cm_state cmp, enum rio_cm_state exch)
+{
+	int ret;
+
+	spin_lock_bh(&ch->lock);
+	ret = (ch->state == cmp);
+	if (ret)
+		ch->state = exch;
+	spin_unlock_bh(&ch->lock);
+	return ret;
+}
+
+static enum rio_cm_state riocm_exch(struct rio_channel *ch,
+				    enum rio_cm_state exch)
+{
+	enum rio_cm_state old;
+
+	spin_lock_bh(&ch->lock);
+	old = ch->state;
+	ch->state = exch;
+	spin_unlock_bh(&ch->lock);
+	return old;
+}
+
+static struct rio_channel *riocm_get_channel(u16 nr)
+{
+	struct rio_channel *ch;
+
+	spin_lock_bh(&idr_lock);
+	ch = idr_find(&ch_idr, nr);
+	if (ch)
+		kref_get(&ch->ref);
+	spin_unlock_bh(&idr_lock);
+	return ch;
+}
+
+static void riocm_put_channel(struct rio_channel *ch)
+{
+	kref_put(&ch->ref, riocm_ch_free);
+}
+
+static void *riocm_rx_get_msg(struct cm_dev *cm)
+{
+	void *msg;
+	int i;
+
+	msg = rio_get_inb_message(cm->mport, cmbox);
+	if (msg) {
+		for (i = 0; i < RIOCM_RX_RING_SIZE; i++) {
+			if (cm->rx_buf[i] == msg) {
+				cm->rx_buf[i] = NULL;
+				cm->rx_slots++;
+				break;
+			}
+		}
+
+		if (i == RIOCM_RX_RING_SIZE)
+			riocm_warn("no record for buffer 0x%p", msg);
+	}
+
+	return msg;
+}
+
+/*
+ * riocm_rx_fill - fills a ring of receive buffers for given cm device
+ * @cm: cm_dev object
+ * @nent: max number of entries to fill
+ *
+ * Returns: none
+ */
+static void riocm_rx_fill(struct cm_dev *cm, int nent)
+{
+	int i;
+
+	if (cm->rx_slots == 0)
+		return;
+
+	for (i = 0; i < RIOCM_RX_RING_SIZE && cm->rx_slots && nent; i++) {
+		if (cm->rx_buf[i] == NULL) {
+			cm->rx_buf[i] = kmalloc(RIO_MAX_MSG_SIZE, GFP_KERNEL);
+			if (cm->rx_buf[i] == NULL)
+				break;
+			rio_add_inb_buffer(cm->mport, cmbox, cm->rx_buf[i]);
+			cm->rx_slots--;
+			nent--;
+		}
+	}
+}
+
+/*
+ * riocm_rx_free - frees all receive buffers associated with given cm device
+ * @cm: cm_dev object
+ *
+ * Returns: none
+ */
+static void riocm_rx_free(struct cm_dev *cm)
+{
+	int i;
+
+	for (i = 0; i < RIOCM_RX_RING_SIZE; i++) {
+		if (cm->rx_buf[i] != NULL) {
+			kfree(cm->rx_buf[i]);
+			cm->rx_buf[i] = NULL;
+		}
+	}
+}
+
+/*
+ * riocm_req_handler - connection request handler
+ * @cm: cm_dev object
+ * @req_data: pointer to the request packet
+ *
+ * Returns: 0 if success, or
+ *          -EINVAL if channel is not in correct state,
+ *          -ENODEV if cannot find a channel with specified ID,
+ *          -ENOMEM if unable to allocate memory to store the request
+ */
+static int riocm_req_handler(struct cm_dev *cm, void *req_data)
+{
+	struct rio_channel *ch;
+	struct conn_req *req;
+	struct rio_ch_chan_hdr *hh = req_data;
+	u16 chnum;
+
+	chnum = ntohs(hh->dst_ch);
+
+	ch = riocm_get_channel(chnum);
+
+	if (!ch)
+		return -ENODEV;
+
+	if (ch->state != RIO_CM_LISTEN) {
+		riocm_debug(RX_CMD, "channel %d is not in listen state", chnum);
+		riocm_put_channel(ch);
+		return -EINVAL;
+	}
+
+	req = kzalloc(sizeof(*req), GFP_KERNEL);
+	if (!req) {
+		riocm_put_channel(ch);
+		return -ENOMEM;
+	}
+
+	req->destid = ntohl(hh->bhdr.src_id);
+	req->chan = ntohs(hh->src_ch);
+	req->cmdev = cm;
+
+	spin_lock_bh(&ch->lock);
+	list_add_tail(&req->node, &ch->accept_queue);
+	spin_unlock_bh(&ch->lock);
+	complete(&ch->comp);
+	riocm_put_channel(ch);
+
+	return 0;
+}
+
+/*
+ * riocm_resp_handler - response to connection request handler
+ * @resp_data: pointer to the response packet
+ *
+ * Returns: 0 if success, or
+ *          -EINVAL if channel is not in correct state,
+ *          -ENODEV if cannot find a channel with specified ID,
+ */
+static int riocm_resp_handler(void *resp_data)
+{
+	struct rio_channel *ch;
+	struct rio_ch_chan_hdr *hh = resp_data;
+	u16 chnum;
+
+	chnum = ntohs(hh->dst_ch);
+	ch = riocm_get_channel(chnum);
+	if (!ch)
+		return -ENODEV;
+
+	if (ch->state != RIO_CM_CONNECT) {
+		riocm_put_channel(ch);
+		return -EINVAL;
+	}
+
+	riocm_exch(ch, RIO_CM_CONNECTED);
+	ch->rem_channel = ntohs(hh->src_ch);
+	complete(&ch->comp);
+	riocm_put_channel(ch);
+
+	return 0;
+}
+
+/*
+ * riocm_close_handler - channel close request handler
+ * @req_data: pointer to the request packet
+ *
+ * Returns: 0 if success, or
+ *          -ENODEV if cannot find a channel with specified ID,
+ *            + error codes returned by riocm_ch_close.
+ */
+static int riocm_close_handler(void *data)
+{
+	struct rio_channel *ch;
+	struct rio_ch_chan_hdr *hh = data;
+	int ret;
+
+	riocm_debug(RX_CMD, "for ch=%d", ntohs(hh->dst_ch));
+
+	spin_lock_bh(&idr_lock);
+	ch = idr_find(&ch_idr, ntohs(hh->dst_ch));
+	if (!ch) {
+		spin_unlock_bh(&idr_lock);
+		return -ENODEV;
+	}
+	idr_remove(&ch_idr, ch->id);
+	spin_unlock_bh(&idr_lock);
+
+	riocm_exch(ch, RIO_CM_DISCONNECT);
+
+	ret = riocm_ch_close(ch);
+	if (ret)
+		riocm_debug(RX_CMD, "riocm_ch_close() returned %d", ret);
+
+	return 0;
+}
+
+/*
+ * rio_cm_handler - function that services request (non-data) packets
+ * @cm: cm_dev object
+ * @data: pointer to the packet
+ */
+static void rio_cm_handler(struct cm_dev *cm, void *data)
+{
+	struct rio_ch_chan_hdr *hdr;
+
+	if (!rio_mport_is_running(cm->mport))
+		goto out;
+
+	hdr = data;
+
+	riocm_debug(RX_CMD, "OP=%x for ch=%d from %d",
+		    hdr->ch_op, ntohs(hdr->dst_ch), ntohs(hdr->src_ch));
+
+	switch (hdr->ch_op) {
+	case CM_CONN_REQ:
+		riocm_req_handler(cm, data);
+		break;
+	case CM_CONN_ACK:
+		riocm_resp_handler(data);
+		break;
+	case CM_CONN_CLOSE:
+		riocm_close_handler(data);
+		break;
+	default:
+		riocm_error("Invalid packet header");
+		break;
+	}
+out:
+	kfree(data);
+}
+
+/*
+ * rio_rx_data_handler - received data packet handler
+ * @cm: cm_dev object
+ * @buf: data packet
+ *
+ * Returns: 0 if success, or
+ *          -ENODEV if cannot find a channel with specified ID,
+ *          -EIO if channel is not in CONNECTED state,
+ *          -ENOMEM if channel RX queue is full (packet discarded)
+ */
+static int rio_rx_data_handler(struct cm_dev *cm, void *buf)
+{
+	struct rio_ch_chan_hdr *hdr;
+	struct rio_channel *ch;
+
+	hdr = buf;
+
+	riocm_debug(RX_DATA, "for ch=%d", ntohs(hdr->dst_ch));
+
+	ch = riocm_get_channel(ntohs(hdr->dst_ch));
+	if (!ch) {
+		/* Discard data message for non-existing channel */
+		kfree(buf);
+		return -ENODEV;
+	}
+
+	/* Place pointer to the buffer into channel's RX queue */
+	spin_lock(&ch->lock);
+
+	if (ch->state != RIO_CM_CONNECTED) {
+		/* Channel is not ready to receive data, discard a packet */
+		riocm_debug(RX_DATA, "ch=%d is in wrong state=%d",
+			    ch->id, ch->state);
+		spin_unlock(&ch->lock);
+		kfree(buf);
+		riocm_put_channel(ch);
+		return -EIO;
+	}
+
+	if (ch->rx_ring.count == RIOCM_RX_RING_SIZE) {
+		/* If RX ring is full, discard a packet */
+		riocm_debug(RX_DATA, "ch=%d is full", ch->id);
+		spin_unlock(&ch->lock);
+		kfree(buf);
+		riocm_put_channel(ch);
+		return -ENOMEM;
+	}
+
+	ch->rx_ring.buf[ch->rx_ring.head] = buf;
+	ch->rx_ring.head++;
+	ch->rx_ring.count++;
+	ch->rx_ring.head %= RIOCM_RX_RING_SIZE;
+
+	complete(&ch->comp);
+
+	spin_unlock(&ch->lock);
+	riocm_put_channel(ch);
+
+	return 0;
+}
+
+/*
+ * rio_ibmsg_handler - inbound message packet handler
+ */
+static void rio_ibmsg_handler(struct work_struct *work)
+{
+	struct cm_dev *cm = container_of(work, struct cm_dev, rx_work);
+	void *data;
+	struct rio_ch_chan_hdr *hdr;
+
+	if (!rio_mport_is_running(cm->mport))
+		return;
+
+	while (1) {
+		mutex_lock(&cm->rx_lock);
+		data = riocm_rx_get_msg(cm);
+		if (data)
+			riocm_rx_fill(cm, 1);
+		mutex_unlock(&cm->rx_lock);
+
+		if (data == NULL)
+			break;
+
+		hdr = data;
+
+		if (hdr->bhdr.type != RIO_CM_CHAN) {
+			/* For now simply discard packets other than channel */
+			riocm_error("Unsupported TYPE code (0x%x). Msg dropped",
+				    hdr->bhdr.type);
+			kfree(data);
+			continue;
+		}
+
+		/* Process a channel message */
+		if (hdr->ch_op == CM_DATA_MSG)
+			rio_rx_data_handler(cm, data);
+		else
+			rio_cm_handler(cm, data);
+	}
+}
+
+static void riocm_inb_msg_event(struct rio_mport *mport, void *dev_id,
+				int mbox, int slot)
+{
+	struct cm_dev *cm = dev_id;
+
+	if (rio_mport_is_running(cm->mport) && !work_pending(&cm->rx_work))
+		queue_work(cm->rx_wq, &cm->rx_work);
+}
+
+/*
+ * rio_txcq_handler - TX completion handler
+ * @cm: cm_dev object
+ * @slot: TX queue slot
+ *
+ * TX completion handler also ensures that pending request packets are placed
+ * into transmit queue as soon as a free slot becomes available. This is done
+ * to give higher priority to request packets during high intensity data flow.
+ */
+static void rio_txcq_handler(struct cm_dev *cm, int slot)
+{
+	int ack_slot;
+
+	/* ATTN: Add TX completion notification if/when direct buffer
+	 * transfer is implemented. At this moment only correct tracking
+	 * of tx_count is important.
+	 */
+	riocm_debug(TX_EVENT, "for mport_%d slot %d tx_cnt %d",
+		    cm->mport->id, slot, cm->tx_cnt);
+
+	spin_lock(&cm->tx_lock);
+	ack_slot = cm->tx_ack_slot;
+
+	if (ack_slot == slot)
+		riocm_debug(TX_EVENT, "slot == ack_slot");
+
+	while (cm->tx_cnt && ((ack_slot != slot) ||
+	       (cm->tx_cnt == RIOCM_TX_RING_SIZE))) {
+
+		cm->tx_buf[ack_slot] = NULL;
+		++ack_slot;
+		ack_slot &= (RIOCM_TX_RING_SIZE - 1);
+		cm->tx_cnt--;
+	}
+
+	if (cm->tx_cnt < 0 || cm->tx_cnt > RIOCM_TX_RING_SIZE)
+		riocm_error("tx_cnt %d out of sync", cm->tx_cnt);
+
+	WARN_ON((cm->tx_cnt < 0) || (cm->tx_cnt > RIOCM_TX_RING_SIZE));
+
+	cm->tx_ack_slot = ack_slot;
+
+	/*
+	 * If there are pending requests, insert them into transmit queue
+	 */
+	if (!list_empty(&cm->tx_reqs) && (cm->tx_cnt < RIOCM_TX_RING_SIZE)) {
+		struct tx_req *req, *_req;
+		int rc;
+
+		list_for_each_entry_safe(req, _req, &cm->tx_reqs, node) {
+			list_del(&req->node);
+			cm->tx_buf[cm->tx_slot] = req->buffer;
+			rc = rio_add_outb_message(cm->mport, req->rdev, cmbox,
+						  req->buffer, req->len);
+			kfree(req->buffer);
+			kfree(req);
+
+			++cm->tx_cnt;
+			++cm->tx_slot;
+			cm->tx_slot &= (RIOCM_TX_RING_SIZE - 1);
+			if (cm->tx_cnt == RIOCM_TX_RING_SIZE)
+				break;
+		}
+	}
+
+	spin_unlock(&cm->tx_lock);
+}
+
+static void riocm_outb_msg_event(struct rio_mport *mport, void *dev_id,
+				 int mbox, int slot)
+{
+	struct cm_dev *cm = dev_id;
+
+	if (cm && rio_mport_is_running(cm->mport))
+		rio_txcq_handler(cm, slot);
+}
+
+static int riocm_queue_req(struct cm_dev *cm, struct rio_dev *rdev,
+			   void *buffer, size_t len)
+{
+	unsigned long flags;
+	struct tx_req *treq;
+
+	treq = kzalloc(sizeof(*treq), GFP_KERNEL);
+	if (treq == NULL)
+		return -ENOMEM;
+
+	treq->rdev = rdev;
+	treq->buffer = buffer;
+	treq->len = len;
+
+	spin_lock_irqsave(&cm->tx_lock, flags);
+	list_add_tail(&treq->node, &cm->tx_reqs);
+	spin_unlock_irqrestore(&cm->tx_lock, flags);
+	return 0;
+}
+
+/*
+ * riocm_post_send - helper function that places packet into msg TX queue
+ * @cm: cm_dev object
+ * @rdev: target RapidIO device object (required by outbound msg interface)
+ * @buffer: pointer to a packet buffer to send
+ * @len: length of data to transfer
+ * @req: request priority flag
+ *
+ * Returns: 0 if success, or error code otherwise.
+ */
+static int riocm_post_send(struct cm_dev *cm, struct rio_dev *rdev,
+			   void *buffer, size_t len)
+{
+	int rc;
+	unsigned long flags;
+
+	spin_lock_irqsave(&cm->tx_lock, flags);
+
+	if (cm->mport == NULL) {
+		rc = -ENODEV;
+		goto err_out;
+	}
+
+	if (cm->tx_cnt == RIOCM_TX_RING_SIZE) {
+		riocm_debug(TX, "Tx Queue is full");
+		rc = -EBUSY;
+		goto err_out;
+	}
+
+	cm->tx_buf[cm->tx_slot] = buffer;
+	rc = rio_add_outb_message(cm->mport, rdev, cmbox, buffer, len);
+
+	riocm_debug(TX, "Add buf@%p destid=%x tx_slot=%d tx_cnt=%d",
+		 buffer, rdev->destid, cm->tx_slot, cm->tx_cnt);
+
+	++cm->tx_cnt;
+	++cm->tx_slot;
+	cm->tx_slot &= (RIOCM_TX_RING_SIZE - 1);
+
+err_out:
+	spin_unlock_irqrestore(&cm->tx_lock, flags);
+	return rc;
+}
+
+/*
+ * riocm_ch_send - sends a data packet to a remote device
+ * @ch_id: local channel ID
+ * @buf: pointer to a data buffer to send (including CM header)
+ * @len: length of data to transfer (including CM header)
+ *
+ * ATTN: ASSUMES THAT THE HEADER SPACE IS RESERVED PART OF THE DATA PACKET
+ *
+ * Returns: 0 if success, or
+ *          -EINVAL if one or more input parameters is/are not valid,
+ *          -ENODEV if cannot find a channel with specified ID,
+ *          -EAGAIN if a channel is not in CONNECTED state,
+ *	    + error codes returned by HW send routine.
+ */
+static int riocm_ch_send(u16 ch_id, void *buf, int len)
+{
+	struct rio_channel *ch;
+	struct rio_ch_chan_hdr *hdr;
+	int ret;
+
+	if (buf == NULL || ch_id == 0 || len == 0 || len > RIO_MAX_MSG_SIZE)
+		return -EINVAL;
+
+	ch = riocm_get_channel(ch_id);
+	if (!ch) {
+		riocm_error("%s(%d) ch_%d not found", current->comm,
+			    task_pid_nr(current), ch_id);
+		return -ENODEV;
+	}
+
+	if (!riocm_cmp(ch, RIO_CM_CONNECTED)) {
+		ret = -EAGAIN;
+		goto err_out;
+	}
+
+	/*
+	 * Fill buffer header section with corresponding channel data
+	 */
+	hdr = buf;
+
+	hdr->bhdr.src_id = htonl(ch->loc_destid);
+	hdr->bhdr.dst_id = htonl(ch->rem_destid);
+	hdr->bhdr.src_mbox = cmbox;
+	hdr->bhdr.dst_mbox = cmbox;
+	hdr->bhdr.type = RIO_CM_CHAN;
+	hdr->ch_op = CM_DATA_MSG;
+	hdr->dst_ch = htons(ch->rem_channel);
+	hdr->src_ch = htons(ch->id);
+	hdr->msg_len = htons((u16)len);
+
+	/* ATTN: the function call below relies on the fact that underlying
+	 * HW-specific add_outb_message() routine copies TX data into its own
+	 * internal transfer buffer (true for all RIONET compatible mport
+	 * drivers). Must be reviewed if mport driver uses the buffer directly.
+	 */
+
+	ret = riocm_post_send(ch->cmdev, ch->rdev, buf, len);
+	if (ret)
+		riocm_debug(TX, "ch %d send_err=%d", ch->id, ret);
+err_out:
+	riocm_put_channel(ch);
+	return ret;
+}
+
+static int riocm_ch_free_rxbuf(struct rio_channel *ch, void *buf)
+{
+	int i, ret = -EINVAL;
+
+	spin_lock_bh(&ch->lock);
+
+	for (i = 0; i < RIOCM_RX_RING_SIZE; i++) {
+		if (ch->rx_ring.inuse[i] == buf) {
+			ch->rx_ring.inuse[i] = NULL;
+			ch->rx_ring.inuse_cnt--;
+			ret = 0;
+			break;
+		}
+	}
+
+	spin_unlock_bh(&ch->lock);
+
+	if (!ret)
+		kfree(buf);
+
+	return ret;
+}
+
+/*
+ * riocm_ch_receive - fetch a data packet received for the specified channel
+ * @ch: local channel ID
+ * @buf: pointer to a packet buffer
+ * @timeout: timeout to wait for incoming packet (in jiffies)
+ *
+ * Returns: 0 and valid buffer pointer if success, or NULL pointer and one of:
+ *          -EAGAIN if a channel is not in CONNECTED state,
+ *          -ENOMEM if in-use tracking queue is full,
+ *          -ETIME if wait timeout expired,
+ *	    -EINTR if wait was interrupted.
+ */
+static int riocm_ch_receive(struct rio_channel *ch, void **buf, long timeout)
+{
+	void *rxmsg = NULL;
+	int i, ret = 0;
+	long wret;
+
+	if (!riocm_cmp(ch, RIO_CM_CONNECTED)) {
+		ret = -EAGAIN;
+		goto out;
+	}
+
+	if (ch->rx_ring.inuse_cnt == RIOCM_RX_RING_SIZE) {
+		/* If we do not have entries to track buffers given to upper
+		 * layer, reject request.
+		 */
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	wret = wait_for_completion_interruptible_timeout(&ch->comp, timeout);
+
+	riocm_debug(WAIT, "wait on %d returned %ld", ch->id, wret);
+
+	if (!wret)
+		ret = -ETIME;
+	else if (wret == -ERESTARTSYS)
+		ret = -EINTR;
+	else
+		ret = riocm_cmp(ch, RIO_CM_CONNECTED) ? 0 : -ECONNRESET;
+
+	if (ret)
+		goto out;
+
+	spin_lock_bh(&ch->lock);
+
+	rxmsg = ch->rx_ring.buf[ch->rx_ring.tail];
+	ch->rx_ring.buf[ch->rx_ring.tail] = NULL;
+	ch->rx_ring.count--;
+	ch->rx_ring.tail++;
+	ch->rx_ring.tail %= RIOCM_RX_RING_SIZE;
+	ret = -ENOMEM;
+
+	for (i = 0; i < RIOCM_RX_RING_SIZE; i++) {
+		if (ch->rx_ring.inuse[i] == NULL) {
+			ch->rx_ring.inuse[i] = rxmsg;
+			ch->rx_ring.inuse_cnt++;
+			ret = 0;
+			break;
+		}
+	}
+
+	if (ret) {
+		/* We have no entry to store pending message: drop it */
+		kfree(rxmsg);
+		rxmsg = NULL;
+	}
+
+	spin_unlock_bh(&ch->lock);
+out:
+	*buf = rxmsg;
+	return ret;
+}
+
+/*
+ * riocm_ch_connect - sends a connect request to a remote device
+ * @loc_ch: local channel ID
+ * @cm: CM device to send connect request
+ * @peer: target RapidIO device
+ * @rem_ch: remote channel ID
+ *
+ * Returns: 0 if success, or
+ *          -EINVAL if the channel is not in IDLE state,
+ *          -EAGAIN if no connection request available immediately,
+ *          -ETIME if ACK response timeout expired,
+ *          -EINTR if wait for response was interrupted.
+ */
+static int riocm_ch_connect(u16 loc_ch, struct cm_dev *cm,
+			    struct cm_peer *peer, u16 rem_ch)
+{
+	struct rio_channel *ch = NULL;
+	struct rio_ch_chan_hdr *hdr;
+	int ret;
+	long wret;
+
+	ch = riocm_get_channel(loc_ch);
+	if (!ch)
+		return -ENODEV;
+
+	if (!riocm_cmp_exch(ch, RIO_CM_IDLE, RIO_CM_CONNECT)) {
+		ret = -EINVAL;
+		goto conn_done;
+	}
+
+	ch->cmdev = cm;
+	ch->rdev = peer->rdev;
+	ch->context = NULL;
+	ch->loc_destid = cm->mport->host_deviceid;
+	ch->rem_channel = rem_ch;
+
+	/*
+	 * Send connect request to the remote RapidIO device
+	 */
+
+	hdr = kzalloc(sizeof(*hdr), GFP_KERNEL);
+	if (hdr == NULL) {
+		ret = -ENOMEM;
+		goto conn_done;
+	}
+
+	hdr->bhdr.src_id = htonl(ch->loc_destid);
+	hdr->bhdr.dst_id = htonl(peer->rdev->destid);
+	hdr->bhdr.src_mbox = cmbox;
+	hdr->bhdr.dst_mbox = cmbox;
+	hdr->bhdr.type = RIO_CM_CHAN;
+	hdr->ch_op = CM_CONN_REQ;
+	hdr->dst_ch = htons(rem_ch);
+	hdr->src_ch = htons(loc_ch);
+
+	/* ATTN: the function call below relies on the fact that underlying
+	 * HW-specific add_outb_message() routine copies TX data into its
+	 * internal transfer buffer. Must be reviewed if mport driver uses
+	 * this buffer directly.
+	 */
+	ret = riocm_post_send(cm, peer->rdev, hdr, sizeof(*hdr));
+
+	if (ret != -EBUSY) {
+		kfree(hdr);
+	} else {
+		ret = riocm_queue_req(cm, peer->rdev, hdr, sizeof(*hdr));
+		if (ret)
+			kfree(hdr);
+	}
+
+	if (ret) {
+		riocm_cmp_exch(ch, RIO_CM_CONNECT, RIO_CM_IDLE);
+		goto conn_done;
+	}
+
+	/* Wait for connect response from the remote device */
+	wret = wait_for_completion_interruptible_timeout(&ch->comp,
+							 RIOCM_CONNECT_TO * HZ);
+	riocm_debug(WAIT, "wait on %d returns %ld", ch->id, wret);
+
+	if (!wret)
+		ret = -ETIME;
+	else if (wret == -ERESTARTSYS)
+		ret = -EINTR;
+	else
+		ret = riocm_cmp(ch, RIO_CM_CONNECTED) ? 0 : -1;
+
+conn_done:
+	riocm_put_channel(ch);
+	return ret;
+}
+
+static int riocm_send_ack(struct rio_channel *ch)
+{
+	struct rio_ch_chan_hdr *hdr;
+	int ret;
+
+	hdr = kzalloc(sizeof(*hdr), GFP_KERNEL);
+	if (hdr == NULL)
+		return -ENOMEM;
+
+	hdr->bhdr.src_id = htonl(ch->loc_destid);
+	hdr->bhdr.dst_id = htonl(ch->rem_destid);
+	hdr->dst_ch = htons(ch->rem_channel);
+	hdr->src_ch = htons(ch->id);
+	hdr->bhdr.src_mbox = cmbox;
+	hdr->bhdr.dst_mbox = cmbox;
+	hdr->bhdr.type = RIO_CM_CHAN;
+	hdr->ch_op = CM_CONN_ACK;
+
+	/* ATTN: the function call below relies on the fact that underlying
+	 * add_outb_message() routine copies TX data into its internal transfer
+	 * buffer. Review if switching to direct buffer version.
+	 */
+	ret = riocm_post_send(ch->cmdev, ch->rdev, hdr, sizeof(*hdr));
+
+	if (ret == -EBUSY && !riocm_queue_req(ch->cmdev,
+					      ch->rdev, hdr, sizeof(*hdr)))
+		return 0;
+	kfree(hdr);
+
+	if (ret)
+		riocm_error("send ACK to ch_%d on %s failed (ret=%d)",
+			    ch->id, rio_name(ch->rdev), ret);
+	return ret;
+}
+
+/*
+ * riocm_ch_accept - accept incoming connection request
+ * @ch_id: channel ID
+ * @new_ch_id: local mport device
+ * @timeout: wait timeout (if 0 non-blocking call, do not wait if connection
+ *           request is not available).
+ *
+ * Returns: pointer to new channel struct if success, or error-valued pointer:
+ *          -ENODEV - cannot find specified channel or mport,
+ *          -EINVAL - the channel is not in IDLE state,
+ *          -EAGAIN - no connection request available immediately (timeout=0),
+ *          -ENOMEM - unable to allocate new channel,
+ *          -ETIME - wait timeout expired,
+ *          -EINTR - wait was interrupted.
+ */
+static struct rio_channel *riocm_ch_accept(u16 ch_id, u16 *new_ch_id,
+					   long timeout)
+{
+	struct rio_channel *ch = NULL;
+	struct rio_channel *new_ch = NULL;
+	struct conn_req *req;
+	struct cm_peer *peer;
+	int found = 0;
+	int err = 0;
+	long wret;
+
+	ch = riocm_get_channel(ch_id);
+	if (!ch)
+		return ERR_PTR(-EINVAL);
+
+	if (!riocm_cmp(ch, RIO_CM_LISTEN)) {
+		err = -EINVAL;
+		goto err_put;
+	}
+
+	/* Don't sleep if this is a non blocking call */
+	if (!timeout) {
+		if (!try_wait_for_completion(&ch->comp)) {
+			err = -EAGAIN;
+			goto err_put;
+		}
+	} else {
+		riocm_debug(WAIT, "on %d", ch->id);
+
+		wret = wait_for_completion_interruptible_timeout(&ch->comp,
+								 timeout);
+		if (!wret) {
+			err = -ETIME;
+			goto err_put;
+		} else if (wret == -ERESTARTSYS) {
+			err = -EINTR;
+			goto err_put;
+		}
+	}
+
+	spin_lock_bh(&ch->lock);
+
+	if (ch->state != RIO_CM_LISTEN) {
+		err = -ECANCELED;
+	} else if (list_empty(&ch->accept_queue)) {
+		riocm_debug(WAIT, "on %d accept_queue is empty on completion",
+			    ch->id);
+		err = -EIO;
+	}
+
+	spin_unlock_bh(&ch->lock);
+
+	if (err) {
+		riocm_debug(WAIT, "on %d returns %d", ch->id, err);
+		goto err_put;
+	}
+
+	/* Create new channel for this connection */
+	new_ch = riocm_ch_alloc(RIOCM_CHNUM_AUTO);
+
+	if (IS_ERR(new_ch)) {
+		riocm_error("failed to get channel for new req (%ld)",
+			PTR_ERR(new_ch));
+		err = -ENOMEM;
+		goto err_put;
+	}
+
+	spin_lock_bh(&ch->lock);
+
+	req = list_first_entry(&ch->accept_queue, struct conn_req, node);
+	list_del(&req->node);
+	new_ch->cmdev = ch->cmdev;
+	new_ch->loc_destid = ch->loc_destid;
+	new_ch->rem_destid = req->destid;
+	new_ch->rem_channel = req->chan;
+
+	spin_unlock_bh(&ch->lock);
+	riocm_put_channel(ch);
+	kfree(req);
+
+	down_read(&rdev_sem);
+	/* Find requester's device object */
+	list_for_each_entry(peer, &new_ch->cmdev->peers, node) {
+		if (peer->rdev->destid == new_ch->rem_destid) {
+			riocm_debug(RX_CMD, "found matching device(%s)",
+				    rio_name(peer->rdev));
+			found = 1;
+			break;
+		}
+	}
+	up_read(&rdev_sem);
+
+	if (!found) {
+		/* If peer device object not found, simply ignore the request */
+		err = -ENODEV;
+		goto err_nodev;
+	}
+
+	new_ch->rdev = peer->rdev;
+	new_ch->state = RIO_CM_CONNECTED;
+	spin_lock_init(&new_ch->lock);
+
+	/* Acknowledge the connection request. */
+	riocm_send_ack(new_ch);
+
+	*new_ch_id = new_ch->id;
+	return new_ch;
+err_put:
+	riocm_put_channel(ch);
+err_nodev:
+	if (new_ch) {
+		spin_lock_bh(&idr_lock);
+		idr_remove(&ch_idr, new_ch->id);
+		spin_unlock_bh(&idr_lock);
+		riocm_put_channel(new_ch);
+	}
+	*new_ch_id = 0;
+	return ERR_PTR(err);
+}
+
+/*
+ * riocm_ch_listen - puts a channel into LISTEN state
+ * @ch_id: channel ID
+ *
+ * Returns: 0 if success, or
+ *          -EINVAL if the specified channel does not exists or
+ *                  is not in CHAN_BOUND state.
+ */
+static int riocm_ch_listen(u16 ch_id)
+{
+	struct rio_channel *ch = NULL;
+	int ret = 0;
+
+	riocm_debug(CHOP, "(ch_%d)", ch_id);
+
+	ch = riocm_get_channel(ch_id);
+	if (!ch || !riocm_cmp_exch(ch, RIO_CM_CHAN_BOUND, RIO_CM_LISTEN))
+		ret = -EINVAL;
+	riocm_put_channel(ch);
+	return ret;
+}
+
+/*
+ * riocm_ch_bind - associate a channel object and an mport device
+ * @ch_id: channel ID
+ * @mport_id: local mport device ID
+ * @context: pointer to the additional caller's context
+ *
+ * Returns: 0 if success, or
+ *          -ENODEV if cannot find specified mport,
+ *          -EINVAL if the specified channel does not exist or
+ *                  is not in IDLE state.
+ */
+static int riocm_ch_bind(u16 ch_id, u8 mport_id, void *context)
+{
+	struct rio_channel *ch = NULL;
+	struct cm_dev *cm;
+	int rc = -ENODEV;
+
+	riocm_debug(CHOP, "ch_%d to mport_%d", ch_id, mport_id);
+
+	/* Find matching cm_dev object */
+	down_read(&rdev_sem);
+	list_for_each_entry(cm, &cm_dev_list, list) {
+		if ((cm->mport->id == mport_id) &&
+		     rio_mport_is_running(cm->mport)) {
+			rc = 0;
+			break;
+		}
+	}
+
+	if (rc)
+		goto exit;
+
+	ch = riocm_get_channel(ch_id);
+	if (!ch) {
+		rc = -EINVAL;
+		goto exit;
+	}
+
+	spin_lock_bh(&ch->lock);
+	if (ch->state != RIO_CM_IDLE) {
+		spin_unlock_bh(&ch->lock);
+		rc = -EINVAL;
+		goto err_put;
+	}
+
+	ch->cmdev = cm;
+	ch->loc_destid = cm->mport->host_deviceid;
+	ch->context = context;
+	ch->state = RIO_CM_CHAN_BOUND;
+	spin_unlock_bh(&ch->lock);
+err_put:
+	riocm_put_channel(ch);
+exit:
+	up_read(&rdev_sem);
+	return rc;
+}
+
+/*
+ * riocm_ch_alloc - channel object allocation helper routine
+ * @ch_num: channel ID (1 ... RIOCM_MAX_CHNUM, 0 = automatic)
+ *
+ * Return value: pointer to newly created channel object,
+ *               or error-valued pointer
+ */
+static struct rio_channel *riocm_ch_alloc(u16 ch_num)
+{
+	int id;
+	int start, end;
+	struct rio_channel *ch;
+
+	ch = kzalloc(sizeof(*ch), GFP_KERNEL);
+	if (!ch)
+		return ERR_PTR(-ENOMEM);
+
+	if (ch_num) {
+		/* If requested, try to obtain the specified channel ID */
+		start = ch_num;
+		end = ch_num + 1;
+	} else {
+		/* Obtain channel ID from the dynamic allocation range */
+		start = chstart;
+		end = RIOCM_MAX_CHNUM + 1;
+	}
+
+	idr_preload(GFP_KERNEL);
+	spin_lock_bh(&idr_lock);
+	id = idr_alloc_cyclic(&ch_idr, ch, start, end, GFP_NOWAIT);
+	spin_unlock_bh(&idr_lock);
+	idr_preload_end();
+
+	if (id < 0) {
+		kfree(ch);
+		return ERR_PTR(id == -ENOSPC ? -EBUSY : id);
+	}
+
+	ch->id = (u16)id;
+	ch->state = RIO_CM_IDLE;
+	spin_lock_init(&ch->lock);
+	INIT_LIST_HEAD(&ch->accept_queue);
+	INIT_LIST_HEAD(&ch->ch_node);
+	init_completion(&ch->comp);
+	init_completion(&ch->comp_close);
+	kref_init(&ch->ref);
+	ch->rx_ring.head = 0;
+	ch->rx_ring.tail = 0;
+	ch->rx_ring.count = 0;
+	ch->rx_ring.inuse_cnt = 0;
+
+	return ch;
+}
+
+/*
+ * riocm_ch_create - creates a new channel object and allocates ID for it
+ * @ch_num: channel ID (1 ... RIOCM_MAX_CHNUM, 0 = automatic)
+ *
+ * Allocates and initializes a new channel object. If the parameter ch_num > 0
+ * and is within the valid range, riocm_ch_create tries to allocate the
+ * specified ID for the new channel. If ch_num = 0, channel ID will be assigned
+ * automatically from the range (chstart ... RIOCM_MAX_CHNUM).
+ * Module parameter 'chstart' defines start of an ID range available for dynamic
+ * allocation. Range below 'chstart' is reserved for pre-defined ID numbers.
+ * Available channel numbers are limited by 16-bit size of channel numbers used
+ * in the packet header.
+ *
+ * Return value: PTR to rio_channel structure if successful (with channel number
+ *               updated via pointer) or error-valued pointer if error.
+ */
+static struct rio_channel *riocm_ch_create(u16 *ch_num)
+{
+	struct rio_channel *ch = NULL;
+
+	ch = riocm_ch_alloc(*ch_num);
+
+	if (IS_ERR(ch))
+		riocm_debug(CHOP, "Failed to allocate channel %d (err=%ld)",
+			    *ch_num, PTR_ERR(ch));
+	else
+		*ch_num = ch->id;
+
+	return ch;
+}
+
+/*
+ * riocm_ch_free - channel object release routine
+ * @ref: pointer to a channel's kref structure
+ */
+static void riocm_ch_free(struct kref *ref)
+{
+	struct rio_channel *ch = container_of(ref, struct rio_channel, ref);
+	int i;
+
+	riocm_debug(CHOP, "(ch_%d)", ch->id);
+
+	if (ch->rx_ring.inuse_cnt) {
+		for (i = 0;
+		     i < RIOCM_RX_RING_SIZE && ch->rx_ring.inuse_cnt; i++) {
+			if (ch->rx_ring.inuse[i] != NULL) {
+				kfree(ch->rx_ring.inuse[i]);
+				ch->rx_ring.inuse_cnt--;
+			}
+		}
+	}
+
+	if (ch->rx_ring.count)
+		for (i = 0; i < RIOCM_RX_RING_SIZE && ch->rx_ring.count; i++) {
+			if (ch->rx_ring.buf[i] != NULL) {
+				kfree(ch->rx_ring.buf[i]);
+				ch->rx_ring.count--;
+			}
+		}
+
+	complete(&ch->comp_close);
+}
+
+static int riocm_send_close(struct rio_channel *ch)
+{
+	struct rio_ch_chan_hdr *hdr;
+	int ret;
+
+	/*
+	 * Send CH_CLOSE notification to the remote RapidIO device
+	 */
+
+	hdr = kzalloc(sizeof(*hdr), GFP_KERNEL);
+	if (hdr == NULL)
+		return -ENOMEM;
+
+	hdr->bhdr.src_id = htonl(ch->loc_destid);
+	hdr->bhdr.dst_id = htonl(ch->rem_destid);
+	hdr->bhdr.src_mbox = cmbox;
+	hdr->bhdr.dst_mbox = cmbox;
+	hdr->bhdr.type = RIO_CM_CHAN;
+	hdr->ch_op = CM_CONN_CLOSE;
+	hdr->dst_ch = htons(ch->rem_channel);
+	hdr->src_ch = htons(ch->id);
+
+	/* ATTN: the function call below relies on the fact that underlying
+	 * add_outb_message() routine copies TX data into its internal transfer
+	 * buffer. Needs to be reviewed if switched to direct buffer mode.
+	 */
+	ret = riocm_post_send(ch->cmdev, ch->rdev, hdr, sizeof(*hdr));
+
+	if (ret == -EBUSY && !riocm_queue_req(ch->cmdev, ch->rdev,
+					      hdr, sizeof(*hdr)))
+		return 0;
+	kfree(hdr);
+
+	if (ret)
+		riocm_error("ch(%d) send CLOSE failed (ret=%d)", ch->id, ret);
+
+	return ret;
+}
+
+/*
+ * riocm_ch_close - closes a channel object with specified ID (by local request)
+ * @ch: channel to be closed
+ */
+static int riocm_ch_close(struct rio_channel *ch)
+{
+	unsigned long tmo = msecs_to_jiffies(3000);
+	enum rio_cm_state state;
+	long wret;
+	int ret = 0;
+
+	riocm_debug(CHOP, "ch_%d by %s(%d)",
+		    ch->id, current->comm, task_pid_nr(current));
+
+	state = riocm_exch(ch, RIO_CM_DESTROYING);
+	if (state == RIO_CM_CONNECTED)
+		riocm_send_close(ch);
+
+	complete_all(&ch->comp);
+
+	riocm_put_channel(ch);
+	wret = wait_for_completion_interruptible_timeout(&ch->comp_close, tmo);
+
+	riocm_debug(WAIT, "wait on %d returns %ld", ch->id, wret);
+
+	if (wret == 0) {
+		/* Timeout on wait occurred */
+		riocm_debug(CHOP, "%s(%d) timed out waiting for ch %d",
+		       current->comm, task_pid_nr(current), ch->id);
+		ret = -ETIMEDOUT;
+	} else if (wret == -ERESTARTSYS) {
+		/* Wait_for_completion was interrupted by a signal */
+		riocm_debug(CHOP, "%s(%d) wait for ch %d was interrupted",
+			current->comm, task_pid_nr(current), ch->id);
+		ret = -EINTR;
+	}
+
+	if (!ret) {
+		riocm_debug(CHOP, "ch_%d resources released", ch->id);
+		kfree(ch);
+	} else {
+		riocm_debug(CHOP, "failed to release ch_%d resources", ch->id);
+	}
+
+	return ret;
+}
+
+/*
+ * riocm_cdev_open() - Open character device
+ */
+static int riocm_cdev_open(struct inode *inode, struct file *filp)
+{
+	riocm_debug(INIT, "by %s(%d) filp=%p ",
+		    current->comm, task_pid_nr(current), filp);
+
+	if (list_empty(&cm_dev_list))
+		return -ENODEV;
+
+	return 0;
+}
+
+/*
+ * riocm_cdev_release() - Release character device
+ */
+static int riocm_cdev_release(struct inode *inode, struct file *filp)
+{
+	struct rio_channel *ch, *_c;
+	unsigned int i;
+	LIST_HEAD(list);
+
+	riocm_debug(EXIT, "by %s(%d) filp=%p",
+		    current->comm, task_pid_nr(current), filp);
+
+	/* Check if there are channels associated with this file descriptor */
+	spin_lock_bh(&idr_lock);
+	idr_for_each_entry(&ch_idr, ch, i) {
+		if (ch && ch->filp == filp) {
+			riocm_debug(EXIT, "ch_%d not released by %s(%d)",
+				    ch->id, current->comm,
+				    task_pid_nr(current));
+			idr_remove(&ch_idr, ch->id);
+			list_add(&ch->ch_node, &list);
+		}
+	}
+	spin_unlock_bh(&idr_lock);
+
+	if (!list_empty(&list)) {
+		list_for_each_entry_safe(ch, _c, &list, ch_node) {
+			list_del(&ch->ch_node);
+			riocm_ch_close(ch);
+		}
+	}
+
+	return 0;
+}
+
+/*
+ * cm_ep_get_list_size() - Reports number of endpoints in the network
+ */
+static int cm_ep_get_list_size(void __user *arg)
+{
+	u32 __user *p = arg;
+	u32 mport_id;
+	u32 count = 0;
+	struct cm_dev *cm;
+
+	if (get_user(mport_id, p))
+		return -EFAULT;
+	if (mport_id >= RIO_MAX_MPORTS)
+		return -EINVAL;
+
+	/* Find a matching cm_dev object */
+	down_read(&rdev_sem);
+	list_for_each_entry(cm, &cm_dev_list, list) {
+		if (cm->mport->id == mport_id) {
+			count = cm->npeers;
+			up_read(&rdev_sem);
+			if (copy_to_user(arg, &count, sizeof(u32)))
+				return -EFAULT;
+			return 0;
+		}
+	}
+	up_read(&rdev_sem);
+
+	return -ENODEV;
+}
+
+/*
+ * cm_ep_get_list() - Returns list of attached endpoints
+ */
+static int cm_ep_get_list(void __user *arg)
+{
+	struct cm_dev *cm;
+	struct cm_peer *peer;
+	u32 info[2];
+	void *buf;
+	u32 nent;
+	u32 *entry_ptr;
+	u32 i = 0;
+	int ret = 0;
+
+	if (copy_from_user(&info, arg, sizeof(info)))
+		return -EFAULT;
+
+	if (info[1] >= RIO_MAX_MPORTS || info[0] > RIOCM_MAX_EP_COUNT)
+		return -EINVAL;
+
+	/* Find a matching cm_dev object */
+	down_read(&rdev_sem);
+	list_for_each_entry(cm, &cm_dev_list, list)
+		if (cm->mport->id == (u8)info[1])
+			goto found;
+
+	up_read(&rdev_sem);
+	return -ENODEV;
+
+found:
+	nent = min(info[0], cm->npeers);
+	buf = kcalloc(nent + 2, sizeof(u32), GFP_KERNEL);
+	if (!buf) {
+		up_read(&rdev_sem);
+		return -ENOMEM;
+	}
+
+	entry_ptr = (u32 *)((uintptr_t)buf + 2*sizeof(u32));
+
+	list_for_each_entry(peer, &cm->peers, node) {
+		*entry_ptr = (u32)peer->rdev->destid;
+		entry_ptr++;
+		if (++i == nent)
+			break;
+	}
+	up_read(&rdev_sem);
+
+	((u32 *)buf)[0] = i; /* report an updated number of entries */
+	((u32 *)buf)[1] = info[1]; /* put back an mport ID */
+	if (copy_to_user(arg, buf, sizeof(u32) * (info[0] + 2)))
+		ret = -EFAULT;
+
+	kfree(buf);
+	return ret;
+}
+
+/*
+ * cm_mport_get_list() - Returns list of available local mport devices
+ */
+static int cm_mport_get_list(void __user *arg)
+{
+	int ret = 0;
+	u32 entries;
+	void *buf;
+	struct cm_dev *cm;
+	u32 *entry_ptr;
+	int count = 0;
+
+	if (copy_from_user(&entries, arg, sizeof(entries)))
+		return -EFAULT;
+	if (entries == 0 || entries > RIO_MAX_MPORTS)
+		return -EINVAL;
+	buf = kcalloc(entries + 1, sizeof(u32), GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	/* Scan all registered cm_dev objects */
+	entry_ptr = (u32 *)((uintptr_t)buf + sizeof(u32));
+	down_read(&rdev_sem);
+	list_for_each_entry(cm, &cm_dev_list, list) {
+		if (count++ < entries) {
+			*entry_ptr = (cm->mport->id << 16) |
+				      cm->mport->host_deviceid;
+			entry_ptr++;
+		}
+	}
+	up_read(&rdev_sem);
+
+	*((u32 *)buf) = count; /* report a real number of entries */
+	if (copy_to_user(arg, buf, sizeof(u32) * (count + 1)))
+		ret = -EFAULT;
+
+	kfree(buf);
+	return ret;
+}
+
+/*
+ * cm_chan_create() - Create a message exchange channel
+ */
+static int cm_chan_create(struct file *filp, void __user *arg)
+{
+	u16 __user *p = arg;
+	u16 ch_num;
+	struct rio_channel *ch;
+
+	if (get_user(ch_num, p))
+		return -EFAULT;
+
+	riocm_debug(CHOP, "ch_%d requested by %s(%d)",
+		    ch_num, current->comm, task_pid_nr(current));
+	ch = riocm_ch_create(&ch_num);
+	if (IS_ERR(ch))
+		return PTR_ERR(ch);
+
+	ch->filp = filp;
+	riocm_debug(CHOP, "ch_%d created by %s(%d)",
+		    ch_num, current->comm, task_pid_nr(current));
+	return put_user(ch_num, p);
+}
+
+/*
+ * cm_chan_close() - Close channel
+ * @filp:	Pointer to file object
+ * @arg:	Channel to close
+ */
+static int cm_chan_close(struct file *filp, void __user *arg)
+{
+	u16 __user *p = arg;
+	u16 ch_num;
+	struct rio_channel *ch;
+
+	if (get_user(ch_num, p))
+		return -EFAULT;
+
+	riocm_debug(CHOP, "ch_%d by %s(%d)",
+		    ch_num, current->comm, task_pid_nr(current));
+
+	spin_lock_bh(&idr_lock);
+	ch = idr_find(&ch_idr, ch_num);
+	if (!ch) {
+		spin_unlock_bh(&idr_lock);
+		return 0;
+	}
+	if (ch->filp != filp) {
+		spin_unlock_bh(&idr_lock);
+		return -EINVAL;
+	}
+	idr_remove(&ch_idr, ch->id);
+	spin_unlock_bh(&idr_lock);
+
+	return riocm_ch_close(ch);
+}
+
+/*
+ * cm_chan_bind() - Bind channel
+ * @arg:	Channel number
+ */
+static int cm_chan_bind(void __user *arg)
+{
+	struct rio_cm_channel chan;
+
+	if (copy_from_user(&chan, arg, sizeof(chan)))
+		return -EFAULT;
+	if (chan.mport_id >= RIO_MAX_MPORTS)
+		return -EINVAL;
+
+	return riocm_ch_bind(chan.id, chan.mport_id, NULL);
+}
+
+/*
+ * cm_chan_listen() - Listen on channel
+ * @arg:	Channel number
+ */
+static int cm_chan_listen(void __user *arg)
+{
+	u16 __user *p = arg;
+	u16 ch_num;
+
+	if (get_user(ch_num, p))
+		return -EFAULT;
+
+	return riocm_ch_listen(ch_num);
+}
+
+/*
+ * cm_chan_accept() - Accept incoming connection
+ * @filp:	Pointer to file object
+ * @arg:	Channel number
+ */
+static int cm_chan_accept(struct file *filp, void __user *arg)
+{
+	struct rio_cm_accept param;
+	long accept_to;
+	struct rio_channel *ch;
+
+	if (copy_from_user(&param, arg, sizeof(param)))
+		return -EFAULT;
+
+	riocm_debug(CHOP, "on ch_%d by %s(%d)",
+		    param.ch_num, current->comm, task_pid_nr(current));
+
+	accept_to = param.wait_to ?
+			msecs_to_jiffies(param.wait_to) : 0;
+
+	ch = riocm_ch_accept(param.ch_num, &param.ch_num, accept_to);
+	if (IS_ERR(ch))
+		return PTR_ERR(ch);
+	ch->filp = filp;
+
+	riocm_debug(CHOP, "new ch_%d for %s(%d)",
+		    ch->id, current->comm, task_pid_nr(current));
+
+	if (copy_to_user(arg, &param, sizeof(param)))
+		return -EFAULT;
+	return 0;
+}
+
+/*
+ * cm_chan_connect() - Connect on channel
+ * @arg:	Channel information
+ */
+static int cm_chan_connect(void __user *arg)
+{
+	struct rio_cm_channel chan;
+	struct cm_dev *cm;
+	struct cm_peer *peer;
+	int ret = -ENODEV;
+
+	if (copy_from_user(&chan, arg, sizeof(chan)))
+		return -EFAULT;
+	if (chan.mport_id >= RIO_MAX_MPORTS)
+		return -EINVAL;
+
+	down_read(&rdev_sem);
+
+	/* Find matching cm_dev object */
+	list_for_each_entry(cm, &cm_dev_list, list) {
+		if (cm->mport->id == chan.mport_id) {
+			ret = 0;
+			break;
+		}
+	}
+
+	if (ret)
+		goto err_out;
+
+	if (chan.remote_destid >= RIO_ANY_DESTID(cm->mport->sys_size)) {
+		ret = -EINVAL;
+		goto err_out;
+	}
+
+	/* Find corresponding RapidIO endpoint device object */
+	ret = -ENODEV;
+
+	list_for_each_entry(peer, &cm->peers, node) {
+		if (peer->rdev->destid == chan.remote_destid) {
+			ret = 0;
+			break;
+		}
+	}
+
+	if (ret)
+		goto err_out;
+
+	up_read(&rdev_sem);
+
+	return riocm_ch_connect(chan.id, cm, peer, chan.remote_channel);
+err_out:
+	up_read(&rdev_sem);
+	return ret;
+}
+
+/*
+ * cm_chan_msg_send() - Send a message through channel
+ * @arg:	Outbound message information
+ */
+static int cm_chan_msg_send(void __user *arg)
+{
+	struct rio_cm_msg msg;
+	void *buf;
+	int ret = 0;
+
+	if (copy_from_user(&msg, arg, sizeof(msg)))
+		return -EFAULT;
+	if (msg.size > RIO_MAX_MSG_SIZE)
+		return -EINVAL;
+
+	buf = kmalloc(msg.size, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	if (copy_from_user(buf, (void __user *)(uintptr_t)msg.msg, msg.size)) {
+		ret = -EFAULT;
+		goto out;
+	}
+
+	ret = riocm_ch_send(msg.ch_num, buf, msg.size);
+out:
+	kfree(buf);
+	return ret;
+}
+
+/*
+ * cm_chan_msg_rcv() - Receive a message through channel
+ * @arg:	Inbound message information
+ */
+static int cm_chan_msg_rcv(void __user *arg)
+{
+	struct rio_cm_msg msg;
+	struct rio_channel *ch;
+	void *buf;
+	long rxto;
+	int ret = 0, msg_size;
+
+	if (copy_from_user(&msg, arg, sizeof(msg)))
+		return -EFAULT;
+
+	if (msg.ch_num == 0 || msg.size == 0)
+		return -EINVAL;
+
+	ch = riocm_get_channel(msg.ch_num);
+	if (!ch)
+		return -ENODEV;
+
+	rxto = msg.rxto ? msecs_to_jiffies(msg.rxto) : MAX_SCHEDULE_TIMEOUT;
+
+	ret = riocm_ch_receive(ch, &buf, rxto);
+	if (ret)
+		goto out;
+
+	msg_size = min(msg.size, (u16)(RIO_MAX_MSG_SIZE));
+
+	if (copy_to_user((void __user *)(uintptr_t)msg.msg, buf, msg_size))
+		ret = -EFAULT;
+
+	riocm_ch_free_rxbuf(ch, buf);
+out:
+	riocm_put_channel(ch);
+	return ret;
+}
+
+/*
+ * riocm_cdev_ioctl() - IOCTL requests handler
+ */
+static long
+riocm_cdev_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+	switch (cmd) {
+	case RIO_CM_EP_GET_LIST_SIZE:
+		return cm_ep_get_list_size((void __user *)arg);
+	case RIO_CM_EP_GET_LIST:
+		return cm_ep_get_list((void __user *)arg);
+	case RIO_CM_CHAN_CREATE:
+		return cm_chan_create(filp, (void __user *)arg);
+	case RIO_CM_CHAN_CLOSE:
+		return cm_chan_close(filp, (void __user *)arg);
+	case RIO_CM_CHAN_BIND:
+		return cm_chan_bind((void __user *)arg);
+	case RIO_CM_CHAN_LISTEN:
+		return cm_chan_listen((void __user *)arg);
+	case RIO_CM_CHAN_ACCEPT:
+		return cm_chan_accept(filp, (void __user *)arg);
+	case RIO_CM_CHAN_CONNECT:
+		return cm_chan_connect((void __user *)arg);
+	case RIO_CM_CHAN_SEND:
+		return cm_chan_msg_send((void __user *)arg);
+	case RIO_CM_CHAN_RECEIVE:
+		return cm_chan_msg_rcv((void __user *)arg);
+	case RIO_CM_MPORT_GET_LIST:
+		return cm_mport_get_list((void __user *)arg);
+	default:
+		break;
+	}
+
+	return -EINVAL;
+}
+
+static const struct file_operations riocm_cdev_fops = {
+	.owner		= THIS_MODULE,
+	.open		= riocm_cdev_open,
+	.release	= riocm_cdev_release,
+	.unlocked_ioctl = riocm_cdev_ioctl,
+};
+
+/*
+ * riocm_add_dev - add new remote RapidIO device into channel management core
+ * @dev: device object associated with RapidIO device
+ * @sif: subsystem interface
+ *
+ * Adds the specified RapidIO device (if applicable) into peers list of
+ * the corresponding channel management device (cm_dev).
+ */
+static int riocm_add_dev(struct device *dev, struct subsys_interface *sif)
+{
+	struct cm_peer *peer;
+	struct rio_dev *rdev = to_rio_dev(dev);
+	struct cm_dev *cm;
+
+	/* Check if the remote device has capabilities required to support CM */
+	if (!dev_cm_capable(rdev))
+		return 0;
+
+	riocm_debug(RDEV, "(%s)", rio_name(rdev));
+
+	peer = kmalloc(sizeof(*peer), GFP_KERNEL);
+	if (!peer)
+		return -ENOMEM;
+
+	/* Find a corresponding cm_dev object */
+	down_write(&rdev_sem);
+	list_for_each_entry(cm, &cm_dev_list, list) {
+		if (cm->mport == rdev->net->hport)
+			goto found;
+	}
+
+	up_write(&rdev_sem);
+	kfree(peer);
+	return -ENODEV;
+
+found:
+	peer->rdev = rdev;
+	list_add_tail(&peer->node, &cm->peers);
+	cm->npeers++;
+
+	up_write(&rdev_sem);
+	return 0;
+}
+
+/*
+ * riocm_remove_dev - remove remote RapidIO device from channel management core
+ * @dev: device object associated with RapidIO device
+ * @sif: subsystem interface
+ *
+ * Removes the specified RapidIO device (if applicable) from peers list of
+ * the corresponding channel management device (cm_dev).
+ */
+static void riocm_remove_dev(struct device *dev, struct subsys_interface *sif)
+{
+	struct rio_dev *rdev = to_rio_dev(dev);
+	struct cm_dev *cm;
+	struct cm_peer *peer;
+	struct rio_channel *ch, *_c;
+	unsigned int i;
+	bool found = false;
+	LIST_HEAD(list);
+
+	/* Check if the remote device has capabilities required to support CM */
+	if (!dev_cm_capable(rdev))
+		return;
+
+	riocm_debug(RDEV, "(%s)", rio_name(rdev));
+
+	/* Find matching cm_dev object */
+	down_write(&rdev_sem);
+	list_for_each_entry(cm, &cm_dev_list, list) {
+		if (cm->mport == rdev->net->hport) {
+			found = true;
+			break;
+		}
+	}
+
+	if (!found) {
+		up_write(&rdev_sem);
+		return;
+	}
+
+	/* Remove remote device from the list of peers */
+	found = false;
+	list_for_each_entry(peer, &cm->peers, node) {
+		if (peer->rdev == rdev) {
+			riocm_debug(RDEV, "removing peer %s", rio_name(rdev));
+			found = true;
+			list_del(&peer->node);
+			cm->npeers--;
+			kfree(peer);
+			break;
+		}
+	}
+
+	up_write(&rdev_sem);
+
+	if (!found)
+		return;
+
+	/*
+	 * Release channels associated with this peer
+	 */
+
+	spin_lock_bh(&idr_lock);
+	idr_for_each_entry(&ch_idr, ch, i) {
+		if (ch && ch->rdev == rdev) {
+			if (atomic_read(&rdev->state) != RIO_DEVICE_SHUTDOWN)
+				riocm_exch(ch, RIO_CM_DISCONNECT);
+			idr_remove(&ch_idr, ch->id);
+			list_add(&ch->ch_node, &list);
+		}
+	}
+	spin_unlock_bh(&idr_lock);
+
+	if (!list_empty(&list)) {
+		list_for_each_entry_safe(ch, _c, &list, ch_node) {
+			list_del(&ch->ch_node);
+			riocm_ch_close(ch);
+		}
+	}
+}
+
+/*
+ * riocm_cdev_add() - Create rio_cm char device
+ * @devno: device number assigned to device (MAJ + MIN)
+ */
+static int riocm_cdev_add(dev_t devno)
+{
+	int ret;
+
+	cdev_init(&riocm_cdev.cdev, &riocm_cdev_fops);
+	riocm_cdev.cdev.owner = THIS_MODULE;
+	ret = cdev_add(&riocm_cdev.cdev, devno, 1);
+	if (ret < 0) {
+		riocm_error("Cannot register a device with error %d", ret);
+		return ret;
+	}
+
+	riocm_cdev.dev = device_create(dev_class, NULL, devno, NULL, DEV_NAME);
+	if (IS_ERR(riocm_cdev.dev)) {
+		cdev_del(&riocm_cdev.cdev);
+		return PTR_ERR(riocm_cdev.dev);
+	}
+
+	riocm_debug(MPORT, "Added %s cdev(%d:%d)",
+		    DEV_NAME, MAJOR(devno), MINOR(devno));
+
+	return 0;
+}
+
+/*
+ * riocm_add_mport - add new local mport device into channel management core
+ * @dev: device object associated with mport
+ * @class_intf: class interface
+ *
+ * When a new mport device is added, CM immediately reserves inbound and
+ * outbound RapidIO mailboxes that will be used.
+ */
+static int riocm_add_mport(struct device *dev,
+			   struct class_interface *class_intf)
+{
+	int rc;
+	int i;
+	struct cm_dev *cm;
+	struct rio_mport *mport = to_rio_mport(dev);
+
+	riocm_debug(MPORT, "add mport %s", mport->name);
+
+	cm = kzalloc(sizeof(*cm), GFP_KERNEL);
+	if (!cm)
+		return -ENOMEM;
+
+	cm->mport = mport;
+
+	rc = rio_request_outb_mbox(mport, cm, cmbox,
+				   RIOCM_TX_RING_SIZE, riocm_outb_msg_event);
+	if (rc) {
+		riocm_error("failed to allocate OBMBOX_%d on %s",
+			    cmbox, mport->name);
+		kfree(cm);
+		return -ENODEV;
+	}
+
+	rc = rio_request_inb_mbox(mport, cm, cmbox,
+				  RIOCM_RX_RING_SIZE, riocm_inb_msg_event);
+	if (rc) {
+		riocm_error("failed to allocate IBMBOX_%d on %s",
+			    cmbox, mport->name);
+		rio_release_outb_mbox(mport, cmbox);
+		kfree(cm);
+		return -ENODEV;
+	}
+
+	/*
+	 * Allocate and register inbound messaging buffers to be ready
+	 * to receive channel and system management requests
+	 */
+	for (i = 0; i < RIOCM_RX_RING_SIZE; i++)
+		cm->rx_buf[i] = NULL;
+
+	cm->rx_slots = RIOCM_RX_RING_SIZE;
+	mutex_init(&cm->rx_lock);
+	riocm_rx_fill(cm, RIOCM_RX_RING_SIZE);
+	cm->rx_wq = create_workqueue(DRV_NAME "/rxq");
+	INIT_WORK(&cm->rx_work, rio_ibmsg_handler);
+
+	cm->tx_slot = 0;
+	cm->tx_cnt = 0;
+	cm->tx_ack_slot = 0;
+	spin_lock_init(&cm->tx_lock);
+
+	INIT_LIST_HEAD(&cm->peers);
+	cm->npeers = 0;
+	INIT_LIST_HEAD(&cm->tx_reqs);
+
+	down_write(&rdev_sem);
+	list_add_tail(&cm->list, &cm_dev_list);
+	up_write(&rdev_sem);
+
+	return 0;
+}
+
+/*
+ * riocm_remove_mport - remove local mport device from channel management core
+ * @dev: device object associated with mport
+ * @class_intf: class interface
+ *
+ * Removes a local mport device from the list of registered devices that provide
+ * channel management services. Returns an error if the specified mport is not
+ * registered with the CM core.
+ */
+static void riocm_remove_mport(struct device *dev,
+			       struct class_interface *class_intf)
+{
+	struct rio_mport *mport = to_rio_mport(dev);
+	struct cm_dev *cm;
+	struct cm_peer *peer, *temp;
+	struct rio_channel *ch, *_c;
+	unsigned int i;
+	bool found = false;
+	LIST_HEAD(list);
+
+	riocm_debug(MPORT, "%s", mport->name);
+
+	/* Find a matching cm_dev object */
+	down_write(&rdev_sem);
+	list_for_each_entry(cm, &cm_dev_list, list) {
+		if (cm->mport == mport) {
+			list_del(&cm->list);
+			found = true;
+			break;
+		}
+	}
+	up_write(&rdev_sem);
+	if (!found)
+		return;
+
+	flush_workqueue(cm->rx_wq);
+	destroy_workqueue(cm->rx_wq);
+
+	/* Release channels bound to this mport */
+	spin_lock_bh(&idr_lock);
+	idr_for_each_entry(&ch_idr, ch, i) {
+		if (ch->cmdev == cm) {
+			riocm_debug(RDEV, "%s drop ch_%d",
+				    mport->name, ch->id);
+			idr_remove(&ch_idr, ch->id);
+			list_add(&ch->ch_node, &list);
+		}
+	}
+	spin_unlock_bh(&idr_lock);
+
+	if (!list_empty(&list)) {
+		list_for_each_entry_safe(ch, _c, &list, ch_node) {
+			list_del(&ch->ch_node);
+			riocm_ch_close(ch);
+		}
+	}
+
+	rio_release_inb_mbox(mport, cmbox);
+	rio_release_outb_mbox(mport, cmbox);
+
+	/* Remove and free peer entries */
+	if (!list_empty(&cm->peers))
+		riocm_debug(RDEV, "ATTN: peer list not empty");
+	list_for_each_entry_safe(peer, temp, &cm->peers, node) {
+		riocm_debug(RDEV, "removing peer %s", rio_name(peer->rdev));
+		list_del(&peer->node);
+		kfree(peer);
+	}
+
+	riocm_rx_free(cm);
+	kfree(cm);
+	riocm_debug(MPORT, "%s done", mport->name);
+}
+
+static int rio_cm_shutdown(struct notifier_block *nb, unsigned long code,
+	void *unused)
+{
+	struct rio_channel *ch;
+	unsigned int i;
+
+	riocm_debug(EXIT, ".");
+
+	spin_lock_bh(&idr_lock);
+	idr_for_each_entry(&ch_idr, ch, i) {
+		riocm_debug(EXIT, "close ch %d", ch->id);
+		if (ch->state == RIO_CM_CONNECTED)
+			riocm_send_close(ch);
+	}
+	spin_unlock_bh(&idr_lock);
+
+	return NOTIFY_DONE;
+}
+
+/*
+ * riocm_interface handles addition/removal of remote RapidIO devices
+ */
+static struct subsys_interface riocm_interface = {
+	.name		= "rio_cm",
+	.subsys		= &rio_bus_type,
+	.add_dev	= riocm_add_dev,
+	.remove_dev	= riocm_remove_dev,
+};
+
+/*
+ * rio_mport_interface handles addition/removal local mport devices
+ */
+static struct class_interface rio_mport_interface __refdata = {
+	.class = &rio_mport_class,
+	.add_dev = riocm_add_mport,
+	.remove_dev = riocm_remove_mport,
+};
+
+static struct notifier_block rio_cm_notifier = {
+	.notifier_call = rio_cm_shutdown,
+};
+
+static int __init riocm_init(void)
+{
+	int ret;
+
+	/* Create device class needed by udev */
+	dev_class = class_create(THIS_MODULE, DRV_NAME);
+	if (IS_ERR(dev_class)) {
+		riocm_error("Cannot create " DRV_NAME " class");
+		return PTR_ERR(dev_class);
+	}
+
+	ret = alloc_chrdev_region(&dev_number, 0, 1, DRV_NAME);
+	if (ret) {
+		class_destroy(dev_class);
+		return ret;
+	}
+
+	dev_major = MAJOR(dev_number);
+	dev_minor_base = MINOR(dev_number);
+	riocm_debug(INIT, "Registered class with %d major", dev_major);
+
+	/*
+	 * Register as rapidio_port class interface to get notifications about
+	 * mport additions and removals.
+	 */
+	ret = class_interface_register(&rio_mport_interface);
+	if (ret) {
+		riocm_error("class_interface_register error: %d", ret);
+		goto err_reg;
+	}
+
+	/*
+	 * Register as RapidIO bus interface to get notifications about
+	 * addition/removal of remote RapidIO devices.
+	 */
+	ret = subsys_interface_register(&riocm_interface);
+	if (ret) {
+		riocm_error("subsys_interface_register error: %d", ret);
+		goto err_cl;
+	}
+
+	ret = register_reboot_notifier(&rio_cm_notifier);
+	if (ret) {
+		riocm_error("failed to register reboot notifier (err=%d)", ret);
+		goto err_sif;
+	}
+
+	ret = riocm_cdev_add(dev_number);
+	if (ret) {
+		unregister_reboot_notifier(&rio_cm_notifier);
+		ret = -ENODEV;
+		goto err_sif;
+	}
+
+	return 0;
+err_sif:
+	subsys_interface_unregister(&riocm_interface);
+err_cl:
+	class_interface_unregister(&rio_mport_interface);
+err_reg:
+	unregister_chrdev_region(dev_number, 1);
+	class_destroy(dev_class);
+	return ret;
+}
+
+static void __exit riocm_exit(void)
+{
+	riocm_debug(EXIT, "enter");
+	unregister_reboot_notifier(&rio_cm_notifier);
+	subsys_interface_unregister(&riocm_interface);
+	class_interface_unregister(&rio_mport_interface);
+	idr_destroy(&ch_idr);
+
+	device_unregister(riocm_cdev.dev);
+	cdev_del(&(riocm_cdev.cdev));
+
+	class_destroy(dev_class);
+	unregister_chrdev_region(dev_number, 1);
+}
+
+late_initcall(riocm_init);
+module_exit(riocm_exit);
diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild
index 6d4e92ccdc91..c44747c0796a 100644
--- a/include/uapi/linux/Kbuild
+++ b/include/uapi/linux/Kbuild
@@ -357,6 +357,7 @@ header-y += reiserfs_fs.h
 header-y += reiserfs_xattr.h
 header-y += resource.h
 header-y += rfkill.h
+header-y += rio_cm_cdev.h
 header-y += rio_mport_cdev.h
 header-y += romfs_fs.h
 header-y += rose.h
diff --git a/include/uapi/linux/rio_cm_cdev.h b/include/uapi/linux/rio_cm_cdev.h
new file mode 100644
index 000000000000..6edb900d318d
--- /dev/null
+++ b/include/uapi/linux/rio_cm_cdev.h
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2015, Integrated Device Technology Inc.
+ * Copyright (c) 2015, Prodrive Technologies
+ * Copyright (c) 2015, RapidIO Trade Association
+ * All rights reserved.
+ *
+ * This software is available to you under a choice of one of two licenses.
+ * You may choose to be licensed under the terms of the GNU General Public
+ * License(GPL) Version 2, or the BSD-3 Clause license below:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _RIO_CM_CDEV_H_
+#define _RIO_CM_CDEV_H_
+
+#include <linux/types.h>
+
+struct rio_cm_channel {
+	__u16 id;
+	__u16 remote_channel;
+	__u16 remote_destid;
+	__u8 mport_id;
+};
+
+struct rio_cm_msg {
+	__u16 ch_num;
+	__u16 size;
+	__u32 rxto;	/* receive timeout in mSec. 0 = blocking */
+	__u64 msg;
+};
+
+struct rio_cm_accept {
+	__u16 ch_num;
+	__u16 pad0;
+	__u32 wait_to;	/* accept timeout in mSec. 0 = blocking */
+};
+
+/* RapidIO Channelized Messaging Driver IOCTLs */
+#define RIO_CM_IOC_MAGIC	'c'
+
+#define RIO_CM_EP_GET_LIST_SIZE	_IOWR(RIO_CM_IOC_MAGIC, 1, __u32)
+#define RIO_CM_EP_GET_LIST	_IOWR(RIO_CM_IOC_MAGIC, 2, __u32)
+#define RIO_CM_CHAN_CREATE	_IOWR(RIO_CM_IOC_MAGIC, 3, __u16)
+#define RIO_CM_CHAN_CLOSE	_IOW(RIO_CM_IOC_MAGIC, 4, __u16)
+#define RIO_CM_CHAN_BIND	_IOW(RIO_CM_IOC_MAGIC, 5, struct rio_cm_channel)
+#define RIO_CM_CHAN_LISTEN	_IOW(RIO_CM_IOC_MAGIC, 6, __u16)
+#define RIO_CM_CHAN_ACCEPT	_IOWR(RIO_CM_IOC_MAGIC, 7, struct rio_cm_accept)
+#define RIO_CM_CHAN_CONNECT	_IOW(RIO_CM_IOC_MAGIC, 8, struct rio_cm_channel)
+#define RIO_CM_CHAN_SEND	_IOW(RIO_CM_IOC_MAGIC, 9, struct rio_cm_msg)
+#define RIO_CM_CHAN_RECEIVE	_IOWR(RIO_CM_IOC_MAGIC, 10, struct rio_cm_msg)
+#define RIO_CM_MPORT_GET_LIST	_IOWR(RIO_CM_IOC_MAGIC, 11, __u32)
+
+#endif /* _RIO_CM_CDEV_H_ */
-- 
cgit v1.2.3-71-gd317


From 444d13ff10fb13bc3e64859c3cf9ce43dcfeb075 Mon Sep 17 00:00:00 2001
From: Jessica Yu <jeyu@redhat.com>
Date: Wed, 27 Jul 2016 12:06:21 +0930
Subject: modules: add ro_after_init support

Add ro_after_init support for modules by adding a new page-aligned section
in the module layout (after rodata) for ro_after_init data and enabling RO
protection for that section after module init runs.

Signed-off-by: Jessica Yu <jeyu@redhat.com>
Acked-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 include/linux/module.h   |  6 +++--
 include/uapi/linux/elf.h |  1 +
 kernel/livepatch/core.c  |  2 +-
 kernel/module.c          | 66 +++++++++++++++++++++++++++++++++++++++---------
 4 files changed, 60 insertions(+), 15 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/linux/module.h b/include/linux/module.h
index f95ed243a4de..0c3207d26ac0 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -298,6 +298,8 @@ struct module_layout {
 	unsigned int text_size;
 	/* Size of RO section of the module (text+rodata) */
 	unsigned int ro_size;
+	/* Size of RO after init section */
+	unsigned int ro_after_init_size;
 
 #ifdef CONFIG_MODULES_TREE_LOOKUP
 	struct mod_tree_node mtn;
@@ -765,12 +767,12 @@ extern int module_sysfs_initialized;
 #ifdef CONFIG_DEBUG_SET_MODULE_RONX
 extern void set_all_modules_text_rw(void);
 extern void set_all_modules_text_ro(void);
-extern void module_enable_ro(const struct module *mod);
+extern void module_enable_ro(const struct module *mod, bool after_init);
 extern void module_disable_ro(const struct module *mod);
 #else
 static inline void set_all_modules_text_rw(void) { }
 static inline void set_all_modules_text_ro(void) { }
-static inline void module_enable_ro(const struct module *mod) { }
+static inline void module_enable_ro(const struct module *mod, bool after_init) { }
 static inline void module_disable_ro(const struct module *mod) { }
 #endif
 
diff --git a/include/uapi/linux/elf.h b/include/uapi/linux/elf.h
index cb4a72f888d5..70b172ba41ce 100644
--- a/include/uapi/linux/elf.h
+++ b/include/uapi/linux/elf.h
@@ -286,6 +286,7 @@ typedef struct elf64_phdr {
 #define SHF_ALLOC		0x2
 #define SHF_EXECINSTR		0x4
 #define SHF_RELA_LIVEPATCH	0x00100000
+#define SHF_RO_AFTER_INIT	0x00200000
 #define SHF_MASKPROC		0xf0000000
 
 /* special section indexes */
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index 5c2bc1052691..8bbe50704621 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -309,7 +309,7 @@ static int klp_write_object_relocations(struct module *pmod,
 			break;
 	}
 
-	module_enable_ro(pmod);
+	module_enable_ro(pmod, true);
 	return ret;
 }
 
diff --git a/kernel/module.c b/kernel/module.c
index c91c2fdca2e6..205a71a97852 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1857,10 +1857,11 @@ static void mod_sysfs_teardown(struct module *mod)
  * from modification and any data from execution.
  *
  * General layout of module is:
- *          [text] [read-only-data] [writable data]
- * text_size -----^                ^               ^
- * ro_size ------------------------|               |
- * size -------------------------------------------|
+ *          [text] [read-only-data] [ro-after-init] [writable data]
+ * text_size -----^                ^               ^               ^
+ * ro_size ------------------------|               |               |
+ * ro_after_init_size -----------------------------|               |
+ * size -----------------------------------------------------------|
  *
  * These values are always page-aligned (as is base)
  */
@@ -1883,14 +1884,24 @@ static void frob_rodata(const struct module_layout *layout,
 		   (layout->ro_size - layout->text_size) >> PAGE_SHIFT);
 }
 
+static void frob_ro_after_init(const struct module_layout *layout,
+				int (*set_memory)(unsigned long start, int num_pages))
+{
+	BUG_ON((unsigned long)layout->base & (PAGE_SIZE-1));
+	BUG_ON((unsigned long)layout->ro_size & (PAGE_SIZE-1));
+	BUG_ON((unsigned long)layout->ro_after_init_size & (PAGE_SIZE-1));
+	set_memory((unsigned long)layout->base + layout->ro_size,
+		   (layout->ro_after_init_size - layout->ro_size) >> PAGE_SHIFT);
+}
+
 static void frob_writable_data(const struct module_layout *layout,
 			       int (*set_memory)(unsigned long start, int num_pages))
 {
 	BUG_ON((unsigned long)layout->base & (PAGE_SIZE-1));
-	BUG_ON((unsigned long)layout->ro_size & (PAGE_SIZE-1));
+	BUG_ON((unsigned long)layout->ro_after_init_size & (PAGE_SIZE-1));
 	BUG_ON((unsigned long)layout->size & (PAGE_SIZE-1));
-	set_memory((unsigned long)layout->base + layout->ro_size,
-		   (layout->size - layout->ro_size) >> PAGE_SHIFT);
+	set_memory((unsigned long)layout->base + layout->ro_after_init_size,
+		   (layout->size - layout->ro_after_init_size) >> PAGE_SHIFT);
 }
 
 /* livepatching wants to disable read-only so it can frob module. */
@@ -1898,21 +1909,26 @@ void module_disable_ro(const struct module *mod)
 {
 	frob_text(&mod->core_layout, set_memory_rw);
 	frob_rodata(&mod->core_layout, set_memory_rw);
+	frob_ro_after_init(&mod->core_layout, set_memory_rw);
 	frob_text(&mod->init_layout, set_memory_rw);
 	frob_rodata(&mod->init_layout, set_memory_rw);
 }
 
-void module_enable_ro(const struct module *mod)
+void module_enable_ro(const struct module *mod, bool after_init)
 {
 	frob_text(&mod->core_layout, set_memory_ro);
 	frob_rodata(&mod->core_layout, set_memory_ro);
 	frob_text(&mod->init_layout, set_memory_ro);
 	frob_rodata(&mod->init_layout, set_memory_ro);
+
+	if (after_init)
+		frob_ro_after_init(&mod->core_layout, set_memory_ro);
 }
 
 static void module_enable_nx(const struct module *mod)
 {
 	frob_rodata(&mod->core_layout, set_memory_nx);
+	frob_ro_after_init(&mod->core_layout, set_memory_nx);
 	frob_writable_data(&mod->core_layout, set_memory_nx);
 	frob_rodata(&mod->init_layout, set_memory_nx);
 	frob_writable_data(&mod->init_layout, set_memory_nx);
@@ -1921,6 +1937,7 @@ static void module_enable_nx(const struct module *mod)
 static void module_disable_nx(const struct module *mod)
 {
 	frob_rodata(&mod->core_layout, set_memory_x);
+	frob_ro_after_init(&mod->core_layout, set_memory_x);
 	frob_writable_data(&mod->core_layout, set_memory_x);
 	frob_rodata(&mod->init_layout, set_memory_x);
 	frob_writable_data(&mod->init_layout, set_memory_x);
@@ -1963,6 +1980,8 @@ static void disable_ro_nx(const struct module_layout *layout)
 	frob_text(layout, set_memory_rw);
 	frob_rodata(layout, set_memory_rw);
 	frob_rodata(layout, set_memory_x);
+	frob_ro_after_init(layout, set_memory_rw);
+	frob_ro_after_init(layout, set_memory_x);
 	frob_writable_data(layout, set_memory_x);
 }
 
@@ -2305,6 +2324,7 @@ static void layout_sections(struct module *mod, struct load_info *info)
 		 * finder in the two loops below */
 		{ SHF_EXECINSTR | SHF_ALLOC, ARCH_SHF_SMALL },
 		{ SHF_ALLOC, SHF_WRITE | ARCH_SHF_SMALL },
+		{ SHF_RO_AFTER_INIT | SHF_ALLOC, ARCH_SHF_SMALL },
 		{ SHF_WRITE | SHF_ALLOC, ARCH_SHF_SMALL },
 		{ ARCH_SHF_SMALL | SHF_ALLOC, 0 }
 	};
@@ -2336,7 +2356,11 @@ static void layout_sections(struct module *mod, struct load_info *info)
 			mod->core_layout.size = debug_align(mod->core_layout.size);
 			mod->core_layout.ro_size = mod->core_layout.size;
 			break;
-		case 3: /* whole core */
+		case 2: /* RO after init */
+			mod->core_layout.size = debug_align(mod->core_layout.size);
+			mod->core_layout.ro_after_init_size = mod->core_layout.size;
+			break;
+		case 4: /* whole core */
 			mod->core_layout.size = debug_align(mod->core_layout.size);
 			break;
 		}
@@ -2366,7 +2390,14 @@ static void layout_sections(struct module *mod, struct load_info *info)
 			mod->init_layout.size = debug_align(mod->init_layout.size);
 			mod->init_layout.ro_size = mod->init_layout.size;
 			break;
-		case 3: /* whole init */
+		case 2:
+			/*
+			 * RO after init doesn't apply to init_layout (only
+			 * core_layout), so it just takes the value of ro_size.
+			 */
+			mod->init_layout.ro_after_init_size = mod->init_layout.ro_size;
+			break;
+		case 4: /* whole init */
 			mod->init_layout.size = debug_align(mod->init_layout.size);
 			break;
 		}
@@ -3193,6 +3224,7 @@ static struct module *layout_and_allocate(struct load_info *info, int flags)
 {
 	/* Module within temporary copy. */
 	struct module *mod;
+	unsigned int ndx;
 	int err;
 
 	mod = setup_load_info(info, flags);
@@ -3215,6 +3247,15 @@ static struct module *layout_and_allocate(struct load_info *info, int flags)
 	/* We will do a special allocation for per-cpu sections later. */
 	info->sechdrs[info->index.pcpu].sh_flags &= ~(unsigned long)SHF_ALLOC;
 
+	/*
+	 * Mark ro_after_init section with SHF_RO_AFTER_INIT so that
+	 * layout_sections() can put it in the right place.
+	 * Note: ro_after_init sections also have SHF_{WRITE,ALLOC} set.
+	 */
+	ndx = find_sec(info, ".data..ro_after_init");
+	if (ndx)
+		info->sechdrs[ndx].sh_flags |= SHF_RO_AFTER_INIT;
+
 	/* Determine total sizes, and put offsets in sh_entsize.  For now
 	   this is done generically; there doesn't appear to be any
 	   special cases for the architectures. */
@@ -3381,12 +3422,14 @@ static noinline int do_init_module(struct module *mod)
 	/* Switch to core kallsyms now init is done: kallsyms may be walking! */
 	rcu_assign_pointer(mod->kallsyms, &mod->core_kallsyms);
 #endif
+	module_enable_ro(mod, true);
 	mod_tree_remove_init(mod);
 	disable_ro_nx(&mod->init_layout);
 	module_arch_freeing_init(mod);
 	mod->init_layout.base = NULL;
 	mod->init_layout.size = 0;
 	mod->init_layout.ro_size = 0;
+	mod->init_layout.ro_after_init_size = 0;
 	mod->init_layout.text_size = 0;
 	/*
 	 * We want to free module_init, but be aware that kallsyms may be
@@ -3478,8 +3521,7 @@ static int complete_formation(struct module *mod, struct load_info *info)
 	/* This relies on module_mutex for list integrity. */
 	module_bug_finalize(info->hdr, info->sechdrs, mod);
 
-	/* Set RO and NX regions */
-	module_enable_ro(mod);
+	module_enable_ro(mod, false);
 	module_enable_nx(mod);
 
 	/* Mark state as coming so strong_try_module_get() ignores us,
-- 
cgit v1.2.3-71-gd317