From cae9910e73446cac68a54e3a7b02aaa12b689026 Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Mon, 20 May 2019 19:43:51 -0500 Subject: net: Add UNIX_DIAG_UID to Netlink UNIX socket diagnostics. This adds the ability for Netlink to report a socket's UID along with the other UNIX diagnostic information that is already available. This will allow diagnostic tools greater insight into which users control which socket. To test this, do the following as a non-root user: unshare -U -r bash nc -l -U user.socket.$$ & .. and verify from within that same session that Netlink UNIX socket diagnostics report the socket's UID as 0. Also verify that Netlink UNIX socket diagnostics report the socket's UID as the user's UID from an unprivileged process in a different session. Verify the same from a root process. Signed-off-by: Felipe Gasper Signed-off-by: David S. Miller --- include/uapi/linux/unix_diag.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/unix_diag.h b/include/uapi/linux/unix_diag.h index 5c502fdf7a42..a1988576fa8a 100644 --- a/include/uapi/linux/unix_diag.h +++ b/include/uapi/linux/unix_diag.h @@ -20,6 +20,7 @@ struct unix_diag_req { #define UDIAG_SHOW_ICONS 0x00000008 /* show pending connections */ #define UDIAG_SHOW_RQLEN 0x00000010 /* show skb receive queue len */ #define UDIAG_SHOW_MEMINFO 0x00000020 /* show memory info of a socket */ +#define UDIAG_SHOW_UID 0x00000040 /* show socket's UID */ struct unix_diag_msg { __u8 udiag_family; @@ -40,6 +41,7 @@ enum { UNIX_DIAG_RQLEN, UNIX_DIAG_MEMINFO, UNIX_DIAG_SHUTDOWN, + UNIX_DIAG_UID, __UNIX_DIAG_MAX, }; -- cgit v1.2.3-71-gd317 From b2557764d0ebf387da7a11967fd955f3b226b172 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Wed, 22 May 2019 20:47:03 +0200 Subject: net: phy: Add support for 100BaseT1 and 1000BaseT1 Add link modes for 100Mbps and 1Gbps over a single pair. Signed-off-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/phy/phy-core.c | 4 +++- include/uapi/linux/ethtool.h | 2 ++ 2 files changed, 5 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/drivers/net/phy/phy-core.c b/drivers/net/phy/phy-core.c index 3daf0214a242..16667fbac8bf 100644 --- a/drivers/net/phy/phy-core.c +++ b/drivers/net/phy/phy-core.c @@ -8,7 +8,7 @@ const char *phy_speed_to_str(int speed) { - BUILD_BUG_ON_MSG(__ETHTOOL_LINK_MODE_MASK_NBITS != 67, + BUILD_BUG_ON_MSG(__ETHTOOL_LINK_MODE_MASK_NBITS != 69, "Enum ethtool_link_mode_bit_indices and phylib are out of sync. " "If a speed or mode has been added please update phy_speed_to_str " "and the PHY settings array.\n"); @@ -131,9 +131,11 @@ static const struct phy_setting settings[] = { PHY_SETTING( 1000, FULL, 1000baseKX_Full ), PHY_SETTING( 1000, FULL, 1000baseT_Full ), PHY_SETTING( 1000, HALF, 1000baseT_Half ), + PHY_SETTING( 1000, FULL, 1000baseT1_Full ), PHY_SETTING( 1000, FULL, 1000baseX_Full ), /* 100M */ PHY_SETTING( 100, FULL, 100baseT_Full ), + PHY_SETTING( 100, FULL, 100baseT1_Full ), PHY_SETTING( 100, HALF, 100baseT_Half ), /* 10M */ PHY_SETTING( 10, FULL, 10baseT_Full ), diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index 3534ce157ae9..dd06302aa93e 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -1483,6 +1483,8 @@ enum ethtool_link_mode_bit_indices { ETHTOOL_LINK_MODE_200000baseLR4_ER4_FR4_Full_BIT = 64, ETHTOOL_LINK_MODE_200000baseDR4_Full_BIT = 65, ETHTOOL_LINK_MODE_200000baseCR4_Full_BIT = 66, + ETHTOOL_LINK_MODE_100baseT1_Full_BIT = 67, + ETHTOOL_LINK_MODE_1000baseT1_Full_BIT = 68, /* must be last entry */ __ETHTOOL_LINK_MODE_MASK_NBITS -- cgit v1.2.3-71-gd317 From aa50accfda60468fd132573b8f83e158ff45cb3d Mon Sep 17 00:00:00 2001 From: Hans Verkuil Date: Tue, 23 Apr 2019 08:44:59 -0400 Subject: media: cec: add CEC_MSG_FL_RAW flag and msg_is_raw helper function This adds the userspace API to send raw unchecked CEC messages. This will require root permissions. Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- drivers/media/cec/cec-priv.h | 5 +++++ include/uapi/linux/cec.h | 1 + 2 files changed, 6 insertions(+) (limited to 'include/uapi/linux') diff --git a/drivers/media/cec/cec-priv.h b/drivers/media/cec/cec-priv.h index 804e38f849c7..7bdf855aaecd 100644 --- a/drivers/media/cec/cec-priv.h +++ b/drivers/media/cec/cec-priv.h @@ -20,6 +20,11 @@ /* devnode to cec_adapter */ #define to_cec_adapter(node) container_of(node, struct cec_adapter, devnode) +static inline bool msg_is_raw(const struct cec_msg *msg) +{ + return msg->flags & CEC_MSG_FL_RAW; +} + /* cec-core.c */ extern int cec_debug; int cec_get_device(struct cec_devnode *devnode); diff --git a/include/uapi/linux/cec.h b/include/uapi/linux/cec.h index 3094af68b6e7..5704fa0292b5 100644 --- a/include/uapi/linux/cec.h +++ b/include/uapi/linux/cec.h @@ -144,6 +144,7 @@ static inline void cec_msg_set_reply_to(struct cec_msg *msg, /* cec_msg flags field */ #define CEC_MSG_FL_REPLY_TO_FOLLOWERS (1 << 0) +#define CEC_MSG_FL_RAW (1 << 1) /* cec_msg tx/rx_status field */ #define CEC_TX_STATUS_OK (1 << 0) -- cgit v1.2.3-71-gd317 From bf361231c295d92a28ca283ea713f56e93e55796 Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Thu, 9 May 2019 20:01:36 -0400 Subject: audit: add saddr_fam filter field Provide a method to filter out sockaddr and bind calls by network address family. Existing SOCKADDR records are listed for any network activity. Implement the AUDIT_SADDR_FAM field selector to be able to classify or limit records to specific network address families, such as AF_INET or AF_INET6. An example of a network record that is unlikely to be useful and flood the logs: type=SOCKADDR msg=audit(07/27/2017 12:18:27.019:845) : saddr={ fam=local path=/var/run/nscd/socket } type=SYSCALL msg=audit(07/27/2017 12:18:27.019:845) : arch=x86_64 syscall=connect success=no exit=ENOENT(No such file or directory) a0=0x3 a1=0x7fff229c4980 a2=0x6e a3=0x6 items=1 ppid=3301 pid=6145 auid=sgrubb uid=sgrubb gid=sgrubb euid=sgrubb suid=sgrubb fsuid=sgrubb egid=sgrubb sgid=sgrubb fsgid=sgrubb tty=pts3 ses=4 comm=bash exe=/usr/bin/bash subj=unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023 key=network-test Please see the audit-testsuite PR at https://github.com/linux-audit/audit-testsuite/pull/87 Please see the github issue https://github.com/linux-audit/audit-kernel/issues/64 Please see the github issue for the accompanying userspace support https://github.com/linux-audit/audit-userspace/issues/93 Signed-off-by: Richard Guy Briggs [PM: merge fuzz in auditfilter.c] Signed-off-by: Paul Moore --- include/uapi/linux/audit.h | 1 + kernel/auditfilter.c | 5 +++++ kernel/auditsc.c | 5 +++++ 3 files changed, 11 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h index a1280af20336..c89c6495983d 100644 --- a/include/uapi/linux/audit.h +++ b/include/uapi/linux/audit.h @@ -281,6 +281,7 @@ #define AUDIT_OBJ_GID 110 #define AUDIT_FIELD_COMPARE 111 #define AUDIT_EXE 112 +#define AUDIT_SADDR_FAM 113 #define AUDIT_ARG0 200 #define AUDIT_ARG1 (AUDIT_ARG0+1) diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index d5e54e944f72..e69d136eeaf6 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -391,6 +391,7 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f) case AUDIT_SUBJ_CLR: case AUDIT_OBJ_LEV_LOW: case AUDIT_OBJ_LEV_HIGH: + case AUDIT_SADDR_FAM: /* bit ops are only useful on syscall args */ if (f->op == Audit_bitmask || f->op == Audit_bittest) return -EINVAL; @@ -438,6 +439,10 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f) if (f->val > AUDIT_MAX_FIELD_COMPARE) return -EINVAL; break; + case AUDIT_SADDR_FAM: + if (f->val >= AF_MAX) + return -EINVAL; + break; default: break; } diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 30aa07b0115f..9134fe11ff6c 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -615,6 +615,11 @@ static int audit_filter_rules(struct task_struct *tsk, case AUDIT_LOGINUID_SET: result = audit_comparator(audit_loginuid_set(tsk), f->op, f->val); break; + case AUDIT_SADDR_FAM: + if (ctx->sockaddr) + result = audit_comparator(ctx->sockaddr->ss_family, + f->op, f->val); + break; case AUDIT_SUBJ_USER: case AUDIT_SUBJ_ROLE: case AUDIT_SUBJ_TYPE: -- cgit v1.2.3-71-gd317 From 88807dc8d573c0f718d0d26f592f212c5a487cf0 Mon Sep 17 00:00:00 2001 From: Oak Zeng Date: Thu, 4 Apr 2019 15:47:34 -0500 Subject: drm/amdgpu: Remap hdp coherency registers Remap HDP_MEM_COHERENCY_FLUSH_CNTL and HDP_REG_COHERENCY_FLUSH_CNTL to an empty page in mmio space. We will later map this page to process space so application can flush hdp. This can't be done properly at those registers' original location because it will expose more than desired registers to process space. v2: Use explicit register hole location v3: Moved remapped hdp registers into adev struct v4: Use more generic name for remapped page Expose register offset in kfd_ioctl.h v5: Move hdp register remap function to nbio ip function v6: Fixed operator precedence issue and other bugs Signed-off-by: Oak Zeng Reviewed-by: Felix Kuehling Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 7 +++++++ drivers/gpu/drm/amd/amdgpu/nbio_v7_0.c | 15 ++++++++++++--- drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c | 15 ++++++++++++--- drivers/gpu/drm/amd/amdgpu/soc15.c | 11 +++++++++++ include/uapi/linux/kfd_ioctl.h | 7 +++++++ 5 files changed, 49 insertions(+), 6 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index 14398f55f602..23c3375623d7 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -639,6 +639,11 @@ struct nbio_hdp_flush_reg { u32 ref_and_mask_sdma1; }; +struct amdgpu_mmio_remap { + u32 reg_offset; + resource_size_t bus_addr; +}; + struct amdgpu_nbio_funcs { const struct nbio_hdp_flush_reg *hdp_flush_reg; u32 (*get_hdp_flush_req_offset)(struct amdgpu_device *adev); @@ -666,6 +671,7 @@ struct amdgpu_nbio_funcs { void (*ih_control)(struct amdgpu_device *adev); void (*init_registers)(struct amdgpu_device *adev); void (*detect_hw_virt)(struct amdgpu_device *adev); + void (*remap_hdp_registers)(struct amdgpu_device *adev); }; struct amdgpu_df_funcs { @@ -764,6 +770,7 @@ struct amdgpu_device { void __iomem *rmmio; /* protects concurrent MM_INDEX/DATA based register access */ spinlock_t mmio_idx_lock; + struct amdgpu_mmio_remap rmmio_remap; /* protects concurrent SMC based register access */ spinlock_t smc_idx_lock; amdgpu_rreg_t smc_rreg; diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_0.c b/drivers/gpu/drm/amd/amdgpu/nbio_v7_0.c index 1cdb98ad2db3..73419fa38159 100644 --- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_0.c +++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_0.c @@ -29,9 +29,18 @@ #include "nbio/nbio_7_0_sh_mask.h" #include "nbio/nbio_7_0_smn.h" #include "vega10_enum.h" +#include #define smnNBIF_MGCG_CTRL_LCLK 0x1013a05c +static void nbio_v7_0_remap_hdp_registers(struct amdgpu_device *adev) +{ + WREG32_SOC15(NBIO, 0, mmREMAP_HDP_MEM_FLUSH_CNTL, + adev->rmmio_remap.reg_offset + KFD_MMIO_REMAP_HDP_MEM_FLUSH_CNTL); + WREG32_SOC15(NBIO, 0, mmREMAP_HDP_REG_FLUSH_CNTL, + adev->rmmio_remap.reg_offset + KFD_MMIO_REMAP_HDP_REG_FLUSH_CNTL); +} + static u32 nbio_v7_0_get_rev_id(struct amdgpu_device *adev) { u32 tmp = RREG32_SOC15(NBIO, 0, mmRCC_DEV0_EPF0_STRAP0); @@ -55,10 +64,9 @@ static void nbio_v7_0_hdp_flush(struct amdgpu_device *adev, struct amdgpu_ring *ring) { if (!ring || !ring->funcs->emit_wreg) - WREG32_SOC15_NO_KIQ(NBIO, 0, mmHDP_MEM_COHERENCY_FLUSH_CNTL, 0); + WREG32_NO_KIQ((adev->rmmio_remap.reg_offset + KFD_MMIO_REMAP_HDP_MEM_FLUSH_CNTL) >> 2, 0); else - amdgpu_ring_emit_wreg(ring, SOC15_REG_OFFSET( - NBIO, 0, mmHDP_MEM_COHERENCY_FLUSH_CNTL), 0); + amdgpu_ring_emit_wreg(ring, (adev->rmmio_remap.reg_offset + KFD_MMIO_REMAP_HDP_MEM_FLUSH_CNTL) >> 2, 0); } static u32 nbio_v7_0_get_memsize(struct amdgpu_device *adev) @@ -283,4 +291,5 @@ const struct amdgpu_nbio_funcs nbio_v7_0_funcs = { .ih_control = nbio_v7_0_ih_control, .init_registers = nbio_v7_0_init_registers, .detect_hw_virt = nbio_v7_0_detect_hw_virt, + .remap_hdp_registers = nbio_v7_0_remap_hdp_registers, }; diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c b/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c index c69d51598cfe..bfaaa327ae3c 100644 --- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c +++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c @@ -27,9 +27,18 @@ #include "nbio/nbio_7_4_offset.h" #include "nbio/nbio_7_4_sh_mask.h" #include "nbio/nbio_7_4_0_smn.h" +#include #define smnNBIF_MGCG_CTRL_LCLK 0x1013a21c +static void nbio_v7_4_remap_hdp_registers(struct amdgpu_device *adev) +{ + WREG32_SOC15(NBIO, 0, mmREMAP_HDP_MEM_FLUSH_CNTL, + adev->rmmio_remap.reg_offset + KFD_MMIO_REMAP_HDP_MEM_FLUSH_CNTL); + WREG32_SOC15(NBIO, 0, mmREMAP_HDP_REG_FLUSH_CNTL, + adev->rmmio_remap.reg_offset + KFD_MMIO_REMAP_HDP_REG_FLUSH_CNTL); +} + static u32 nbio_v7_4_get_rev_id(struct amdgpu_device *adev) { u32 tmp = RREG32_SOC15(NBIO, 0, mmRCC_DEV0_EPF0_STRAP0); @@ -53,10 +62,9 @@ static void nbio_v7_4_hdp_flush(struct amdgpu_device *adev, struct amdgpu_ring *ring) { if (!ring || !ring->funcs->emit_wreg) - WREG32_SOC15_NO_KIQ(NBIO, 0, mmHDP_MEM_COHERENCY_FLUSH_CNTL, 0); + WREG32_NO_KIQ((adev->rmmio_remap.reg_offset + KFD_MMIO_REMAP_HDP_MEM_FLUSH_CNTL) >> 2, 0); else - amdgpu_ring_emit_wreg(ring, SOC15_REG_OFFSET( - NBIO, 0, mmHDP_MEM_COHERENCY_FLUSH_CNTL), 0); + amdgpu_ring_emit_wreg(ring, (adev->rmmio_remap.reg_offset + KFD_MMIO_REMAP_HDP_MEM_FLUSH_CNTL) >> 2, 0); } static u32 nbio_v7_4_get_memsize(struct amdgpu_device *adev) @@ -262,4 +270,5 @@ const struct amdgpu_nbio_funcs nbio_v7_4_funcs = { .ih_control = nbio_v7_4_ih_control, .init_registers = nbio_v7_4_init_registers, .detect_hw_virt = nbio_v7_4_detect_hw_virt, + .remap_hdp_registers = nbio_v7_4_remap_hdp_registers, }; diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.c b/drivers/gpu/drm/amd/amdgpu/soc15.c index 4900e4958dec..78bd00a0142f 100644 --- a/drivers/gpu/drm/amd/amdgpu/soc15.c +++ b/drivers/gpu/drm/amd/amdgpu/soc15.c @@ -44,6 +44,7 @@ #include "smuio/smuio_9_0_offset.h" #include "smuio/smuio_9_0_sh_mask.h" #include "nbio/nbio_7_0_default.h" +#include "nbio/nbio_7_0_offset.h" #include "nbio/nbio_7_0_sh_mask.h" #include "nbio/nbio_7_0_smn.h" #include "mp/mp_9_0_offset.h" @@ -64,6 +65,7 @@ #include "dce_virtual.h" #include "mxgpu_ai.h" #include "amdgpu_smu.h" +#include #define mmMP0_MISC_CGTT_CTRL0 0x01b9 #define mmMP0_MISC_CGTT_CTRL0_BASE_IDX 0 @@ -783,8 +785,11 @@ static const struct amdgpu_asic_funcs vega20_asic_funcs = static int soc15_common_early_init(void *handle) { +#define MMIO_REG_HOLE_OFFSET (0x80000 - PAGE_SIZE) struct amdgpu_device *adev = (struct amdgpu_device *)handle; + adev->rmmio_remap.reg_offset = MMIO_REG_HOLE_OFFSET; + adev->rmmio_remap.bus_addr = adev->rmmio_base + MMIO_REG_HOLE_OFFSET; adev->smc_rreg = NULL; adev->smc_wreg = NULL; adev->pcie_rreg = &soc15_pcie_rreg; @@ -1014,6 +1019,12 @@ static int soc15_common_hw_init(void *handle) soc15_program_aspm(adev); /* setup nbio registers */ adev->nbio_funcs->init_registers(adev); + /* remap HDP registers to a hole in mmio space, + * for the purpose of expose those registers + * to process space + */ + if (adev->nbio_funcs->remap_hdp_registers) + adev->nbio_funcs->remap_hdp_registers(adev); /* enable the doorbell aperture */ soc15_enable_doorbell_aperture(adev, true); /* HW doorbell routing policy: doorbell writing not diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h index dc067ed0b72d..bb1b4280f53d 100644 --- a/include/uapi/linux/kfd_ioctl.h +++ b/include/uapi/linux/kfd_ioctl.h @@ -426,6 +426,13 @@ struct kfd_ioctl_import_dmabuf_args { __u32 dmabuf_fd; /* to KFD */ }; +/* Register offset inside the remapped mmio page + */ +enum kfd_mmio_remap { + KFD_MMIO_REMAP_HDP_MEM_FLUSH_CNTL = 0, + KFD_MMIO_REMAP_HDP_REG_FLUSH_CNTL = 4, +}; + #define AMDKFD_IOCTL_BASE 'K' #define AMDKFD_IO(nr) _IO(AMDKFD_IOCTL_BASE, nr) #define AMDKFD_IOR(nr, type) _IOR(AMDKFD_IOCTL_BASE, nr, type) -- cgit v1.2.3-71-gd317 From d8e408a82704c86ba87c3d58cfe69dcdb758aa07 Mon Sep 17 00:00:00 2001 From: Oak Zeng Date: Thu, 11 Apr 2019 14:43:39 -0500 Subject: drm/amdkfd: Expose HDP registers to user space Introduce a new memory type (KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP) and expose mmio page of HDP registers to user space through this new memory type. v2: moved remapped hdp regs to adev struct v3: rename the new memory type to ALLOC_MEM_FLAGS_MMIO_REMAP v4: use more generic function name v5: Fail remapped mmio allocation for asics before gfx9 Signed-off-by: Oak Zeng Reviewed-by: Felix Kuehling Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 7 +++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 7 ++++--- drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 6 ++++++ drivers/gpu/drm/amd/include/kgd_kfd_interface.h | 1 + include/uapi/linux/kfd_ioctl.h | 1 + 6 files changed, 20 insertions(+), 3 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c index aeead072fa79..401edb605fdd 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c @@ -519,6 +519,13 @@ uint64_t amdgpu_amdkfd_get_hive_id(struct kgd_dev *kgd) return adev->gmc.xgmi.hive_id; } +uint64_t amdgpu_amdkfd_get_mmio_remap_phys_addr(struct kgd_dev *kgd) +{ + struct amdgpu_device *adev = (struct amdgpu_device *)kgd; + + return adev->rmmio_remap.bus_addr; +} + int amdgpu_amdkfd_submit_ib(struct kgd_dev *kgd, enum kgd_engine_type engine, uint32_t vmid, uint64_t gpu_addr, uint32_t *ib_cmd, uint32_t ib_len) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h index 4e37fa7e85b1..ea1f141db3ff 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h @@ -169,6 +169,7 @@ int amdgpu_amdkfd_get_dmabuf_info(struct kgd_dev *kgd, int dma_buf_fd, uint32_t *flags); uint64_t amdgpu_amdkfd_get_vram_usage(struct kgd_dev *kgd); uint64_t amdgpu_amdkfd_get_hive_id(struct kgd_dev *kgd); +uint64_t amdgpu_amdkfd_get_mmio_remap_phys_addr(struct kgd_dev *kgd); #define read_user_wptr(mmptr, wptr, dst) \ ({ \ diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c index a6e5184d436c..00e013581a70 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c @@ -1109,7 +1109,8 @@ int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu( if (!offset || !*offset) return -EINVAL; user_addr = *offset; - } else if (flags & ALLOC_MEM_FLAGS_DOORBELL) { + } else if (flags & (ALLOC_MEM_FLAGS_DOORBELL | + ALLOC_MEM_FLAGS_MMIO_REMAP)) { domain = AMDGPU_GEM_DOMAIN_GTT; alloc_domain = AMDGPU_GEM_DOMAIN_CPU; bo_type = ttm_bo_type_sg; @@ -1294,8 +1295,8 @@ int amdgpu_amdkfd_gpuvm_free_memory_of_gpu( /* Free the sync object */ amdgpu_sync_free(&mem->sync); - /* If the SG is not NULL, it's one we created for a doorbell - * BO. We need to free it. + /* If the SG is not NULL, it's one we created for a doorbell or mmio + * remap BO. We need to free it. */ if (mem->bo->tbo.sg) { sg_free_table(mem->bo->tbo.sg); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c index 083bd8114db1..d795e5018270 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c @@ -1272,6 +1272,12 @@ static int kfd_ioctl_alloc_memory_of_gpu(struct file *filep, if (args->size != kfd_doorbell_process_slice(dev)) return -EINVAL; offset = kfd_get_process_doorbells(dev, p); + } else if (flags & KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP) { + if (args->size != PAGE_SIZE) + return -EINVAL; + offset = amdgpu_amdkfd_get_mmio_remap_phys_addr(dev->kgd); + if (!offset) + return -ENOMEM; } mutex_lock(&p->mutex); diff --git a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h index b897aca9b4c9..98b9533e672b 100644 --- a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h +++ b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h @@ -174,6 +174,7 @@ struct tile_config { #define ALLOC_MEM_FLAGS_GTT (1 << 1) #define ALLOC_MEM_FLAGS_USERPTR (1 << 2) #define ALLOC_MEM_FLAGS_DOORBELL (1 << 3) +#define ALLOC_MEM_FLAGS_MMIO_REMAP (1 << 4) /* * Allocation flags attributes/access options. diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h index bb1b4280f53d..1e7d5f3376b0 100644 --- a/include/uapi/linux/kfd_ioctl.h +++ b/include/uapi/linux/kfd_ioctl.h @@ -338,6 +338,7 @@ struct kfd_ioctl_acquire_vm_args { #define KFD_IOC_ALLOC_MEM_FLAGS_GTT (1 << 1) #define KFD_IOC_ALLOC_MEM_FLAGS_USERPTR (1 << 2) #define KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL (1 << 3) +#define KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP (1 << 4) /* Allocation flags: attributes/access options */ #define KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE (1 << 31) #define KFD_IOC_ALLOC_MEM_FLAGS_EXECUTABLE (1 << 30) -- cgit v1.2.3-71-gd317 From 1b4670f6983156526c286723465fdf805070b45d Mon Sep 17 00:00:00 2001 From: Oak Zeng Date: Thu, 7 Feb 2019 14:02:27 -0600 Subject: drm/amdkfd: Introduce XGMI SDMA queue type Existing QUEUE_TYPE_SDMA means PCIe optimized SDMA queues. Introduce a new QUEUE_TYPE_SDMA_XGMI, which is optimized for non-PCIe transfer such as XGMI. Signed-off-by: Oak Zeng Reviewed-by: Felix Kuehling Signed-off-by: Felix Kuehling Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 2 + drivers/gpu/drm/amd/amdkfd/kfd_device.c | 15 +++ .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 123 +++++++++++++++------ .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.h | 3 + drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c | 2 + drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_vi.c | 2 + drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c | 3 +- drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 4 +- .../gpu/drm/amd/amdkfd/kfd_process_queue_manager.c | 10 +- include/uapi/linux/kfd_ioctl.h | 7 +- 10 files changed, 132 insertions(+), 39 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c index 3ccaa38779ea..38ae53fe8182 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c @@ -213,6 +213,8 @@ static int set_queue_properties_from_user(struct queue_properties *q_properties, q_properties->type = KFD_QUEUE_TYPE_COMPUTE; else if (args->queue_type == KFD_IOC_QUEUE_TYPE_SDMA) q_properties->type = KFD_QUEUE_TYPE_SDMA; + else if (args->queue_type == KFD_IOC_QUEUE_TYPE_SDMA_XGMI) + q_properties->type = KFD_QUEUE_TYPE_SDMA_XGMI; else return -ENOTSUPP; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c index 8202a5db3a35..1368b41cb92b 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c @@ -54,6 +54,7 @@ static const struct kfd_device_info kaveri_device_info = { .needs_iommu_device = true, .needs_pci_atomics = false, .num_sdma_engines = 2, + .num_xgmi_sdma_engines = 0, .num_sdma_queues_per_engine = 2, }; @@ -71,6 +72,7 @@ static const struct kfd_device_info carrizo_device_info = { .needs_iommu_device = true, .needs_pci_atomics = false, .num_sdma_engines = 2, + .num_xgmi_sdma_engines = 0, .num_sdma_queues_per_engine = 2, }; @@ -87,6 +89,7 @@ static const struct kfd_device_info raven_device_info = { .needs_iommu_device = true, .needs_pci_atomics = true, .num_sdma_engines = 1, + .num_xgmi_sdma_engines = 0, .num_sdma_queues_per_engine = 2, }; #endif @@ -105,6 +108,7 @@ static const struct kfd_device_info hawaii_device_info = { .needs_iommu_device = false, .needs_pci_atomics = false, .num_sdma_engines = 2, + .num_xgmi_sdma_engines = 0, .num_sdma_queues_per_engine = 2, }; @@ -121,6 +125,7 @@ static const struct kfd_device_info tonga_device_info = { .needs_iommu_device = false, .needs_pci_atomics = true, .num_sdma_engines = 2, + .num_xgmi_sdma_engines = 0, .num_sdma_queues_per_engine = 2, }; @@ -137,6 +142,7 @@ static const struct kfd_device_info fiji_device_info = { .needs_iommu_device = false, .needs_pci_atomics = true, .num_sdma_engines = 2, + .num_xgmi_sdma_engines = 0, .num_sdma_queues_per_engine = 2, }; @@ -153,6 +159,7 @@ static const struct kfd_device_info fiji_vf_device_info = { .needs_iommu_device = false, .needs_pci_atomics = false, .num_sdma_engines = 2, + .num_xgmi_sdma_engines = 0, .num_sdma_queues_per_engine = 2, }; @@ -170,6 +177,7 @@ static const struct kfd_device_info polaris10_device_info = { .needs_iommu_device = false, .needs_pci_atomics = true, .num_sdma_engines = 2, + .num_xgmi_sdma_engines = 0, .num_sdma_queues_per_engine = 2, }; @@ -186,6 +194,7 @@ static const struct kfd_device_info polaris10_vf_device_info = { .needs_iommu_device = false, .needs_pci_atomics = false, .num_sdma_engines = 2, + .num_xgmi_sdma_engines = 0, .num_sdma_queues_per_engine = 2, }; @@ -202,6 +211,7 @@ static const struct kfd_device_info polaris11_device_info = { .needs_iommu_device = false, .needs_pci_atomics = true, .num_sdma_engines = 2, + .num_xgmi_sdma_engines = 0, .num_sdma_queues_per_engine = 2, }; @@ -218,6 +228,7 @@ static const struct kfd_device_info polaris12_device_info = { .needs_iommu_device = false, .needs_pci_atomics = true, .num_sdma_engines = 2, + .num_xgmi_sdma_engines = 0, .num_sdma_queues_per_engine = 2, }; @@ -234,6 +245,7 @@ static const struct kfd_device_info vega10_device_info = { .needs_iommu_device = false, .needs_pci_atomics = false, .num_sdma_engines = 2, + .num_xgmi_sdma_engines = 0, .num_sdma_queues_per_engine = 2, }; @@ -250,6 +262,7 @@ static const struct kfd_device_info vega10_vf_device_info = { .needs_iommu_device = false, .needs_pci_atomics = false, .num_sdma_engines = 2, + .num_xgmi_sdma_engines = 0, .num_sdma_queues_per_engine = 2, }; @@ -266,6 +279,7 @@ static const struct kfd_device_info vega12_device_info = { .needs_iommu_device = false, .needs_pci_atomics = false, .num_sdma_engines = 2, + .num_xgmi_sdma_engines = 0, .num_sdma_queues_per_engine = 2, }; @@ -282,6 +296,7 @@ static const struct kfd_device_info vega20_device_info = { .needs_iommu_device = false, .needs_pci_atomics = false, .num_sdma_engines = 2, + .num_xgmi_sdma_engines = 0, .num_sdma_queues_per_engine = 8, }; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c index d41045d3fc3a..1562590d837e 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c @@ -60,14 +60,14 @@ static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm, struct qcm_process_device *qpd); static void deallocate_sdma_queue(struct device_queue_manager *dqm, - unsigned int sdma_queue_id); + struct queue *q); static void kfd_process_hw_exception(struct work_struct *work); static inline enum KFD_MQD_TYPE get_mqd_type_from_queue_type(enum kfd_queue_type type) { - if (type == KFD_QUEUE_TYPE_SDMA) + if (type == KFD_QUEUE_TYPE_SDMA || type == KFD_QUEUE_TYPE_SDMA_XGMI) return KFD_MQD_TYPE_SDMA; return KFD_MQD_TYPE_CP; } @@ -107,12 +107,23 @@ static unsigned int get_num_sdma_engines(struct device_queue_manager *dqm) return dqm->dev->device_info->num_sdma_engines; } +static unsigned int get_num_xgmi_sdma_engines(struct device_queue_manager *dqm) +{ + return dqm->dev->device_info->num_xgmi_sdma_engines; +} + unsigned int get_num_sdma_queues(struct device_queue_manager *dqm) { return dqm->dev->device_info->num_sdma_engines * dqm->dev->device_info->num_sdma_queues_per_engine; } +unsigned int get_num_xgmi_sdma_queues(struct device_queue_manager *dqm) +{ + return dqm->dev->device_info->num_xgmi_sdma_engines + * dqm->dev->device_info->num_sdma_queues_per_engine; +} + void program_sh_mem_settings(struct device_queue_manager *dqm, struct qcm_process_device *qpd) { @@ -133,7 +144,8 @@ static int allocate_doorbell(struct qcm_process_device *qpd, struct queue *q) * preserve the user mode ABI. */ q->doorbell_id = q->properties.queue_id; - } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA) { + } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA || + q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) { /* For SDMA queues on SOC15 with 8-byte doorbell, use static * doorbell assignments based on the engine and queue id. * The doobell index distance between RLC (2*i) and (2*i+1) @@ -174,7 +186,8 @@ static void deallocate_doorbell(struct qcm_process_device *qpd, struct kfd_dev *dev = qpd->dqm->dev; if (!KFD_IS_SOC15(dev->device_info->asic_family) || - q->properties.type == KFD_QUEUE_TYPE_SDMA) + q->properties.type == KFD_QUEUE_TYPE_SDMA || + q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) return; old = test_and_clear_bit(q->doorbell_id, qpd->doorbell_bitmap); @@ -289,7 +302,8 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm, if (q->properties.type == KFD_QUEUE_TYPE_COMPUTE) retval = create_compute_queue_nocpsch(dqm, q, qpd); - else if (q->properties.type == KFD_QUEUE_TYPE_SDMA) + else if (q->properties.type == KFD_QUEUE_TYPE_SDMA || + q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) retval = create_sdma_queue_nocpsch(dqm, q, qpd); else retval = -EINVAL; @@ -307,6 +321,8 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm, if (q->properties.type == KFD_QUEUE_TYPE_SDMA) dqm->sdma_queue_count++; + else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) + dqm->xgmi_sdma_queue_count++; /* * Unconditionally increment this counter, regardless of the queue's @@ -430,7 +446,10 @@ static int destroy_queue_nocpsch_locked(struct device_queue_manager *dqm, deallocate_hqd(dqm, q); } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA) { dqm->sdma_queue_count--; - deallocate_sdma_queue(dqm, q->sdma_id); + deallocate_sdma_queue(dqm, q); + } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) { + dqm->xgmi_sdma_queue_count--; + deallocate_sdma_queue(dqm, q); } else { pr_debug("q->properties.type %d is invalid\n", q->properties.type); @@ -521,7 +540,8 @@ static int update_queue(struct device_queue_manager *dqm, struct queue *q) } } else if (prev_active && (q->properties.type == KFD_QUEUE_TYPE_COMPUTE || - q->properties.type == KFD_QUEUE_TYPE_SDMA)) { + q->properties.type == KFD_QUEUE_TYPE_SDMA || + q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)) { retval = mqd_mgr->destroy_mqd(mqd_mgr, q->mqd, KFD_PREEMPT_TYPE_WAVEFRONT_DRAIN, KFD_UNMAP_LATENCY_MS, q->pipe, q->queue); @@ -548,7 +568,8 @@ static int update_queue(struct device_queue_manager *dqm, struct queue *q) retval = map_queues_cpsch(dqm); else if (q->properties.is_active && (q->properties.type == KFD_QUEUE_TYPE_COMPUTE || - q->properties.type == KFD_QUEUE_TYPE_SDMA)) { + q->properties.type == KFD_QUEUE_TYPE_SDMA || + q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)) { if (WARN(q->process->mm != current->mm, "should only run in user thread")) retval = -EFAULT; @@ -840,6 +861,7 @@ static int initialize_nocpsch(struct device_queue_manager *dqm) INIT_LIST_HEAD(&dqm->queues); dqm->queue_count = dqm->next_pipe_to_allocate = 0; dqm->sdma_queue_count = 0; + dqm->xgmi_sdma_queue_count = 0; for (pipe = 0; pipe < get_pipes_per_mec(dqm); pipe++) { int pipe_offset = pipe * get_queues_per_pipe(dqm); @@ -852,6 +874,7 @@ static int initialize_nocpsch(struct device_queue_manager *dqm) dqm->vmid_bitmap = (1 << dqm->dev->vm_info.vmid_num_kfd) - 1; dqm->sdma_bitmap = (1ULL << get_num_sdma_queues(dqm)) - 1; + dqm->xgmi_sdma_bitmap = (1ULL << get_num_xgmi_sdma_queues(dqm)) - 1; return 0; } @@ -886,17 +909,34 @@ static int allocate_sdma_queue(struct device_queue_manager *dqm, { int bit; - if (dqm->sdma_bitmap == 0) - return -ENOMEM; - - bit = __ffs64(dqm->sdma_bitmap); - dqm->sdma_bitmap &= ~(1ULL << bit); - q->sdma_id = bit; - - q->properties.sdma_engine_id = q->sdma_id % get_num_sdma_engines(dqm); - q->properties.sdma_queue_id = q->sdma_id / get_num_sdma_engines(dqm); + if (q->properties.type == KFD_QUEUE_TYPE_SDMA) { + if (dqm->sdma_bitmap == 0) + return -ENOMEM; + bit = __ffs64(dqm->sdma_bitmap); + dqm->sdma_bitmap &= ~(1ULL << bit); + q->sdma_id = bit; + q->properties.sdma_engine_id = q->sdma_id % + get_num_sdma_engines(dqm); + q->properties.sdma_queue_id = q->sdma_id / + get_num_sdma_engines(dqm); + } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) { + if (dqm->xgmi_sdma_bitmap == 0) + return -ENOMEM; + bit = __ffs64(dqm->xgmi_sdma_bitmap); + dqm->xgmi_sdma_bitmap &= ~(1ULL << bit); + q->sdma_id = bit; + /* sdma_engine_id is sdma id including + * both PCIe-optimized SDMAs and XGMI- + * optimized SDMAs. The calculation below + * assumes the first N engines are always + * PCIe-optimized ones + */ + q->properties.sdma_engine_id = get_num_sdma_engines(dqm) + + q->sdma_id % get_num_xgmi_sdma_engines(dqm); + q->properties.sdma_queue_id = q->sdma_id / + get_num_xgmi_sdma_engines(dqm); + } - pr_debug("SDMA id is: %d\n", q->sdma_id); pr_debug("SDMA engine id: %d\n", q->properties.sdma_engine_id); pr_debug("SDMA queue id: %d\n", q->properties.sdma_queue_id); @@ -904,11 +944,17 @@ static int allocate_sdma_queue(struct device_queue_manager *dqm, } static void deallocate_sdma_queue(struct device_queue_manager *dqm, - unsigned int sdma_id) + struct queue *q) { - if (sdma_id >= get_num_sdma_queues(dqm)) - return; - dqm->sdma_bitmap |= (1ULL << sdma_id); + if (q->properties.type == KFD_QUEUE_TYPE_SDMA) { + if (q->sdma_id >= get_num_sdma_queues(dqm)) + return; + dqm->sdma_bitmap |= (1ULL << q->sdma_id); + } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) { + if (q->sdma_id >= get_num_xgmi_sdma_queues(dqm)) + return; + dqm->xgmi_sdma_bitmap |= (1ULL << q->sdma_id); + } } static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm, @@ -946,7 +992,7 @@ out_uninit_mqd: out_deallocate_doorbell: deallocate_doorbell(qpd, q); out_deallocate_sdma_queue: - deallocate_sdma_queue(dqm, q->sdma_id); + deallocate_sdma_queue(dqm, q); return retval; } @@ -1004,8 +1050,10 @@ static int initialize_cpsch(struct device_queue_manager *dqm) INIT_LIST_HEAD(&dqm->queues); dqm->queue_count = dqm->processes_count = 0; dqm->sdma_queue_count = 0; + dqm->xgmi_sdma_queue_count = 0; dqm->active_runlist = false; dqm->sdma_bitmap = (1ULL << get_num_sdma_queues(dqm)) - 1; + dqm->xgmi_sdma_bitmap = (1ULL << get_num_xgmi_sdma_queues(dqm)) - 1; INIT_WORK(&dqm->hw_exception_work, kfd_process_hw_exception); @@ -1127,7 +1175,8 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q, goto out; } - if (q->properties.type == KFD_QUEUE_TYPE_SDMA) { + if (q->properties.type == KFD_QUEUE_TYPE_SDMA || + q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) { retval = allocate_sdma_queue(dqm, q); if (retval) goto out; @@ -1167,6 +1216,8 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q, if (q->properties.type == KFD_QUEUE_TYPE_SDMA) dqm->sdma_queue_count++; + else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) + dqm->xgmi_sdma_queue_count++; /* * Unconditionally increment this counter, regardless of the queue's * type or whether the queue is active. @@ -1182,8 +1233,9 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q, out_deallocate_doorbell: deallocate_doorbell(qpd, q); out_deallocate_sdma_queue: - if (q->properties.type == KFD_QUEUE_TYPE_SDMA) - deallocate_sdma_queue(dqm, q->sdma_id); + if (q->properties.type == KFD_QUEUE_TYPE_SDMA || + q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) + deallocate_sdma_queue(dqm, q); out: return retval; } @@ -1216,7 +1268,8 @@ static int unmap_sdma_queues(struct device_queue_manager *dqm) { int i, retval = 0; - for (i = 0; i < dqm->dev->device_info->num_sdma_engines; i++) { + for (i = 0; i < dqm->dev->device_info->num_sdma_engines + + dqm->dev->device_info->num_xgmi_sdma_engines; i++) { retval = pm_send_unmap_queue(&dqm->packets, KFD_QUEUE_TYPE_SDMA, KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0, false, i); if (retval) @@ -1258,10 +1311,10 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm, if (!dqm->active_runlist) return retval; - pr_debug("Before destroying queues, sdma queue count is : %u\n", - dqm->sdma_queue_count); + pr_debug("Before destroying queues, sdma queue count is : %u, xgmi sdma queue count is : %u\n", + dqm->sdma_queue_count, dqm->xgmi_sdma_queue_count); - if (dqm->sdma_queue_count > 0) + if (dqm->sdma_queue_count > 0 || dqm->xgmi_sdma_queue_count) unmap_sdma_queues(dqm); retval = pm_send_unmap_queue(&dqm->packets, KFD_QUEUE_TYPE_COMPUTE, @@ -1333,7 +1386,10 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm, if (q->properties.type == KFD_QUEUE_TYPE_SDMA) { dqm->sdma_queue_count--; - deallocate_sdma_queue(dqm, q->sdma_id); + deallocate_sdma_queue(dqm, q); + } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) { + dqm->xgmi_sdma_queue_count--; + deallocate_sdma_queue(dqm, q); } list_del(&q->list); @@ -1550,7 +1606,10 @@ static int process_termination_cpsch(struct device_queue_manager *dqm, list_for_each_entry(q, &qpd->queues_list, list) { if (q->properties.type == KFD_QUEUE_TYPE_SDMA) { dqm->sdma_queue_count--; - deallocate_sdma_queue(dqm, q->sdma_id); + deallocate_sdma_queue(dqm, q); + } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) { + dqm->xgmi_sdma_queue_count--; + deallocate_sdma_queue(dqm, q); } if (q->properties.is_active) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h index 3742fd340ec3..88b4c007696e 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h @@ -181,10 +181,12 @@ struct device_queue_manager { unsigned int processes_count; unsigned int queue_count; unsigned int sdma_queue_count; + unsigned int xgmi_sdma_queue_count; unsigned int total_queue_count; unsigned int next_pipe_to_allocate; unsigned int *allocated_queues; uint64_t sdma_bitmap; + uint64_t xgmi_sdma_bitmap; unsigned int vmid_bitmap; uint64_t pipelines_addr; struct kfd_mem_obj *pipeline_mem; @@ -216,6 +218,7 @@ unsigned int get_queues_num(struct device_queue_manager *dqm); unsigned int get_queues_per_pipe(struct device_queue_manager *dqm); unsigned int get_pipes_per_mec(struct device_queue_manager *dqm); unsigned int get_num_sdma_queues(struct device_queue_manager *dqm); +unsigned int get_num_xgmi_sdma_queues(struct device_queue_manager *dqm); static inline unsigned int get_sh_mem_bases_32(struct kfd_process_device *pdd) { diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c index 33830b1a5a54..604570bea6bd 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c @@ -175,6 +175,7 @@ static int pm_map_queues_v9(struct packet_manager *pm, uint32_t *buffer, queue_type__mes_map_queues__debug_interface_queue_vi; break; case KFD_QUEUE_TYPE_SDMA: + case KFD_QUEUE_TYPE_SDMA_XGMI: packet->bitfields2.engine_sel = q->properties.sdma_engine_id + engine_sel__mes_map_queues__sdma0_vi; use_static = false; /* no static queues under SDMA */ @@ -221,6 +222,7 @@ static int pm_unmap_queues_v9(struct packet_manager *pm, uint32_t *buffer, engine_sel__mes_unmap_queues__compute; break; case KFD_QUEUE_TYPE_SDMA: + case KFD_QUEUE_TYPE_SDMA_XGMI: packet->bitfields2.engine_sel = engine_sel__mes_unmap_queues__sdma0 + sdma_engine; break; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_vi.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_vi.c index bf20c6d32ef3..3cdb19826927 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_vi.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_vi.c @@ -212,6 +212,7 @@ static int pm_map_queues_vi(struct packet_manager *pm, uint32_t *buffer, queue_type__mes_map_queues__debug_interface_queue_vi; break; case KFD_QUEUE_TYPE_SDMA: + case KFD_QUEUE_TYPE_SDMA_XGMI: packet->bitfields2.engine_sel = q->properties.sdma_engine_id + engine_sel__mes_map_queues__sdma0_vi; use_static = false; /* no static queues under SDMA */ @@ -258,6 +259,7 @@ static int pm_unmap_queues_vi(struct packet_manager *pm, uint32_t *buffer, engine_sel__mes_unmap_queues__compute; break; case KFD_QUEUE_TYPE_SDMA: + case KFD_QUEUE_TYPE_SDMA_XGMI: packet->bitfields2.engine_sel = engine_sel__mes_unmap_queues__sdma0 + sdma_engine; break; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c index 045a229436a0..077c47fd4fee 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c @@ -48,7 +48,8 @@ static void pm_calc_rlib_size(struct packet_manager *pm, process_count = pm->dqm->processes_count; queue_count = pm->dqm->queue_count; - compute_queue_count = queue_count - pm->dqm->sdma_queue_count; + compute_queue_count = queue_count - pm->dqm->sdma_queue_count - + pm->dqm->xgmi_sdma_queue_count; /* check if there is over subscription * Note: the arbitration between the number of VMIDs and diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h index 10bd1abe1646..8f02d7817162 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h @@ -188,6 +188,7 @@ struct kfd_device_info { bool needs_iommu_device; bool needs_pci_atomics; unsigned int num_sdma_engines; + unsigned int num_xgmi_sdma_engines; unsigned int num_sdma_queues_per_engine; }; @@ -329,7 +330,8 @@ enum kfd_queue_type { KFD_QUEUE_TYPE_COMPUTE, KFD_QUEUE_TYPE_SDMA, KFD_QUEUE_TYPE_HIQ, - KFD_QUEUE_TYPE_DIQ + KFD_QUEUE_TYPE_DIQ, + KFD_QUEUE_TYPE_SDMA_XGMI }; enum kfd_queue_format { diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c index f18d9cdf9aac..e652e25ede75 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c @@ -186,8 +186,13 @@ int pqm_create_queue(struct process_queue_manager *pqm, switch (type) { case KFD_QUEUE_TYPE_SDMA: - if (dev->dqm->queue_count >= get_num_sdma_queues(dev->dqm)) { - pr_err("Over-subscription is not allowed for SDMA.\n"); + case KFD_QUEUE_TYPE_SDMA_XGMI: + if ((type == KFD_QUEUE_TYPE_SDMA && dev->dqm->sdma_queue_count + >= get_num_sdma_queues(dev->dqm)) || + (type == KFD_QUEUE_TYPE_SDMA_XGMI && + dev->dqm->xgmi_sdma_queue_count + >= get_num_xgmi_sdma_queues(dev->dqm))) { + pr_debug("Over-subscription is not allowed for SDMA.\n"); retval = -EPERM; goto err_create_queue; } @@ -446,6 +451,7 @@ int pqm_debugfs_mqds(struct seq_file *m, void *data) q = pqn->q; switch (q->properties.type) { case KFD_QUEUE_TYPE_SDMA: + case KFD_QUEUE_TYPE_SDMA_XGMI: seq_printf(m, " SDMA queue on device %x\n", q->device->id); mqd_type = KFD_MQD_TYPE_SDMA; diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h index 1e7d5f3376b0..20917c59f39c 100644 --- a/include/uapi/linux/kfd_ioctl.h +++ b/include/uapi/linux/kfd_ioctl.h @@ -35,9 +35,10 @@ struct kfd_ioctl_get_version_args { }; /* For kfd_ioctl_create_queue_args.queue_type. */ -#define KFD_IOC_QUEUE_TYPE_COMPUTE 0 -#define KFD_IOC_QUEUE_TYPE_SDMA 1 -#define KFD_IOC_QUEUE_TYPE_COMPUTE_AQL 2 +#define KFD_IOC_QUEUE_TYPE_COMPUTE 0x0 +#define KFD_IOC_QUEUE_TYPE_SDMA 0x1 +#define KFD_IOC_QUEUE_TYPE_COMPUTE_AQL 0x2 +#define KFD_IOC_QUEUE_TYPE_SDMA_XGMI 0x3 #define KFD_MAX_QUEUE_PERCENTAGE 100 #define KFD_MAX_QUEUE_PRIORITY 15 -- cgit v1.2.3-71-gd317 From 8b401f9ed2441ad9e219953927a842d24ed051fc Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 23 May 2019 14:47:45 -0700 Subject: bpf: implement bpf_send_signal() helper This patch tries to solve the following specific use case. Currently, bpf program can already collect stack traces through kernel function get_perf_callchain() when certain events happens (e.g., cache miss counter or cpu clock counter overflows). But such stack traces are not enough for jitted programs, e.g., hhvm (jited php). To get real stack trace, jit engine internal data structures need to be traversed in order to get the real user functions. bpf program itself may not be the best place to traverse the jit engine as the traversing logic could be complex and it is not a stable interface either. Instead, hhvm implements a signal handler, e.g. for SIGALARM, and a set of program locations which it can dump stack traces. When it receives a signal, it will dump the stack in next such program location. Such a mechanism can be implemented in the following way: . a perf ring buffer is created between bpf program and tracing app. . once a particular event happens, bpf program writes to the ring buffer and the tracing app gets notified. . the tracing app sends a signal SIGALARM to the hhvm. But this method could have large delays and causing profiling results skewed. This patch implements bpf_send_signal() helper to send a signal to hhvm in real time, resulting in intended stack traces. Acked-by: Andrii Nakryiko Signed-off-by: Yonghong Song Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 17 +++++++++++- kernel/trace/bpf_trace.c | 72 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 88 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 63e0cf66f01a..68d4470523a0 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2672,6 +2672,20 @@ union bpf_attr { * 0 on success. * * **-ENOENT** if the bpf-local-storage cannot be found. + * + * int bpf_send_signal(u32 sig) + * Description + * Send signal *sig* to the current task. + * Return + * 0 on success or successfully queued. + * + * **-EBUSY** if work queue under nmi is full. + * + * **-EINVAL** if *sig* is invalid. + * + * **-EPERM** if no permission to send the *sig*. + * + * **-EAGAIN** if bpf program can try again. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -2782,7 +2796,8 @@ union bpf_attr { FN(strtol), \ FN(strtoul), \ FN(sk_storage_get), \ - FN(sk_storage_delete), + FN(sk_storage_delete), \ + FN(send_signal), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index f92d6ad5e080..70029eafc71f 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -567,6 +567,63 @@ static const struct bpf_func_proto bpf_probe_read_str_proto = { .arg3_type = ARG_ANYTHING, }; +struct send_signal_irq_work { + struct irq_work irq_work; + struct task_struct *task; + u32 sig; +}; + +static DEFINE_PER_CPU(struct send_signal_irq_work, send_signal_work); + +static void do_bpf_send_signal(struct irq_work *entry) +{ + struct send_signal_irq_work *work; + + work = container_of(entry, struct send_signal_irq_work, irq_work); + group_send_sig_info(work->sig, SEND_SIG_PRIV, work->task, PIDTYPE_TGID); +} + +BPF_CALL_1(bpf_send_signal, u32, sig) +{ + struct send_signal_irq_work *work = NULL; + + /* Similar to bpf_probe_write_user, task needs to be + * in a sound condition and kernel memory access be + * permitted in order to send signal to the current + * task. + */ + if (unlikely(current->flags & (PF_KTHREAD | PF_EXITING))) + return -EPERM; + if (unlikely(uaccess_kernel())) + return -EPERM; + if (unlikely(!nmi_uaccess_okay())) + return -EPERM; + + if (in_nmi()) { + work = this_cpu_ptr(&send_signal_work); + if (work->irq_work.flags & IRQ_WORK_BUSY) + return -EBUSY; + + /* Add the current task, which is the target of sending signal, + * to the irq_work. The current task may change when queued + * irq works get executed. + */ + work->task = current; + work->sig = sig; + irq_work_queue(&work->irq_work); + return 0; + } + + return group_send_sig_info(sig, SEND_SIG_PRIV, current, PIDTYPE_TGID); +} + +static const struct bpf_func_proto bpf_send_signal_proto = { + .func = bpf_send_signal, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_ANYTHING, +}; + static const struct bpf_func_proto * tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@ -617,6 +674,8 @@ tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_get_current_cgroup_id: return &bpf_get_current_cgroup_id_proto; #endif + case BPF_FUNC_send_signal: + return &bpf_send_signal_proto; default: return NULL; } @@ -1343,5 +1402,18 @@ static int __init bpf_event_init(void) return 0; } +static int __init send_signal_irq_work_init(void) +{ + int cpu; + struct send_signal_irq_work *work; + + for_each_possible_cpu(cpu) { + work = per_cpu_ptr(&send_signal_work, cpu); + init_irq_work(&work->irq_work, do_bpf_send_signal); + } + return 0; +} + fs_initcall(bpf_event_init); +subsys_initcall(send_signal_irq_work_init); #endif /* CONFIG_MODULES */ -- cgit v1.2.3-71-gd317 From c240eff63a1cf1c4edc768e0cfc374811c02f069 Mon Sep 17 00:00:00 2001 From: Jiong Wang Date: Fri, 24 May 2019 23:25:16 +0100 Subject: bpf: introduce new bpf prog load flags "BPF_F_TEST_RND_HI32" x86_64 and AArch64 perhaps are two arches that running bpf testsuite frequently, however the zero extension insertion pass is not enabled for them because of their hardware support. It is critical to guarantee the pass correction as it is supposed to be enabled at default for a couple of other arches, for example PowerPC, SPARC, arm, NFP etc. Therefore, it would be very useful if there is a way to test this pass on for example x86_64. The test methodology employed by this set is "poisoning" useless bits. High 32-bit of a definition is randomized if it is identified as not used by any later insn. Such randomization is only enabled under testing mode which is gated by the new bpf prog load flags "BPF_F_TEST_RND_HI32". Suggested-by: Alexei Starovoitov Signed-off-by: Jiong Wang Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 18 ++++++++++++++++++ kernel/bpf/syscall.c | 4 +++- 2 files changed, 21 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 68d4470523a0..7c6aef253173 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -260,6 +260,24 @@ enum bpf_attach_type { */ #define BPF_F_ANY_ALIGNMENT (1U << 1) +/* BPF_F_TEST_RND_HI32 is used in BPF_PROG_LOAD command for testing purpose. + * Verifier does sub-register def/use analysis and identifies instructions whose + * def only matters for low 32-bit, high 32-bit is never referenced later + * through implicit zero extension. Therefore verifier notifies JIT back-ends + * that it is safe to ignore clearing high 32-bit for these instructions. This + * saves some back-ends a lot of code-gen. However such optimization is not + * necessary on some arches, for example x86_64, arm64 etc, whose JIT back-ends + * hence hasn't used verifier's analysis result. But, we really want to have a + * way to be able to verify the correctness of the described optimization on + * x86_64 on which testsuites are frequently exercised. + * + * So, this flag is introduced. Once it is set, verifier will randomize high + * 32-bit for those instructions who has been identified as safe to ignore them. + * Then, if verifier is not doing correct analysis, such randomization will + * regress tests to expose bugs. + */ +#define BPF_F_TEST_RND_HI32 (1U << 2) + /* When BPF ldimm64's insn[0].src_reg != 0 then this can have * two extensions: * diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index cb5440b02e82..3d546b6f4646 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1604,7 +1604,9 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) if (CHECK_ATTR(BPF_PROG_LOAD)) return -EINVAL; - if (attr->prog_flags & ~(BPF_F_STRICT_ALIGNMENT | BPF_F_ANY_ALIGNMENT)) + if (attr->prog_flags & ~(BPF_F_STRICT_ALIGNMENT | + BPF_F_ANY_ALIGNMENT | + BPF_F_TEST_RND_HI32)) return -EINVAL; if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && -- cgit v1.2.3-71-gd317 From ea8157ab2ae5e914dd427e5cfab533b6da3819cd Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 21 May 2019 07:55:45 +0100 Subject: zsfold: Convert zsfold to use the new mount API Convert the zsfold filesystem to the new internal mount API as the old one will be obsoleted and removed. This allows greater flexibility in communication of mount parameters between userspace, the VFS and the filesystem. See Documentation/filesystems/mount_api.txt for more information. Signed-off-by: David Howells --- include/uapi/linux/magic.h | 1 + mm/z3fold.c | 10 +++++----- 2 files changed, 6 insertions(+), 5 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h index f8c00045d537..85c1119d0b0b 100644 --- a/include/uapi/linux/magic.h +++ b/include/uapi/linux/magic.h @@ -91,5 +91,6 @@ #define UDF_SUPER_MAGIC 0x15013346 #define BALLOON_KVM_MAGIC 0x13661366 #define ZSMALLOC_MAGIC 0x58295829 +#define Z3FOLD_MAGIC 0x33 #endif /* __LINUX_MAGIC_H__ */ diff --git a/mm/z3fold.c b/mm/z3fold.c index abeb5bcbea57..a43e8bfcaaea 100644 --- a/mm/z3fold.c +++ b/mm/z3fold.c @@ -25,7 +25,6 @@ #include #include #include -#include #include #include #include @@ -35,12 +34,14 @@ #include #include #include +#include #include #include #include #include #include #include +#include /* * NCHUNKS_ORDER determines the internal allocation granularity, effectively @@ -239,15 +240,14 @@ static inline void free_handle(unsigned long handle) } } -static struct dentry *z3fold_do_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static int z3fold_init_fs_context(struct fs_context *fc) { - return mount_pseudo(fs_type, NULL, NULL, 0x33); + return init_pseudo(fc, Z3FOLD_MAGIC) ? 0 : -ENOMEM; } static struct file_system_type z3fold_fs = { .name = "z3fold", - .mount = z3fold_do_mount, + .init_fs_context = z3fold_init_fs_context, .kill_sb = kill_anon_super, }; -- cgit v1.2.3-71-gd317 From 4914425e28fb90c39fa986016373845de5453e97 Mon Sep 17 00:00:00 2001 From: Hans Verkuil Date: Wed, 24 Apr 2019 05:37:49 -0400 Subject: media: coda/venus/s5p_mfc: fix control typo These two slice modes used by the V4L2_CID_MPEG_VIDEO_MULTI_SLICE_MODE control had a silly typo: V4L2_MPEG_VIDEO_MULTI_SICE_MODE_MAX_MB V4L2_MPEG_VIDEO_MULTI_SICE_MODE_MAX_BYTES SICE should be SLICE. Rename these enum values, keeping the old ones (under #ifndef __KERNEL__) for backwards compatibility reasons. Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- drivers/media/platform/coda/coda-bit.c | 4 ++-- drivers/media/platform/coda/coda-common.c | 2 +- drivers/media/platform/qcom/venus/venc_ctrls.c | 2 +- drivers/media/platform/s5p-mfc/s5p_mfc_enc.c | 2 +- drivers/media/platform/s5p-mfc/s5p_mfc_opr_v5.c | 4 ++-- drivers/media/platform/s5p-mfc/s5p_mfc_opr_v6.c | 8 ++++---- include/uapi/linux/v4l2-controls.h | 5 +++++ 7 files changed, 16 insertions(+), 11 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/media/platform/coda/coda-bit.c b/drivers/media/platform/coda/coda-bit.c index d774a5aaa422..a25f3742ecde 100644 --- a/drivers/media/platform/coda/coda-bit.c +++ b/drivers/media/platform/coda/coda-bit.c @@ -1043,7 +1043,7 @@ static int coda_start_encoding(struct coda_ctx *ctx) case V4L2_MPEG_VIDEO_MULTI_SLICE_MODE_SINGLE: value = 0; break; - case V4L2_MPEG_VIDEO_MULTI_SICE_MODE_MAX_MB: + case V4L2_MPEG_VIDEO_MULTI_SLICE_MODE_MAX_MB: value = (ctx->params.slice_max_mb & CODA_SLICING_SIZE_MASK) << CODA_SLICING_SIZE_OFFSET; @@ -1051,7 +1051,7 @@ static int coda_start_encoding(struct coda_ctx *ctx) << CODA_SLICING_UNIT_OFFSET; value |= 1 & CODA_SLICING_MODE_MASK; break; - case V4L2_MPEG_VIDEO_MULTI_SICE_MODE_MAX_BYTES: + case V4L2_MPEG_VIDEO_MULTI_SLICE_MODE_MAX_BYTES: value = (ctx->params.slice_max_bits & CODA_SLICING_SIZE_MASK) << CODA_SLICING_SIZE_OFFSET; diff --git a/drivers/media/platform/coda/coda-common.c b/drivers/media/platform/coda/coda-common.c index 1856b782fdde..614943e8a7a2 100644 --- a/drivers/media/platform/coda/coda-common.c +++ b/drivers/media/platform/coda/coda-common.c @@ -2061,7 +2061,7 @@ static void coda_encode_ctrls(struct coda_ctx *ctx) } v4l2_ctrl_new_std_menu(&ctx->ctrls, &coda_ctrl_ops, V4L2_CID_MPEG_VIDEO_MULTI_SLICE_MODE, - V4L2_MPEG_VIDEO_MULTI_SICE_MODE_MAX_BYTES, 0x0, + V4L2_MPEG_VIDEO_MULTI_SLICE_MODE_MAX_BYTES, 0x0, V4L2_MPEG_VIDEO_MULTI_SLICE_MODE_SINGLE); v4l2_ctrl_new_std(&ctx->ctrls, &coda_ctrl_ops, V4L2_CID_MPEG_VIDEO_MULTI_SLICE_MAX_MB, 1, 0x3fffffff, 1, 1); diff --git a/drivers/media/platform/qcom/venus/venc_ctrls.c b/drivers/media/platform/qcom/venus/venc_ctrls.c index bd4538accf13..7b7186ef6dd2 100644 --- a/drivers/media/platform/qcom/venus/venc_ctrls.c +++ b/drivers/media/platform/qcom/venus/venc_ctrls.c @@ -293,7 +293,7 @@ int venc_ctrl_init(struct venus_inst *inst) v4l2_ctrl_new_std_menu(&inst->ctrl_handler, &venc_ctrl_ops, V4L2_CID_MPEG_VIDEO_MULTI_SLICE_MODE, - V4L2_MPEG_VIDEO_MULTI_SICE_MODE_MAX_BYTES, + V4L2_MPEG_VIDEO_MULTI_SLICE_MODE_MAX_BYTES, 0, V4L2_MPEG_VIDEO_MULTI_SLICE_MODE_SINGLE); v4l2_ctrl_new_std_menu(&inst->ctrl_handler, &venc_ctrl_ops, diff --git a/drivers/media/platform/s5p-mfc/s5p_mfc_enc.c b/drivers/media/platform/s5p-mfc/s5p_mfc_enc.c index 8fcf627dedfb..5505e4fc2090 100644 --- a/drivers/media/platform/s5p-mfc/s5p_mfc_enc.c +++ b/drivers/media/platform/s5p-mfc/s5p_mfc_enc.c @@ -134,7 +134,7 @@ static struct mfc_control controls[] = { .id = V4L2_CID_MPEG_VIDEO_MULTI_SLICE_MODE, .type = V4L2_CTRL_TYPE_MENU, .minimum = V4L2_MPEG_VIDEO_MULTI_SLICE_MODE_SINGLE, - .maximum = V4L2_MPEG_VIDEO_MULTI_SICE_MODE_MAX_BYTES, + .maximum = V4L2_MPEG_VIDEO_MULTI_SLICE_MODE_MAX_BYTES, .default_value = V4L2_MPEG_VIDEO_MULTI_SLICE_MODE_SINGLE, .menu_skip_mask = 0, }, diff --git a/drivers/media/platform/s5p-mfc/s5p_mfc_opr_v5.c b/drivers/media/platform/s5p-mfc/s5p_mfc_opr_v5.c index 6144e95f6425..e83ede3efca7 100644 --- a/drivers/media/platform/s5p-mfc/s5p_mfc_opr_v5.c +++ b/drivers/media/platform/s5p-mfc/s5p_mfc_opr_v5.c @@ -695,9 +695,9 @@ static int s5p_mfc_set_enc_params(struct s5p_mfc_ctx *ctx) /* multi-slice control */ /* multi-slice MB number or bit size */ mfc_write(dev, p->slice_mode, S5P_FIMV_ENC_MSLICE_CTRL); - if (p->slice_mode == V4L2_MPEG_VIDEO_MULTI_SICE_MODE_MAX_MB) { + if (p->slice_mode == V4L2_MPEG_VIDEO_MULTI_SLICE_MODE_MAX_MB) { mfc_write(dev, p->slice_mb, S5P_FIMV_ENC_MSLICE_MB); - } else if (p->slice_mode == V4L2_MPEG_VIDEO_MULTI_SICE_MODE_MAX_BYTES) { + } else if (p->slice_mode == V4L2_MPEG_VIDEO_MULTI_SLICE_MODE_MAX_BYTES) { mfc_write(dev, p->slice_bit, S5P_FIMV_ENC_MSLICE_BIT); } else { mfc_write(dev, 0, S5P_FIMV_ENC_MSLICE_MB); diff --git a/drivers/media/platform/s5p-mfc/s5p_mfc_opr_v6.c b/drivers/media/platform/s5p-mfc/s5p_mfc_opr_v6.c index 281699ab7fe1..d75511190e47 100644 --- a/drivers/media/platform/s5p-mfc/s5p_mfc_opr_v6.c +++ b/drivers/media/platform/s5p-mfc/s5p_mfc_opr_v6.c @@ -736,10 +736,10 @@ static int s5p_mfc_set_slice_mode(struct s5p_mfc_ctx *ctx) /* multi-slice control */ /* multi-slice MB number or bit size */ writel(ctx->slice_mode, mfc_regs->e_mslice_mode); - if (ctx->slice_mode == V4L2_MPEG_VIDEO_MULTI_SICE_MODE_MAX_MB) { + if (ctx->slice_mode == V4L2_MPEG_VIDEO_MULTI_SLICE_MODE_MAX_MB) { writel(ctx->slice_size.mb, mfc_regs->e_mslice_size_mb); } else if (ctx->slice_mode == - V4L2_MPEG_VIDEO_MULTI_SICE_MODE_MAX_BYTES) { + V4L2_MPEG_VIDEO_MULTI_SLICE_MODE_MAX_BYTES) { writel(ctx->slice_size.bits, mfc_regs->e_mslice_size_bits); } else { writel(0x0, mfc_regs->e_mslice_size_mb); @@ -779,11 +779,11 @@ static int s5p_mfc_set_enc_params(struct s5p_mfc_ctx *ctx) /* multi-slice MB number or bit size */ ctx->slice_mode = p->slice_mode; reg = 0; - if (p->slice_mode == V4L2_MPEG_VIDEO_MULTI_SICE_MODE_MAX_MB) { + if (p->slice_mode == V4L2_MPEG_VIDEO_MULTI_SLICE_MODE_MAX_MB) { reg |= (0x1 << 3); writel(reg, mfc_regs->e_enc_options); ctx->slice_size.mb = p->slice_mb; - } else if (p->slice_mode == V4L2_MPEG_VIDEO_MULTI_SICE_MODE_MAX_BYTES) { + } else if (p->slice_mode == V4L2_MPEG_VIDEO_MULTI_SLICE_MODE_MAX_BYTES) { reg |= (0x1 << 3); writel(reg, mfc_regs->e_enc_options); ctx->slice_size.bits = p->slice_bit; diff --git a/include/uapi/linux/v4l2-controls.h b/include/uapi/linux/v4l2-controls.h index 37807f23231e..9cad9fd969e3 100644 --- a/include/uapi/linux/v4l2-controls.h +++ b/include/uapi/linux/v4l2-controls.h @@ -392,8 +392,13 @@ enum v4l2_mpeg_video_header_mode { #define V4L2_CID_MPEG_VIDEO_MULTI_SLICE_MODE (V4L2_CID_MPEG_BASE+221) enum v4l2_mpeg_video_multi_slice_mode { V4L2_MPEG_VIDEO_MULTI_SLICE_MODE_SINGLE = 0, + V4L2_MPEG_VIDEO_MULTI_SLICE_MODE_MAX_MB = 1, + V4L2_MPEG_VIDEO_MULTI_SLICE_MODE_MAX_BYTES = 2, +#ifndef __KERNEL__ + /* Kept for backwards compatibility reasons. Stupid typo... */ V4L2_MPEG_VIDEO_MULTI_SICE_MODE_MAX_MB = 1, V4L2_MPEG_VIDEO_MULTI_SICE_MODE_MAX_BYTES = 2, +#endif }; #define V4L2_CID_MPEG_VIDEO_VBV_SIZE (V4L2_CID_MPEG_BASE+222) #define V4L2_CID_MPEG_VIDEO_DEC_PTS (V4L2_CID_MPEG_BASE+223) -- cgit v1.2.3-71-gd317 From 1a058c3376765ee31d65e28cbbb9d4ff15120056 Mon Sep 17 00:00:00 2001 From: Oak Zeng Date: Mon, 6 May 2019 22:11:14 -0500 Subject: drm/amdkfd: New IOCTL to allocate queue GWS Add a new kfd ioctl to allocate queue GWS. Queue GWS is released on queue destroy. Signed-off-by: Oak Zeng Reviewed-by: Felix Kuehling Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 27 +++++++++++++++++++++++++++ include/uapi/linux/kfd_ioctl.h | 20 +++++++++++++++++++- 2 files changed, 46 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c index c92e931ceb27..aab2aa6c1dee 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c @@ -1567,6 +1567,31 @@ copy_from_user_failed: return err; } +static int kfd_ioctl_alloc_queue_gws(struct file *filep, + struct kfd_process *p, void *data) +{ + int retval; + struct kfd_ioctl_alloc_queue_gws_args *args = data; + struct kfd_dev *dev = NULL; + + if (!hws_gws_support || + dev->dqm->sched_policy == KFD_SCHED_POLICY_NO_HWS) + return -EINVAL; + + dev = kfd_device_by_id(args->gpu_id); + if (!dev) { + pr_debug("Could not find gpu id 0x%x\n", args->gpu_id); + return -EINVAL; + } + + mutex_lock(&p->mutex); + retval = pqm_set_gws(&p->pqm, args->queue_id, args->num_gws ? dev->gws : NULL); + mutex_unlock(&p->mutex); + + args->first_gws = 0; + return retval; +} + static int kfd_ioctl_get_dmabuf_info(struct file *filep, struct kfd_process *p, void *data) { @@ -1769,6 +1794,8 @@ static const struct amdkfd_ioctl_desc amdkfd_ioctls[] = { AMDKFD_IOCTL_DEF(AMDKFD_IOC_IMPORT_DMABUF, kfd_ioctl_import_dmabuf, 0), + AMDKFD_IOCTL_DEF(AMDKFD_IOC_ALLOC_QUEUE_GWS, + kfd_ioctl_alloc_queue_gws, 0), }; #define AMDKFD_CORE_IOCTL_COUNT ARRAY_SIZE(amdkfd_ioctls) diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h index 20917c59f39c..070d1bc7e725 100644 --- a/include/uapi/linux/kfd_ioctl.h +++ b/include/uapi/linux/kfd_ioctl.h @@ -410,6 +410,21 @@ struct kfd_ioctl_unmap_memory_from_gpu_args { __u32 n_success; /* to/from KFD */ }; +/* Allocate GWS for specific queue + * + * @gpu_id: device identifier + * @queue_id: queue's id that GWS is allocated for + * @num_gws: how many GWS to allocate + * @first_gws: index of the first GWS allocated. + * only support contiguous GWS allocation + */ +struct kfd_ioctl_alloc_queue_gws_args { + __u32 gpu_id; /* to KFD */ + __u32 queue_id; /* to KFD */ + __u32 num_gws; /* to KFD */ + __u32 first_gws; /* from KFD */ +}; + struct kfd_ioctl_get_dmabuf_info_args { __u64 size; /* from KFD */ __u64 metadata_ptr; /* to KFD */ @@ -529,7 +544,10 @@ enum kfd_mmio_remap { #define AMDKFD_IOC_IMPORT_DMABUF \ AMDKFD_IOWR(0x1D, struct kfd_ioctl_import_dmabuf_args) +#define AMDKFD_IOC_ALLOC_QUEUE_GWS \ + AMDKFD_IOWR(0x1E, struct kfd_ioctl_alloc_queue_gws_args) + #define AMDKFD_COMMAND_START 0x01 -#define AMDKFD_COMMAND_END 0x1E +#define AMDKFD_COMMAND_END 0x1F #endif -- cgit v1.2.3-71-gd317 From 65ee00a9409f751188a8cdc0988167858eb4a536 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 24 May 2019 14:43:03 -0700 Subject: net: nexthop uapi New UAPI for nexthops as standalone objects: - defines netlink ancillary header, struct nhmsg - RTM commands for nexthop objects, RTM_*NEXTHOP, - RTNLGRP for nexthop notifications, RTNLGRP_NEXTHOP, - Attributes for creating nexthops, NHA_* - Attribute for route specs to specify a nexthop by id, RTA_NH_ID. The nexthop attributes and semantics follow the route and RTA ones for device, gateway and lwt encap. Unique to nexthop objects are a blackhole and a group which contains references to other nexthop objects. With the exception of blackhole and group, nexthop objects MUST contain a device. Gateway and encap are optional. Nexthop groups can only reference other pre-existing nexthops by id. If the NHA_ID attribute is present that id is used for the nexthop. If not specified, one is auto assigned. Dump requests can include attributes: - NHA_GROUPS to return only nexthop groups, - NHA_MASTER to limit dumps to nexthops with devices enslaved to the given master (e.g., VRF) - NHA_OIF to limit dumps to nexthops using given device nlmsg_route_perms in selinux code is updated for the new RTM comands. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/uapi/linux/nexthop.h | 56 ++++++++++++++++++++++++++++++++++++++++++ include/uapi/linux/rtnetlink.h | 10 ++++++++ security/selinux/nlmsgtab.c | 5 +++- 3 files changed, 70 insertions(+), 1 deletion(-) create mode 100644 include/uapi/linux/nexthop.h (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/nexthop.h b/include/uapi/linux/nexthop.h new file mode 100644 index 000000000000..7b61867e9848 --- /dev/null +++ b/include/uapi/linux/nexthop.h @@ -0,0 +1,56 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _UAPI_LINUX_NEXTHOP_H +#define _UAPI_LINUX_NEXTHOP_H + +#include + +struct nhmsg { + unsigned char nh_family; + unsigned char nh_scope; /* return only */ + unsigned char nh_protocol; /* Routing protocol that installed nh */ + unsigned char resvd; + unsigned int nh_flags; /* RTNH_F flags */ +}; + +/* entry in a nexthop group */ +struct nexthop_grp { + __u32 id; /* nexthop id - must exist */ + __u8 weight; /* weight of this nexthop */ + __u8 resvd1; + __u16 resvd2; +}; + +enum { + NEXTHOP_GRP_TYPE_MPATH, /* default type if not specified */ + __NEXTHOP_GRP_TYPE_MAX, +}; + +#define NEXTHOP_GRP_TYPE_MAX (__NEXTHOP_GRP_TYPE_MAX - 1) + +enum { + NHA_UNSPEC, + NHA_ID, /* u32; id for nexthop. id == 0 means auto-assign */ + + NHA_GROUP, /* array of nexthop_grp */ + NHA_GROUP_TYPE, /* u16 one of NEXTHOP_GRP_TYPE */ + /* if NHA_GROUP attribute is added, no other attributes can be set */ + + NHA_BLACKHOLE, /* flag; nexthop used to blackhole packets */ + /* if NHA_BLACKHOLE is added, OIF, GATEWAY, ENCAP can not be set */ + + NHA_OIF, /* u32; nexthop device */ + NHA_GATEWAY, /* be32 (IPv4) or in6_addr (IPv6) gw address */ + NHA_ENCAP_TYPE, /* u16; lwt encap type */ + NHA_ENCAP, /* lwt encap data */ + + /* NHA_OIF can be appended to dump request to return only + * nexthops using given device + */ + NHA_GROUPS, /* flag; only return nexthop groups in dump */ + NHA_MASTER, /* u32; only return nexthops with given master dev */ + + __NHA_MAX, +}; + +#define NHA_MAX (__NHA_MAX - 1) +#endif diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h index 46399367627f..ce2a623abb75 100644 --- a/include/uapi/linux/rtnetlink.h +++ b/include/uapi/linux/rtnetlink.h @@ -157,6 +157,13 @@ enum { RTM_GETCHAIN, #define RTM_GETCHAIN RTM_GETCHAIN + RTM_NEWNEXTHOP = 104, +#define RTM_NEWNEXTHOP RTM_NEWNEXTHOP + RTM_DELNEXTHOP, +#define RTM_DELNEXTHOP RTM_DELNEXTHOP + RTM_GETNEXTHOP, +#define RTM_GETNEXTHOP RTM_GETNEXTHOP + __RTM_MAX, #define RTM_MAX (((__RTM_MAX + 3) & ~3) - 1) }; @@ -342,6 +349,7 @@ enum rtattr_type_t { RTA_IP_PROTO, RTA_SPORT, RTA_DPORT, + RTA_NH_ID, __RTA_MAX }; @@ -704,6 +712,8 @@ enum rtnetlink_groups { #define RTNLGRP_IPV4_MROUTE_R RTNLGRP_IPV4_MROUTE_R RTNLGRP_IPV6_MROUTE_R, #define RTNLGRP_IPV6_MROUTE_R RTNLGRP_IPV6_MROUTE_R + RTNLGRP_NEXTHOP, +#define RTNLGRP_NEXTHOP RTNLGRP_NEXTHOP __RTNLGRP_MAX }; #define RTNLGRP_MAX (__RTNLGRP_MAX - 1) diff --git a/security/selinux/nlmsgtab.c b/security/selinux/nlmsgtab.c index 9cec81209617..2c75d823d8e2 100644 --- a/security/selinux/nlmsgtab.c +++ b/security/selinux/nlmsgtab.c @@ -83,6 +83,9 @@ static const struct nlmsg_perm nlmsg_route_perms[] = { RTM_NEWCHAIN, NETLINK_ROUTE_SOCKET__NLMSG_WRITE }, { RTM_DELCHAIN, NETLINK_ROUTE_SOCKET__NLMSG_WRITE }, { RTM_GETCHAIN, NETLINK_ROUTE_SOCKET__NLMSG_READ }, + { RTM_NEWNEXTHOP, NETLINK_ROUTE_SOCKET__NLMSG_WRITE }, + { RTM_DELNEXTHOP, NETLINK_ROUTE_SOCKET__NLMSG_WRITE }, + { RTM_GETNEXTHOP, NETLINK_ROUTE_SOCKET__NLMSG_READ }, }; static const struct nlmsg_perm nlmsg_tcpdiag_perms[] = @@ -166,7 +169,7 @@ int selinux_nlmsg_lookup(u16 sclass, u16 nlmsg_type, u32 *perm) * structures at the top of this file with the new mappings * before updating the BUILD_BUG_ON() macro! */ - BUILD_BUG_ON(RTM_MAX != (RTM_NEWCHAIN + 3)); + BUILD_BUG_ON(RTM_MAX != (RTM_NEWNEXTHOP + 3)); err = nlmsg_perm(nlmsg_type, perm, nlmsg_route_perms, sizeof(nlmsg_route_perms)); break; -- cgit v1.2.3-71-gd317 From 5902bca94ae05316ec7feab9b84cb07ffa5c1175 Mon Sep 17 00:00:00 2001 From: Philipp Zabel Date: Wed, 24 Apr 2019 06:43:47 -0400 Subject: media: v4l2-ctrl: add MPEG-2 profile and level controls Add MPEG-2 CID definitions for profiles and levels defined in ITU-T Rec. H.262. Signed-off-by: Philipp Zabel Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- Documentation/media/uapi/v4l/ext-ctrls-codec.rst | 56 ++++++++++++++++++++++++ drivers/media/v4l2-core/v4l2-ctrls.c | 23 ++++++++++ include/uapi/linux/v4l2-controls.h | 18 ++++++++ 3 files changed, 97 insertions(+) (limited to 'include/uapi/linux') diff --git a/Documentation/media/uapi/v4l/ext-ctrls-codec.rst b/Documentation/media/uapi/v4l/ext-ctrls-codec.rst index 4a8446203085..843c93e8e7bc 100644 --- a/Documentation/media/uapi/v4l/ext-ctrls-codec.rst +++ b/Documentation/media/uapi/v4l/ext-ctrls-codec.rst @@ -759,6 +759,32 @@ enum v4l2_mpeg_video_h264_level - +.. _v4l2-mpeg-video-mpeg2-level: + +``V4L2_CID_MPEG_VIDEO_MPEG2_LEVEL`` + (enum) + +enum v4l2_mpeg_video_mpeg2_level - + The level information for the MPEG2 elementary stream. Applicable to + MPEG2 codecs. Possible values are: + + + +.. flat-table:: + :header-rows: 0 + :stub-columns: 0 + + * - ``V4L2_MPEG_VIDEO_MPEG2_LEVEL_LOW`` + - Low Level (LL) + * - ``V4L2_MPEG_VIDEO_MPEG2_LEVEL_MAIN`` + - Main Level (ML) + * - ``V4L2_MPEG_VIDEO_MPEG2_LEVEL_HIGH_1440`` + - High-1440 Level (H-14) + * - ``V4L2_MPEG_VIDEO_MPEG2_LEVEL_HIGH`` + - High Level (HL) + + + .. _v4l2-mpeg-video-mpeg4-level: ``V4L2_CID_MPEG_VIDEO_MPEG4_LEVEL`` @@ -845,6 +871,36 @@ enum v4l2_mpeg_video_h264_profile - +.. _v4l2-mpeg-video-mpeg2-profile: + +``V4L2_CID_MPEG_VIDEO_MPEG2_PROFILE`` + (enum) + +enum v4l2_mpeg_video_mpeg2_profile - + The profile information for MPEG2. Applicable to MPEG2 codecs. + Possible values are: + + + +.. flat-table:: + :header-rows: 0 + :stub-columns: 0 + + * - ``V4L2_MPEG_VIDEO_MPEG2_PROFILE_SIMPLE`` + - Simple profile (SP) + * - ``V4L2_MPEG_VIDEO_MPEG2_PROFILE_MAIN`` + - Main profile (MP) + * - ``V4L2_MPEG_VIDEO_MPEG2_PROFILE_SNR_SCALABLE`` + - SNR Scalable profile (SNR) + * - ``V4L2_MPEG_VIDEO_MPEG2_PROFILE_SPATIALLY_SCALABLE`` + - Spatially Scalable profile (Spt) + * - ``V4L2_MPEG_VIDEO_MPEG2_PROFILE_HIGH`` + - High profile (HP) + * - ``V4L2_MPEG_VIDEO_MPEG2_PROFILE_MULTIVIEW`` + - Multi-view profile (MVP) + + + .. _v4l2-mpeg-video-mpeg4-profile: ``V4L2_CID_MPEG_VIDEO_MPEG4_PROFILE`` diff --git a/drivers/media/v4l2-core/v4l2-ctrls.c b/drivers/media/v4l2-core/v4l2-ctrls.c index 2ffffd923265..38e80fb36d1a 100644 --- a/drivers/media/v4l2-core/v4l2-ctrls.c +++ b/drivers/media/v4l2-core/v4l2-ctrls.c @@ -406,6 +406,21 @@ const char * const *v4l2_ctrl_get_menu(u32 id) "Explicit", NULL, }; + static const char * const mpeg_mpeg2_level[] = { + "Low", + "Main", + "High 1440", + "High", + NULL, + }; + static const char * const mpeg2_profile[] = { + "Simple", + "Main", + "SNR Scalable", + "Spatially Scalable", + "High", + NULL, + }; static const char * const mpeg_mpeg4_level[] = { "0", "0b", @@ -622,6 +637,10 @@ const char * const *v4l2_ctrl_get_menu(u32 id) return h264_fp_arrangement_type; case V4L2_CID_MPEG_VIDEO_H264_FMO_MAP_TYPE: return h264_fmo_map_type; + case V4L2_CID_MPEG_VIDEO_MPEG2_LEVEL: + return mpeg_mpeg2_level; + case V4L2_CID_MPEG_VIDEO_MPEG2_PROFILE: + return mpeg2_profile; case V4L2_CID_MPEG_VIDEO_MPEG4_LEVEL: return mpeg_mpeg4_level; case V4L2_CID_MPEG_VIDEO_MPEG4_PROFILE: @@ -832,6 +851,8 @@ const char *v4l2_ctrl_get_name(u32 id) case V4L2_CID_MPEG_VIDEO_H264_I_FRAME_MAX_QP: return "H264 I-Frame Maximum QP Value"; case V4L2_CID_MPEG_VIDEO_H264_P_FRAME_MIN_QP: return "H264 P-Frame Minimum QP Value"; case V4L2_CID_MPEG_VIDEO_H264_P_FRAME_MAX_QP: return "H264 P-Frame Maximum QP Value"; + case V4L2_CID_MPEG_VIDEO_MPEG2_LEVEL: return "MPEG2 Level"; + case V4L2_CID_MPEG_VIDEO_MPEG2_PROFILE: return "MPEG2 Profile"; case V4L2_CID_MPEG_VIDEO_MPEG4_I_FRAME_QP: return "MPEG4 I-Frame QP Value"; case V4L2_CID_MPEG_VIDEO_MPEG4_P_FRAME_QP: return "MPEG4 P-Frame QP Value"; case V4L2_CID_MPEG_VIDEO_MPEG4_B_FRAME_QP: return "MPEG4 B-Frame QP Value"; @@ -1197,6 +1218,8 @@ void v4l2_ctrl_fill(u32 id, const char **name, enum v4l2_ctrl_type *type, case V4L2_CID_MPEG_VIDEO_H264_VUI_SAR_IDC: case V4L2_CID_MPEG_VIDEO_H264_SEI_FP_ARRANGEMENT_TYPE: case V4L2_CID_MPEG_VIDEO_H264_FMO_MAP_TYPE: + case V4L2_CID_MPEG_VIDEO_MPEG2_LEVEL: + case V4L2_CID_MPEG_VIDEO_MPEG2_PROFILE: case V4L2_CID_MPEG_VIDEO_MPEG4_LEVEL: case V4L2_CID_MPEG_VIDEO_MPEG4_PROFILE: case V4L2_CID_JPEG_CHROMA_SUBSAMPLING: diff --git a/include/uapi/linux/v4l2-controls.h b/include/uapi/linux/v4l2-controls.h index 9cad9fd969e3..a2669b79b294 100644 --- a/include/uapi/linux/v4l2-controls.h +++ b/include/uapi/linux/v4l2-controls.h @@ -409,6 +409,24 @@ enum v4l2_mpeg_video_multi_slice_mode { #define V4L2_CID_MPEG_VIDEO_MV_V_SEARCH_RANGE (V4L2_CID_MPEG_BASE+228) #define V4L2_CID_MPEG_VIDEO_FORCE_KEY_FRAME (V4L2_CID_MPEG_BASE+229) +/* CIDs for the MPEG-2 Part 2 (H.262) codec */ +#define V4L2_CID_MPEG_VIDEO_MPEG2_LEVEL (V4L2_CID_MPEG_BASE+270) +enum v4l2_mpeg_video_mpeg2_level { + V4L2_MPEG_VIDEO_MPEG2_LEVEL_LOW = 0, + V4L2_MPEG_VIDEO_MPEG2_LEVEL_MAIN = 1, + V4L2_MPEG_VIDEO_MPEG2_LEVEL_HIGH_1440 = 2, + V4L2_MPEG_VIDEO_MPEG2_LEVEL_HIGH = 3, +}; +#define V4L2_CID_MPEG_VIDEO_MPEG2_PROFILE (V4L2_CID_MPEG_BASE+271) +enum v4l2_mpeg_video_mpeg2_profile { + V4L2_MPEG_VIDEO_MPEG2_PROFILE_SIMPLE = 0, + V4L2_MPEG_VIDEO_MPEG2_PROFILE_MAIN = 1, + V4L2_MPEG_VIDEO_MPEG2_PROFILE_SNR_SCALABLE = 2, + V4L2_MPEG_VIDEO_MPEG2_PROFILE_SPATIALLY_SCALABLE = 3, + V4L2_MPEG_VIDEO_MPEG2_PROFILE_HIGH = 4, + V4L2_MPEG_VIDEO_MPEG2_PROFILE_MULTIVIEW = 5, +}; + /* CIDs for the FWHT codec as used by the vicodec driver. */ #define V4L2_CID_FWHT_I_FRAME_QP (V4L2_CID_MPEG_BASE + 290) #define V4L2_CID_FWHT_P_FRAME_QP (V4L2_CID_MPEG_BASE + 291) -- cgit v1.2.3-71-gd317 From 24ec483cec981618f8a4782a36d1e3f319d42cad Mon Sep 17 00:00:00 2001 From: Kevin 'ldir' Darbyshire-Bryant Date: Tue, 28 May 2019 17:03:50 +0000 Subject: net: sched: Introduce act_ctinfo action MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ctinfo is a new tc filter action module. It is designed to restore information contained in firewall conntrack marks to other packet fields and is typically used on packet ingress paths. At present it has two independent sub-functions or operating modes, DSCP restoration mode & skb mark restoration mode. The DSCP restore mode: This mode copies DSCP values that have been placed in the firewall conntrack mark back into the IPv4/v6 diffserv fields of relevant packets. The DSCP restoration is intended for use and has been found useful for restoring ingress classifications based on egress classifications across links that bleach or otherwise change DSCP, typically home ISP Internet links. Restoring DSCP on ingress on the WAN link allows qdiscs such as but by no means limited to CAKE to shape inbound packets according to policies that are easier to set & mark on egress. Ingress classification is traditionally a challenging task since iptables rules haven't yet run and tc filter/eBPF programs are pre-NAT lookups, hence are unable to see internal IPv4 addresses as used on the typical home masquerading gateway. Thus marking the connection in some manner on egress for later restoration of classification on ingress is easier to implement. Parameters related to DSCP restore mode: dscpmask - a 32 bit mask of 6 contiguous bits and indicate bits of the conntrack mark field contain the DSCP value to be restored. statemask - a 32 bit mask of (usually) 1 bit length, outside the area specified by dscpmask. This represents a conditional operation flag whereby the DSCP is only restored if the flag is set. This is useful to implement a 'one shot' iptables based classification where the 'complicated' iptables rules are only run once to classify the connection on initial (egress) packet and subsequent packets are all marked/restored with the same DSCP. A mask of zero disables the conditional behaviour ie. the conntrack mark DSCP bits are always restored to the ip diffserv field (assuming the conntrack entry is found & the skb is an ipv4/ipv6 type) e.g. dscpmask 0xfc000000 statemask 0x01000000 |----0xFC----conntrack mark----000000---| | Bits 31-26 | bit 25 | bit24 |~~~ Bit 0| | DSCP | unused | flag |unused | |-----------------------0x01---000000---| | | | | ---| Conditional flag v only restore if set |-ip diffserv-| | 6 bits | |-------------| The skb mark restore mode (cpmark): This mode copies the firewall conntrack mark to the skb's mark field. It is completely the functional equivalent of the existing act_connmark action with the additional feature of being able to apply a mask to the restored value. Parameters related to skb mark restore mode: mask - a 32 bit mask applied to the firewall conntrack mark to mask out bits unwanted for restoration. This can be useful where the conntrack mark is being used for different purposes by different applications. If not specified and by default the whole mark field is copied (i.e. default mask of 0xffffffff) e.g. mask 0x00ffffff to mask out the top 8 bits being used by the aforementioned DSCP restore mode. |----0x00----conntrack mark----ffffff---| | Bits 31-24 | | | DSCP & flag| some value here | |---------------------------------------| | | v |------------skb mark-------------------| | | | | zeroed | | |---------------------------------------| Overall parameters: zone - conntrack zone control - action related control (reclassify | pipe | drop | continue | ok | goto chain ) Signed-off-by: Kevin Darbyshire-Bryant Reviewed-by: Toke Høiland-Jørgensen Acked-by: Cong Wang Signed-off-by: David S. Miller --- include/net/tc_act/tc_ctinfo.h | 28 +++ include/uapi/linux/pkt_cls.h | 1 + include/uapi/linux/tc_act/tc_ctinfo.h | 34 +++ net/sched/Kconfig | 17 ++ net/sched/Makefile | 1 + net/sched/act_ctinfo.c | 396 ++++++++++++++++++++++++++++++ tools/testing/selftests/tc-testing/config | 1 + 7 files changed, 478 insertions(+) create mode 100644 include/net/tc_act/tc_ctinfo.h create mode 100644 include/uapi/linux/tc_act/tc_ctinfo.h create mode 100644 net/sched/act_ctinfo.c (limited to 'include/uapi/linux') diff --git a/include/net/tc_act/tc_ctinfo.h b/include/net/tc_act/tc_ctinfo.h new file mode 100644 index 000000000000..d6a688571672 --- /dev/null +++ b/include/net/tc_act/tc_ctinfo.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __NET_TC_CTINFO_H +#define __NET_TC_CTINFO_H + +#include + +struct tcf_ctinfo_params { + struct rcu_head rcu; + struct net *net; + u32 dscpmask; + u32 dscpstatemask; + u32 cpmarkmask; + u16 zone; + u8 mode; + u8 dscpmaskshift; +}; + +struct tcf_ctinfo { + struct tc_action common; + struct tcf_ctinfo_params __rcu *params; + u64 stats_dscp_set; + u64 stats_dscp_error; + u64 stats_cpmark_set; +}; + +#define to_ctinfo(a) ((struct tcf_ctinfo *)a) + +#endif /* __NET_TC_CTINFO_H */ diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h index 51a0496f78ea..a93680fc4bfa 100644 --- a/include/uapi/linux/pkt_cls.h +++ b/include/uapi/linux/pkt_cls.h @@ -105,6 +105,7 @@ enum tca_id { TCA_ID_IFE = TCA_ACT_IFE, TCA_ID_SAMPLE = TCA_ACT_SAMPLE, /* other actions go here */ + TCA_ID_CTINFO, __TCA_ID_MAX = 255 }; diff --git a/include/uapi/linux/tc_act/tc_ctinfo.h b/include/uapi/linux/tc_act/tc_ctinfo.h new file mode 100644 index 000000000000..da803e05a89b --- /dev/null +++ b/include/uapi/linux/tc_act/tc_ctinfo.h @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef __UAPI_TC_CTINFO_H +#define __UAPI_TC_CTINFO_H + +#include +#include + +struct tc_ctinfo { + tc_gen; +}; + +enum { + TCA_CTINFO_UNSPEC, + TCA_CTINFO_PAD, + TCA_CTINFO_TM, + TCA_CTINFO_ACT, + TCA_CTINFO_ZONE, + TCA_CTINFO_PARMS_DSCP_MASK, + TCA_CTINFO_PARMS_DSCP_STATEMASK, + TCA_CTINFO_PARMS_CPMARK_MASK, + TCA_CTINFO_STATS_DSCP_SET, + TCA_CTINFO_STATS_DSCP_ERROR, + TCA_CTINFO_STATS_CPMARK_SET, + __TCA_CTINFO_MAX +}; + +#define TCA_CTINFO_MAX (__TCA_CTINFO_MAX - 1) + +enum { + CTINFO_MODE_DSCP = BIT(0), + CTINFO_MODE_CPMARK = BIT(1) +}; + +#endif diff --git a/net/sched/Kconfig b/net/sched/Kconfig index 2c72d95c3050..d104f7ee26c7 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -877,6 +877,23 @@ config NET_ACT_CONNMARK To compile this code as a module, choose M here: the module will be called act_connmark. +config NET_ACT_CTINFO + tristate "Netfilter Connection Mark Actions" + depends on NET_CLS_ACT && NETFILTER && IP_NF_IPTABLES + depends on NF_CONNTRACK && NF_CONNTRACK_MARK + help + Say Y here to allow transfer of a connmark stored information. + Current actions transfer connmark stored DSCP into + ipv4/v6 diffserv and/or to transfer connmark to packet + mark. Both are useful for restoring egress based marks + back onto ingress connections for qdisc priority mapping + purposes. + + If unsure, say N. + + To compile this code as a module, choose M here: the + module will be called act_ctinfo. + config NET_ACT_SKBMOD tristate "skb data modification action" depends on NET_CLS_ACT diff --git a/net/sched/Makefile b/net/sched/Makefile index 8a40431d7b5c..d54bfcbd7981 100644 --- a/net/sched/Makefile +++ b/net/sched/Makefile @@ -21,6 +21,7 @@ obj-$(CONFIG_NET_ACT_CSUM) += act_csum.o obj-$(CONFIG_NET_ACT_VLAN) += act_vlan.o obj-$(CONFIG_NET_ACT_BPF) += act_bpf.o obj-$(CONFIG_NET_ACT_CONNMARK) += act_connmark.o +obj-$(CONFIG_NET_ACT_CTINFO) += act_ctinfo.o obj-$(CONFIG_NET_ACT_SKBMOD) += act_skbmod.o obj-$(CONFIG_NET_ACT_IFE) += act_ife.o obj-$(CONFIG_NET_IFE_SKBMARK) += act_meta_mark.o diff --git a/net/sched/act_ctinfo.c b/net/sched/act_ctinfo.c new file mode 100644 index 000000000000..926109139a81 --- /dev/null +++ b/net/sched/act_ctinfo.c @@ -0,0 +1,396 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* net/sched/act_ctinfo.c netfilter ctinfo connmark actions + * + * Copyright (c) 2019 Kevin Darbyshire-Bryant + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +static struct tc_action_ops act_ctinfo_ops; +static unsigned int ctinfo_net_id; + +static void tcf_ctinfo_dscp_set(struct nf_conn *ct, struct tcf_ctinfo *ca, + struct tcf_ctinfo_params *cp, + struct sk_buff *skb, int wlen, int proto) +{ + u8 dscp, newdscp; + + newdscp = (((ct->mark & cp->dscpmask) >> cp->dscpmaskshift) << 2) & + ~INET_ECN_MASK; + + switch (proto) { + case NFPROTO_IPV4: + dscp = ipv4_get_dsfield(ip_hdr(skb)) & ~INET_ECN_MASK; + if (dscp != newdscp) { + if (likely(!skb_try_make_writable(skb, wlen))) { + ipv4_change_dsfield(ip_hdr(skb), + INET_ECN_MASK, + newdscp); + ca->stats_dscp_set++; + } else { + ca->stats_dscp_error++; + } + } + break; + case NFPROTO_IPV6: + dscp = ipv6_get_dsfield(ipv6_hdr(skb)) & ~INET_ECN_MASK; + if (dscp != newdscp) { + if (likely(!skb_try_make_writable(skb, wlen))) { + ipv6_change_dsfield(ipv6_hdr(skb), + INET_ECN_MASK, + newdscp); + ca->stats_dscp_set++; + } else { + ca->stats_dscp_error++; + } + } + break; + default: + break; + } +} + +static void tcf_ctinfo_cpmark_set(struct nf_conn *ct, struct tcf_ctinfo *ca, + struct tcf_ctinfo_params *cp, + struct sk_buff *skb) +{ + ca->stats_cpmark_set++; + skb->mark = ct->mark & cp->cpmarkmask; +} + +static int tcf_ctinfo_act(struct sk_buff *skb, const struct tc_action *a, + struct tcf_result *res) +{ + const struct nf_conntrack_tuple_hash *thash = NULL; + struct tcf_ctinfo *ca = to_ctinfo(a); + struct nf_conntrack_tuple tuple; + struct nf_conntrack_zone zone; + enum ip_conntrack_info ctinfo; + struct tcf_ctinfo_params *cp; + struct nf_conn *ct; + int proto, wlen; + int action; + + cp = rcu_dereference_bh(ca->params); + + tcf_lastuse_update(&ca->tcf_tm); + bstats_update(&ca->tcf_bstats, skb); + action = READ_ONCE(ca->tcf_action); + + wlen = skb_network_offset(skb); + if (tc_skb_protocol(skb) == htons(ETH_P_IP)) { + wlen += sizeof(struct iphdr); + if (!pskb_may_pull(skb, wlen)) + goto out; + + proto = NFPROTO_IPV4; + } else if (tc_skb_protocol(skb) == htons(ETH_P_IPV6)) { + wlen += sizeof(struct ipv6hdr); + if (!pskb_may_pull(skb, wlen)) + goto out; + + proto = NFPROTO_IPV6; + } else { + goto out; + } + + ct = nf_ct_get(skb, &ctinfo); + if (!ct) { /* look harder, usually ingress */ + if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb), + proto, cp->net, &tuple)) + goto out; + zone.id = cp->zone; + zone.dir = NF_CT_DEFAULT_ZONE_DIR; + + thash = nf_conntrack_find_get(cp->net, &zone, &tuple); + if (!thash) + goto out; + + ct = nf_ct_tuplehash_to_ctrack(thash); + } + + if (cp->mode & CTINFO_MODE_DSCP) + if (!cp->dscpstatemask || (ct->mark & cp->dscpstatemask)) + tcf_ctinfo_dscp_set(ct, ca, cp, skb, wlen, proto); + + if (cp->mode & CTINFO_MODE_CPMARK) + tcf_ctinfo_cpmark_set(ct, ca, cp, skb); + + if (thash) + nf_ct_put(ct); +out: + return action; +} + +static const struct nla_policy ctinfo_policy[TCA_CTINFO_MAX + 1] = { + [TCA_CTINFO_ACT] = { .len = sizeof(struct + tc_ctinfo) }, + [TCA_CTINFO_ZONE] = { .type = NLA_U16 }, + [TCA_CTINFO_PARMS_DSCP_MASK] = { .type = NLA_U32 }, + [TCA_CTINFO_PARMS_DSCP_STATEMASK] = { .type = NLA_U32 }, + [TCA_CTINFO_PARMS_CPMARK_MASK] = { .type = NLA_U32 }, +}; + +static int tcf_ctinfo_init(struct net *net, struct nlattr *nla, + struct nlattr *est, struct tc_action **a, + int ovr, int bind, bool rtnl_held, + struct tcf_proto *tp, + struct netlink_ext_ack *extack) +{ + struct tc_action_net *tn = net_generic(net, ctinfo_net_id); + struct nlattr *tb[TCA_CTINFO_MAX + 1]; + struct tcf_ctinfo_params *cp_new; + struct tcf_chain *goto_ch = NULL; + u32 dscpmask = 0, dscpstatemask; + struct tc_ctinfo *actparm; + struct tcf_ctinfo *ci; + u8 dscpmaskshift; + int ret = 0, err; + + if (!nla) + return -EINVAL; + + err = nla_parse_nested(tb, TCA_CTINFO_MAX, nla, ctinfo_policy, NULL); + if (err < 0) + return err; + + if (!tb[TCA_CTINFO_ACT]) + return -EINVAL; + actparm = nla_data(tb[TCA_CTINFO_ACT]); + + /* do some basic validation here before dynamically allocating things */ + /* that we would otherwise have to clean up. */ + if (tb[TCA_CTINFO_PARMS_DSCP_MASK]) { + dscpmask = nla_get_u32(tb[TCA_CTINFO_PARMS_DSCP_MASK]); + /* need contiguous 6 bit mask */ + dscpmaskshift = dscpmask ? __ffs(dscpmask) : 0; + if ((~0 & (dscpmask >> dscpmaskshift)) != 0x3f) + return -EINVAL; + dscpstatemask = tb[TCA_CTINFO_PARMS_DSCP_STATEMASK] ? + nla_get_u32(tb[TCA_CTINFO_PARMS_DSCP_STATEMASK]) : 0; + /* mask & statemask must not overlap */ + if (dscpmask & dscpstatemask) + return -EINVAL; + } + + /* done the validation:now to the actual action allocation */ + err = tcf_idr_check_alloc(tn, &actparm->index, a, bind); + if (!err) { + ret = tcf_idr_create(tn, actparm->index, est, a, + &act_ctinfo_ops, bind, false); + if (ret) { + tcf_idr_cleanup(tn, actparm->index); + return ret; + } + } else if (err > 0) { + if (bind) /* don't override defaults */ + return 0; + if (!ovr) { + tcf_idr_release(*a, bind); + return -EEXIST; + } + } else { + return err; + } + + err = tcf_action_check_ctrlact(actparm->action, tp, &goto_ch, extack); + if (err < 0) + goto release_idr; + + ci = to_ctinfo(*a); + + cp_new = kzalloc(sizeof(*cp_new), GFP_KERNEL); + if (unlikely(!cp_new)) { + err = -ENOMEM; + goto put_chain; + } + + cp_new->net = net; + cp_new->zone = tb[TCA_CTINFO_ZONE] ? + nla_get_u16(tb[TCA_CTINFO_ZONE]) : 0; + if (dscpmask) { + cp_new->dscpmask = dscpmask; + cp_new->dscpmaskshift = dscpmaskshift; + cp_new->dscpstatemask = dscpstatemask; + cp_new->mode |= CTINFO_MODE_DSCP; + } else { + cp_new->mode &= ~CTINFO_MODE_DSCP; + } + + if (tb[TCA_CTINFO_PARMS_CPMARK_MASK]) { + cp_new->cpmarkmask = + nla_get_u32(tb[TCA_CTINFO_PARMS_CPMARK_MASK]); + cp_new->mode |= CTINFO_MODE_CPMARK; + } else { + cp_new->mode &= ~CTINFO_MODE_CPMARK; + } + + spin_lock_bh(&ci->tcf_lock); + goto_ch = tcf_action_set_ctrlact(*a, actparm->action, goto_ch); + rcu_swap_protected(ci->params, cp_new, + lockdep_is_held(&ci->tcf_lock)); + spin_unlock_bh(&ci->tcf_lock); + + if (goto_ch) + tcf_chain_put_by_act(goto_ch); + if (cp_new) + kfree_rcu(cp_new, rcu); + + if (ret == ACT_P_CREATED) + tcf_idr_insert(tn, *a); + + return ret; + +put_chain: + if (goto_ch) + tcf_chain_put_by_act(goto_ch); +release_idr: + tcf_idr_release(*a, bind); + return err; +} + +static int tcf_ctinfo_dump(struct sk_buff *skb, struct tc_action *a, + int bind, int ref) +{ + struct tcf_ctinfo *ci = to_ctinfo(a); + struct tc_ctinfo opt = { + .index = ci->tcf_index, + .refcnt = refcount_read(&ci->tcf_refcnt) - ref, + .bindcnt = atomic_read(&ci->tcf_bindcnt) - bind, + }; + unsigned char *b = skb_tail_pointer(skb); + struct tcf_ctinfo_params *cp; + struct tcf_t t; + + spin_lock_bh(&ci->tcf_lock); + cp = rcu_dereference_protected(ci->params, + lockdep_is_held(&ci->tcf_lock)); + + tcf_tm_dump(&t, &ci->tcf_tm); + if (nla_put_64bit(skb, TCA_CTINFO_TM, sizeof(t), &t, TCA_CTINFO_PAD)) + goto nla_put_failure; + + opt.action = ci->tcf_action; + if (nla_put(skb, TCA_CTINFO_ACT, sizeof(opt), &opt)) + goto nla_put_failure; + + if (nla_put_u16(skb, TCA_CTINFO_ZONE, cp->zone)) + goto nla_put_failure; + + if (cp->mode & CTINFO_MODE_DSCP) { + if (nla_put_u32(skb, TCA_CTINFO_PARMS_DSCP_MASK, + cp->dscpmask)) + goto nla_put_failure; + if (nla_put_u32(skb, TCA_CTINFO_PARMS_DSCP_STATEMASK, + cp->dscpstatemask)) + goto nla_put_failure; + } + + if (cp->mode & CTINFO_MODE_CPMARK) { + if (nla_put_u32(skb, TCA_CTINFO_PARMS_CPMARK_MASK, + cp->cpmarkmask)) + goto nla_put_failure; + } + + if (nla_put_u64_64bit(skb, TCA_CTINFO_STATS_DSCP_SET, + ci->stats_dscp_set, TCA_CTINFO_PAD)) + goto nla_put_failure; + + if (nla_put_u64_64bit(skb, TCA_CTINFO_STATS_DSCP_ERROR, + ci->stats_dscp_error, TCA_CTINFO_PAD)) + goto nla_put_failure; + + if (nla_put_u64_64bit(skb, TCA_CTINFO_STATS_CPMARK_SET, + ci->stats_cpmark_set, TCA_CTINFO_PAD)) + goto nla_put_failure; + + spin_unlock_bh(&ci->tcf_lock); + return skb->len; + +nla_put_failure: + spin_unlock_bh(&ci->tcf_lock); + nlmsg_trim(skb, b); + return -1; +} + +static int tcf_ctinfo_walker(struct net *net, struct sk_buff *skb, + struct netlink_callback *cb, int type, + const struct tc_action_ops *ops, + struct netlink_ext_ack *extack) +{ + struct tc_action_net *tn = net_generic(net, ctinfo_net_id); + + return tcf_generic_walker(tn, skb, cb, type, ops, extack); +} + +static int tcf_ctinfo_search(struct net *net, struct tc_action **a, u32 index) +{ + struct tc_action_net *tn = net_generic(net, ctinfo_net_id); + + return tcf_idr_search(tn, a, index); +} + +static struct tc_action_ops act_ctinfo_ops = { + .kind = "ctinfo", + .id = TCA_ID_CTINFO, + .owner = THIS_MODULE, + .act = tcf_ctinfo_act, + .dump = tcf_ctinfo_dump, + .init = tcf_ctinfo_init, + .walk = tcf_ctinfo_walker, + .lookup = tcf_ctinfo_search, + .size = sizeof(struct tcf_ctinfo), +}; + +static __net_init int ctinfo_init_net(struct net *net) +{ + struct tc_action_net *tn = net_generic(net, ctinfo_net_id); + + return tc_action_net_init(tn, &act_ctinfo_ops); +} + +static void __net_exit ctinfo_exit_net(struct list_head *net_list) +{ + tc_action_net_exit(net_list, ctinfo_net_id); +} + +static struct pernet_operations ctinfo_net_ops = { + .init = ctinfo_init_net, + .exit_batch = ctinfo_exit_net, + .id = &ctinfo_net_id, + .size = sizeof(struct tc_action_net), +}; + +static int __init ctinfo_init_module(void) +{ + return tcf_register_action(&act_ctinfo_ops, &ctinfo_net_ops); +} + +static void __exit ctinfo_cleanup_module(void) +{ + tcf_unregister_action(&act_ctinfo_ops, &ctinfo_net_ops); +} + +module_init(ctinfo_init_module); +module_exit(ctinfo_cleanup_module); +MODULE_AUTHOR("Kevin Darbyshire-Bryant "); +MODULE_DESCRIPTION("Connection tracking mark actions"); +MODULE_LICENSE("GPL"); diff --git a/tools/testing/selftests/tc-testing/config b/tools/testing/selftests/tc-testing/config index 203302065458..b235efd55367 100644 --- a/tools/testing/selftests/tc-testing/config +++ b/tools/testing/selftests/tc-testing/config @@ -38,6 +38,7 @@ CONFIG_NET_ACT_CSUM=m CONFIG_NET_ACT_VLAN=m CONFIG_NET_ACT_BPF=m CONFIG_NET_ACT_CONNMARK=m +CONFIG_NET_ACT_CTINFO=m CONFIG_NET_ACT_SKBMOD=m CONFIG_NET_ACT_IFE=m CONFIG_NET_ACT_TUNNEL_KEY=m -- cgit v1.2.3-71-gd317 From 9092a76d3cf8638467b09bbb4f409094349b2b53 Mon Sep 17 00:00:00 2001 From: Jason Baron Date: Wed, 29 May 2019 12:33:57 -0400 Subject: tcp: add backup TFO key infrastructure We would like to be able to rotate TFO keys while minimizing the number of client cookies that are rejected. Currently, we have only one key which can be used to generate and validate cookies, thus if we simply replace this key clients can easily have cookies rejected upon rotation. We propose having the ability to have both a primary key and a backup key. The primary key is used to generate as well as to validate cookies. The backup is only used to validate cookies. Thus, keys can be rotated as: 1) generate new key 2) add new key as the backup key 3) swap the primary and backup key, thus setting the new key as the primary We don't simply set the new key as the primary key and move the old key to the backup slot because the ip may be behind a load balancer and we further allow for the fact that all machines behind the load balancer will not be updated simultaneously. We make use of this infrastructure in subsequent patches. Suggested-by: Igor Lubashev Signed-off-by: Jason Baron Signed-off-by: Christoph Paasch Acked-by: Yuchung Cheng Signed-off-by: David S. Miller --- include/net/tcp.h | 41 ++++++++++- include/uapi/linux/snmp.h | 1 + net/ipv4/proc.c | 1 + net/ipv4/sysctl_net_ipv4.c | 2 +- net/ipv4/tcp.c | 3 +- net/ipv4/tcp_fastopen.c | 172 +++++++++++++++++++++++++++++++-------------- 6 files changed, 162 insertions(+), 58 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/tcp.h b/include/net/tcp.h index 985aa5db570c..0083a14fb64f 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1614,7 +1614,8 @@ void tcp_free_fastopen_req(struct tcp_sock *tp); void tcp_fastopen_destroy_cipher(struct sock *sk); void tcp_fastopen_ctx_destroy(struct net *net); int tcp_fastopen_reset_cipher(struct net *net, struct sock *sk, - void *key, unsigned int len); + void *primary_key, void *backup_key, + unsigned int len); void tcp_fastopen_add_skb(struct sock *sk, struct sk_buff *skb); struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb, struct request_sock *req, @@ -1625,11 +1626,14 @@ bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss, struct tcp_fastopen_cookie *cookie); bool tcp_fastopen_defer_connect(struct sock *sk, int *err); #define TCP_FASTOPEN_KEY_LENGTH 16 +#define TCP_FASTOPEN_KEY_MAX 2 +#define TCP_FASTOPEN_KEY_BUF_LENGTH \ + (TCP_FASTOPEN_KEY_LENGTH * TCP_FASTOPEN_KEY_MAX) /* Fastopen key context */ struct tcp_fastopen_context { - struct crypto_cipher *tfm; - __u8 key[TCP_FASTOPEN_KEY_LENGTH]; + struct crypto_cipher *tfm[TCP_FASTOPEN_KEY_MAX]; + __u8 key[TCP_FASTOPEN_KEY_BUF_LENGTH]; struct rcu_head rcu; }; @@ -1639,6 +1643,37 @@ bool tcp_fastopen_active_should_disable(struct sock *sk); void tcp_fastopen_active_disable_ofo_check(struct sock *sk); void tcp_fastopen_active_detect_blackhole(struct sock *sk, bool expired); +/* Caller needs to wrap with rcu_read_(un)lock() */ +static inline +struct tcp_fastopen_context *tcp_fastopen_get_ctx(const struct sock *sk) +{ + struct tcp_fastopen_context *ctx; + + ctx = rcu_dereference(inet_csk(sk)->icsk_accept_queue.fastopenq.ctx); + if (!ctx) + ctx = rcu_dereference(sock_net(sk)->ipv4.tcp_fastopen_ctx); + return ctx; +} + +static inline +bool tcp_fastopen_cookie_match(const struct tcp_fastopen_cookie *foc, + const struct tcp_fastopen_cookie *orig) +{ + if (orig->len == TCP_FASTOPEN_COOKIE_SIZE && + orig->len == foc->len && + !memcmp(orig->val, foc->val, foc->len)) + return true; + return false; +} + +static inline +int tcp_fastopen_context_len(const struct tcp_fastopen_context *ctx) +{ + if (ctx->tfm[1]) + return 2; + return 1; +} + /* Latencies incurred by various limits for a sender. They are * chronograph-like stats that are mutually exclusive. */ diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h index 86dc24a96c90..74904e9d1b72 100644 --- a/include/uapi/linux/snmp.h +++ b/include/uapi/linux/snmp.h @@ -283,6 +283,7 @@ enum LINUX_MIB_TCPACKCOMPRESSED, /* TCPAckCompressed */ LINUX_MIB_TCPZEROWINDOWDROP, /* TCPZeroWindowDrop */ LINUX_MIB_TCPRCVQDROP, /* TCPRcvQDrop */ + LINUX_MIB_TCPFASTOPENPASSIVEALTKEY, /* TCPFastOpenPassiveAltKey */ __LINUX_MIB_MAX }; diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index b613572c6616..4746f963c439 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -291,6 +291,7 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TCPAckCompressed", LINUX_MIB_TCPACKCOMPRESSED), SNMP_MIB_ITEM("TCPZeroWindowDrop", LINUX_MIB_TCPZEROWINDOWDROP), SNMP_MIB_ITEM("TCPRcvQDrop", LINUX_MIB_TCPRCVQDROP), + SNMP_MIB_ITEM("TCPFastOpenPassiveAltKey", LINUX_MIB_TCPFASTOPENPASSIVEALTKEY), SNMP_MIB_SENTINEL }; diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 875867b64d6a..72dc8ca98d43 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -318,7 +318,7 @@ static int proc_tcp_fastopen_key(struct ctl_table *table, int write, for (i = 0; i < ARRAY_SIZE(user_key); i++) key[i] = cpu_to_le32(user_key[i]); - tcp_fastopen_reset_cipher(net, NULL, key, + tcp_fastopen_reset_cipher(net, NULL, key, NULL, TCP_FASTOPEN_KEY_LENGTH); } diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 53d61ca3ac4b..bca51a351b0e 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2798,7 +2798,8 @@ static int do_tcp_setsockopt(struct sock *sk, int level, if (copy_from_user(key, optval, optlen)) return -EFAULT; - return tcp_fastopen_reset_cipher(net, sk, key, sizeof(key)); + return tcp_fastopen_reset_cipher(net, sk, key, NULL, + sizeof(key)); } default: /* fallthru */ diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index 3889ad28dd06..8e1580485c9e 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -30,14 +30,20 @@ void tcp_fastopen_init_key_once(struct net *net) * for a valid cookie, so this is an acceptable risk. */ get_random_bytes(key, sizeof(key)); - tcp_fastopen_reset_cipher(net, NULL, key, sizeof(key)); + tcp_fastopen_reset_cipher(net, NULL, key, NULL, sizeof(key)); } static void tcp_fastopen_ctx_free(struct rcu_head *head) { struct tcp_fastopen_context *ctx = container_of(head, struct tcp_fastopen_context, rcu); - crypto_free_cipher(ctx->tfm); + int i; + + /* We own ctx, thus no need to hold the Fastopen-lock */ + for (i = 0; i < TCP_FASTOPEN_KEY_MAX; i++) { + if (ctx->tfm[i]) + crypto_free_cipher(ctx->tfm[i]); + } kfree(ctx); } @@ -66,33 +72,54 @@ void tcp_fastopen_ctx_destroy(struct net *net) call_rcu(&ctxt->rcu, tcp_fastopen_ctx_free); } +struct tcp_fastopen_context *tcp_fastopen_alloc_ctx(void *primary_key, + void *backup_key, + unsigned int len) +{ + struct tcp_fastopen_context *new_ctx; + void *key = primary_key; + int err, i; + + new_ctx = kmalloc(sizeof(*new_ctx), GFP_KERNEL); + if (!new_ctx) + return ERR_PTR(-ENOMEM); + for (i = 0; i < TCP_FASTOPEN_KEY_MAX; i++) + new_ctx->tfm[i] = NULL; + for (i = 0; i < (backup_key ? 2 : 1); i++) { + new_ctx->tfm[i] = crypto_alloc_cipher("aes", 0, 0); + if (IS_ERR(new_ctx->tfm[i])) { + err = PTR_ERR(new_ctx->tfm[i]); + new_ctx->tfm[i] = NULL; + pr_err("TCP: TFO aes cipher alloc error: %d\n", err); + goto out; + } + err = crypto_cipher_setkey(new_ctx->tfm[i], key, len); + if (err) { + pr_err("TCP: TFO cipher key error: %d\n", err); + goto out; + } + memcpy(&new_ctx->key[i * TCP_FASTOPEN_KEY_LENGTH], key, len); + key = backup_key; + } + return new_ctx; +out: + tcp_fastopen_ctx_free(&new_ctx->rcu); + return ERR_PTR(err); +} + int tcp_fastopen_reset_cipher(struct net *net, struct sock *sk, - void *key, unsigned int len) + void *primary_key, void *backup_key, + unsigned int len) { struct tcp_fastopen_context *ctx, *octx; struct fastopen_queue *q; - int err; + int err = 0; - ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); - if (!ctx) - return -ENOMEM; - ctx->tfm = crypto_alloc_cipher("aes", 0, 0); - - if (IS_ERR(ctx->tfm)) { - err = PTR_ERR(ctx->tfm); -error: kfree(ctx); - pr_err("TCP: TFO aes cipher alloc error: %d\n", err); - return err; - } - err = crypto_cipher_setkey(ctx->tfm, key, len); - if (err) { - pr_err("TCP: TFO cipher key error: %d\n", err); - crypto_free_cipher(ctx->tfm); - goto error; + ctx = tcp_fastopen_alloc_ctx(primary_key, backup_key, len); + if (IS_ERR(ctx)) { + err = PTR_ERR(ctx); + goto out; } - memcpy(ctx->key, key, len); - - spin_lock(&net->ipv4.tcp_fastopen_ctx_lock); if (sk) { q = &inet_csk(sk)->icsk_accept_queue.fastopenq; @@ -108,6 +135,7 @@ error: kfree(ctx); if (octx) call_rcu(&octx->rcu, tcp_fastopen_ctx_free); +out: return err; } @@ -151,25 +179,20 @@ static bool __tcp_fastopen_cookie_gen_cipher(struct request_sock *req, * * XXX (TFO) - refactor when TCP_FASTOPEN_COOKIE_SIZE != AES_BLOCK_SIZE. */ -static bool tcp_fastopen_cookie_gen(struct sock *sk, +static void tcp_fastopen_cookie_gen(struct sock *sk, struct request_sock *req, struct sk_buff *syn, struct tcp_fastopen_cookie *foc) { struct tcp_fastopen_context *ctx; - bool ok = false; rcu_read_lock(); - ctx = rcu_dereference(inet_csk(sk)->icsk_accept_queue.fastopenq.ctx); - if (!ctx) - ctx = rcu_dereference(sock_net(sk)->ipv4.tcp_fastopen_ctx); + ctx = tcp_fastopen_get_ctx(sk); if (ctx) - ok = __tcp_fastopen_cookie_gen_cipher(req, syn, ctx->tfm, foc); + __tcp_fastopen_cookie_gen_cipher(req, syn, ctx->tfm[0], foc); rcu_read_unlock(); - return ok; } - /* If an incoming SYN or SYNACK frame contains a payload and/or FIN, * queue this additional data / FIN. */ @@ -213,6 +236,35 @@ void tcp_fastopen_add_skb(struct sock *sk, struct sk_buff *skb) tcp_fin(sk); } +/* returns 0 - no key match, 1 for primary, 2 for backup */ +static int tcp_fastopen_cookie_gen_check(struct sock *sk, + struct request_sock *req, + struct sk_buff *syn, + struct tcp_fastopen_cookie *orig, + struct tcp_fastopen_cookie *valid_foc) +{ + struct tcp_fastopen_cookie search_foc = { .len = -1 }; + struct tcp_fastopen_cookie *foc = valid_foc; + struct tcp_fastopen_context *ctx; + int i, ret = 0; + + rcu_read_lock(); + ctx = tcp_fastopen_get_ctx(sk); + if (!ctx) + goto out; + for (i = 0; i < tcp_fastopen_context_len(ctx); i++) { + __tcp_fastopen_cookie_gen_cipher(req, syn, ctx->tfm[i], foc); + if (tcp_fastopen_cookie_match(foc, orig)) { + ret = i + 1; + goto out; + } + foc = &search_foc; + } +out: + rcu_read_unlock(); + return ret; +} + static struct sock *tcp_fastopen_create_child(struct sock *sk, struct sk_buff *skb, struct request_sock *req) @@ -332,6 +384,7 @@ struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb, int tcp_fastopen = sock_net(sk)->ipv4.sysctl_tcp_fastopen; struct tcp_fastopen_cookie valid_foc = { .len = -1 }; struct sock *child; + int ret = 0; if (foc->len == 0) /* Client requests a cookie */ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENCOOKIEREQD); @@ -347,31 +400,44 @@ struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb, tcp_fastopen_no_cookie(sk, dst, TFO_SERVER_COOKIE_NOT_REQD)) goto fastopen; - if (foc->len >= 0 && /* Client presents or requests a cookie */ - tcp_fastopen_cookie_gen(sk, req, skb, &valid_foc) && - foc->len == TCP_FASTOPEN_COOKIE_SIZE && - foc->len == valid_foc.len && - !memcmp(foc->val, valid_foc.val, foc->len)) { - /* Cookie is valid. Create a (full) child socket to accept - * the data in SYN before returning a SYN-ACK to ack the - * data. If we fail to create the socket, fall back and - * ack the ISN only but includes the same cookie. - * - * Note: Data-less SYN with valid cookie is allowed to send - * data in SYN_RECV state. - */ + if (foc->len == 0) { + /* Client requests a cookie. */ + tcp_fastopen_cookie_gen(sk, req, skb, &valid_foc); + } else if (foc->len > 0) { + ret = tcp_fastopen_cookie_gen_check(sk, req, skb, foc, + &valid_foc); + if (!ret) { + NET_INC_STATS(sock_net(sk), + LINUX_MIB_TCPFASTOPENPASSIVEFAIL); + } else { + /* Cookie is valid. Create a (full) child socket to + * accept the data in SYN before returning a SYN-ACK to + * ack the data. If we fail to create the socket, fall + * back and ack the ISN only but includes the same + * cookie. + * + * Note: Data-less SYN with valid cookie is allowed to + * send data in SYN_RECV state. + */ fastopen: - child = tcp_fastopen_create_child(sk, skb, req); - if (child) { - foc->len = -1; + child = tcp_fastopen_create_child(sk, skb, req); + if (child) { + if (ret == 2) { + valid_foc.exp = foc->exp; + *foc = valid_foc; + NET_INC_STATS(sock_net(sk), + LINUX_MIB_TCPFASTOPENPASSIVEALTKEY); + } else { + foc->len = -1; + } + NET_INC_STATS(sock_net(sk), + LINUX_MIB_TCPFASTOPENPASSIVE); + return child; + } NET_INC_STATS(sock_net(sk), - LINUX_MIB_TCPFASTOPENPASSIVE); - return child; + LINUX_MIB_TCPFASTOPENPASSIVEFAIL); } - NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVEFAIL); - } else if (foc->len > 0) /* Client presents an invalid cookie */ - NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVEFAIL); - + } valid_foc.exp = foc->exp; *foc = valid_foc; return NULL; -- cgit v1.2.3-71-gd317 From ed0ac5c7ec3763e3261c48e3c5d4b7528b60fd85 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 20 May 2019 21:51:50 +0100 Subject: keys: Add a keyctl to move a key between keyrings Add a keyctl to atomically move a link to a key from one keyring to another. The key must exist in "from" keyring and a flag can be given to cause the operation to fail if there's a matching key already in the "to" keyring. This can be done with: keyctl(KEYCTL_MOVE, key_serial_t key, key_serial_t from_keyring, key_serial_t to_keyring, unsigned int flags); The key being moved must grant Link permission and both keyrings must grant Write permission. flags should be 0 or KEYCTL_MOVE_EXCL, with the latter preventing displacement of a matching key from the "to" keyring. Signed-off-by: David Howells --- Documentation/security/keys/core.rst | 21 +++++++ include/linux/key.h | 5 ++ include/uapi/linux/keyctl.h | 3 + security/keys/compat.c | 3 + security/keys/internal.h | 3 + security/keys/keyctl.c | 52 +++++++++++++++++ security/keys/keyring.c | 108 +++++++++++++++++++++++++++++++++++ 7 files changed, 195 insertions(+) (limited to 'include/uapi/linux') diff --git a/Documentation/security/keys/core.rst b/Documentation/security/keys/core.rst index 9521c4207f01..823d29bf44f7 100644 --- a/Documentation/security/keys/core.rst +++ b/Documentation/security/keys/core.rst @@ -577,6 +577,27 @@ The keyctl syscall functions are: added. + * Move a key from one keyring to another:: + + long keyctl(KEYCTL_MOVE, + key_serial_t id, + key_serial_t from_ring_id, + key_serial_t to_ring_id, + unsigned int flags); + + Move the key specified by "id" from the keyring specified by + "from_ring_id" to the keyring specified by "to_ring_id". If the two + keyrings are the same, nothing is done. + + "flags" can have KEYCTL_MOVE_EXCL set in it to cause the operation to fail + with EEXIST if a matching key exists in the destination keyring, otherwise + such a key will be replaced. + + A process must have link permission on the key for this function to be + successful and write permission on both keyrings. Any errors that can + occur from KEYCTL_LINK also apply on the destination keyring here. + + * Unlink a key or keyring from another keyring:: long keyctl(KEYCTL_UNLINK, key_serial_t keyring, key_serial_t key); diff --git a/include/linux/key.h b/include/linux/key.h index 1f09aad1c98c..612e1cf84049 100644 --- a/include/linux/key.h +++ b/include/linux/key.h @@ -310,6 +310,11 @@ extern int key_update(key_ref_t key, extern int key_link(struct key *keyring, struct key *key); +extern int key_move(struct key *key, + struct key *from_keyring, + struct key *to_keyring, + unsigned int flags); + extern int key_unlink(struct key *keyring, struct key *key); diff --git a/include/uapi/linux/keyctl.h b/include/uapi/linux/keyctl.h index f45ee0f69c0c..fd9fb11b312b 100644 --- a/include/uapi/linux/keyctl.h +++ b/include/uapi/linux/keyctl.h @@ -67,6 +67,7 @@ #define KEYCTL_PKEY_SIGN 27 /* Create a public key signature */ #define KEYCTL_PKEY_VERIFY 28 /* Verify a public key signature */ #define KEYCTL_RESTRICT_KEYRING 29 /* Restrict keys allowed to link to a keyring */ +#define KEYCTL_MOVE 30 /* Move keys between keyrings */ /* keyctl structures */ struct keyctl_dh_params { @@ -112,4 +113,6 @@ struct keyctl_pkey_params { __u32 __spare[7]; }; +#define KEYCTL_MOVE_EXCL 0x00000001 /* Do not displace from the to-keyring */ + #endif /* _LINUX_KEYCTL_H */ diff --git a/security/keys/compat.c b/security/keys/compat.c index 9482df601dc3..b326bc4f84d7 100644 --- a/security/keys/compat.c +++ b/security/keys/compat.c @@ -159,6 +159,9 @@ COMPAT_SYSCALL_DEFINE5(keyctl, u32, option, return keyctl_pkey_verify(compat_ptr(arg2), compat_ptr(arg3), compat_ptr(arg4), compat_ptr(arg5)); + case KEYCTL_MOVE: + return keyctl_keyring_move(arg2, arg3, arg4, arg5); + default: return -EOPNOTSUPP; } diff --git a/security/keys/internal.h b/security/keys/internal.h index 25cdd0cbdc06..b54a58c025ae 100644 --- a/security/keys/internal.h +++ b/security/keys/internal.h @@ -95,6 +95,8 @@ extern void key_type_put(struct key_type *ktype); extern int __key_link_lock(struct key *keyring, const struct keyring_index_key *index_key); +extern int __key_move_lock(struct key *l_keyring, struct key *u_keyring, + const struct keyring_index_key *index_key); extern int __key_link_begin(struct key *keyring, const struct keyring_index_key *index_key, struct assoc_array_edit **_edit); @@ -217,6 +219,7 @@ extern long keyctl_update_key(key_serial_t, const void __user *, size_t); extern long keyctl_revoke_key(key_serial_t); extern long keyctl_keyring_clear(key_serial_t); extern long keyctl_keyring_link(key_serial_t, key_serial_t); +extern long keyctl_keyring_move(key_serial_t, key_serial_t, key_serial_t, unsigned int); extern long keyctl_keyring_unlink(key_serial_t, key_serial_t); extern long keyctl_describe_key(key_serial_t, char __user *, size_t); extern long keyctl_keyring_search(key_serial_t, const char __user *, diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c index 0f947bcbad46..bbfe7d92d41c 100644 --- a/security/keys/keyctl.c +++ b/security/keys/keyctl.c @@ -572,6 +572,52 @@ error: return ret; } +/* + * Move a link to a key from one keyring to another, displacing any matching + * key from the destination keyring. + * + * The key must grant the caller Link permission and both keyrings must grant + * the caller Write permission. There must also be a link in the from keyring + * to the key. If both keyrings are the same, nothing is done. + * + * If successful, 0 will be returned. + */ +long keyctl_keyring_move(key_serial_t id, key_serial_t from_ringid, + key_serial_t to_ringid, unsigned int flags) +{ + key_ref_t key_ref, from_ref, to_ref; + long ret; + + if (flags & ~KEYCTL_MOVE_EXCL) + return -EINVAL; + + key_ref = lookup_user_key(id, KEY_LOOKUP_CREATE, KEY_NEED_LINK); + if (IS_ERR(key_ref)) + return PTR_ERR(key_ref); + + from_ref = lookup_user_key(from_ringid, 0, KEY_NEED_WRITE); + if (IS_ERR(from_ref)) { + ret = PTR_ERR(from_ref); + goto error2; + } + + to_ref = lookup_user_key(to_ringid, KEY_LOOKUP_CREATE, KEY_NEED_WRITE); + if (IS_ERR(to_ref)) { + ret = PTR_ERR(to_ref); + goto error3; + } + + ret = key_move(key_ref_to_ptr(key_ref), key_ref_to_ptr(from_ref), + key_ref_to_ptr(to_ref), flags); + + key_ref_put(to_ref); +error3: + key_ref_put(from_ref); +error2: + key_ref_put(key_ref); + return ret; +} + /* * Return a description of a key to userspace. * @@ -1772,6 +1818,12 @@ SYSCALL_DEFINE5(keyctl, int, option, unsigned long, arg2, unsigned long, arg3, (const void __user *)arg4, (const void __user *)arg5); + case KEYCTL_MOVE: + return keyctl_keyring_move((key_serial_t)arg2, + (key_serial_t)arg3, + (key_serial_t)arg4, + (unsigned int)arg5); + default: return -EOPNOTSUPP; } diff --git a/security/keys/keyring.c b/security/keys/keyring.c index 12acad3db6cf..67066bb58b83 100644 --- a/security/keys/keyring.c +++ b/security/keys/keyring.c @@ -1221,6 +1221,40 @@ int __key_link_lock(struct key *keyring, return 0; } +/* + * Lock keyrings for move (link/unlink combination). + */ +int __key_move_lock(struct key *l_keyring, struct key *u_keyring, + const struct keyring_index_key *index_key) + __acquires(&l_keyring->sem) + __acquires(&u_keyring->sem) + __acquires(&keyring_serialise_link_lock) +{ + if (l_keyring->type != &key_type_keyring || + u_keyring->type != &key_type_keyring) + return -ENOTDIR; + + /* We have to be very careful here to take the keyring locks in the + * right order, lest we open ourselves to deadlocking against another + * move operation. + */ + if (l_keyring < u_keyring) { + down_write(&l_keyring->sem); + down_write_nested(&u_keyring->sem, 1); + } else { + down_write(&u_keyring->sem); + down_write_nested(&l_keyring->sem, 1); + } + + /* Serialise link/link calls to prevent parallel calls causing a cycle + * when linking two keyring in opposite orders. + */ + if (index_key->type == &key_type_keyring) + mutex_lock(&keyring_serialise_link_lock); + + return 0; +} + /* * Preallocate memory so that a key can be linked into to a keyring. */ @@ -1494,6 +1528,80 @@ int key_unlink(struct key *keyring, struct key *key) } EXPORT_SYMBOL(key_unlink); +/** + * key_move - Move a key from one keyring to another + * @key: The key to move + * @from_keyring: The keyring to remove the link from. + * @to_keyring: The keyring to make the link in. + * @flags: Qualifying flags, such as KEYCTL_MOVE_EXCL. + * + * Make a link in @to_keyring to a key, such that the keyring holds a reference + * on that key and the key can potentially be found by searching that keyring + * whilst simultaneously removing a link to the key from @from_keyring. + * + * This function will write-lock both keyring's semaphores and will consume + * some of the user's key data quota to hold the link on @to_keyring. + * + * Returns 0 if successful, -ENOTDIR if either keyring isn't a keyring, + * -EKEYREVOKED if either keyring has been revoked, -ENFILE if the second + * keyring is full, -EDQUOT if there is insufficient key data quota remaining + * to add another link or -ENOMEM if there's insufficient memory. If + * KEYCTL_MOVE_EXCL is set, then -EEXIST will be returned if there's already a + * matching key in @to_keyring. + * + * It is assumed that the caller has checked that it is permitted for a link to + * be made (the keyring should have Write permission and the key Link + * permission). + */ +int key_move(struct key *key, + struct key *from_keyring, + struct key *to_keyring, + unsigned int flags) +{ + struct assoc_array_edit *from_edit = NULL, *to_edit = NULL; + int ret; + + kenter("%d,%d,%d", key->serial, from_keyring->serial, to_keyring->serial); + + if (from_keyring == to_keyring) + return 0; + + key_check(key); + key_check(from_keyring); + key_check(to_keyring); + + ret = __key_move_lock(from_keyring, to_keyring, &key->index_key); + if (ret < 0) + goto out; + ret = __key_unlink_begin(from_keyring, key, &from_edit); + if (ret < 0) + goto error; + ret = __key_link_begin(to_keyring, &key->index_key, &to_edit); + if (ret < 0) + goto error; + + ret = -EEXIST; + if (to_edit->dead_leaf && (flags & KEYCTL_MOVE_EXCL)) + goto error; + + ret = __key_link_check_restriction(to_keyring, key); + if (ret < 0) + goto error; + ret = __key_link_check_live_key(to_keyring, key); + if (ret < 0) + goto error; + + __key_unlink(from_keyring, key, &from_edit); + __key_link(key, &to_edit); +error: + __key_link_end(to_keyring, &key->index_key, to_edit); + __key_unlink_end(from_keyring, key, from_edit); +out: + kleave(" = %d", ret); + return ret; +} +EXPORT_SYMBOL(key_move); + /** * keyring_clear - Clear a keyring * @keyring: The keyring to clear. -- cgit v1.2.3-71-gd317 From 9c3c0c2048149d946d7f3ebdcbe70e2946750bfb Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 18 Apr 2019 22:43:36 +0200 Subject: isdn: remove isdn4linux With all isdn4linux hardware drivers gone, this is only a wrapper around CAPI to support old user space. However, from looking at the mailing list, it seems that the last time anyone asked about it was in 2014, when the upgrade from a linux-2.4 installation failed, and mISDN was suggested as a replacement. The largest public ISDN network (Deutsche Telekom) was supposed to be shut down 2018, which must have drastically reduced the number of legacy installations. When we last discussed removing i4l in 2016, Karsten Keil suggested revisiting this in 2018. I guess this is overdue. Link: http://listserv.isdn4linux.de/pipermail/isdn4linux/2014-October/006165.html Link: https://patchwork.kernel.org/patch/8484861/#17900371 Link: https://listserv.isdn4linux.de/pipermail/isdn4linux/2019-April/thread.html Signed-off-by: Arnd Bergmann --- Documentation/isdn/INTERFACE | 759 ------- Documentation/isdn/INTERFACE.fax | 163 -- Documentation/isdn/README | 599 ------ Documentation/isdn/README.FAQ | 26 - Documentation/isdn/README.audio | 138 -- Documentation/isdn/README.concap | 259 --- Documentation/isdn/README.diversion | 127 -- Documentation/isdn/README.fax | 45 - Documentation/isdn/README.hfc-pci | 41 - Documentation/isdn/README.syncppp | 58 - Documentation/isdn/README.x25 | 184 -- Documentation/isdn/syncPPP.FAQ | 224 --- Documentation/process/changes.rst | 16 +- MAINTAINERS | 2 - drivers/isdn/Kconfig | 26 - drivers/isdn/Makefile | 2 - drivers/isdn/capi/Kconfig | 9 - drivers/isdn/capi/capidrv.c | 2525 ----------------------- drivers/isdn/capi/capidrv.h | 140 -- drivers/isdn/divert/Makefile | 10 - drivers/isdn/divert/divert_init.c | 82 - drivers/isdn/divert/divert_procfs.c | 336 ---- drivers/isdn/divert/isdn_divert.c | 846 -------- drivers/isdn/divert/isdn_divert.h | 132 -- drivers/isdn/i4l/Kconfig | 127 -- drivers/isdn/i4l/Makefile | 14 - drivers/isdn/i4l/isdn_audio.c | 711 ------- drivers/isdn/i4l/isdn_audio.h | 44 - drivers/isdn/i4l/isdn_bsdcomp.c | 930 --------- drivers/isdn/i4l/isdn_common.c | 2368 ---------------------- drivers/isdn/i4l/isdn_common.h | 47 - drivers/isdn/i4l/isdn_concap.c | 99 - drivers/isdn/i4l/isdn_concap.h | 11 - drivers/isdn/i4l/isdn_net.c | 3198 ----------------------------- drivers/isdn/i4l/isdn_net.h | 151 -- drivers/isdn/i4l/isdn_ppp.c | 3046 ---------------------------- drivers/isdn/i4l/isdn_ppp.h | 41 - drivers/isdn/i4l/isdn_tty.c | 3756 ----------------------------------- drivers/isdn/i4l/isdn_tty.h | 120 -- drivers/isdn/i4l/isdn_ttyfax.c | 1123 ----------- drivers/isdn/i4l/isdn_ttyfax.h | 17 - drivers/isdn/i4l/isdn_v110.c | 625 ------ drivers/isdn/i4l/isdn_v110.h | 29 - drivers/isdn/i4l/isdn_x25iface.c | 332 ---- drivers/isdn/i4l/isdn_x25iface.h | 30 - drivers/isdn/isdnloop/Makefile | 6 - drivers/isdn/isdnloop/isdnloop.c | 1528 -------------- drivers/isdn/isdnloop/isdnloop.h | 112 -- include/linux/concap.h | 112 -- include/linux/isdn.h | 473 ----- include/linux/isdn_divertif.h | 35 - include/linux/isdn_ppp.h | 194 -- include/linux/isdnif.h | 505 ----- include/linux/wanrouter.h | 11 - include/uapi/linux/isdn.h | 144 -- include/uapi/linux/isdn_divertif.h | 31 - include/uapi/linux/isdn_ppp.h | 68 - include/uapi/linux/isdnif.h | 57 - include/uapi/linux/wanrouter.h | 18 - 59 files changed, 2 insertions(+), 26860 deletions(-) delete mode 100644 Documentation/isdn/INTERFACE delete mode 100644 Documentation/isdn/INTERFACE.fax delete mode 100644 Documentation/isdn/README delete mode 100644 Documentation/isdn/README.FAQ delete mode 100644 Documentation/isdn/README.audio delete mode 100644 Documentation/isdn/README.concap delete mode 100644 Documentation/isdn/README.diversion delete mode 100644 Documentation/isdn/README.fax delete mode 100644 Documentation/isdn/README.hfc-pci delete mode 100644 Documentation/isdn/README.syncppp delete mode 100644 Documentation/isdn/README.x25 delete mode 100644 Documentation/isdn/syncPPP.FAQ delete mode 100644 drivers/isdn/capi/capidrv.c delete mode 100644 drivers/isdn/capi/capidrv.h delete mode 100644 drivers/isdn/divert/Makefile delete mode 100644 drivers/isdn/divert/divert_init.c delete mode 100644 drivers/isdn/divert/divert_procfs.c delete mode 100644 drivers/isdn/divert/isdn_divert.c delete mode 100644 drivers/isdn/divert/isdn_divert.h delete mode 100644 drivers/isdn/i4l/Kconfig delete mode 100644 drivers/isdn/i4l/isdn_audio.c delete mode 100644 drivers/isdn/i4l/isdn_audio.h delete mode 100644 drivers/isdn/i4l/isdn_bsdcomp.c delete mode 100644 drivers/isdn/i4l/isdn_common.c delete mode 100644 drivers/isdn/i4l/isdn_common.h delete mode 100644 drivers/isdn/i4l/isdn_concap.c delete mode 100644 drivers/isdn/i4l/isdn_concap.h delete mode 100644 drivers/isdn/i4l/isdn_net.c delete mode 100644 drivers/isdn/i4l/isdn_net.h delete mode 100644 drivers/isdn/i4l/isdn_ppp.c delete mode 100644 drivers/isdn/i4l/isdn_ppp.h delete mode 100644 drivers/isdn/i4l/isdn_tty.c delete mode 100644 drivers/isdn/i4l/isdn_tty.h delete mode 100644 drivers/isdn/i4l/isdn_ttyfax.c delete mode 100644 drivers/isdn/i4l/isdn_ttyfax.h delete mode 100644 drivers/isdn/i4l/isdn_v110.c delete mode 100644 drivers/isdn/i4l/isdn_v110.h delete mode 100644 drivers/isdn/i4l/isdn_x25iface.c delete mode 100644 drivers/isdn/i4l/isdn_x25iface.h delete mode 100644 drivers/isdn/isdnloop/Makefile delete mode 100644 drivers/isdn/isdnloop/isdnloop.c delete mode 100644 drivers/isdn/isdnloop/isdnloop.h delete mode 100644 include/linux/concap.h delete mode 100644 include/linux/isdn.h delete mode 100644 include/linux/isdn_divertif.h delete mode 100644 include/linux/isdn_ppp.h delete mode 100644 include/linux/isdnif.h delete mode 100644 include/linux/wanrouter.h delete mode 100644 include/uapi/linux/isdn.h delete mode 100644 include/uapi/linux/isdn_divertif.h delete mode 100644 include/uapi/linux/isdn_ppp.h delete mode 100644 include/uapi/linux/isdnif.h delete mode 100644 include/uapi/linux/wanrouter.h (limited to 'include/uapi/linux') diff --git a/Documentation/isdn/INTERFACE b/Documentation/isdn/INTERFACE deleted file mode 100644 index 5df17e5b25c8..000000000000 --- a/Documentation/isdn/INTERFACE +++ /dev/null @@ -1,759 +0,0 @@ -$Id: INTERFACE,v 1.15.8.2 2001/03/13 16:17:07 kai Exp $ - -Description of the Interface between Linklevel and Hardwarelevel - of isdn4linux: - - - The Communication between Linklevel (LL) and Hardwarelevel (HL) - is based on the struct isdn_if (defined in isdnif.h). - - An HL-driver can register itself at LL by calling the function - register_isdn() with a pointer to that struct. Prior to that, it has - to preset some of the fields of isdn_if. The LL sets the rest of - the fields. All further communication is done via callbacks using - the function-pointers defined in isdn_if. - - Changes/Version numbering: - - During development of the ISDN subsystem, several changes have been - made to the interface. Before it went into kernel, the package - had a unique version number. The last version, distributed separately - was 0.7.4. When the subsystem went into kernel, every functional unit - got a separate version number. These numbers are shown at initialization, - separated by slashes: - - c.c/t.t/n.n/p.p/a.a/v.v - - where - - c.c is the revision of the common code. - t.t is the revision of the tty related code. - n.n is the revision of the network related code. - p.p is the revision of the ppp related code. - a.a is the revision of the audio related code. - v.v is the revision of the V.110 related code. - - Changes in this document are marked with '***CHANGEx' where x representing - the version number. If that number starts with 0, it refers to the old, - separately distributed package. If it starts with one of the letters - above, it refers to the revision of the corresponding module. - ***CHANGEIx refers to the revision number of the isdnif.h - -1. Description of the fields of isdn_if: - - int channels; - - This field has to be set by the HL-driver to the number of channels - supported prior to calling register_isdn(). Upon return of the call, - the LL puts an id there, which has to be used by the HL-driver when - invoking the other callbacks. - - int maxbufsize; - - ***CHANGE0.6: New since this version. - - Also to be preset by the HL-driver. With this value the HL-driver - tells the LL the maximum size of a data-packet it will accept. - - unsigned long features; - - To be preset by the HL-driver. Using this field, the HL-driver - announces the features supported. At the moment this is limited to - report the supported layer2 and layer3-protocols. For setting this - field the constants ISDN_FEATURE..., declared in isdnif.h have to be - used. - - ***CHANGE0.7.1: The line type (1TR6, EDSS1) has to be set. - - unsigned short hl_hdrlen; - - ***CHANGE0.7.4: New field. - - To be preset by the HL-driver, if it supports sk_buff's. The driver - should put here the amount of additional space needed in sk_buff's for - its internal purposes. Drivers not supporting sk_buff's should - initialize this field to 0. - - void (*rcvcallb_skb)(int, int, struct sk_buff *) - - ***CHANGE0.7.4: New field. - - This field will be set by LL. The HL-driver delivers received data- - packets by calling this function. Upon calling, the HL-driver must - already have its private data pulled off the head of the sk_buff. - - Parameter: - int driver-Id - int Channel-number locally to the driver. (starting with 0) - struct sk_buff * Pointer to sk_buff, containing received data. - - int (*statcallb)(isdn_ctrl*); - - This field will be set by LL. This function has to be called by the - HL-driver for signaling status-changes or other events to the LL. - - Parameter: - isdn_ctrl* - - The struct isdn_ctrl also defined in isdn_if. The exact meanings of its - fields are described together with the descriptions of the possible - events. Here is only a short description of the fields: - - driver = driver Id. - command = event-type. (one of the constants ISDN_STAT_...) - arg = depends on event-type. - num = depends on event-type. - - Returnvalue: - 0 on success, else -1 - - int (*command)(isdn_ctrl*); - - This field has to be preset by the HL-driver. It points to a function, - to be called by LL to perform functions like dialing, B-channel - setup, etc. The exact meaning of the parameters is described with the - descriptions of the possible commands. - - Parameter: - isdn_ctrl* - driver = driver-Id - command = command to perform. (one of the constants ISDN_CMD_...) - arg = depends on command. - num = depends on command. - - Returnvalue: - >=0 on success, else error-code (-ENODEV etc.) - - int (*writebuf_skb)(int, int, int, struct sk_buff *) - - ***CHANGE0.7.4: New field. - ***CHANGEI.1.21: New field. - - This field has to be preset by the HL-driver. The given function will - be called by the LL for delivering data to be send via B-Channel. - - - Parameter: - int driver-Id ***CHANGE0.7.4: New parameter. - int channel-number locally to the HL-driver. (starts with 0) - int ack ***ChangeI1.21: New parameter - If this is !0, the driver has to signal the delivery - by sending an ISDN_STAT_BSENT. If this is 0, the driver - MUST NOT send an ISDN_STAT_BSENT. - struct sk_buff * Pointer to sk_buff containing data to be send via - B-channel. - - Returnvalue: - Length of data accepted on success, else error-code (-EINVAL on - oversized packets etc.) - - int (*writecmd)(u_char*, int, int, int, int); - - This field has to be preset by the HL-driver. The given function will be - called to perform write-requests on /dev/isdnctrl (i.e. sending commands - to the card) The data-format is hardware-specific. This function is - intended for debugging only. It is not necessary for normal operation - and never will be called by the tty-emulation- or network-code. If - this function is not supported, the driver has to set NULL here. - - Parameter: - u_char* pointer to data. - int length of data. - int flag: 0 = call from within kernel-space. (HL-driver must use - memcpy, may NOT use schedule()) - 1 = call from user-space. (HL-driver must use - memcpy_fromfs, use of schedule() allowed) - int driver-Id. - int channel-number locally to the HL-driver. (starts with 0) - -***CHANGEI1.14: The driver-Id and channel-number are new since this revision. - - Returnvalue: - Length of data accepted on success, else error-code (-EINVAL etc.) - - int (*readstat)(u_char*, int, int, int, int); - - This field has to be preset by the HL-driver. The given function will be - called to perform read-requests on /dev/isdnctrl (i.e. reading replies - from the card) The data-format is hardware-specific. This function is - intended for debugging only. It is not necessary for normal operation - and never will be called by the tty-emulation- or network-code. If - this function is not supported, the driver has to set NULL here. - - Parameter: - u_char* pointer to data. - int length of data. - int flag: 0 = call from within kernel-space. (HL-driver must use - memcpy, may NOT use schedule()) - 1 = call from user-space. (HL-driver must use - memcpy_fromfs, use of schedule() allowed) - int driver-Id. - int channel-number locally to the HL-driver. (starts with 0) - -***CHANGEI1.14: The driver-Id and channel-number are new since this revision. - - Returnvalue: - Length of data on success, else error-code (-EINVAL etc.) - - char id[20]; - ***CHANGE0.7: New since this version. - - This string has to be preset by the HL-driver. Its purpose is for - identification of the driver by the user. Eg.: it is shown in the - status-info of /dev/isdninfo. Furthermore it is used as Id for binding - net-interfaces to a specific channel. If a string of length zero is - given, upon return, isdn4linux will replace it by a generic name. (line0, - line1 etc.) It is recommended to make this string configurable during - module-load-time. (copy a global variable to this string.) For doing that, - modules 1.2.8 or newer are necessary. - -2. Description of the commands, a HL-driver has to support: - - All commands will be performed by calling the function command() described - above from within the LL. The field command of the struct-parameter will - contain the desired command, the field driver is always set to the - appropriate driver-Id. - - Until now, the following commands are defined: - -***CHANGEI1.34: The parameter "num" has been replaced by a union "parm" containing - the old "num" and a new setup_type struct used for ISDN_CMD_DIAL - and ISDN_STAT_ICALL callback. - - ISDN_CMD_IOCTL: - - This command is intended for performing ioctl-calls for configuring - hardware or similar purposes (setting port-addresses, loading firmware - etc.) For this purpose, in the LL all ioctl-calls with an argument - >= IIOCDRVCTL (0x100) will be handed transparently to this - function after subtracting 0x100 and placing the result in arg. - Example: - If a userlevel-program calls ioctl(0x101,...) the function gets - called with the field command set to 1. - - Parameter: - driver = driver-Id. - command = ISDN_CMD_IOCTL - arg = Original ioctl-cmd - IIOCDRVCTL - parm.num = first bytes filled with (unsigned long)arg - - Returnvalue: - Depending on driver. - - - ISDN_CMD_DIAL: - - This command is used to tell the HL-driver it should dial a given - number. - - Parameter: - driver = driver-Id. - command = ISDN_CMD_DIAL - arg = channel-number locally to the driver. (starting with 0) - - parm.setup.phone = An ASCII-String containing the number to dial. - parm.setup.eazmsn = An ASCII-Sting containing the own EAZ or MSN. - parm.setup.si1 = The Service-Indicator. - parm.setup.si2 = Additional Service-Indicator. - - If the Line has been designed as SPV (a special german - feature, meaning semi-leased-line) the phone has to - start with an "S". - ***CHANGE0.6: In previous versions the EAZ has been given in the - highbyte of arg. - ***CHANGE0.7.1: New since this version: ServiceIndicator and AddInfo. - - ISDN_CMD_ACCEPTD: - - With this command, the HL-driver is told to accept a D-Channel-setup. - (Response to an incoming call) - - Parameter: - driver = driver-Id. - command = ISDN_CMD_ACCEPTD - arg = channel-number locally to the driver. (starting with 0) - parm = unused. - - ISDN_CMD_ACCEPTB: - - With this command, the HL-driver is told to perform a B-Channel-setup. - (after establishing D-Channel-Connection) - - Parameter: - driver = driver-Id. - command = ISDN_CMD_ACCEPTB - arg = channel-number locally to the driver. (starting with 0) - parm = unused. - - ISDN_CMD_HANGUP: - - With this command, the HL-driver is told to hangup (B-Channel if - established first, then D-Channel). This command is also used for - actively rejecting an incoming call. - - Parameter: - driver = driver-Id. - command = ISDN_CMD_HANGUP - arg = channel-number locally to the driver. (starting with 0) - parm = unused. - - ISDN_CMD_CLREAZ: - - With this command, the HL-driver is told not to signal incoming - calls to the LL. - - Parameter: - driver = driver-Id. - command = ISDN_CMD_CLREAZ - arg = channel-number locally to the driver. (starting with 0) - parm = unused. - - ISDN_CMD_SETEAZ: - - With this command, the HL-driver is told to signal incoming calls for - the given EAZs/MSNs to the LL. - - Parameter: - driver = driver-Id. - command = ISDN_CMD_SETEAZ - arg = channel-number locally to the driver. (starting with 0) - parm.num = ASCII-String, containing the desired EAZ's/MSN's - (comma-separated). If an empty String is given, the - HL-driver should respond to ALL incoming calls, - regardless of the destination-address. - ***CHANGE0.6: New since this version the "empty-string"-feature. - - ISDN_CMD_GETEAZ: (currently unused) - - With this command, the HL-driver is told to report the current setting - given with ISDN_CMD_SETEAZ. - - Parameter: - driver = driver-Id. - command = ISDN_CMD_GETEAZ - arg = channel-number locally to the driver. (starting with 0) - parm.num = ASCII-String, containing the current EAZ's/MSN's - - ISDN_CMD_SETSIL: (currently unused) - - With this command, the HL-driver is told to signal only incoming - calls with the given Service-Indicators. - - Parameter: - driver = driver-Id. - command = ISDN_CMD_SETSIL - arg = channel-number locally to the driver. (starting with 0) - parm.num = ASCII-String, containing the desired Service-Indicators. - - ISDN_CMD_GETSIL: (currently unused) - - With this command, the HL-driver is told to return the current - Service-Indicators it will respond to. - - Parameter: - driver = driver-Id. - command = ISDN_CMD_SETSIL - arg = channel-number locally to the driver. (starting with 0) - parm.num = ASCII-String, containing the current Service-Indicators. - - ISDN_CMD_SETL2: - - With this command, the HL-driver is told to select the given Layer-2- - protocol. This command is issued by the LL prior to ISDN_CMD_DIAL or - ISDN_CMD_ACCEPTD. - - - Parameter: - driver = driver-Id. - command = ISDN_CMD_SETL2 - arg = channel-number locally to the driver. (starting with 0) - logical or'ed with (protocol-Id << 8) - protocol-Id is one of the constants ISDN_PROTO_L2... - parm = unused. - - ISDN_CMD_GETL2: (currently unused) - - With this command, the HL-driver is told to return the current - setting of the Layer-2-protocol. - - Parameter: - driver = driver-Id. - command = ISDN_CMD_GETL2 - arg = channel-number locally to the driver. (starting with 0) - parm = unused. - Returnvalue: - current protocol-Id (one of the constants ISDN_L2_PROTO) - - ISDN_CMD_SETL3: - - With this command, the HL-driver is told to select the given Layer-3- - protocol. This command is issued by the LL prior to ISDN_CMD_DIAL or - ISDN_CMD_ACCEPTD. - - - Parameter: - driver = driver-Id. - command = ISDN_CMD_SETL3 - arg = channel-number locally to the driver. (starting with 0) - logical or'ed with (protocol-Id << 8) - protocol-Id is one of the constants ISDN_PROTO_L3... - parm.fax = Pointer to T30_s fax struct. (fax usage only) - - ISDN_CMD_GETL2: (currently unused) - - With this command, the HL-driver is told to return the current - setting of the Layer-3-protocol. - - Parameter: - driver = driver-Id. - command = ISDN_CMD_GETL3 - arg = channel-number locally to the driver. (starting with 0) - parm = unused. - Returnvalue: - current protocol-Id (one of the constants ISDN_L3_PROTO) - - ISDN_CMD_PROCEED: - - With this command, the HL-driver is told to proceed with a incoming call. - - Parameter: - driver = driver-Id. - command = ISDN_CMD_PROCEED - arg = channel-number locally to the driver. (starting with 0) - setup.eazmsn= empty string or string send as uus1 in DSS1 with - PROCEED message - - ISDN_CMD_ALERT: - - With this command, the HL-driver is told to alert a proceeding call. - - Parameter: - driver = driver-Id. - command = ISDN_CMD_ALERT - arg = channel-number locally to the driver. (starting with 0) - setup.eazmsn= empty string or string send as uus1 in DSS1 with - ALERT message - - ISDN_CMD_REDIR: - - With this command, the HL-driver is told to redirect a call in proceeding - or alerting state. - - Parameter: - driver = driver-Id. - command = ISDN_CMD_REDIR - arg = channel-number locally to the driver. (starting with 0) - setup.eazmsn= empty string or string send as uus1 in DSS1 protocol - setup.screen= screening indicator - setup.phone = redirected to party number - - ISDN_CMD_PROT_IO: - - With this call, the LL-driver invokes protocol specific features through - the LL. - The call is not implicitely bound to a connection. - - Parameter: - driver = driver-Id - command = ISDN_CMD_PROT_IO - arg = The lower 8 Bits define the addressed protocol as defined - in ISDN_PTYPE..., the upper bits are used to differentiate - the protocol specific CMD. - - para = protocol and function specific. See isdnif.h for detail. - - - ISDN_CMD_FAXCMD: - - With this command the HL-driver receives a fax sub-command. - For details refer to INTERFACE.fax - - Parameter: - driver = driver-Id. - command = ISDN_CMD_FAXCMD - arg = channel-number locally to the driver. (starting with 0) - parm = unused. - - -3. Description of the events to be signaled by the HL-driver to the LL. - - All status-changes are signaled via calling the previously described - function statcallb(). The field command of the struct isdn_cmd has - to be set by the HL-driver with the appropriate Status-Id (event-number). - The field arg has to be set to the channel-number (locally to the driver, - starting with 0) to which this event applies. (Exception: STAVAIL-event) - - Until now, the following Status-Ids are defined: - - ISDN_STAT_AVAIL: - - With this call, the HL-driver signals the availability of new data - for readstat(). Used only for debugging-purposes, see description - of readstat(). - - Parameter: - driver = driver-Id - command = ISDN_STAT_STAVAIL - arg = length of available data. - parm = unused. - - ISDN_STAT_ICALL: - ISDN_STAT_ICALLW: - - With this call, the HL-driver signals an incoming call to the LL. - If ICALLW is signalled the incoming call is a waiting call without - a available B-chan. - - Parameter: - driver = driver-Id - command = ISDN_STAT_ICALL - arg = channel-number, locally to the driver. (starting with 0) - para.setup.phone = Callernumber. - para.setup.eazmsn = CalledNumber. - para.setup.si1 = Service Indicator. - para.setup.si2 = Additional Service Indicator. - para.setup.plan = octet 3 from Calling party number Information Element. - para.setup.screen = octet 3a from Calling party number Information Element. - - Return: - 0 = No device matching this call. - 1 = At least one device matching this call (RING on ttyI). - HL-driver may send ALERTING on the D-channel in this case. - 2 = Call will be rejected. - 3 = Incoming called party number is currently incomplete. - Additional digits are required. - Used for signalling with PtP connections. - 4 = Call will be held in a proceeding state - (HL driver sends PROCEEDING) - Used when a user space prog needs time to interpret a call - para.setup.eazmsn may be filled with an uus1 message of - 30 octets maximum. Empty string if no uus. - 5 = Call will be actively deflected to another party - Only available in DSS1/EURO protocol - para.setup.phone must be set to destination party number - para.setup.eazmsn may be filled with an uus1 message of - 30 octets maximum. Empty string if no uus. - -1 = An error happened. (Invalid parameters for example.) - The keypad support now is included in the dial command. - - - ISDN_STAT_RUN: - - With this call, the HL-driver signals availability of the ISDN-card. - (after initializing, loading firmware) - - Parameter: - driver = driver-Id - command = ISDN_STAT_RUN - arg = unused. - parm = unused. - - ISDN_STAT_STOP: - - With this call, the HL-driver signals unavailability of the ISDN-card. - (before unloading, while resetting/reconfiguring the card) - - Parameter: - driver = driver-Id - command = ISDN_STAT_STOP - arg = unused. - parm = unused. - - ISDN_STAT_DCONN: - - With this call, the HL-driver signals the successful establishment of - a D-Channel-connection. (Response to ISDN_CMD_ACCEPTD or ISDN_CMD_DIAL) - - Parameter: - driver = driver-Id - command = ISDN_STAT_DCONN - arg = channel-number, locally to the driver. (starting with 0) - parm = unused. - - ISDN_STAT_BCONN: - - With this call, the HL-driver signals the successful establishment of - a B-Channel-connection. (Response to ISDN_CMD_ACCEPTB or because the - remote-station has initiated establishment) - - The HL driver should call this when the logical l2/l3 protocol - connection on top of the physical B-channel is established. - - Parameter: - driver = driver-Id - command = ISDN_STAT_BCONN - arg = channel-number, locally to the driver. (starting with 0) - parm.num = ASCII-String, containing type of connection (for analog - modem only). This will be appended to the CONNECT message - e.g. 14400/V.32bis - - ISDN_STAT_DHUP: - - With this call, the HL-driver signals the shutdown of a - D-Channel-connection. This could be a response to a prior ISDN_CMD_HANGUP, - or caused by a remote-hangup or if the remote-station has actively - rejected a call. - - Parameter: - driver = driver-Id - command = ISDN_STAT_DHUP - arg = channel-number, locally to the driver. (starting with 0) - parm = unused. - - ISDN_STAT_BHUP: - - With this call, the HL-driver signals the shutdown of a - B-Channel-connection. This could be a response to a prior ISDN_CMD_HANGUP, - or caused by a remote-hangup. - - The HL driver should call this as soon as the logical l2/l3 protocol - connection on top of the physical B-channel is released. - - Parameter: - driver = driver-Id - command = ISDN_STAT_BHUP - arg = channel-number, locally to the driver. (starting with 0) - parm = unused. - - ISDN_STAT_CINF: - - With this call, the HL-driver delivers charge-unit information to the - LL. - - Parameter: - driver = driver-Id - command = ISDN_STAT_CINF - arg = channel-number, locally to the driver. (starting with 0) - parm.num = ASCII string containing charge-units (digits only). - - ISDN_STAT_LOAD: (currently unused) - - ISDN_STAT_UNLOAD: - - With this call, the HL-driver signals that it will be unloaded now. This - tells the LL to release all corresponding data-structures. - - Parameter: - driver = driver-Id - command = ISDN_STAT_UNLOAD - arg = unused. - parm = unused. - - ISDN_STAT_BSENT: - - With this call the HL-driver signals the delivery of a data-packet. - This callback is used by the network-interfaces only, tty-Emulation - does not need this call. - - Parameter: - driver = driver-Id - command = ISDN_STAT_BSENT - arg = channel-number, locally to the driver. (starting with 0) - parm.length = ***CHANGEI.1.21: New field. - the driver has to set this to the original length - of the skb at the time of receiving it from the linklevel. - - ISDN_STAT_NODCH: - - With this call, the driver has to respond to a prior ISDN_CMD_DIAL, if - no D-Channel is available. - - Parameter: - driver = driver-Id - command = ISDN_STAT_NODCH - arg = channel-number, locally to the driver. (starting with 0) - parm = unused. - - ISDN_STAT_ADDCH: - - This call is for HL-drivers, which are unable to check card-type - or numbers of supported channels before they have loaded any firmware - using ioctl. Those HL-driver simply set the channel-parameter to a - minimum channel-number when registering, and later if they know - the real amount, perform this call, allocating additional channels. - - Parameter: - driver = driver-Id - command = ISDN_STAT_ADDCH - arg = number of channels to be added. - parm = unused. - - ISDN_STAT_CAUSE: - - With this call, the HL-driver delivers CAUSE-messages to the LL. - Currently the LL does not use this messages. Their contents is simply - logged via kernel-messages. Therefore, currently the format of the - messages is completely free. However they should be printable. - - Parameter: - driver = driver-Id - command = ISDN_STAT_NODCH - arg = channel-number, locally to the driver. (starting with 0) - parm.num = ASCII string containing CAUSE-message. - - ISDN_STAT_DISPLAY: - - With this call, the HL-driver delivers DISPLAY-messages to the LL. - Currently the LL does not use this messages. - - Parameter: - driver = driver-Id - command = ISDN_STAT_DISPLAY - arg = channel-number, locally to the driver. (starting with 0) - para.display= string containing DISPLAY-message. - - ISDN_STAT_PROT: - - With this call, the HL-driver delivers protocol specific infos to the LL. - The call is not implicitely bound to a connection. - - Parameter: - driver = driver-Id - command = ISDN_STAT_PROT - arg = The lower 8 Bits define the addressed protocol as defined - in ISDN_PTYPE..., the upper bits are used to differentiate - the protocol specific STAT. - - para = protocol and function specific. See isdnif.h for detail. - - ISDN_STAT_DISCH: - - With this call, the HL-driver signals the LL to disable or enable the - use of supplied channel and driver. - The call may be used to reduce the available number of B-channels after - loading the driver. The LL has to ignore a disabled channel when searching - for free channels. The HL driver itself never delivers STAT callbacks for - disabled channels. - The LL returns a nonzero code if the operation was not successful or the - selected channel is actually regarded as busy. - - Parameter: - driver = driver-Id - command = ISDN_STAT_DISCH - arg = channel-number, locally to the driver. (starting with 0) - parm.num[0] = 0 if channel shall be disabled, else enabled. - - ISDN_STAT_L1ERR: - - ***CHANGEI1.21 new status message. - A signal can be sent to the linklevel if an Layer1-error results in - packet-loss on receive or send. The field errcode of the cmd.parm - union describes the error more precisely. - - Parameter: - driver = driver-Id - command = ISDN_STAT_L1ERR - arg = channel-number, locally to the driver. (starting with 0) - parm.errcode= ISDN_STAT_L1ERR_SEND: Packet lost while sending. - ISDN_STAT_L1ERR_RECV: Packet lost while receiving. - ISDN_STAT_FAXIND: - - With this call the HL-driver signals a fax sub-command to the LL. - For details refer to INTERFACE.fax - - Parameter: - driver = driver-Id. - command = ISDN_STAT_FAXIND - arg = channel-number, locally to the driver. (starting with 0) - parm = unused. - diff --git a/Documentation/isdn/INTERFACE.fax b/Documentation/isdn/INTERFACE.fax deleted file mode 100644 index 9c8c6d914ec7..000000000000 --- a/Documentation/isdn/INTERFACE.fax +++ /dev/null @@ -1,163 +0,0 @@ -$Id: INTERFACE.fax,v 1.2 2000/08/06 09:22:50 armin Exp $ - - -Description of the fax-subinterface between linklevel and hardwarelevel of - isdn4linux. - - The communication between linklevel (LL) and hardwarelevel (HL) for fax - is based on the struct T30_s (defined in isdnif.h). - This struct is allocated in the LL. - In order to use fax, the LL provides the pointer to this struct with the - command ISDN_CMD_SETL3 (parm.fax). This pointer expires in case of hangup - and when a new channel to a new connection is assigned. - - -Data handling: - In send-mode the HL-driver has to handle the codes and the bit-order - conversion by itself. - In receive-mode the LL-driver takes care of the bit-order conversion - (specified by +FBOR) - -Structure T30_s description: - - This structure stores the values (set by AT-commands), the remote- - capability-values and the command-codes between LL and HL. - - If the HL-driver receives ISDN_CMD_FAXCMD, all needed information - is in this struct set by the LL. - To signal information to the LL, the HL-driver has to set the - parameters and use ISDN_STAT_FAXIND. - (Please refer to INTERFACE) - -Structure T30_s: - - All members are 8-bit unsigned (__u8) - - - resolution - - rate - - width - - length - - compression - - ecm - - binary - - scantime - - id[] - Local faxmachine's parameters, set by +FDIS, +FDCS, +FLID, ... - - - r_resolution - - r_rate - - r_width - - r_length - - r_compression - - r_ecm - - r_binary - - r_scantime - - r_id[] - Remote faxmachine's parameters. To be set by HL-driver. - - - phase - Defines the actual state of fax connection. Set by HL or LL - depending on progress and type of connection. - If the phase changes because of an AT command, the LL driver - changes this value. Otherwise the HL-driver takes care of it, but - only necessary on call establishment (from IDLE to PHASE_A). - (one of the constants ISDN_FAX_PHASE_[IDLE,A,B,C,D,E]) - - - direction - Defines outgoing/send or incoming/receive connection. - (ISDN_TTY_FAX_CONN_[IN,OUT]) - - - code - Commands from LL to HL; possible constants : - ISDN_TTY_FAX_DR signals +FDR command to HL - - ISDN_TTY_FAX_DT signals +FDT command to HL - - ISDN_TTY_FAX_ET signals +FET command to HL - - - Other than that the "code" is set with the hangup-code value at - the end of connection for the +FHNG message. - - - r_code - Commands from HL to LL; possible constants : - ISDN_TTY_FAX_CFR output of +FCFR message. - - ISDN_TTY_FAX_RID output of remote ID set in r_id[] - (+FCSI/+FTSI on send/receive) - - ISDN_TTY_FAX_DCS output of +FDCS and CONNECT message, - switching to phase C. - - ISDN_TTY_FAX_ET signals end of data, - switching to phase D. - - ISDN_TTY_FAX_FCON signals the established, outgoing connection, - switching to phase B. - - ISDN_TTY_FAX_FCON_I signals the established, incoming connection, - switching to phase B. - - ISDN_TTY_FAX_DIS output of +FDIS message and values. - - ISDN_TTY_FAX_SENT signals that all data has been sent - and is acknowledged, - OK message will be sent. - - ISDN_TTY_FAX_PTS signals a msg-confirmation (page sent successful), - depending on fet value: - 0: output OK message (more pages follow) - 1: switching to phase B (next document) - - ISDN_TTY_FAX_TRAIN_OK output of +FDCS and OK message (for receive mode). - - ISDN_TTY_FAX_EOP signals end of data in receive mode, - switching to phase D. - - ISDN_TTY_FAX_HNG output of the +FHNG and value set by code and - OK message, switching to phase E. - - - - badlin - Value of +FBADLIN - - - badmul - Value of +FBADMUL - - - bor - Value of +FBOR - - - fet - Value of +FET command in send-mode. - Set by HL in receive-mode for +FET message. - - - pollid[] - ID-string, set by +FCIG - - - cq - Value of +FCQ - - - cr - Value of +FCR - - - ctcrty - Value of +FCTCRTY - - - minsp - Value of +FMINSP - - - phcto - Value of +FPHCTO - - - rel - Value of +FREL - - - nbc - Value of +FNBC (0,1) - (+FNBC is not a known class 2 fax command, I added this to change the - automatic "best capabilities" connection in the eicon HL-driver) - - -Armin -mac@melware.de - diff --git a/Documentation/isdn/README b/Documentation/isdn/README deleted file mode 100644 index 74bd2bdb455b..000000000000 --- a/Documentation/isdn/README +++ /dev/null @@ -1,599 +0,0 @@ -README for the ISDN-subsystem - -1. Preface - - 1.1 Introduction - - This README describes how to set up and how to use the different parts - of the ISDN-subsystem. - - For using the ISDN-subsystem, some additional userlevel programs are - necessary. Those programs and some contributed utilities are available - at - - ftp.isdn4linux.de - - /pub/isdn4linux/isdn4k-utils-.tar.gz - - - We also have set up a mailing-list: - - The isdn4linux-project originates in Germany, and therefore by historical - reasons, the mailing-list's primary language is german. However mails - written in english have been welcome all the time. - - to subscribe: write a email to majordomo@listserv.isdn4linux.de, - Subject irrelevant, in the message body: - subscribe isdn4linux - - To write to the mailing-list, write to isdn4linux@listserv.isdn4linux.de - - This mailinglist is bidirectionally gated to the newsgroup - - de.alt.comm.isdn4linux - - There is also a well maintained FAQ in English available at - https://www.mhessler.de/i4lfaq/ - It can be viewed online, or downloaded in sgml/text/html format. - The FAQ can also be viewed online at - https://www.isdn4linux.de/faq/i4lfaq.html - or downloaded from - ftp://ftp.isdn4linux.de/pub/isdn4linux/FAQ/ - - 1.1 Technical details - - In the following Text, the terms MSN and EAZ are used. - - MSN is the abbreviation for (M)ultiple(S)ubscriber(N)umber, and applies - to Euro(EDSS1)-type lines. Usually it is simply the phone number. - - EAZ is the abbreviation of (E)ndgeraete(A)uswahl(Z)iffer and - applies to German 1TR6-type lines. This is a one-digit string, - simply appended to the base phone number - - The internal handling is nearly identical, so replace the appropriate - term to that one, which applies to your local ISDN-environment. - - When the link-level-module isdn.o is loaded, it supports up to 16 - low-level-modules with up to 64 channels. (The number 64 is arbitrarily - chosen and can be configured at compile-time --ISDN_MAX in isdn.h). - A low-level-driver can register itself through an interface (which is - defined in isdnif.h) and gets assigned a slot. - The following char-devices are made available for each channel: - - A raw-control-device with the following functions: - write: raw D-channel-messages (format: depends on driver). - read: raw D-channel-messages (format: depends on driver). - ioctl: depends on driver, i.e. for the ICN-driver, the base-address of - the ports and the shared memory on the card can be set and read - also the boot-code and the protocol software can be loaded into - the card. - - O N L Y !!! for debugging (no locking against other devices): - One raw-data-device with the following functions: - write: data to B-channel. - read: data from B-channel. - - In addition the following devices are made available: - - 128 tty-devices (64 cuix and 64 ttyIx) with integrated modem-emulator: - The functionality is almost the same as that of a serial device - (the line-discs are handled by the kernel), which lets you run - SLIP, CSLIP and asynchronous PPP through the devices. We have tested - Seyon, minicom, CSLIP (uri-dip) PPP, mgetty, XCept and Hylafax. - - The modem-emulation supports the following: - 1.3.1 Commands: - - ATA Answer incoming call. - ATD Dial, the number may contain: - [0-9] and [,#.*WPT-S] - the latter are ignored until 'S'. - The 'S' must precede the number, if - the line is a SPV (German 1TR6). - ATE0 Echo off. - ATE1 Echo on (default). - ATH Hang-up. - ATH1 Off hook (ignored). - ATH0 Hang-up. - ATI Return "ISDN for Linux...". - ATI0 " - ATI1 " - ATI2 Report of last connection. - ATO On line (data mode). - ATQ0 Enable result codes (default). - ATQ1 Disable result codes (default). - ATSx=y Set register x to y. - ATSx? Show contents of register x. - ATV0 Numeric responses. - ATV1 English responses (default). - ATZ Load registers and EAZ/MSN from Profile. - AT&Bx Set Send-Packet-size to x (max. 4000) - The real packet-size may be limited by the - low-level-driver used. e.g. the HiSax-Module- - limit is 2000. You will get NO Error-Message, - if you set it to higher values, because at the - time of giving this command the corresponding - driver may not be selected (see "Automatic - Assignment") however the size of outgoing packets - will be limited correctly. - AT&D0 Ignore DTR - AT&D2 DTR-low-edge: Hang up and return to - command mode (default). - AT&D3 Same as AT&D2 but also resets all registers. - AT&Ex Set the EAZ/MSN for this channel to x. - AT&F Reset all registers and profile to "factory-defaults" - AT&Lx Set list of phone numbers to listen on. x is a - list of wildcard patterns separated by semicolon. - If this is set, it has precedence over the MSN set - by AT&E. - AT&Rx Select V.110 bitrate adaption. - This command enables V.110 protocol with 9600 baud - (x=9600), 19200 baud (x=19200) or 38400 baud - (x=38400). A value of x=0 disables V.110 switching - back to default X.75. This command sets the following - Registers: - Reg 14 (Layer-2 protocol): - x = 0: 0 - x = 9600: 7 - x = 19200: 8 - x = 38400: 9 - Reg 18.2 = 1 - Reg 19 (Additional Service Indicator): - x = 0: 0 - x = 9600: 197 - x = 19200: 199 - x = 38400: 198 - Note on value in Reg 19: - There is _NO_ common convention for 38400 baud. - The value 198 is chosen arbitrarily. Users - _MUST_ negotiate this value before establishing - a connection. - AT&Sx Set window-size (x = 1..8) (not yet implemented) - AT&V Show all settings. - AT&W0 Write registers and EAZ/MSN to profile. See also - iprofd (5.c in this README). - AT&X0 BTX-mode and T.70-mode off (default) - AT&X1 BTX-mode on. (S13.1=1, S13.5=0 S14=0, S16=7, S18=7, S19=0) - AT&X2 T.70-mode on. (S13.1=1, S13.5=1, S14=0, S16=7, S18=7, S19=0) - AT+Rx Resume a suspended call with CallID x (x = 1,2,3...) - AT+Sx Suspend a call with CallID x (x = 1,2,3...) - - For voice-mode commands refer to README.audio - - 1.3.2 Escape sequence: - During a connection, the emulation reacts just like - a normal modem to the escape sequence +++. - (The escape character - default '+' - can be set in the - register 2). - The DELAY must at least be 1.5 seconds long and delay - between the escape characters must not exceed 0.5 seconds. - - 1.3.3 Registers: - - Nr. Default Description - 0 0 Answer on ring number. - (no auto-answer if S0=0). - 1 0 Count of rings. - 2 43 Escape character. - (a value >= 128 disables the escape sequence). - 3 13 Carriage return character (ASCII). - 4 10 Line feed character (ASCII). - 5 8 Backspace character (ASCII). - 6 3 Delay in seconds before dialing. - 7 60 Wait for carrier. - 8 2 Pause time for comma (ignored) - 9 6 Carrier detect time (ignored) - 10 7 Carrier loss to disconnect time (ignored). - 11 70 Touch tone timing (ignored). - 12 69 Bit coded register: - Bit 0: 0 = Suppress response messages. - 1 = Show response messages. - Bit 1: 0 = English response messages. - 1 = Numeric response messages. - Bit 2: 0 = Echo off. - 1 = Echo on. - Bit 3 0 = DCD always on. - 1 = DCD follows carrier. - Bit 4 0 = CTS follows RTS - 1 = Ignore RTS, CTS always on. - Bit 5 0 = return to command mode on DTR low. - 1 = Same as 0 but also resets all - registers. - See also register 13, bit 2 - Bit 6 0 = DSR always on. - 1 = DSR only on if channel is available. - Bit 7 0 = Cisco-PPP-flag-hack off (default). - 1 = Cisco-PPP-flag-hack on. - 13 0 Bit coded register: - Bit 0: 0 = Use delayed tty-send-algorithm - 1 = Direct tty-send. - Bit 1: 0 = T.70 protocol (Only for BTX!) off - 1 = T.70 protocol (Only for BTX!) on - Bit 2: 0 = Don't hangup on DTR low. - 1 = Hangup on DTR low. - Bit 3: 0 = Standard response messages - 1 = Extended response messages - Bit 4: 0 = CALLER NUMBER before every RING. - 1 = CALLER NUMBER after first RING. - Bit 5: 0 = T.70 extended protocol off - 1 = T.70 extended protocol on - Bit 6: 0 = Special RUNG Message off - 1 = Special RUNG Message on - "RUNG" is delivered on a ttyI, if - an incoming call happened (RING) and - the remote party hung up before any - local ATA was given. - Bit 7: 0 = Don't show display messages from net - 1 = Show display messages from net - (S12 Bit 1 must be 0 too) - 14 0 Layer-2 protocol: - 0 = X75/LAPB with I-frames - 1 = X75/LAPB with UI-frames - 2 = X75/LAPB with BUI-frames - 3 = HDLC - 4 = Transparent (audio) - 7 = V.110, 9600 baud - 8 = V.110, 19200 baud - 9 = V.110, 38400 baud - 10 = Analog Modem (only if hardware supports this) - 11 = Fax G3 (only if hardware supports this) - 15 0 Layer-3 protocol: - 0 = transparent - 1 = transparent with audio features (e.g. DSP) - 2 = Fax G3 Class 2 commands (S14 has to be set to 11) - 3 = Fax G3 Class 1 commands (S14 has to be set to 11) - 16 250 Send-Packet-size/16 - 17 8 Window-size (not yet implemented) - 18 4 Bit coded register, Service-Octet-1 to accept, - or to be used on dialout: - Bit 0: Service 1 (audio) when set. - Bit 1: Service 5 (BTX) when set. - Bit 2: Service 7 (data) when set. - Note: It is possible to set more than one - bit. In this case, on incoming calls - the selected services are accepted, - and if the service is "audio", the - Layer-2-protocol is automatically - changed to 4 regardless of the setting - of register 14. On outgoing calls, - the most significant 1-bit is chosen to - select the outgoing service octet. - 19 0 Service-Octet-2 - 20 0 Bit coded register (readonly) - Service-Octet-1 of last call. - Bit mapping is the same as register 18 - 21 0 Bit coded register (readonly) - Set on incoming call (during RING) to - octet 3 of calling party number IE (Numbering plan) - See section 4.5.10 of ITU Q.931 - 22 0 Bit coded register (readonly) - Set on incoming call (during RING) to - octet 3a of calling party number IE (Screening info) - See section 4.5.10 of ITU Q.931 - 23 0 Bit coded register: - Bit 0: 0 = Add CPN to RING message off - 1 = Add CPN to RING message on - Bit 1: 0 = Add CPN to FCON message off - 1 = Add CPN to FCON message on - Bit 2: 0 = Add CDN to RING/FCON message off - 1 = Add CDN to RING/FCON message on - - Last but not least a (at the moment fairly primitive) device to request - the line-status (/dev/isdninfo) is made available. - - Automatic assignment of devices to lines: - - All inactive physical lines are listening to all EAZs for incoming - calls and are NOT assigned to a specific tty or network interface. - When an incoming call is detected, the driver looks first for a network - interface and then for an opened tty which: - - 1. is configured for the same EAZ. - 2. has the same protocol settings for the B-channel. - 3. (only for network interfaces if the security flag is set) - contains the caller number in its access list. - 4. Either the channel is not bound exclusively to another Net-interface, or - it is bound AND the other checks apply to exactly this interface. - (For usage of the bind-features, refer to the isdnctrl-man-page) - - Only when a matching interface or tty is found is the call accepted - and the "connection" between the low-level-layer and the link-level-layer - is established and kept until the end of the connection. - In all other cases no connection is established. Isdn4linux can be - configured to either do NOTHING in this case (which is useful, if - other, external devices with the same EAZ/MSN are connected to the bus) - or to reject the call actively. (isdnctrl busreject ...) - - For an outgoing call, the inactive physical lines are searched. - The call is placed on the first physical line, which supports the - requested protocols for the B-channel. If a net-interface, however - is pre-bound to a channel, this channel is used directly. - - This makes it possible to configure several network interfaces and ttys - for one EAZ, if the network interfaces are set to secure operation. - If an incoming call matches one network interface, it gets connected to it. - If another incoming call for the same EAZ arrives, which does not match - a network interface, the first tty gets a "RING" and so on. - -2 System prerequisites: - - ATTENTION! - - Always use the latest module utilities. The current version is - named in Documentation/Changes. Some old versions of insmod - are not capable of setting the driver-Ids correctly. - -3. Lowlevel-driver configuration. - - Configuration depends on how the drivers are built. See the - README. for information on driver-specific setup. - -4. Device-inodes - - The major and minor numbers and their names are described in - Documentation/admin-guide/devices.rst. The major numbers are: - - 43 for the ISDN-tty's. - 44 for the ISDN-callout-tty's. - 45 for control/info/debug devices. - -5. Application - - a) For some card-types, firmware has to be loaded into the cards, before - proceeding with device-independent setup. See README. - for how to do that. - - b) If you only intend to use ttys, you are nearly ready now. - - c) If you want to have really permanent "Modem"-settings on disk, you - can start the daemon iprofd. Give it a path to a file at the command- - line. It will store the profile-settings in this file every time - an AT&W0 is performed on any ISDN-tty. If the file already exists, - all profiles are initialized from this file. If you want to unload - any of the modules, kill iprofd first. - - d) For networking, continue: Create an interface: - isdnctrl addif isdn0 - - e) Set the EAZ (or MSN for Euro-ISDN): - isdnctrl eaz isdn0 2 - - (For 1TR6 a single digit is allowed, for Euro-ISDN the number is your - real MSN e.g.: Phone-Number) - - f) Set the number for outgoing calls on the interface: - isdnctrl addphone isdn0 out 1234567 - ... (this can be executed more than once, all assigned numbers are - tried in order) - and the number(s) for incoming calls: - isdnctrl addphone isdn0 in 1234567 - - g) Set the timeout for hang-up: - isdnctrl huptimeout isdn0 - - h) additionally you may activate charge-hang-up (= Hang up before - next charge-info, this only works, if your isdn-provider transmits - the charge-info during and after the connection): - isdnctrl chargehup isdn0 on - - i) Set the dial mode of the interface: - isdnctrl dialmode isdn0 auto - "off" means that you (or the system) cannot make any connection - (neither incoming or outgoing connections are possible). Use - this if you want to be sure that no connections will be made. - "auto" means that the interface is in auto-dial mode, and will - attempt to make a connection whenever a network data packet needs - the interface's link. Note that this can cause unexpected dialouts, - and lead to a high phone bill! Some daemons or other pc's that use - this interface can cause this. - Incoming connections are also possible. - "manual" is a dial mode created to prevent the unexpected dialouts. - In this mode, the interface will never make any connections on its - own. You must explicitly initiate a connection with "isdnctrl dial - isdn0". However, after an idle time of no traffic as configured for - the huptimeout value with isdnctrl, the connection _will_ be ended. - If you don't want any automatic hangup, set the huptimeout value to 0. - "manual" is the default. - - j) Setup the interface with ifconfig as usual, and set a route to it. - - k) (optional) If you run X11 and have Tcl/Tk-wish version 4.0, you can use - the script tools/tcltk/isdnmon. You can add actions for line-status - changes. See the comments at the beginning of the script for how to - do that. There are other tty-based tools in the tools-subdirectory - contributed by Michael Knigge (imon), Volker Götz (imontty) and - Andreas Kool (isdnmon). - - l) For initial testing, you can set the verbose-level to 2 (default: 0). - Then all incoming calls are logged, even if they are not addressed - to one of the configured net-interfaces: - isdnctrl verbose 2 - - Now you are ready! A ping to the set address should now result in an - automatic dial-out (look at syslog kernel-messages). - The phone numbers and EAZs can be assigned at any time with isdnctrl. - You can add as many interfaces as you like with addif following the - directions above. Of course, there may be some limitations. But we have - tested as many as 20 interfaces without any problem. However, if you - don't give an interface name to addif, the kernel will assign a name - which starts with "eth". The number of "eth"-interfaces is limited by - the kernel. - -5. Additional options for isdnctrl: - - "isdnctrl secure on" - Only incoming calls, for which the caller-id is listed in the access - list of the interface are accepted. You can add caller-id's With the - command "isdnctrl addphone in " - Euro-ISDN does not transmit the leading '0' of the caller-id for an - incoming call, therefore you should configure it accordingly. - If the real number for the dialout e.g. is "09311234567" the number - to configure here is "9311234567". The pattern-match function - works similar to the shell mechanism. - - ? one arbitrary digit - * zero or arbitrary many digits - [123] one of the digits in the list - [1-5] one digit between '1' and '5' - a '^' as the first character in a list inverts the list - - - "isdnctrl secure off" - Switch off secure operation (default). - - "isdnctrl ihup [on|off]" - Switch the hang-up-timer for incoming calls on or off. - - "isdnctrl eaz " - Returns the EAZ of an interface. - - "isdnctrl delphone in|out " - Deletes a number from one of the access-lists of the interface. - - "isdnctrl delif " - Removes the interface (and possible slaves) from the kernel. - (You have to unregister it with "ifconfig down" before). - - "isdnctrl callback [on|off]" - Switches an interface to callback-mode. In this mode, an incoming call - will be rejected and after this the remote-station will be called. If - you test this feature by using ping, some routers will re-dial very - quickly, so that the callback from isdn4linux may not be recognized. - In this case use ping with the option -i to increase the interval - between echo-packets. - - "isdnctrl cbdelay [seconds]" - Sets the delay (default 5 sec) between an incoming call and start of - dialing when callback is enabled. - - "isdnctrl cbhup [on|off]" - This enables (default) or disables an active hangup (reject) when getting an - incoming call for an interface which is configured for callback. - - "isdnctrl encap " - Selects the type of packet-encapsulation. The encapsulation can be changed - only while an interface is down. - - At the moment the following values are supported: - - rawip (Default) Selects raw-IP-encapsulation. This means, MAC-headers - are stripped off. - ip IP with type-field. Same as IP but the type-field of the MAC-header - is preserved. - x25iface X.25 interface encapsulation (first byte semantics as defined in - ../networking/x25-iface.txt). Use this for running the linux - X.25 network protocol stack (AF_X25 sockets) on top of isdn. - cisco-h A special-mode for communicating with a Cisco, which is configured - to do "hdlc" - ethernet No stripping. Packets are sent with full MAC-header. - The Ethernet-address of the interface is faked, from its - IP-address: fc:fc:i1:i2:i3:i4, where i1-4 are the IP-addr.-values. - syncppp Synchronous PPP - - uihdlc HDLC with UI-frame-header (for use with DOS ISPA, option -h1) - - - NOTE: x25iface encapsulation is currently experimental. Please - read README.x25 for further details - - - Watching packets, using standard-tcpdump will fail for all encapsulations - except ethernet because tcpdump does not know how to handle packets - without MAC-header. A patch for tcpdump is included in the utility-package - mentioned above. - - "isdnctrl l2_prot " - Selects a layer-2-protocol. - (With the ICN-driver and the HiSax-driver, "x75i" and "hdlc" is available. - With other drivers, "x75ui", "x75bui", "x25dte", "x25dce" may be - possible too. See README.x25 for x25 related l2 protocols.) - - isdnctrl l3_prot - The same for layer-3. (At the moment only "trans" is allowed) - - "isdnctrl list " - Shows all parameters of an interface and the charge-info. - Try "all" as the interface name. - - "isdnctrl hangup " - Forces hangup of an interface. - - "isdnctrl bind , [exclusive]" - If you are using more than one ISDN card, it is sometimes necessary to - dial out using a specific card or even preserve a specific channel for - dialout of a specific net-interface. This can be done with the above - command. Replace by whatever you assigned while loading the - module. The is counted from zero. The upper limit - depends on the card used. At the moment no card supports more than - 2 channels, so the upper limit is one. - - "isdnctrl unbind " - unbinds a previously bound interface. - - "isdnctrl busreject on|off" - If switched on, isdn4linux replies a REJECT to incoming calls, it - cannot match to any configured interface. - If switched off, nothing happens in this case. - You normally should NOT enable this feature, if the ISDN adapter is not - the only device connected to the S0-bus. Otherwise it could happen that - isdn4linux rejects an incoming call, which belongs to another device on - the bus. - - "isdnctrl addslave - Creates a slave interface for channel-bundling. Slave interfaces are - not seen by the kernel, but their ISDN-part can be configured with - isdnctrl as usual. (Phone numbers, EAZ/MSN, timeouts etc.) If more - than two channels are to be bundled, feel free to create as many as you - want. InterfaceName must be a real interface, NOT a slave. Slave interfaces - start dialing, if the master interface resp. the previous slave interface - has a load of more than 7000 cps. They hangup if the load goes under 7000 - cps, according to their "huptimeout"-parameter. - - "isdnctrl sdelay secs." - This sets the minimum time an Interface has to be fully loaded, until - it sends a dial-request to its slave. - - "isdnctrl dial " - Forces an interface to start dialing even if no packets are to be - transferred. - - "isdnctrl mapping MSN0,MSN1,MSN2,...MSN9" - This installs a mapping table for EAZ<->MSN-mapping for a single line. - Missing MSN's have to be given as "-" or can be omitted, if at the end - of the commandline. - With this command, it's now possible to have an interface listening to - mixed 1TR6- and Euro-Type lines. In this case, the interface has to be - configured to a 1TR6-type EAZ (one digit). The mapping is also valid - for tty-emulation. Seen from the interface/tty-level the mapping - CAN be used, however it's possible to use single tty's/interfaces with - real MSN's (more digits) also, in which case the mapping will be ignored. - Here is an example: - - You have a 1TR6-type line with base-nr. 1234567 and a Euro-line with - MSN's 987654, 987655 and 987656. The DriverId for the Euro-line is "EURO". - - isdnctrl mapping EURO -,987654,987655,987656,-,987655 - ... - isdnctrl eaz isdn0 1 # listen on 12345671(1tr6) and 987654(euro) - ... - isdnctrl eaz isdn1 4 # listen on 12345674(1tr6) only. - ... - isdnctrl eaz isdn2 987654 # listen on 987654(euro) only. - - Same scheme is used with AT&E... at the tty's. - -6. If you want to write a new low-level-driver, you are welcome. - The interface to the link-level-module is described in the file INTERFACE. - If the interface should be expanded for any reason, don't do it - on your own, send me a mail containing the proposed changes and - some reasoning about them. - If other drivers will not be affected, I will include the changes - in the next release. - For developers only, there is a second mailing-list. Write to me - (fritz@isdn4linux.de), if you want to join that list. - -Have fun! - - -Fritz - diff --git a/Documentation/isdn/README.FAQ b/Documentation/isdn/README.FAQ deleted file mode 100644 index e5dd1addacdd..000000000000 --- a/Documentation/isdn/README.FAQ +++ /dev/null @@ -1,26 +0,0 @@ - -The FAQ for isdn4linux -====================== - -Please note that there is a big FAQ available in the isdn4k-utils. -You find it in: - isdn4k-utils/FAQ/i4lfaq.sgml - -In case you just want to see the FAQ online, or download the newest version, -you can have a look at my website: -https://www.mhessler.de/i4lfaq/ (view + download) -or: -https://www.isdn4linux.de/faq/4lfaq.html (view) - -As the extension tells, the FAQ is in SGML format, and you can convert it -into text/html/... format by using the sgml2txt/sgml2html/... tools. -Alternatively, you can also do a 'configure; make all' in the FAQ directory. - - -Please have a look at the FAQ before posting anything in the Mailinglist, -or the newsgroup! - - -Matthias Hessler -hessler@isdn4linux.de - diff --git a/Documentation/isdn/README.audio b/Documentation/isdn/README.audio deleted file mode 100644 index 8ebca19290d9..000000000000 --- a/Documentation/isdn/README.audio +++ /dev/null @@ -1,138 +0,0 @@ -$Id: README.audio,v 1.8 1999/07/11 17:17:29 armin Exp $ - -ISDN subsystem for Linux. - Description of audio mode. - -When enabled during kernel configuration, the tty emulator of the ISDN -subsystem is capable of a reduced set of commands to support audio. -This document describes the commands supported and the format of -audio data. - -Commands for enabling/disabling audio mode: - - AT+FCLASS=8 Enable audio mode. - This affects the following registers: - S18: Bits 0 and 2 are set. - S16: Set to 48 and any further change to - larger values is blocked. - AT+FCLASS=0 Disable audio mode. - Register 18 is set to 4. - AT+FCLASS=? Show possible modes. - AT+FCLASS? Report current mode (0 or 8). - -Commands supported in audio mode: - -All audio mode commands have one of the following forms: - - AT+Vxx? Show current setting. - AT+Vxx=? Show possible settings. - AT+Vxx=v Set simple parameter. - AT+Vxx=v,v ... Set complex parameter. - -where xx is a two-character code and v are alphanumerical parameters. -The following commands are supported: - - AT+VNH=x Auto hangup setting. NO EFFECT, supported - for compatibility only. - AT+VNH? Always reporting "1" - AT+VNH=? Always reporting "1" - - AT+VIP Reset all audio parameters. - - AT+VLS=x Line select. x is one of the following: - 0 = No device. - 2 = Phone line. - AT+VLS=? Always reporting "0,2" - AT+VLS? Show current line. - - AT+VRX Start recording. Emulator responds with - CONNECT and starts sending audio data to - the application. See below for data format - - AT+VSD=x,y Set silence-detection parameters. - Possible parameters: - x = 0 ... 31 sensitivity threshold level. - (default 0 , deactivated) - y = 0 ... 255 range of interval in units - of 0.1 second. (default 70) - AT+VSD=? Report possible parameters. - AT+VSD? Show current parameters. - - AT+VDD=x,y Set DTMF-detection parameters. - Only possible if online and during this connection. - Possible parameters: - x = 0 ... 15 sensitivity threshold level. - (default 0 , I4L soft-decode) - (1-15 soft-decode off, hardware on) - y = 0 ... 255 tone duration in units of 5ms. - Not for I4L soft decode (default 8, 40ms) - AT+VDD=? Report possible parameters. - AT+VDD? Show current parameters. - - AT+VSM=x Select audio data format. - Possible parameters: - 2 = ADPCM-2 - 3 = ADPCM-3 - 4 = ADPCM-4 - 5 = aLAW - 6 = uLAW - AT+VSM=? Show possible audio formats. - - AT+VTX Start audio playback. Emulator responds - with CONNECT and starts sending audio data - received from the application via phone line. -General behavior and description of data formats/protocol. - when a connection is made: - - On incoming calls, if the application responds to a RING - with ATA, depending on the calling service, the emulator - responds with either CONNECT (data call) or VCON (voice call). - - On outgoing voice calls, the emulator responds with VCON - upon connection setup. - - Audio recording. - - When receiving audio data, a kind of bisync protocol is used. - Upon AT+VRX command, the emulator responds with CONNECT, and - starts sending audio data to the application. There are several - escape sequences defined, all using DLE (0x10) as Escape char: - - End of audio data. (i.e. caused by a - hangup of the remote side) Emulator stops - recording, responding with VCON. - Abort recording, (send by appl.) Emulator - stops recording, sends DLE,ETX. - Escape sequence for DLE in data stream. - 0 Touchtone "0" received. - ... - 9 Touchtone "9" received. - # Touchtone "#" received. - * Touchtone "*" received. - A Touchtone "A" received. - B Touchtone "B" received. - C Touchtone "C" received. - D Touchtone "D" received. - - q quiet. Silence detected after non-silence. - s silence. Silence detected from the - start of recording. - - Currently unsupported DLE sequences: - - c FAX calling tone received. - b busy tone received. - - Audio playback. - - When sending audio data, upon AT+VTX command, emulator responds with - CONNECT, and starts transferring data from application to the phone line. - The same DLE sequences apply to this mode. - - Full-Duplex-Audio: - - When _both_ commands for recording and playback are given in _one_ - AT-command-line (i.e.: "AT+VTX+VRX"), full-duplex-mode is selected. - In this mode, the only way to stop recording is sending - and the only way to stop playback is to send . - diff --git a/Documentation/isdn/README.concap b/Documentation/isdn/README.concap deleted file mode 100644 index a76d74845a4c..000000000000 --- a/Documentation/isdn/README.concap +++ /dev/null @@ -1,259 +0,0 @@ -Description of the "concap" encapsulation protocol interface -============================================================ - -The "concap" interface is intended to be used by network device -drivers that need to process an encapsulation protocol. -It is assumed that the protocol interacts with a linux network device by -- data transmission -- connection control (establish, release) -Thus, the mnemonic: "CONnection CONtrolling eNCAPsulation Protocol". - -This is currently only used inside the isdn subsystem. But it might -also be useful to other kinds of network devices. Thus, if you want -to suggest changes that improve usability or performance of the -interface, please let me know. I'm willing to include them in future -releases (even if I needed to adapt the current isdn code to the -changed interface). - - -Why is this useful? -=================== - -The encapsulation protocol used on top of WAN connections or permanent -point-to-point links are frequently chosen upon bilateral agreement. -Thus, a device driver for a certain type of hardware must support -several different encapsulation protocols at once. - -The isdn device driver did already support several different -encapsulation protocols. The encapsulation protocol is configured by a -user space utility (isdnctrl). The isdn network interface code then -uses several case statements which select appropriate actions -depending on the currently configured encapsulation protocol. - -In contrast, LAN network interfaces always used a single encapsulation -protocol which is unique to the hardware type of the interface. The LAN -encapsulation is usually done by just sticking a header on the data. Thus, -traditional linux network device drivers used to process the -encapsulation protocol directly (usually by just providing a hard_header() -method in the device structure) using some hardware type specific support -functions. This is simple, direct and efficient. But it doesn't fit all -the requirements for complex WAN encapsulations. - - - The configurability of the encapsulation protocol to be used - makes isdn network interfaces more flexible, but also much more - complex than traditional lan network interfaces. - - -Many Encapsulation protocols used on top of WAN connections will not just -stick a header on the data. They also might need to set up or release -the WAN connection. They also might want to send other data for their -private purpose over the wire, e.g. ppp does a lot of link level -negotiation before the first piece of user data can be transmitted. -Such encapsulation protocols for WAN devices are typically more complex -than encapsulation protocols for lan devices. Thus, network interface -code for typical WAN devices also tends to be more complex. - - -In order to support Linux' x25 PLP implementation on top of -isdn network interfaces I could have introduced yet another branch to -the various case statements inside drivers/isdn/isdn_net.c. -This eventually made isdn_net.c even more complex. In addition, it made -isdn_net.c harder to maintain. Thus, by identifying an abstract -interface between the network interface code and the encapsulation -protocol, complexity could be reduced and maintainability could be -increased. - - -Likewise, a similar encapsulation protocol will frequently be needed by -several different interfaces of even different hardware type, e.g. the -synchronous ppp implementation used by the isdn driver and the -asynchronous ppp implementation used by the ppp driver have a lot of -similar code in them. By cleanly separating the encapsulation protocol -from the hardware specific interface stuff such code could be shared -better in future. - - -When operating over dial-up-connections (e.g. telephone lines via modem, -non-permanent virtual circuits of wide area networks, ISDN) many -encapsulation protocols will need to control the connection. Therefore, -some basic connection control primitives are supported. The type and -semantics of the connection (i.e the ISO layer where connection service -is provided) is outside our scope and might be different depending on -the encapsulation protocol used, e.g. for a ppp module using our service -on top of a modem connection a connect_request will result in dialing -a (somewhere else configured) remote phone number. For an X25-interface -module (LAPB semantics, as defined in Documentation/networking/x25-iface.txt) -a connect_request will ask for establishing a reliable lapb -datalink connection. - - -The encapsulation protocol currently provides the following -service primitives to the network device. - -- create a new encapsulation protocol instance -- delete encapsulation protocol instance and free all its resources -- initialize (open) the encapsulation protocol instance for use. -- deactivate (close) an encapsulation protocol instance. -- process (xmit) data handed down by upper protocol layer -- receive data from lower (hardware) layer -- process connect indication from lower (hardware) layer -- process disconnect indication from lower (hardware) layer - - -The network interface driver accesses those primitives via callbacks -provided by the encapsulation protocol instance within a -struct concap_proto_ops. - -struct concap_proto_ops{ - - /* create a new encapsulation protocol instance of same type */ - struct concap_proto * (*proto_new) (void); - - /* delete encapsulation protocol instance and free all its resources. - cprot may no longer be referenced after calling this */ - void (*proto_del)(struct concap_proto *cprot); - - /* initialize the protocol's data. To be called at interface startup - or when the device driver resets the interface. All services of the - encapsulation protocol may be used after this*/ - int (*restart)(struct concap_proto *cprot, - struct net_device *ndev, - struct concap_device_ops *dops); - - /* deactivate an encapsulation protocol instance. The encapsulation - protocol may not call any *dops methods after this. */ - int (*close)(struct concap_proto *cprot); - - /* process a frame handed down to us by upper layer */ - int (*encap_and_xmit)(struct concap_proto *cprot, struct sk_buff *skb); - - /* to be called for each data entity received from lower layer*/ - int (*data_ind)(struct concap_proto *cprot, struct sk_buff *skb); - - /* to be called when a connection was set up/down. - Protocols that don't process these primitives might fill in - dummy methods here */ - int (*connect_ind)(struct concap_proto *cprot); - int (*disconn_ind)(struct concap_proto *cprot); -}; - - -The data structures are defined in the header file include/linux/concap.h. - - -A Network interface using encapsulation protocols must also provide -some service primitives to the encapsulation protocol: - -- request data being submitted by lower layer (device hardware) -- request a connection being set up by lower layer -- request a connection being released by lower layer - -The encapsulation protocol accesses those primitives via callbacks -provided by the network interface within a struct concap_device_ops. - -struct concap_device_ops{ - - /* to request data be submitted by device */ - int (*data_req)(struct concap_proto *, struct sk_buff *); - - /* Control methods must be set to NULL by devices which do not - support connection control. */ - /* to request a connection be set up */ - int (*connect_req)(struct concap_proto *); - - /* to request a connection be released */ - int (*disconn_req)(struct concap_proto *); -}; - -The network interface does not explicitly provide a receive service -because the encapsulation protocol directly calls netif_rx(). - - - - -An encapsulation protocol itself is actually the -struct concap_proto{ - struct net_device *net_dev; /* net device using our service */ - struct concap_device_ops *dops; /* callbacks provided by device */ - struct concap_proto_ops *pops; /* callbacks provided by us */ - int flags; - void *proto_data; /* protocol specific private data, to - be accessed via *pops methods only*/ - /* - : - whatever - : - */ -}; - -Most of this is filled in when the device requests the protocol to -be reset (opend). The network interface must provide the net_dev and -dops pointers. Other concap_proto members should be considered private -data that are only accessed by the pops callback functions. Likewise, -a concap proto should access the network device's private data -only by means of the callbacks referred to by the dops pointer. - - -A possible extended device structure which uses the connection controlling -encapsulation services could look like this: - -struct concap_device{ - struct net_device net_dev; - struct my_priv /* device->local stuff */ - /* the my_priv struct might contain a - struct concap_device_ops *dops; - to provide the device specific callbacks - */ - struct concap_proto *cprot; /* callbacks provided by protocol */ -}; - - - -Misc Thoughts -============= - -The concept of the concap proto might help to reuse protocol code and -reduce the complexity of certain network interface implementations. -The trade off is that it introduces yet another procedure call layer -when processing the protocol. This has of course some impact on -performance. However, typically the concap interface will be used by -devices attached to slow lines (like telephone, isdn, leased synchronous -lines). For such slow lines, the overhead is probably negligible. -This might no longer hold for certain high speed WAN links (like -ATM). - - -If general linux network interfaces explicitly supported concap -protocols (e.g. by a member struct concap_proto* in struct net_device) -then the interface of the service function could be changed -by passing a pointer of type (struct net_device*) instead of -type (struct concap_proto*). Doing so would make many of the service -functions compatible to network device support functions. - -e.g. instead of the concap protocol's service function - - int (*encap_and_xmit)(struct concap_proto *cprot, struct sk_buff *skb); - -we could have - - int (*encap_and_xmit)(struct net_device *ndev, struct sk_buff *skb); - -As this is compatible to the dev->hard_start_xmit() method, the device -driver could directly register the concap protocol's encap_and_xmit() -function as its hard_start_xmit() method. This would eliminate one -procedure call layer. - - -The device's data request function could also be defined as - - int (*data_req)(struct net_device *ndev, struct sk_buff *skb); - -This might even allow for some protocol stacking. And the network -interface might even register the same data_req() function directly -as its hard_start_xmit() method when a zero layer encapsulation -protocol is configured. Thus, eliminating the performance penalty -of the concap interface when a trivial concap protocol is used. -Nevertheless, the device remains able to support encapsulation -protocol configuration. - diff --git a/Documentation/isdn/README.diversion b/Documentation/isdn/README.diversion deleted file mode 100644 index bddcd5fb86ff..000000000000 --- a/Documentation/isdn/README.diversion +++ /dev/null @@ -1,127 +0,0 @@ -The isdn diversion services are a supporting module working together with -the isdn4linux and the HiSax module for passive cards. -Active cards, TAs and cards using a own or other driver than the HiSax -module need to be adapted to the HL<->LL interface described in a separate -document. The diversion services may be used with all cards supported by -the HiSax driver. -The diversion kernel interface and controlling tool divertctrl were written -by Werner Cornelius (werner@isdn4linux.de or werner@titro.de) under the -GNU General Public License. - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program; if not, write to the Free Software - Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - -Table of contents -================= - -1. Features of the i4l diversion services - (Or what can the i4l diversion services do for me) - -2. Required hard- and software - -3. Compiling, installing and loading/unloading the module - Tracing calling and diversion information - -4. Tracing calling and diversion information - -5. Format of the divert device ASCII output - - -1. Features of the i4l diversion services - (Or what can the i4l diversion services do for me) - - The i4l diversion services offers call forwarding and logging normally - only supported by isdn phones. Incoming calls may be diverted - unconditionally (CFU), when not reachable (CFNR) or on busy condition - (CFB). - The diversions may be invoked statically in the providers exchange - as normally done by isdn phones. In this case all incoming calls - with a special (or all) service identifiers are forwarded if the - forwarding reason is met. Activated static services may also be - interrogated (queried). - The i4l diversion services additionally offers a dynamic version of - call forwarding which is not preprogrammed inside the providers exchange - but dynamically activated by i4l. - In this case all incoming calls are checked by rules that may be - compared to the mechanism of ipfwadm or ipchains. If a given rule matches - the checking process is finished and the rule matching will be applied - to the call. - The rules include primary and secondary service identifiers, called - number and subaddress, callers number and subaddress and whether the rule - matches to all filtered calls or only those when all B-channel resources - are exhausted. - Actions that may be invoked by a rule are ignore, proceed, reject, - direct divert or delayed divert of a call. - All incoming calls matching a rule except the ignore rule a reported and - logged as ASCII via the proc filesystem (/proc/net/isdn/divert). If proceed - is selected the call will be held in a proceeding state (without ringing) - for a certain amount of time to let an external program or client decide - how to handle the call. - - -2. Required hard- and software - - For using the i4l diversion services the isdn line must be of a EURO/DSS1 - type. Additionally the i4l services only work together with the HiSax - driver for passive isdn cards. All HiSax supported cards may be used for - the diversion purposes. - The static diversion services require the provider having static services - CFU, CFNR, CFB activated on an MSN-line. The static services may not be - used on a point-to-point connection. Further the static services are only - available in some countries (for example germany). Countries requiring the - keypad protocol for activating static diversions (like the netherlands) are - not supported but may use the tty devices for this purpose. - The dynamic diversion services may be used in all countries if the provider - enables the feature CF (call forwarding). This should work on both MSN- and - point-to-point lines. - To add and delete rules the additional divertctrl program is needed. This - program is part of the isdn4kutils package. - -3. Compiling, installing and loading/unloading the module - Tracing calling and diversion information - - - To compile the i4l code with diversion support you need to say yes to the - DSS1 diversion services when selecting the i4l options in the kernel - config (menuconfig or config). - After having properly activated a make modules and make modules_install all - required modules will be correctly installed in the needed modules dirs. - As the diversion services are currently not included in the scripts of most - standard distributions you will have to add a "insmod dss1_divert" after - having loaded the global isdn module. - The module can be loaded without any command line parameters. - If the module is actually loaded and active may be checked with a - "cat /proc/modules" or "ls /proc/net/isdn/divert". The divert file is - dynamically created by the diversion module and removed when the module is - unloaded. - - -4. Tracing calling and diversion information - - You also may put a "cat /proc/net/isdn/divert" in the background with the - output redirected to a file. Then all actions of the module are logged. - The divert file in the proc system may be opened more than once, so in - conjunction with inetd and a small remote client on other machines inside - your network incoming calls and reactions by the module may be shown on - every listening machine. - If a call is reported as proceeding an external program or client may - specify during a certain amount of time (normally 4 to 10 seconds) what - to do with that call. - To unload the module all open files to the device in the proc system must - be closed. Otherwise the module (and isdn.o) may not be unloaded. - -5. Format of the divert device ASCII output - - To be done later - diff --git a/Documentation/isdn/README.fax b/Documentation/isdn/README.fax deleted file mode 100644 index 5314958a8a6e..000000000000 --- a/Documentation/isdn/README.fax +++ /dev/null @@ -1,45 +0,0 @@ - -Fax with isdn4linux -=================== - -When enabled during kernel configuration, the tty emulator -of the ISDN subsystem is capable of the Fax Class 2 commands. - -This only makes sense under the following conditions : - -- You need the commands as dummy, because you are using - hylafax (with patch) for AVM capi. -- You want to use the fax capabilities of your isdn-card. - (supported cards are listed below) - - -NOTE: This implementation does *not* support fax with passive - ISDN-cards (known as softfax). The low-level driver of - the ISDN-card and/or the card itself must support this. - - -Supported ISDN-Cards --------------------- - -Eicon DIVA Server BRI/PCI - - full support with both B-channels. - -Eicon DIVA Server 4BRI/PCI - - full support with all B-channels. - -Eicon DIVA Server PRI/PCI - - full support on amount of B-channels - depending on DSPs on board. - - - -The command set is known as Class 2 (not Class 2.0) and -can be activated by AT+FCLASS=2 - - -The interface between the link-level-module and the hardware-level driver -is described in the files INTERFACE.fax and INTERFACE. - -Armin -mac@melware.de - diff --git a/Documentation/isdn/README.hfc-pci b/Documentation/isdn/README.hfc-pci deleted file mode 100644 index e8a4ef0226e8..000000000000 --- a/Documentation/isdn/README.hfc-pci +++ /dev/null @@ -1,41 +0,0 @@ -The driver for the HFC-PCI and HFC-PCI-A chips from CCD may be used -for many OEM cards using this chips. -Additionally the driver has a special feature which makes it possible -to read the echo-channel of the isdn bus. So all frames in both directions -may be logged. -When the echo logging feature is used the number of available B-channels -for a HFC-PCI card is reduced to 1. Of course this is only relevant to -the card, not to the isdn line. -To activate the echo mode the following ioctls must be entered: - -hisaxctrl 10 1 - -This reduces the available channels to 1. There must not be open connections -through this card when entering the command. -And then: - -hisaxctrl 12 1 - -This enables the echo mode. If Hex logging is activated the isdnctrlx -devices show a output with a line beginning of HEX: for the providers -exchange and ECHO: for isdn devices sending to the provider. - -If more than one HFC-PCI cards are installed, a specific card may be selected -at the hisax module load command line. Supply the load command with the desired -IO-address of the desired card. -Example: -There tree cards installed in your machine at IO-base addresses 0xd000, 0xd400 -and 0xdc00 -If you want to use the card at 0xd400 standalone you should supply the insmod -or depmod with type=35 io=0xd400. -If you want to use all three cards, but the order needs to be at 0xdc00,0xd400, -0xd000 you may give the parameters type=35,35,35 io=0xdc00,0xd400,0xd00 -Then the desired card will be the initialised in the desired order. -If the io parameter is used the io addresses of all used cards should be -supplied else the parameter is assumed 0 and a auto search for a free card is -invoked which may not give the wanted result. - -Comments and reports to werner@isdn4linux.de or werner@isdn-development.de - - - diff --git a/Documentation/isdn/README.syncppp b/Documentation/isdn/README.syncppp deleted file mode 100644 index 27d260095cce..000000000000 --- a/Documentation/isdn/README.syncppp +++ /dev/null @@ -1,58 +0,0 @@ -Some additional information for setting up a syncPPP -connection using network interfaces. ---------------------------------------------------------------- - -You need one thing beside the isdn4linux package: - - a patched pppd .. (I called it ipppd to show the difference) - -Compiling isdn4linux with sync PPP: ------------------------------------ -To compile isdn4linux with the sync PPP part, you have -to answer the appropriate question when doing a "make config" -Don't forget to load the slhc.o -module before the isdn.o module, if VJ-compression support -is not compiled into your kernel. (e.g if you have no PPP or -CSLIP in the kernel) - -Using isdn4linux with sync PPP: -------------------------------- -Sync PPP is just another encapsulation for isdn4linux. The -name to enable sync PPP encapsulation is 'syncppp' .. e.g: - - /sbin/isdnctrl encap ippp0 syncppp - -The name of the interface is here 'ippp0'. You need -one interface with the name 'ippp0' to saturate the -ipppd, which checks the ppp version via this interface. -Currently, all devices must have the name ipppX where -'X' is a decimal value. - -To set up a PPP connection you need the ipppd .. You must start -the ipppd once after installing the modules. The ipppd -communicates with the isdn4linux link-level driver using the -/dev/ippp0 to /dev/ippp15 devices. One ipppd can handle -all devices at once. If you want to use two PPP connections -at the same time, you have to connect the ipppd to two -devices .. and so on. -I've implemented one additional option for the ipppd: - 'useifip' will get (if set to not 0.0.0.0) the IP address - for the negotiation from the attached network-interface. -(also: ipppd will try to negotiate pointopoint IP as remote IP) -You must disable BSD-compression, this implementation can't -handle compressed packets. - -Check the etc/rc.isdn.syncppp in the isdn4kernel-util package -for an example setup script. - -To use the MPPP stuff, you must configure a slave device -with isdn4linux. Now call the ipppd with the '+mp' option. -To increase the number of links, you must use the -'addlink' option of the isdnctrl tool. (rc.isdn.syncppp.MPPP is -an example script) - -enjoy it, - michael - - - diff --git a/Documentation/isdn/README.x25 b/Documentation/isdn/README.x25 deleted file mode 100644 index e561a77c4e22..000000000000 --- a/Documentation/isdn/README.x25 +++ /dev/null @@ -1,184 +0,0 @@ - -X.25 support within isdn4linux -============================== - -This is alpha/beta test code. Use it completely at your own risk. -As new versions appear, the stuff described here might suddenly change -or become invalid without notice. - -Keep in mind: - -You are using several new parts of the 2.2.x kernel series which -have not been tested in a large scale. Therefore, you might encounter -more bugs as usual. - -- If you connect to an X.25 neighbour not operated by yourself, ASK the - other side first. Be prepared that bugs in the protocol implementation - might result in problems. - -- This implementation has never wiped out my whole hard disk yet. But as - this is experimental code, don't blame me if that happened to you. - Backing up important data will never harm. - -- Monitor your isdn connections while using this software. This should - prevent you from undesired phone bills in case of driver problems. - - - - -How to configure the kernel -=========================== - -The ITU-T (former CCITT) X.25 network protocol layer has been implemented -in the Linux source tree since version 2.1.16. The isdn subsystem might be -useful to run X.25 on top of ISDN. If you want to try it, select - - "CCITT X.25 Packet Layer" - -from the networking options as well as - - "ISDN Support" and "X.25 PLP on Top of ISDN" - -from the ISDN subsystem options when you configure your kernel for -compilation. You currently also need to enable -"Prompt for development and/or incomplete code/drivers" from the -"Code maturity level options" menu. For the x25trace utility to work -you also need to enable "Packet socket". - -For local testing it is also recommended to enable the isdnloop driver -from the isdn subsystem's configuration menu. - -For testing, it is recommended that all isdn drivers and the X.25 PLP -protocol are compiled as loadable modules. Like this, you can recover -from certain errors by simply unloading and reloading the modules. - - - -What's it for? How to use it? -============================= - -X.25 on top of isdn might be useful with two different scenarios: - -- You might want to access a public X.25 data network from your Linux box. - You can use i4l if you were physically connected to the X.25 switch - by an ISDN B-channel (leased line as well as dial up connection should - work). - - This corresponds to ITU-T recommendation X.31 Case A (circuit-mode - access to PSPDN [packet switched public data network]). - - NOTE: X.31 also covers a Case B (access to PSPDN via virtual - circuit / packet mode service). The latter mode (which in theory - also allows using the D-channel) is not supported by isdn4linux. - It should however be possible to establish such packet mode connections - with certain active isdn cards provided that the firmware supports X.31 - and the driver exports this functionality to the user. Currently, - the AVM B1 driver is the only driver which does so. (It should be - possible to access D-channel X.31 with active AVM cards using the - CAPI interface of the AVM-B1 driver). - -- Or you might want to operate certain ISDN teleservices on your linux - box. A lot of those teleservices run on top of the ISO-8208 - (DTE-DTE mode) network layer protocol. ISO-8208 is essentially the - same as ITU-T X.25. - - Popular candidates of such teleservices are EUROfile transfer or any - teleservice applying ITU-T recommendation T.90. - -To use the X.25 protocol on top of isdn, just create an isdn network -interface as usual, configure your own and/or peer's ISDN numbers, -and choose x25iface encapsulation by - - isdnctrl encap x25iface. - -Once encap is set like this, the device can be used by the X.25 packet layer. - -All the stuff needed for X.25 is implemented inside the isdn link -level (mainly isdn_net.c and some new source files). Thus, it should -work with every existing HL driver. I was able to successfully open X.25 -connections on top of the isdnloop driver and the hisax driver. -"x25iface"-encapsulation bypasses demand dialing. Dialing will be -initiated when the upper (X.25 packet) layer requests the lapb datalink to -be established. But hangup timeout is still active. Whenever a hangup -occurs, all existing X.25 connections on that link will be cleared -It is recommended to use sufficiently large hangup-timeouts for the -isdn interfaces. - - -In order to set up a conforming protocol stack you also need to -specify the proper l2_prot parameter: - -To operate in ISO-8208 X.25 DTE-DTE mode, use - - isdnctrl l2_prot x75i - -To access an X.25 network switch via isdn (your linux box is the DTE), use - - isdnctrl l2_prot x25dte - -To mimic an X.25 network switch (DCE side of the connection), use - - isdnctrl l2_prot x25dce - -However, x25dte or x25dce is currently not supported by any real HL -level driver. The main difference between x75i and x25dte/dce is that -x25d[tc]e uses fixed lap_b addresses. With x75i, the side which -initiates the isdn connection uses the DTE's lap_b address while the -called side used the DCE's lap_b address. Thus, l2_prot x75i might -probably work if you access a public X.25 network as long as the -corresponding isdn connection is set up by you. At least one test -was successful to connect via isdn4linux to an X.25 switch using this -trick. At the switch side, a terminal adapter X.21 was used to connect -it to the isdn. - - -How to set up a test installation? -================================== - -To test X.25 on top of isdn, you need to get - -- a recent version of the "isdnctrl" program that supports setting the new - X.25 specific parameters. - -- the x25-utils-2.X package from - ftp://ftp.hes.iki.fi/pub/ham/linux/ax25/x25utils-* - (don't confuse the x25-utils with the ax25-utils) - -- an application program that uses linux PF_X25 sockets (some are - contained in the x25-util package). - -Before compiling the user level utilities make sure that the compiler/ -preprocessor will fetch the proper kernel header files of this kernel -source tree. Either make /usr/include/linux a symbolic link pointing to -this kernel's include/linux directory or set the appropriate compiler flags. - -When all drivers and interfaces are loaded and configured you need to -ifconfig the network interfaces up and add X.25-routes to them. Use -the usual ifconfig tool. - -ifconfig up - -But a special x25route tool (distributed with the x25-util package) -is needed to set up X.25 routes. I.e. - -x25route add 01 - -will cause all x.25 connections to the destination X.25-address -"01" to be routed to your created isdn network interface. - -There are currently no real X.25 applications available. However, for -tests, the x25-utils package contains a modified version of telnet -and telnetd that uses X.25 sockets instead of tcp/ip sockets. You can -use those for your first tests. Furthermore, you might check -ftp://ftp.hamburg.pop.de/pub/LOCAL/linux/i4l-eft/ which contains some -alpha-test implementation ("eftp4linux") of the EUROfile transfer -protocol. - -The scripts distributed with the eftp4linux test releases might also -provide useful examples for setting up X.25 on top of isdn. - -The x25-utility package also contains an x25trace tool that can be -used to monitor X.25 packets received by the network interfaces. -The /proc/net/x25* files also contain useful information. - -- Henner diff --git a/Documentation/isdn/syncPPP.FAQ b/Documentation/isdn/syncPPP.FAQ deleted file mode 100644 index 3257a4bc0786..000000000000 --- a/Documentation/isdn/syncPPP.FAQ +++ /dev/null @@ -1,224 +0,0 @@ -simple isdn4linux PPP FAQ .. to be continued .. not 'debugged' -------------------------------------------------------------------- - -Q01: what's pppd, ipppd, syncPPP, asyncPPP ?? -Q02: error message "this system lacks PPP support" -Q03: strange information using 'ifconfig' -Q04: MPPP?? What's that and how can I use it ... -Q05: I tried MPPP but it doesn't work -Q06: can I use asynchronous PPP encapsulation with network devices -Q07: A SunISDN machine can't connect to my i4l system -Q08: I wanna talk to several machines, which need different configs -Q09: Starting the ipppd, I get only error messages from i4l -Q10: I wanna use dynamic IP address assignment -Q11: I can't connect. How can I check where the problem is. -Q12: How can I reduce login delay? - -------------------------------------------------------------------- - -Q01: pppd, ipppd, syncPPP, asyncPPP .. what is that ? - what should I use? -A: The pppd is for asynchronous PPP .. asynchronous means - here, the framing is character based. (e.g when - using ttyI* or tty* devices) - - The ipppd handles PPP packets coming in HDLC - frames (bit based protocol) ... The PPP driver - in isdn4linux pushes all IP packets direct - to the network layer and all PPP protocol - frames to the /dev/ippp* device. - So, the ipppd is a simple external network - protocol handler. - - If you login into a remote machine using the - /dev/ttyI* devices and then enable PPP on the - remote terminal server -> use the 'old' pppd - - If your remote side immediately starts to send - frames ... you probably connect to a - syncPPP machine .. use the network device part - of isdn4linux with the 'syncppp' encapsulation - and make sure, that the ipppd is running and - connected to at least one /dev/ippp*. Check the - isdn4linux manual on how to configure a network device. - --- - -Q02: when I start the ipppd .. I only get the - error message "this system lacks PPP support" -A: check that at least the device 'ippp0' exists. - (you can check this e.g with the program 'ifconfig') - The ipppd NEEDS this device under THIS name .. - If this device doesn't exists, use: - isdnctrl addif ippp0 - isdnctrl encap ippp0 syncppp - ... (see isdn4linux doc for more) ... -A: Maybe you have compiled the ipppd with another - kernel source tree than the kernel you currently - run ... - --- - -Q03: when I list the netdevices with ifconfig I see, that - my ISDN interface has a HWaddr and IRQ=0 and Base - address = 0 -A: The device is a fake ethernet device .. ignore IRQ and baseaddr - You need the HWaddr only for ethernet encapsulation. - --- - -Q04: MPPP?? What's that and how can I use it ... - -A: MPPP or MP or MPP (Warning: MP is also an - acronym for 'Multi Processor') stands for - Multi Point to Point and means bundling - of several channels to one logical stream. - To enable MPPP negotiation you must call the - ipppd with the '+mp' option. - You must also configure a slave device for - every additional channel. (see the i4l manual - for more) - To use channel bundling you must first activate - the 'master' or initial call. Now you can add - the slave channels with the command: - isdnctrl addlink - e.g: - isdnctrl addlink ippp0 - This is different from other encapsulations of - isdn4linux! With syncPPP, there is no automatic - activation of slave devices. - --- - -Q05: I tried MPPP but it doesn't work .. the ipppd - writes in the debug log something like: - .. rcvd [0][proto=0x3d] c0 00 00 00 80 fd 01 01 00 0a ... - .. sent [0][LCP ProtRej id=0x2 00 3d c0 00 00 00 80 fd 01 ... - -A: you forgot to compile MPPP/RFC1717 support into the - ISDN Subsystem. Recompile with this option enabled. - --- - -Q06: can I use asynchronous PPP encapsulation - over the network interface of isdn4linux .. - -A: No .. that's not possible .. Use the standard - PPP package over the /dev/ttyI* devices. You - must not use the ipppd for this. - --- - -Q07: A SunISDN machine tries to connect my i4l system, - which doesn't work. - Checking the debug log I just saw garbage like: -!![ ... fill in the line ... ]!! - -A: The Sun tries to talk asynchronous PPP ... i4l - can't understand this ... try to use the ttyI* - devices with the standard PPP/pppd package - -A: (from Alexanter Strauss: ) -!![ ... fill in mail ]!! - --- - -Q08: I wanna talk to remote machines, which need - a different configuration. The only way - I found to do this is to kill the ipppd and - start a new one with another config to connect - to the second machine. - -A: you must bind a network interface explicitly to - an ippp device, where you can connect a (for this - interface) individually configured ipppd. - --- - -Q09: When I start the ipppd I only get error messages - from the i4l driver .. - -A: When starting, the ipppd calls functions which may - trigger a network packet. (e.g gethostbyname()). - Without the ipppd (at this moment, it is not - fully started) we can't handle this network request. - Try to configure hostnames necessary for the ipppd - in your local /etc/hosts file or in a way, that - your system can resolve it without using an - isdn/ippp network-interface. - --- - -Q10: I wanna use dynamic IP address assignment ... How - must I configure the network device. - -A: At least you must have a route which forwards - a packet to the ippp network-interface to trigger - the dial-on-demand. - A default route to the ippp-interface will work. - Now you must choose a dummy IP address for your - interface. - If for some reason you can't set the default - route to the ippp interface, you may take any - address of the subnet from which you expect your - dynamic IP number and set a 'network route' for - this subnet to the ippp interface. - To allow overriding of the dummy address you - must call the ipppd with the 'ipcp-accept-local' option. - -A: You must know, how the ipppd gets the addresses it wanna - configure. If you don't give any option, the ipppd - tries to negotiate the local host address! - With the option 'noipdefault' it requests an address - from the remote machine. With 'useifip' it gets the - addresses from the net interface. Or you set the address - on the option line with the option. - Note: the IP address of the remote machine must be configured - locally or the remote machine must send it in an IPCP request. - If your side doesn't know the IP address after negotiation, it - closes the connection! - You must allow overriding of address with the 'ipcp-accept-*' - options, if you have set your own or the remote address - explicitly. - -A: Maybe you try these options .. e.g: - - /sbin/ipppd :$REMOTE noipdefault /dev/ippp0 - - where REMOTE must be the address of the remote machine (the - machine, which gives you your address) - --- - -Q11: I can't connect. How can I check where the problem is. - -A: A good help log is the debug output from the ipppd... - Check whether you can find there: - - only a few LCP-conf-req SENT messages (less then 10) - and then a Term-REQ: - -> check whether your ISDN card is well configured - it seems, that your machine doesn't dial - (IRQ,IO,Proto, etc problems) - Configure your ISDN card to print debug messages and - check the /dev/isdnctrl output next time. There - you can see, whether there is activity on the card/line. - - there are at least a few RECV messages in the log: - -> fine: your card is dialing and your remote machine - tries to talk with you. Maybe only a missing - authentication. Check your ipppd configuration again. - - the ipppd exits for some reason: - -> not good ... check /var/adm/syslog and /var/adm/daemon. - Could be a bug in the ipppd. - --- - -Q12: How can I reduce login delay? - -A: Log a login session ('debug' log) and check which options - your remote side rejects. Next time configure your ipppd - to not negotiate these options. Another 'side effect' is, that - this increases redundancy. (e.g your remote side is buggy and - rejects options in a wrong way). - - - diff --git a/Documentation/process/changes.rst b/Documentation/process/changes.rst index 18735dc460a0..111636ad1bad 100644 --- a/Documentation/process/changes.rst +++ b/Documentation/process/changes.rst @@ -23,8 +23,8 @@ running, the suggested command should tell you. Again, keep in mind that this list assumes you are already functionally running a Linux kernel. Also, not all tools are necessary on all -systems; obviously, if you don't have any ISDN hardware, for example, -you probably needn't concern yourself with isdn4k-utils. +systems; obviously, if you don't have any PC Card hardware, for example, +you probably needn't concern yourself with pcmciautils. ====================== =============== ======================================== Program Minimal version Command to check the version @@ -45,7 +45,6 @@ btrfs-progs 0.18 btrfsck pcmciautils 004 pccardctl -V quota-tools 3.09 quota -V PPP 2.4.0 pppd --version -isdn4k-utils 3.1pre1 isdnctrl 2>&1|grep version nfs-utils 1.0.5 showmount --version procps 3.2.0 ps --version oprofile 0.9 oprofiled --version @@ -279,12 +278,6 @@ which can be made by:: as root. -Isdn4k-utils ------------- - -Due to changes in the length of the phone number field, isdn4k-utils -needs to be recompiled or (preferably) upgraded. - NFS-utils --------- @@ -448,11 +441,6 @@ PPP - -Isdn4k-utils ------------- - -- - NFS-utils --------- diff --git a/MAINTAINERS b/MAINTAINERS index 0c55b0fedbe2..3a761e680296 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -8371,9 +8371,7 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/kkeil/isdn-2.6.git S: Maintained F: Documentation/isdn/ F: drivers/isdn/ -F: include/linux/isdn.h F: include/linux/isdn/ -F: include/uapi/linux/isdn.h F: include/uapi/linux/isdn/ IT87 HARDWARE MONITORING DRIVER diff --git a/drivers/isdn/Kconfig b/drivers/isdn/Kconfig index 1ca4d70d198a..6e3bf833c67e 100644 --- a/drivers/isdn/Kconfig +++ b/drivers/isdn/Kconfig @@ -21,27 +21,6 @@ menuconfig ISDN if ISDN -menuconfig ISDN_I4L - tristate "Old ISDN4Linux (deprecated)" - depends on TTY - ---help--- - This driver allows you to use an ISDN adapter for networking - connections and as dialin/out device. The isdn-tty's have a built - in AT-compatible modem emulator. Network devices support autodial, - channel-bundling, callback and caller-authentication without having - a daemon running. A reduced T.70 protocol is supported with tty's - suitable for German BTX. On D-Channel, the protocols EDSS1 - (Euro-ISDN) and 1TR6 (German style) are supported. See - for more information. - - ISDN support in the linux kernel is moving towards a new API, - called CAPI (Common ISDN Application Programming Interface). - Therefore the old ISDN4Linux layer will eventually become obsolete. - It is still available, though, for use with adapters that are not - supported by the new CAPI subsystem yet. - -source "drivers/isdn/i4l/Kconfig" - menuconfig ISDN_CAPI tristate "CAPI 2.0 subsystem" help @@ -71,9 +50,4 @@ source "drivers/isdn/hysdn/Kconfig" source "drivers/isdn/mISDN/Kconfig" -config ISDN_HDLC - tristate - select CRC_CCITT - select BITREVERSE - endif # ISDN diff --git a/drivers/isdn/Makefile b/drivers/isdn/Makefile index 7487f0bbe855..379b4a03c321 100644 --- a/drivers/isdn/Makefile +++ b/drivers/isdn/Makefile @@ -7,7 +7,5 @@ obj-$(CONFIG_ISDN_I4L) += i4l/ obj-$(CONFIG_ISDN_CAPI) += capi/ obj-$(CONFIG_MISDN) += mISDN/ obj-$(CONFIG_ISDN) += hardware/ -obj-$(CONFIG_ISDN_DIVERSION) += divert/ -obj-$(CONFIG_ISDN_DRV_LOOP) += isdnloop/ obj-$(CONFIG_HYSDN) += hysdn/ obj-$(CONFIG_ISDN_DRV_GIGASET) += gigaset/ diff --git a/drivers/isdn/capi/Kconfig b/drivers/isdn/capi/Kconfig index abaadce376c5..089dbee18f36 100644 --- a/drivers/isdn/capi/Kconfig +++ b/drivers/isdn/capi/Kconfig @@ -27,15 +27,6 @@ config ISDN_CAPI_MIDDLEWARE device. If you want to use pppd with pppdcapiplugin to dial up to your ISP, say Y here. -config ISDN_CAPI_CAPIDRV - tristate "CAPI2.0 capidrv interface support" - depends on ISDN_I4L - help - This option provides the glue code to hook up CAPI driven cards to - the legacy isdn4linux link layer. If you have a card which is - supported by a CAPI driver, but still want to use old features like - ippp interfaces or ttyI emulation, say Y/M here. - config ISDN_CAPI_CAPIDRV_VERBOSE bool "Verbose reason code reporting" depends on ISDN_CAPI_CAPIDRV diff --git a/drivers/isdn/capi/capidrv.c b/drivers/isdn/capi/capidrv.c deleted file mode 100644 index e8949f3dcae1..000000000000 --- a/drivers/isdn/capi/capidrv.c +++ /dev/null @@ -1,2525 +0,0 @@ -/* $Id: capidrv.c,v 1.1.2.2 2004/01/12 23:17:24 keil Exp $ - * - * ISDN4Linux Driver, using capi20 interface (kernelcapi) - * - * Copyright 1997 by Carsten Paeth - * - * This software may be used and distributed according to the terms - * of the GNU General Public License, incorporated herein by reference. - * - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include "capidrv.h" - -static int debugmode = 0; - -MODULE_DESCRIPTION("CAPI4Linux: Interface to ISDN4Linux"); -MODULE_AUTHOR("Carsten Paeth"); -MODULE_LICENSE("GPL"); -module_param(debugmode, uint, S_IRUGO | S_IWUSR); - -/* -------- type definitions ----------------------------------------- */ - - -struct capidrv_contr { - - struct capidrv_contr *next; - struct module *owner; - u32 contrnr; - char name[20]; - - /* - * for isdn4linux - */ - isdn_if interface; - int myid; - - /* - * LISTEN state - */ - int state; - u32 cipmask; - u32 cipmask2; - struct timer_list listentimer; - - /* - * ID of capi message sent - */ - u16 msgid; - - /* - * B-Channels - */ - int nbchan; - struct capidrv_bchan { - struct capidrv_contr *contr; - u8 msn[ISDN_MSNLEN]; - int l2; - int l3; - u8 num[ISDN_MSNLEN]; - u8 mynum[ISDN_MSNLEN]; - int si1; - int si2; - int incoming; - int disconnecting; - struct capidrv_plci { - struct capidrv_plci *next; - u32 plci; - u32 ncci; /* ncci for CONNECT_ACTIVE_IND */ - u16 msgid; /* to identfy CONNECT_CONF */ - int chan; - int state; - int leasedline; - struct capidrv_ncci { - struct capidrv_ncci *next; - struct capidrv_plci *plcip; - u32 ncci; - u16 msgid; /* to identfy CONNECT_B3_CONF */ - int chan; - int state; - int oldstate; - /* */ - u16 datahandle; - struct ncci_datahandle_queue { - struct ncci_datahandle_queue *next; - u16 datahandle; - int len; - } *ackqueue; - } *ncci_list; - } *plcip; - struct capidrv_ncci *nccip; - } *bchans; - - struct capidrv_plci *plci_list; - - /* for q931 data */ - u8 q931_buf[4096]; - u8 *q931_read; - u8 *q931_write; - u8 *q931_end; -}; - - -struct capidrv_data { - struct capi20_appl ap; - int ncontr; - struct capidrv_contr *contr_list; -}; - -typedef struct capidrv_plci capidrv_plci; -typedef struct capidrv_ncci capidrv_ncci; -typedef struct capidrv_contr capidrv_contr; -typedef struct capidrv_data capidrv_data; -typedef struct capidrv_bchan capidrv_bchan; - -/* -------- data definitions ----------------------------------------- */ - -static capidrv_data global; -static DEFINE_SPINLOCK(global_lock); - -static void handle_dtrace_data(capidrv_contr *card, - int send, int level2, u8 *data, u16 len); - -/* -------- convert functions ---------------------------------------- */ - -static inline u32 b1prot(int l2, int l3) -{ - switch (l2) { - case ISDN_PROTO_L2_X75I: - case ISDN_PROTO_L2_X75UI: - case ISDN_PROTO_L2_X75BUI: - return 0; - case ISDN_PROTO_L2_HDLC: - default: - return 0; - case ISDN_PROTO_L2_TRANS: - return 1; - case ISDN_PROTO_L2_V11096: - case ISDN_PROTO_L2_V11019: - case ISDN_PROTO_L2_V11038: - return 2; - case ISDN_PROTO_L2_FAX: - return 4; - case ISDN_PROTO_L2_MODEM: - return 8; - } -} - -static inline u32 b2prot(int l2, int l3) -{ - switch (l2) { - case ISDN_PROTO_L2_X75I: - case ISDN_PROTO_L2_X75UI: - case ISDN_PROTO_L2_X75BUI: - default: - return 0; - case ISDN_PROTO_L2_HDLC: - case ISDN_PROTO_L2_TRANS: - case ISDN_PROTO_L2_V11096: - case ISDN_PROTO_L2_V11019: - case ISDN_PROTO_L2_V11038: - case ISDN_PROTO_L2_MODEM: - return 1; - case ISDN_PROTO_L2_FAX: - return 4; - } -} - -static inline u32 b3prot(int l2, int l3) -{ - switch (l2) { - case ISDN_PROTO_L2_X75I: - case ISDN_PROTO_L2_X75UI: - case ISDN_PROTO_L2_X75BUI: - case ISDN_PROTO_L2_HDLC: - case ISDN_PROTO_L2_TRANS: - case ISDN_PROTO_L2_V11096: - case ISDN_PROTO_L2_V11019: - case ISDN_PROTO_L2_V11038: - case ISDN_PROTO_L2_MODEM: - default: - return 0; - case ISDN_PROTO_L2_FAX: - return 4; - } -} - -static _cstruct b1config_async_v110(u16 rate) -{ - /* CAPI-Spec "B1 Configuration" */ - static unsigned char buf[9]; - buf[0] = 8; /* len */ - /* maximum bitrate */ - buf[1] = rate & 0xff; buf[2] = (rate >> 8) & 0xff; - buf[3] = 8; buf[4] = 0; /* 8 bits per character */ - buf[5] = 0; buf[6] = 0; /* parity none */ - buf[7] = 0; buf[8] = 0; /* 1 stop bit */ - return buf; -} - -static _cstruct b1config(int l2, int l3) -{ - switch (l2) { - case ISDN_PROTO_L2_X75I: - case ISDN_PROTO_L2_X75UI: - case ISDN_PROTO_L2_X75BUI: - case ISDN_PROTO_L2_HDLC: - case ISDN_PROTO_L2_TRANS: - default: - return NULL; - case ISDN_PROTO_L2_V11096: - return b1config_async_v110(9600); - case ISDN_PROTO_L2_V11019: - return b1config_async_v110(19200); - case ISDN_PROTO_L2_V11038: - return b1config_async_v110(38400); - } -} - -static inline u16 si2cip(u8 si1, u8 si2) -{ - static const u8 cip[17][5] = - { - /* 0 1 2 3 4 */ - {0, 0, 0, 0, 0}, /*0 */ - {16, 16, 4, 26, 16}, /*1 */ - {17, 17, 17, 4, 4}, /*2 */ - {2, 2, 2, 2, 2}, /*3 */ - {18, 18, 18, 18, 18}, /*4 */ - {2, 2, 2, 2, 2}, /*5 */ - {0, 0, 0, 0, 0}, /*6 */ - {2, 2, 2, 2, 2}, /*7 */ - {2, 2, 2, 2, 2}, /*8 */ - {21, 21, 21, 21, 21}, /*9 */ - {19, 19, 19, 19, 19}, /*10 */ - {0, 0, 0, 0, 0}, /*11 */ - {0, 0, 0, 0, 0}, /*12 */ - {0, 0, 0, 0, 0}, /*13 */ - {0, 0, 0, 0, 0}, /*14 */ - {22, 22, 22, 22, 22}, /*15 */ - {27, 27, 27, 28, 27} /*16 */ - }; - if (si1 > 16) - si1 = 0; - if (si2 > 4) - si2 = 0; - - return (u16) cip[si1][si2]; -} - -static inline u8 cip2si1(u16 cipval) -{ - static const u8 si[32] = - {7, 1, 7, 7, 1, 1, 7, 7, /*0-7 */ - 7, 1, 0, 0, 0, 0, 0, 0, /*8-15 */ - 1, 2, 4, 10, 9, 9, 15, 7, /*16-23 */ - 7, 7, 1, 16, 16, 0, 0, 0}; /*24-31 */ - - if (cipval > 31) - cipval = 0; /* .... */ - return si[cipval]; -} - -static inline u8 cip2si2(u16 cipval) -{ - static const u8 si[32] = - {0, 0, 0, 0, 2, 3, 0, 0, /*0-7 */ - 0, 3, 0, 0, 0, 0, 0, 0, /*8-15 */ - 1, 2, 0, 0, 9, 0, 0, 0, /*16-23 */ - 0, 0, 3, 2, 3, 0, 0, 0}; /*24-31 */ - - if (cipval > 31) - cipval = 0; /* .... */ - return si[cipval]; -} - - -/* -------- controller management ------------------------------------- */ - -static inline capidrv_contr *findcontrbydriverid(int driverid) -{ - unsigned long flags; - capidrv_contr *p; - - spin_lock_irqsave(&global_lock, flags); - for (p = global.contr_list; p; p = p->next) - if (p->myid == driverid) - break; - spin_unlock_irqrestore(&global_lock, flags); - return p; -} - -static capidrv_contr *findcontrbynumber(u32 contr) -{ - unsigned long flags; - capidrv_contr *p = global.contr_list; - - spin_lock_irqsave(&global_lock, flags); - for (p = global.contr_list; p; p = p->next) - if (p->contrnr == contr) - break; - spin_unlock_irqrestore(&global_lock, flags); - return p; -} - - -/* -------- plci management ------------------------------------------ */ - -static capidrv_plci *new_plci(capidrv_contr *card, int chan) -{ - capidrv_plci *plcip; - - plcip = kzalloc(sizeof(capidrv_plci), GFP_ATOMIC); - - if (plcip == NULL) - return NULL; - - plcip->state = ST_PLCI_NONE; - plcip->plci = 0; - plcip->msgid = 0; - plcip->chan = chan; - plcip->next = card->plci_list; - card->plci_list = plcip; - card->bchans[chan].plcip = plcip; - - return plcip; -} - -static capidrv_plci *find_plci_by_plci(capidrv_contr *card, u32 plci) -{ - capidrv_plci *p; - for (p = card->plci_list; p; p = p->next) - if (p->plci == plci) - return p; - return NULL; -} - -static capidrv_plci *find_plci_by_msgid(capidrv_contr *card, u16 msgid) -{ - capidrv_plci *p; - for (p = card->plci_list; p; p = p->next) - if (p->msgid == msgid) - return p; - return NULL; -} - -static capidrv_plci *find_plci_by_ncci(capidrv_contr *card, u32 ncci) -{ - capidrv_plci *p; - for (p = card->plci_list; p; p = p->next) - if (p->plci == (ncci & 0xffff)) - return p; - return NULL; -} - -static void free_plci(capidrv_contr *card, capidrv_plci *plcip) -{ - capidrv_plci **pp; - - for (pp = &card->plci_list; *pp; pp = &(*pp)->next) { - if (*pp == plcip) { - *pp = (*pp)->next; - card->bchans[plcip->chan].plcip = NULL; - card->bchans[plcip->chan].disconnecting = 0; - card->bchans[plcip->chan].incoming = 0; - kfree(plcip); - return; - } - } - printk(KERN_ERR "capidrv-%d: free_plci %p (0x%x) not found, Huh?\n", - card->contrnr, plcip, plcip->plci); -} - -/* -------- ncci management ------------------------------------------ */ - -static inline capidrv_ncci *new_ncci(capidrv_contr *card, - capidrv_plci *plcip, - u32 ncci) -{ - capidrv_ncci *nccip; - - nccip = kzalloc(sizeof(capidrv_ncci), GFP_ATOMIC); - - if (nccip == NULL) - return NULL; - - nccip->ncci = ncci; - nccip->state = ST_NCCI_NONE; - nccip->plcip = plcip; - nccip->chan = plcip->chan; - nccip->datahandle = 0; - - nccip->next = plcip->ncci_list; - plcip->ncci_list = nccip; - - card->bchans[plcip->chan].nccip = nccip; - - return nccip; -} - -static inline capidrv_ncci *find_ncci(capidrv_contr *card, u32 ncci) -{ - capidrv_plci *plcip; - capidrv_ncci *p; - - if ((plcip = find_plci_by_ncci(card, ncci)) == NULL) - return NULL; - - for (p = plcip->ncci_list; p; p = p->next) - if (p->ncci == ncci) - return p; - return NULL; -} - -static inline capidrv_ncci *find_ncci_by_msgid(capidrv_contr *card, - u32 ncci, u16 msgid) -{ - capidrv_plci *plcip; - capidrv_ncci *p; - - if ((plcip = find_plci_by_ncci(card, ncci)) == NULL) - return NULL; - - for (p = plcip->ncci_list; p; p = p->next) - if (p->msgid == msgid) - return p; - return NULL; -} - -static void free_ncci(capidrv_contr *card, struct capidrv_ncci *nccip) -{ - struct capidrv_ncci **pp; - - for (pp = &(nccip->plcip->ncci_list); *pp; pp = &(*pp)->next) { - if (*pp == nccip) { - *pp = (*pp)->next; - break; - } - } - card->bchans[nccip->chan].nccip = NULL; - kfree(nccip); -} - -static int capidrv_add_ack(struct capidrv_ncci *nccip, - u16 datahandle, int len) -{ - struct ncci_datahandle_queue *n, **pp; - - n = kmalloc(sizeof(struct ncci_datahandle_queue), GFP_ATOMIC); - if (!n) { - printk(KERN_ERR "capidrv: kmalloc ncci_datahandle failed\n"); - return -1; - } - n->next = NULL; - n->datahandle = datahandle; - n->len = len; - for (pp = &nccip->ackqueue; *pp; pp = &(*pp)->next); - *pp = n; - return 0; -} - -static int capidrv_del_ack(struct capidrv_ncci *nccip, u16 datahandle) -{ - struct ncci_datahandle_queue **pp, *p; - int len; - - for (pp = &nccip->ackqueue; *pp; pp = &(*pp)->next) { - if ((*pp)->datahandle == datahandle) { - p = *pp; - len = p->len; - *pp = (*pp)->next; - kfree(p); - return len; - } - } - return -1; -} - -/* -------- convert and send capi message ---------------------------- */ - -static void send_message(capidrv_contr *card, _cmsg *cmsg) -{ - struct sk_buff *skb; - size_t len; - - if (capi_cmsg2message(cmsg, cmsg->buf)) { - printk(KERN_ERR "capidrv::send_message: parser failure\n"); - return; - } - len = CAPIMSG_LEN(cmsg->buf); - skb = alloc_skb(len, GFP_ATOMIC); - if (!skb) { - printk(KERN_ERR "capidrv::send_message: can't allocate mem\n"); - return; - } - skb_put_data(skb, cmsg->buf, len); - if (capi20_put_message(&global.ap, skb) != CAPI_NOERROR) - kfree_skb(skb); -} - -/* -------- state machine -------------------------------------------- */ - -struct listenstatechange { - int actstate; - int nextstate; - int event; -}; - -static struct listenstatechange listentable[] = -{ - {ST_LISTEN_NONE, ST_LISTEN_WAIT_CONF, EV_LISTEN_REQ}, - {ST_LISTEN_ACTIVE, ST_LISTEN_ACTIVE_WAIT_CONF, EV_LISTEN_REQ}, - {ST_LISTEN_WAIT_CONF, ST_LISTEN_NONE, EV_LISTEN_CONF_ERROR}, - {ST_LISTEN_ACTIVE_WAIT_CONF, ST_LISTEN_ACTIVE, EV_LISTEN_CONF_ERROR}, - {ST_LISTEN_WAIT_CONF, ST_LISTEN_NONE, EV_LISTEN_CONF_EMPTY}, - {ST_LISTEN_ACTIVE_WAIT_CONF, ST_LISTEN_NONE, EV_LISTEN_CONF_EMPTY}, - {ST_LISTEN_WAIT_CONF, ST_LISTEN_ACTIVE, EV_LISTEN_CONF_OK}, - {ST_LISTEN_ACTIVE_WAIT_CONF, ST_LISTEN_ACTIVE, EV_LISTEN_CONF_OK}, - {}, -}; - -static void listen_change_state(capidrv_contr *card, int event) -{ - struct listenstatechange *p = listentable; - while (p->event) { - if (card->state == p->actstate && p->event == event) { - if (debugmode) - printk(KERN_DEBUG "capidrv-%d: listen_change_state %d -> %d\n", - card->contrnr, card->state, p->nextstate); - card->state = p->nextstate; - return; - } - p++; - } - printk(KERN_ERR "capidrv-%d: listen_change_state state=%d event=%d ????\n", - card->contrnr, card->state, event); - -} - -/* ------------------------------------------------------------------ */ - -static void p0(capidrv_contr *card, capidrv_plci *plci) -{ - isdn_ctrl cmd; - - card->bchans[plci->chan].contr = NULL; - cmd.command = ISDN_STAT_DHUP; - cmd.driver = card->myid; - cmd.arg = plci->chan; - card->interface.statcallb(&cmd); - free_plci(card, plci); -} - -/* ------------------------------------------------------------------ */ - -struct plcistatechange { - int actstate; - int nextstate; - int event; - void (*changefunc)(capidrv_contr *card, capidrv_plci *plci); -}; - -static struct plcistatechange plcitable[] = -{ - /* P-0 */ - {ST_PLCI_NONE, ST_PLCI_OUTGOING, EV_PLCI_CONNECT_REQ, NULL}, - {ST_PLCI_NONE, ST_PLCI_ALLOCATED, EV_PLCI_FACILITY_IND_UP, NULL}, - {ST_PLCI_NONE, ST_PLCI_INCOMING, EV_PLCI_CONNECT_IND, NULL}, - {ST_PLCI_NONE, ST_PLCI_RESUMEING, EV_PLCI_RESUME_REQ, NULL}, - /* P-0.1 */ - {ST_PLCI_OUTGOING, ST_PLCI_NONE, EV_PLCI_CONNECT_CONF_ERROR, p0}, - {ST_PLCI_OUTGOING, ST_PLCI_ALLOCATED, EV_PLCI_CONNECT_CONF_OK, NULL}, - /* P-1 */ - {ST_PLCI_ALLOCATED, ST_PLCI_ACTIVE, EV_PLCI_CONNECT_ACTIVE_IND, NULL}, - {ST_PLCI_ALLOCATED, ST_PLCI_DISCONNECTING, EV_PLCI_DISCONNECT_REQ, NULL}, - {ST_PLCI_ALLOCATED, ST_PLCI_DISCONNECTING, EV_PLCI_FACILITY_IND_DOWN, NULL}, - {ST_PLCI_ALLOCATED, ST_PLCI_DISCONNECTED, EV_PLCI_DISCONNECT_IND, NULL}, - /* P-ACT */ - {ST_PLCI_ACTIVE, ST_PLCI_DISCONNECTING, EV_PLCI_DISCONNECT_REQ, NULL}, - {ST_PLCI_ACTIVE, ST_PLCI_DISCONNECTING, EV_PLCI_FACILITY_IND_DOWN, NULL}, - {ST_PLCI_ACTIVE, ST_PLCI_DISCONNECTED, EV_PLCI_DISCONNECT_IND, NULL}, - {ST_PLCI_ACTIVE, ST_PLCI_HELD, EV_PLCI_HOLD_IND, NULL}, - {ST_PLCI_ACTIVE, ST_PLCI_DISCONNECTING, EV_PLCI_SUSPEND_IND, NULL}, - /* P-2 */ - {ST_PLCI_INCOMING, ST_PLCI_DISCONNECTING, EV_PLCI_CONNECT_REJECT, NULL}, - {ST_PLCI_INCOMING, ST_PLCI_FACILITY_IND, EV_PLCI_FACILITY_IND_UP, NULL}, - {ST_PLCI_INCOMING, ST_PLCI_ACCEPTING, EV_PLCI_CONNECT_RESP, NULL}, - {ST_PLCI_INCOMING, ST_PLCI_DISCONNECTING, EV_PLCI_DISCONNECT_REQ, NULL}, - {ST_PLCI_INCOMING, ST_PLCI_DISCONNECTING, EV_PLCI_FACILITY_IND_DOWN, NULL}, - {ST_PLCI_INCOMING, ST_PLCI_DISCONNECTED, EV_PLCI_DISCONNECT_IND, NULL}, - {ST_PLCI_INCOMING, ST_PLCI_DISCONNECTING, EV_PLCI_CD_IND, NULL}, - /* P-3 */ - {ST_PLCI_FACILITY_IND, ST_PLCI_DISCONNECTING, EV_PLCI_CONNECT_REJECT, NULL}, - {ST_PLCI_FACILITY_IND, ST_PLCI_ACCEPTING, EV_PLCI_CONNECT_ACTIVE_IND, NULL}, - {ST_PLCI_FACILITY_IND, ST_PLCI_DISCONNECTING, EV_PLCI_DISCONNECT_REQ, NULL}, - {ST_PLCI_FACILITY_IND, ST_PLCI_DISCONNECTING, EV_PLCI_FACILITY_IND_DOWN, NULL}, - {ST_PLCI_FACILITY_IND, ST_PLCI_DISCONNECTED, EV_PLCI_DISCONNECT_IND, NULL}, - /* P-4 */ - {ST_PLCI_ACCEPTING, ST_PLCI_ACTIVE, EV_PLCI_CONNECT_ACTIVE_IND, NULL}, - {ST_PLCI_ACCEPTING, ST_PLCI_DISCONNECTING, EV_PLCI_DISCONNECT_REQ, NULL}, - {ST_PLCI_ACCEPTING, ST_PLCI_DISCONNECTING, EV_PLCI_FACILITY_IND_DOWN, NULL}, - {ST_PLCI_ACCEPTING, ST_PLCI_DISCONNECTED, EV_PLCI_DISCONNECT_IND, NULL}, - /* P-5 */ - {ST_PLCI_DISCONNECTING, ST_PLCI_DISCONNECTED, EV_PLCI_DISCONNECT_IND, NULL}, - /* P-6 */ - {ST_PLCI_DISCONNECTED, ST_PLCI_NONE, EV_PLCI_DISCONNECT_RESP, p0}, - /* P-0.Res */ - {ST_PLCI_RESUMEING, ST_PLCI_NONE, EV_PLCI_RESUME_CONF_ERROR, p0}, - {ST_PLCI_RESUMEING, ST_PLCI_RESUME, EV_PLCI_RESUME_CONF_OK, NULL}, - /* P-RES */ - {ST_PLCI_RESUME, ST_PLCI_ACTIVE, EV_PLCI_RESUME_IND, NULL}, - /* P-HELD */ - {ST_PLCI_HELD, ST_PLCI_ACTIVE, EV_PLCI_RETRIEVE_IND, NULL}, - {}, -}; - -static void plci_change_state(capidrv_contr *card, capidrv_plci *plci, int event) -{ - struct plcistatechange *p = plcitable; - while (p->event) { - if (plci->state == p->actstate && p->event == event) { - if (debugmode) - printk(KERN_DEBUG "capidrv-%d: plci_change_state:0x%x %d -> %d\n", - card->contrnr, plci->plci, plci->state, p->nextstate); - plci->state = p->nextstate; - if (p->changefunc) - p->changefunc(card, plci); - return; - } - p++; - } - printk(KERN_ERR "capidrv-%d: plci_change_state:0x%x state=%d event=%d ????\n", - card->contrnr, plci->plci, plci->state, event); -} - -/* ------------------------------------------------------------------ */ - -static _cmsg cmsg; - -static void n0(capidrv_contr *card, capidrv_ncci *ncci) -{ - isdn_ctrl cmd; - - capi_fill_DISCONNECT_REQ(&cmsg, - global.ap.applid, - card->msgid++, - ncci->plcip->plci, - NULL, /* BChannelinformation */ - NULL, /* Keypadfacility */ - NULL, /* Useruserdata */ /* $$$$ */ - NULL /* Facilitydataarray */ - ); - plci_change_state(card, ncci->plcip, EV_PLCI_DISCONNECT_REQ); - send_message(card, &cmsg); - - cmd.command = ISDN_STAT_BHUP; - cmd.driver = card->myid; - cmd.arg = ncci->chan; - card->interface.statcallb(&cmd); - free_ncci(card, ncci); -} - -/* ------------------------------------------------------------------ */ - -struct nccistatechange { - int actstate; - int nextstate; - int event; - void (*changefunc)(capidrv_contr *card, capidrv_ncci *ncci); -}; - -static struct nccistatechange nccitable[] = -{ - /* N-0 */ - {ST_NCCI_NONE, ST_NCCI_OUTGOING, EV_NCCI_CONNECT_B3_REQ, NULL}, - {ST_NCCI_NONE, ST_NCCI_INCOMING, EV_NCCI_CONNECT_B3_IND, NULL}, - /* N-0.1 */ - {ST_NCCI_OUTGOING, ST_NCCI_ALLOCATED, EV_NCCI_CONNECT_B3_CONF_OK, NULL}, - {ST_NCCI_OUTGOING, ST_NCCI_NONE, EV_NCCI_CONNECT_B3_CONF_ERROR, n0}, - /* N-1 */ - {ST_NCCI_INCOMING, ST_NCCI_DISCONNECTING, EV_NCCI_CONNECT_B3_REJECT, NULL}, - {ST_NCCI_INCOMING, ST_NCCI_ALLOCATED, EV_NCCI_CONNECT_B3_RESP, NULL}, - {ST_NCCI_INCOMING, ST_NCCI_DISCONNECTED, EV_NCCI_DISCONNECT_B3_IND, NULL}, - {ST_NCCI_INCOMING, ST_NCCI_DISCONNECTING, EV_NCCI_DISCONNECT_B3_REQ, NULL}, - /* N-2 */ - {ST_NCCI_ALLOCATED, ST_NCCI_ACTIVE, EV_NCCI_CONNECT_B3_ACTIVE_IND, NULL}, - {ST_NCCI_ALLOCATED, ST_NCCI_DISCONNECTED, EV_NCCI_DISCONNECT_B3_IND, NULL}, - {ST_NCCI_ALLOCATED, ST_NCCI_DISCONNECTING, EV_NCCI_DISCONNECT_B3_REQ, NULL}, - /* N-ACT */ - {ST_NCCI_ACTIVE, ST_NCCI_ACTIVE, EV_NCCI_RESET_B3_IND, NULL}, - {ST_NCCI_ACTIVE, ST_NCCI_RESETING, EV_NCCI_RESET_B3_REQ, NULL}, - {ST_NCCI_ACTIVE, ST_NCCI_DISCONNECTED, EV_NCCI_DISCONNECT_B3_IND, NULL}, - {ST_NCCI_ACTIVE, ST_NCCI_DISCONNECTING, EV_NCCI_DISCONNECT_B3_REQ, NULL}, - /* N-3 */ - {ST_NCCI_RESETING, ST_NCCI_ACTIVE, EV_NCCI_RESET_B3_IND, NULL}, - {ST_NCCI_RESETING, ST_NCCI_DISCONNECTED, EV_NCCI_DISCONNECT_B3_IND, NULL}, - {ST_NCCI_RESETING, ST_NCCI_DISCONNECTING, EV_NCCI_DISCONNECT_B3_REQ, NULL}, - /* N-4 */ - {ST_NCCI_DISCONNECTING, ST_NCCI_DISCONNECTED, EV_NCCI_DISCONNECT_B3_IND, NULL}, - {ST_NCCI_DISCONNECTING, ST_NCCI_PREVIOUS, EV_NCCI_DISCONNECT_B3_CONF_ERROR, NULL}, - /* N-5 */ - {ST_NCCI_DISCONNECTED, ST_NCCI_NONE, EV_NCCI_DISCONNECT_B3_RESP, n0}, - {}, -}; - -static void ncci_change_state(capidrv_contr *card, capidrv_ncci *ncci, int event) -{ - struct nccistatechange *p = nccitable; - while (p->event) { - if (ncci->state == p->actstate && p->event == event) { - if (debugmode) - printk(KERN_DEBUG "capidrv-%d: ncci_change_state:0x%x %d -> %d\n", - card->contrnr, ncci->ncci, ncci->state, p->nextstate); - if (p->nextstate == ST_NCCI_PREVIOUS) { - ncci->state = ncci->oldstate; - ncci->oldstate = p->actstate; - } else { - ncci->oldstate = p->actstate; - ncci->state = p->nextstate; - } - if (p->changefunc) - p->changefunc(card, ncci); - return; - } - p++; - } - printk(KERN_ERR "capidrv-%d: ncci_change_state:0x%x state=%d event=%d ????\n", - card->contrnr, ncci->ncci, ncci->state, event); -} - -/* ------------------------------------------------------------------- */ - -static inline int new_bchan(capidrv_contr *card) -{ - int i; - for (i = 0; i < card->nbchan; i++) { - if (card->bchans[i].plcip == NULL) { - card->bchans[i].disconnecting = 0; - return i; - } - } - return -1; -} - -/* ------------------------------------------------------------------- */ -static char *capi_info2str(u16 reason) -{ -#ifndef CONFIG_ISDN_CAPI_CAPIDRV_VERBOSE - return ".."; -#else - switch (reason) { - -/*-- informative values (corresponding message was processed) -----*/ - case 0x0001: - return "NCPI not supported by current protocol, NCPI ignored"; - case 0x0002: - return "Flags not supported by current protocol, flags ignored"; - case 0x0003: - return "Alert already sent by another application"; - -/*-- error information concerning CAPI_REGISTER -----*/ - case 0x1001: - return "Too many applications"; - case 0x1002: - return "Logical block size too small, must be at least 128 Bytes"; - case 0x1003: - return "Buffer exceeds 64 kByte"; - case 0x1004: - return "Message buffer size too small, must be at least 1024 Bytes"; - case 0x1005: - return "Max. number of logical connections not supported"; - case 0x1006: - return "Reserved"; - case 0x1007: - return "The message could not be accepted because of an internal busy condition"; - case 0x1008: - return "OS resource error (no memory ?)"; - case 0x1009: - return "CAPI not installed"; - case 0x100A: - return "Controller does not support external equipment"; - case 0x100B: - return "Controller does only support external equipment"; - -/*-- error information concerning message exchange functions -----*/ - case 0x1101: - return "Illegal application number"; - case 0x1102: - return "Illegal command or subcommand or message length less than 12 bytes"; - case 0x1103: - return "The message could not be accepted because of a queue full condition !! The error code does not imply that CAPI cannot receive messages directed to another controller, PLCI or NCCI"; - case 0x1104: - return "Queue is empty"; - case 0x1105: - return "Queue overflow, a message was lost !! This indicates a configuration error. The only recovery from this error is to perform a CAPI_RELEASE"; - case 0x1106: - return "Unknown notification parameter"; - case 0x1107: - return "The Message could not be accepted because of an internal busy condition"; - case 0x1108: - return "OS Resource error (no memory ?)"; - case 0x1109: - return "CAPI not installed"; - case 0x110A: - return "Controller does not support external equipment"; - case 0x110B: - return "Controller does only support external equipment"; - -/*-- error information concerning resource / coding problems -----*/ - case 0x2001: - return "Message not supported in current state"; - case 0x2002: - return "Illegal Controller / PLCI / NCCI"; - case 0x2003: - return "Out of PLCI"; - case 0x2004: - return "Out of NCCI"; - case 0x2005: - return "Out of LISTEN"; - case 0x2006: - return "Out of FAX resources (protocol T.30)"; - case 0x2007: - return "Illegal message parameter coding"; - -/*-- error information concerning requested services -----*/ - case 0x3001: - return "B1 protocol not supported"; - case 0x3002: - return "B2 protocol not supported"; - case 0x3003: - return "B3 protocol not supported"; - case 0x3004: - return "B1 protocol parameter not supported"; - case 0x3005: - return "B2 protocol parameter not supported"; - case 0x3006: - return "B3 protocol parameter not supported"; - case 0x3007: - return "B protocol combination not supported"; - case 0x3008: - return "NCPI not supported"; - case 0x3009: - return "CIP Value unknown"; - case 0x300A: - return "Flags not supported (reserved bits)"; - case 0x300B: - return "Facility not supported"; - case 0x300C: - return "Data length not supported by current protocol"; - case 0x300D: - return "Reset procedure not supported by current protocol"; - -/*-- informations about the clearing of a physical connection -----*/ - case 0x3301: - return "Protocol error layer 1 (broken line or B-channel removed by signalling protocol)"; - case 0x3302: - return "Protocol error layer 2"; - case 0x3303: - return "Protocol error layer 3"; - case 0x3304: - return "Another application got that call"; -/*-- T.30 specific reasons -----*/ - case 0x3311: - return "Connecting not successful (remote station is no FAX G3 machine)"; - case 0x3312: - return "Connecting not successful (training error)"; - case 0x3313: - return "Disconnected before transfer (remote station does not support transfer mode, e.g. resolution)"; - case 0x3314: - return "Disconnected during transfer (remote abort)"; - case 0x3315: - return "Disconnected during transfer (remote procedure error, e.g. unsuccessful repetition of T.30 commands)"; - case 0x3316: - return "Disconnected during transfer (local tx data underrun)"; - case 0x3317: - return "Disconnected during transfer (local rx data overflow)"; - case 0x3318: - return "Disconnected during transfer (local abort)"; - case 0x3319: - return "Illegal parameter coding (e.g. SFF coding error)"; - -/*-- disconnect causes from the network according to ETS 300 102-1/Q.931 -----*/ - case 0x3481: return "Unallocated (unassigned) number"; - case 0x3482: return "No route to specified transit network"; - case 0x3483: return "No route to destination"; - case 0x3486: return "Channel unacceptable"; - case 0x3487: - return "Call awarded and being delivered in an established channel"; - case 0x3490: return "Normal call clearing"; - case 0x3491: return "User busy"; - case 0x3492: return "No user responding"; - case 0x3493: return "No answer from user (user alerted)"; - case 0x3495: return "Call rejected"; - case 0x3496: return "Number changed"; - case 0x349A: return "Non-selected user clearing"; - case 0x349B: return "Destination out of order"; - case 0x349C: return "Invalid number format"; - case 0x349D: return "Facility rejected"; - case 0x349E: return "Response to STATUS ENQUIRY"; - case 0x349F: return "Normal, unspecified"; - case 0x34A2: return "No circuit / channel available"; - case 0x34A6: return "Network out of order"; - case 0x34A9: return "Temporary failure"; - case 0x34AA: return "Switching equipment congestion"; - case 0x34AB: return "Access information discarded"; - case 0x34AC: return "Requested circuit / channel not available"; - case 0x34AF: return "Resources unavailable, unspecified"; - case 0x34B1: return "Quality of service unavailable"; - case 0x34B2: return "Requested facility not subscribed"; - case 0x34B9: return "Bearer capability not authorized"; - case 0x34BA: return "Bearer capability not presently available"; - case 0x34BF: return "Service or option not available, unspecified"; - case 0x34C1: return "Bearer capability not implemented"; - case 0x34C2: return "Channel type not implemented"; - case 0x34C5: return "Requested facility not implemented"; - case 0x34C6: return "Only restricted digital information bearer capability is available"; - case 0x34CF: return "Service or option not implemented, unspecified"; - case 0x34D1: return "Invalid call reference value"; - case 0x34D2: return "Identified channel does not exist"; - case 0x34D3: return "A suspended call exists, but this call identity does not"; - case 0x34D4: return "Call identity in use"; - case 0x34D5: return "No call suspended"; - case 0x34D6: return "Call having the requested call identity has been cleared"; - case 0x34D8: return "Incompatible destination"; - case 0x34DB: return "Invalid transit network selection"; - case 0x34DF: return "Invalid message, unspecified"; - case 0x34E0: return "Mandatory information element is missing"; - case 0x34E1: return "Message type non-existent or not implemented"; - case 0x34E2: return "Message not compatible with call state or message type non-existent or not implemented"; - case 0x34E3: return "Information element non-existent or not implemented"; - case 0x34E4: return "Invalid information element contents"; - case 0x34E5: return "Message not compatible with call state"; - case 0x34E6: return "Recovery on timer expiry"; - case 0x34EF: return "Protocol error, unspecified"; - case 0x34FF: return "Interworking, unspecified"; - - default: return "No additional information"; - } -#endif -} - -static void handle_controller(_cmsg *cmsg) -{ - capidrv_contr *card = findcontrbynumber(cmsg->adr.adrController & 0x7f); - - if (!card) { - printk(KERN_ERR "capidrv: %s from unknown controller 0x%x\n", - capi_cmd2str(cmsg->Command, cmsg->Subcommand), - cmsg->adr.adrController & 0x7f); - return; - } - switch (CAPICMD(cmsg->Command, cmsg->Subcommand)) { - - case CAPI_LISTEN_CONF: /* Controller */ - if (debugmode) - printk(KERN_DEBUG "capidrv-%d: listenconf Info=0x%4x (%s) cipmask=0x%x\n", - card->contrnr, cmsg->Info, capi_info2str(cmsg->Info), card->cipmask); - if (cmsg->Info) { - listen_change_state(card, EV_LISTEN_CONF_ERROR); - } else if (card->cipmask == 0) { - listen_change_state(card, EV_LISTEN_CONF_EMPTY); - } else { - listen_change_state(card, EV_LISTEN_CONF_OK); - } - break; - - case CAPI_MANUFACTURER_IND: /* Controller */ - if (cmsg->ManuID == 0x214D5641 - && cmsg->Class == 0 - && cmsg->Function == 1) { - u8 *data = cmsg->ManuData + 3; - u16 len = cmsg->ManuData[0]; - u16 layer; - int direction; - if (len == 255) { - len = (cmsg->ManuData[1] | (cmsg->ManuData[2] << 8)); - data += 2; - } - len -= 2; - layer = ((*(data - 1)) << 8) | *(data - 2); - if (layer & 0x300) - direction = (layer & 0x200) ? 0 : 1; - else direction = (layer & 0x800) ? 0 : 1; - if (layer & 0x0C00) { - if ((layer & 0xff) == 0x80) { - handle_dtrace_data(card, direction, 1, data, len); - break; - } - } else if ((layer & 0xff) < 0x80) { - handle_dtrace_data(card, direction, 0, data, len); - break; - } - printk(KERN_INFO "capidrv-%d: %s from controller 0x%x layer 0x%x, ignored\n", - card->contrnr, - capi_cmd2str(cmsg->Command, cmsg->Subcommand), - cmsg->adr.adrController, layer); - break; - } - goto ignored; - case CAPI_MANUFACTURER_CONF: /* Controller */ - if (cmsg->ManuID == 0x214D5641) { - char *s = NULL; - switch (cmsg->Class) { - case 0: break; - case 1: s = "unknown class"; break; - case 2: s = "unknown function"; break; - default: s = "unknown error"; break; - } - if (s) - printk(KERN_INFO "capidrv-%d: %s from controller 0x%x function %d: %s\n", - card->contrnr, - capi_cmd2str(cmsg->Command, cmsg->Subcommand), - cmsg->adr.adrController, - cmsg->Function, s); - break; - } - goto ignored; - case CAPI_FACILITY_IND: /* Controller/plci/ncci */ - goto ignored; - case CAPI_FACILITY_CONF: /* Controller/plci/ncci */ - goto ignored; - case CAPI_INFO_IND: /* Controller/plci */ - goto ignored; - case CAPI_INFO_CONF: /* Controller/plci */ - goto ignored; - - default: - printk(KERN_ERR "capidrv-%d: got %s from controller 0x%x ???", - card->contrnr, - capi_cmd2str(cmsg->Command, cmsg->Subcommand), - cmsg->adr.adrController); - } - return; - -ignored: - printk(KERN_INFO "capidrv-%d: %s from controller 0x%x ignored\n", - card->contrnr, - capi_cmd2str(cmsg->Command, cmsg->Subcommand), - cmsg->adr.adrController); -} - -static void handle_incoming_call(capidrv_contr *card, _cmsg *cmsg) -{ - capidrv_plci *plcip; - capidrv_bchan *bchan; - isdn_ctrl cmd; - int chan; - - if ((chan = new_bchan(card)) == -1) { - printk(KERN_ERR "capidrv-%d: incoming call on not existing bchan ?\n", card->contrnr); - return; - } - bchan = &card->bchans[chan]; - if ((plcip = new_plci(card, chan)) == NULL) { - printk(KERN_ERR "capidrv-%d: incoming call: no memory, sorry.\n", card->contrnr); - return; - } - bchan->incoming = 1; - plcip->plci = cmsg->adr.adrPLCI; - plci_change_state(card, plcip, EV_PLCI_CONNECT_IND); - - cmd.command = ISDN_STAT_ICALL; - cmd.driver = card->myid; - cmd.arg = chan; - memset(&cmd.parm.setup, 0, sizeof(cmd.parm.setup)); - strncpy(cmd.parm.setup.phone, - cmsg->CallingPartyNumber + 3, - cmsg->CallingPartyNumber[0] - 2); - strncpy(cmd.parm.setup.eazmsn, - cmsg->CalledPartyNumber + 2, - cmsg->CalledPartyNumber[0] - 1); - cmd.parm.setup.si1 = cip2si1(cmsg->CIPValue); - cmd.parm.setup.si2 = cip2si2(cmsg->CIPValue); - cmd.parm.setup.plan = cmsg->CallingPartyNumber[1]; - cmd.parm.setup.screen = cmsg->CallingPartyNumber[2]; - - printk(KERN_INFO "capidrv-%d: incoming call %s,%d,%d,%s\n", - card->contrnr, - cmd.parm.setup.phone, - cmd.parm.setup.si1, - cmd.parm.setup.si2, - cmd.parm.setup.eazmsn); - - if (cmd.parm.setup.si1 == 1 && cmd.parm.setup.si2 != 0) { - printk(KERN_INFO "capidrv-%d: patching si2=%d to 0 for VBOX\n", - card->contrnr, - cmd.parm.setup.si2); - cmd.parm.setup.si2 = 0; - } - - switch (card->interface.statcallb(&cmd)) { - case 0: - case 3: - /* No device matching this call. - * and isdn_common.c has send a HANGUP command - * which is ignored in state ST_PLCI_INCOMING, - * so we send RESP to ignore the call - */ - capi_cmsg_answer(cmsg); - cmsg->Reject = 1; /* ignore */ - plci_change_state(card, plcip, EV_PLCI_CONNECT_REJECT); - send_message(card, cmsg); - printk(KERN_INFO "capidrv-%d: incoming call %s,%d,%d,%s ignored\n", - card->contrnr, - cmd.parm.setup.phone, - cmd.parm.setup.si1, - cmd.parm.setup.si2, - cmd.parm.setup.eazmsn); - break; - case 1: - /* At least one device matching this call (RING on ttyI) - * HL-driver may send ALERTING on the D-channel in this - * case. - * really means: RING on ttyI or a net interface - * accepted this call already. - * - * If the call was accepted, state has already changed, - * and CONNECT_RESP already sent. - */ - if (plcip->state == ST_PLCI_INCOMING) { - printk(KERN_INFO "capidrv-%d: incoming call %s,%d,%d,%s tty alerting\n", - card->contrnr, - cmd.parm.setup.phone, - cmd.parm.setup.si1, - cmd.parm.setup.si2, - cmd.parm.setup.eazmsn); - capi_fill_ALERT_REQ(cmsg, - global.ap.applid, - card->msgid++, - plcip->plci, /* adr */ - NULL,/* BChannelinformation */ - NULL,/* Keypadfacility */ - NULL,/* Useruserdata */ - NULL /* Facilitydataarray */ - ); - plcip->msgid = cmsg->Messagenumber; - send_message(card, cmsg); - } else { - printk(KERN_INFO "capidrv-%d: incoming call %s,%d,%d,%s on netdev\n", - card->contrnr, - cmd.parm.setup.phone, - cmd.parm.setup.si1, - cmd.parm.setup.si2, - cmd.parm.setup.eazmsn); - } - break; - - case 2: /* Call will be rejected. */ - capi_cmsg_answer(cmsg); - cmsg->Reject = 2; /* reject call, normal call clearing */ - plci_change_state(card, plcip, EV_PLCI_CONNECT_REJECT); - send_message(card, cmsg); - break; - - default: - /* An error happened. (Invalid parameters for example.) */ - capi_cmsg_answer(cmsg); - cmsg->Reject = 8; /* reject call, - destination out of order */ - plci_change_state(card, plcip, EV_PLCI_CONNECT_REJECT); - send_message(card, cmsg); - break; - } - return; -} - -static void handle_plci(_cmsg *cmsg) -{ - capidrv_contr *card = findcontrbynumber(cmsg->adr.adrController & 0x7f); - capidrv_plci *plcip; - isdn_ctrl cmd; - _cdebbuf *cdb; - - if (!card) { - printk(KERN_ERR "capidrv: %s from unknown controller 0x%x\n", - capi_cmd2str(cmsg->Command, cmsg->Subcommand), - cmsg->adr.adrController & 0x7f); - return; - } - switch (CAPICMD(cmsg->Command, cmsg->Subcommand)) { - - case CAPI_DISCONNECT_IND: /* plci */ - if (cmsg->Reason) { - printk(KERN_INFO "capidrv-%d: %s reason 0x%x (%s) for plci 0x%x\n", - card->contrnr, - capi_cmd2str(cmsg->Command, cmsg->Subcommand), - cmsg->Reason, capi_info2str(cmsg->Reason), cmsg->adr.adrPLCI); - } - if (!(plcip = find_plci_by_plci(card, cmsg->adr.adrPLCI))) { - capi_cmsg_answer(cmsg); - send_message(card, cmsg); - goto notfound; - } - card->bchans[plcip->chan].disconnecting = 1; - plci_change_state(card, plcip, EV_PLCI_DISCONNECT_IND); - capi_cmsg_answer(cmsg); - plci_change_state(card, plcip, EV_PLCI_DISCONNECT_RESP); - send_message(card, cmsg); - break; - - case CAPI_DISCONNECT_CONF: /* plci */ - if (cmsg->Info) { - printk(KERN_INFO "capidrv-%d: %s info 0x%x (%s) for plci 0x%x\n", - card->contrnr, - capi_cmd2str(cmsg->Command, cmsg->Subcommand), - cmsg->Info, capi_info2str(cmsg->Info), - cmsg->adr.adrPLCI); - } - if (!(plcip = find_plci_by_plci(card, cmsg->adr.adrPLCI))) - goto notfound; - - card->bchans[plcip->chan].disconnecting = 1; - break; - - case CAPI_ALERT_CONF: /* plci */ - if (cmsg->Info) { - printk(KERN_INFO "capidrv-%d: %s info 0x%x (%s) for plci 0x%x\n", - card->contrnr, - capi_cmd2str(cmsg->Command, cmsg->Subcommand), - cmsg->Info, capi_info2str(cmsg->Info), - cmsg->adr.adrPLCI); - } - break; - - case CAPI_CONNECT_IND: /* plci */ - handle_incoming_call(card, cmsg); - break; - - case CAPI_CONNECT_CONF: /* plci */ - if (cmsg->Info) { - printk(KERN_INFO "capidrv-%d: %s info 0x%x (%s) for plci 0x%x\n", - card->contrnr, - capi_cmd2str(cmsg->Command, cmsg->Subcommand), - cmsg->Info, capi_info2str(cmsg->Info), - cmsg->adr.adrPLCI); - } - if (!(plcip = find_plci_by_msgid(card, cmsg->Messagenumber))) - goto notfound; - - plcip->plci = cmsg->adr.adrPLCI; - if (cmsg->Info) { - plci_change_state(card, plcip, EV_PLCI_CONNECT_CONF_ERROR); - } else { - plci_change_state(card, plcip, EV_PLCI_CONNECT_CONF_OK); - } - break; - - case CAPI_CONNECT_ACTIVE_IND: /* plci */ - - if (!(plcip = find_plci_by_plci(card, cmsg->adr.adrPLCI))) - goto notfound; - - if (card->bchans[plcip->chan].incoming) { - capi_cmsg_answer(cmsg); - plci_change_state(card, plcip, EV_PLCI_CONNECT_ACTIVE_IND); - send_message(card, cmsg); - } else { - capidrv_ncci *nccip; - capi_cmsg_answer(cmsg); - send_message(card, cmsg); - - nccip = new_ncci(card, plcip, cmsg->adr.adrPLCI); - - if (!nccip) { - printk(KERN_ERR "capidrv-%d: no mem for ncci, sorry\n", card->contrnr); - break; /* $$$$ */ - } - capi_fill_CONNECT_B3_REQ(cmsg, - global.ap.applid, - card->msgid++, - plcip->plci, /* adr */ - NULL /* NCPI */ - ); - nccip->msgid = cmsg->Messagenumber; - plci_change_state(card, plcip, - EV_PLCI_CONNECT_ACTIVE_IND); - ncci_change_state(card, nccip, EV_NCCI_CONNECT_B3_REQ); - send_message(card, cmsg); - cmd.command = ISDN_STAT_DCONN; - cmd.driver = card->myid; - cmd.arg = plcip->chan; - card->interface.statcallb(&cmd); - } - break; - - case CAPI_INFO_IND: /* Controller/plci */ - - if (!(plcip = find_plci_by_plci(card, cmsg->adr.adrPLCI))) - goto notfound; - - if (cmsg->InfoNumber == 0x4000) { - if (cmsg->InfoElement[0] == 4) { - cmd.command = ISDN_STAT_CINF; - cmd.driver = card->myid; - cmd.arg = plcip->chan; - sprintf(cmd.parm.num, "%lu", - (unsigned long) - ((u32) cmsg->InfoElement[1] - | ((u32) (cmsg->InfoElement[2]) << 8) - | ((u32) (cmsg->InfoElement[3]) << 16) - | ((u32) (cmsg->InfoElement[4]) << 24))); - card->interface.statcallb(&cmd); - break; - } - } - cdb = capi_cmsg2str(cmsg); - if (cdb) { - printk(KERN_WARNING "capidrv-%d: %s\n", - card->contrnr, cdb->buf); - cdebbuf_free(cdb); - } else - printk(KERN_WARNING "capidrv-%d: CAPI_INFO_IND InfoNumber %x not handled\n", - card->contrnr, cmsg->InfoNumber); - - break; - - case CAPI_CONNECT_ACTIVE_CONF: /* plci */ - goto ignored; - case CAPI_SELECT_B_PROTOCOL_CONF: /* plci */ - goto ignored; - case CAPI_FACILITY_IND: /* Controller/plci/ncci */ - goto ignored; - case CAPI_FACILITY_CONF: /* Controller/plci/ncci */ - goto ignored; - - case CAPI_INFO_CONF: /* Controller/plci */ - goto ignored; - - default: - printk(KERN_ERR "capidrv-%d: got %s for plci 0x%x ???", - card->contrnr, - capi_cmd2str(cmsg->Command, cmsg->Subcommand), - cmsg->adr.adrPLCI); - } - return; -ignored: - printk(KERN_INFO "capidrv-%d: %s for plci 0x%x ignored\n", - card->contrnr, - capi_cmd2str(cmsg->Command, cmsg->Subcommand), - cmsg->adr.adrPLCI); - return; -notfound: - printk(KERN_ERR "capidrv-%d: %s: plci 0x%x not found\n", - card->contrnr, - capi_cmd2str(cmsg->Command, cmsg->Subcommand), - cmsg->adr.adrPLCI); - return; -} - -static void handle_ncci(_cmsg *cmsg) -{ - capidrv_contr *card = findcontrbynumber(cmsg->adr.adrController & 0x7f); - capidrv_plci *plcip; - capidrv_ncci *nccip; - isdn_ctrl cmd; - int len; - - if (!card) { - printk(KERN_ERR "capidrv: %s from unknown controller 0x%x\n", - capi_cmd2str(cmsg->Command, cmsg->Subcommand), - cmsg->adr.adrController & 0x7f); - return; - } - switch (CAPICMD(cmsg->Command, cmsg->Subcommand)) { - - case CAPI_CONNECT_B3_ACTIVE_IND: /* ncci */ - if (!(nccip = find_ncci(card, cmsg->adr.adrNCCI))) - goto notfound; - - capi_cmsg_answer(cmsg); - ncci_change_state(card, nccip, EV_NCCI_CONNECT_B3_ACTIVE_IND); - send_message(card, cmsg); - - cmd.command = ISDN_STAT_BCONN; - cmd.driver = card->myid; - cmd.arg = nccip->chan; - card->interface.statcallb(&cmd); - - printk(KERN_INFO "capidrv-%d: chan %d up with ncci 0x%x\n", - card->contrnr, nccip->chan, nccip->ncci); - break; - - case CAPI_CONNECT_B3_ACTIVE_CONF: /* ncci */ - goto ignored; - - case CAPI_CONNECT_B3_IND: /* ncci */ - - plcip = find_plci_by_ncci(card, cmsg->adr.adrNCCI); - if (plcip) { - nccip = new_ncci(card, plcip, cmsg->adr.adrNCCI); - if (nccip) { - ncci_change_state(card, nccip, EV_NCCI_CONNECT_B3_IND); - capi_fill_CONNECT_B3_RESP(cmsg, - global.ap.applid, - card->msgid++, - nccip->ncci, /* adr */ - 0, /* Reject */ - NULL /* NCPI */ - ); - ncci_change_state(card, nccip, EV_NCCI_CONNECT_B3_RESP); - send_message(card, cmsg); - break; - } - printk(KERN_ERR "capidrv-%d: no mem for ncci, sorry\n", card->contrnr); - } else { - printk(KERN_ERR "capidrv-%d: %s: plci for ncci 0x%x not found\n", - card->contrnr, - capi_cmd2str(cmsg->Command, cmsg->Subcommand), - cmsg->adr.adrNCCI); - } - capi_fill_CONNECT_B3_RESP(cmsg, - global.ap.applid, - card->msgid++, - cmsg->adr.adrNCCI, - 2, /* Reject */ - NULL /* NCPI */ - ); - send_message(card, cmsg); - break; - - case CAPI_CONNECT_B3_CONF: /* ncci */ - - if (!(nccip = find_ncci_by_msgid(card, - cmsg->adr.adrNCCI, - cmsg->Messagenumber))) - goto notfound; - - nccip->ncci = cmsg->adr.adrNCCI; - if (cmsg->Info) { - printk(KERN_INFO "capidrv-%d: %s info 0x%x (%s) for ncci 0x%x\n", - card->contrnr, - capi_cmd2str(cmsg->Command, cmsg->Subcommand), - cmsg->Info, capi_info2str(cmsg->Info), - cmsg->adr.adrNCCI); - } - - if (cmsg->Info) - ncci_change_state(card, nccip, EV_NCCI_CONNECT_B3_CONF_ERROR); - else - ncci_change_state(card, nccip, EV_NCCI_CONNECT_B3_CONF_OK); - break; - - case CAPI_CONNECT_B3_T90_ACTIVE_IND: /* ncci */ - capi_cmsg_answer(cmsg); - send_message(card, cmsg); - break; - - case CAPI_DATA_B3_IND: /* ncci */ - /* handled in handle_data() */ - goto ignored; - - case CAPI_DATA_B3_CONF: /* ncci */ - if (cmsg->Info) { - printk(KERN_WARNING "CAPI_DATA_B3_CONF: Info %x - %s\n", - cmsg->Info, capi_info2str(cmsg->Info)); - } - if (!(nccip = find_ncci(card, cmsg->adr.adrNCCI))) - goto notfound; - - len = capidrv_del_ack(nccip, cmsg->DataHandle); - if (len < 0) - break; - cmd.command = ISDN_STAT_BSENT; - cmd.driver = card->myid; - cmd.arg = nccip->chan; - cmd.parm.length = len; - card->interface.statcallb(&cmd); - break; - - case CAPI_DISCONNECT_B3_IND: /* ncci */ - if (!(nccip = find_ncci(card, cmsg->adr.adrNCCI))) - goto notfound; - - card->bchans[nccip->chan].disconnecting = 1; - ncci_change_state(card, nccip, EV_NCCI_DISCONNECT_B3_IND); - capi_cmsg_answer(cmsg); - ncci_change_state(card, nccip, EV_NCCI_DISCONNECT_B3_RESP); - send_message(card, cmsg); - break; - - case CAPI_DISCONNECT_B3_CONF: /* ncci */ - if (!(nccip = find_ncci(card, cmsg->adr.adrNCCI))) - goto notfound; - if (cmsg->Info) { - printk(KERN_INFO "capidrv-%d: %s info 0x%x (%s) for ncci 0x%x\n", - card->contrnr, - capi_cmd2str(cmsg->Command, cmsg->Subcommand), - cmsg->Info, capi_info2str(cmsg->Info), - cmsg->adr.adrNCCI); - ncci_change_state(card, nccip, EV_NCCI_DISCONNECT_B3_CONF_ERROR); - } - break; - - case CAPI_RESET_B3_IND: /* ncci */ - if (!(nccip = find_ncci(card, cmsg->adr.adrNCCI))) - goto notfound; - ncci_change_state(card, nccip, EV_NCCI_RESET_B3_IND); - capi_cmsg_answer(cmsg); - send_message(card, cmsg); - break; - - case CAPI_RESET_B3_CONF: /* ncci */ - goto ignored; /* $$$$ */ - - case CAPI_FACILITY_IND: /* Controller/plci/ncci */ - goto ignored; - case CAPI_FACILITY_CONF: /* Controller/plci/ncci */ - goto ignored; - - default: - printk(KERN_ERR "capidrv-%d: got %s for ncci 0x%x ???", - card->contrnr, - capi_cmd2str(cmsg->Command, cmsg->Subcommand), - cmsg->adr.adrNCCI); - } - return; -ignored: - printk(KERN_INFO "capidrv-%d: %s for ncci 0x%x ignored\n", - card->contrnr, - capi_cmd2str(cmsg->Command, cmsg->Subcommand), - cmsg->adr.adrNCCI); - return; -notfound: - printk(KERN_ERR "capidrv-%d: %s: ncci 0x%x not found\n", - card->contrnr, - capi_cmd2str(cmsg->Command, cmsg->Subcommand), - cmsg->adr.adrNCCI); -} - - -static void handle_data(_cmsg *cmsg, struct sk_buff *skb) -{ - capidrv_contr *card = findcontrbynumber(cmsg->adr.adrController & 0x7f); - capidrv_ncci *nccip; - - if (!card) { - printk(KERN_ERR "capidrv: %s from unknown controller 0x%x\n", - capi_cmd2str(cmsg->Command, cmsg->Subcommand), - cmsg->adr.adrController & 0x7f); - kfree_skb(skb); - return; - } - if (!(nccip = find_ncci(card, cmsg->adr.adrNCCI))) { - printk(KERN_ERR "capidrv-%d: %s: ncci 0x%x not found\n", - card->contrnr, - capi_cmd2str(cmsg->Command, cmsg->Subcommand), - cmsg->adr.adrNCCI); - kfree_skb(skb); - return; - } - (void) skb_pull(skb, CAPIMSG_LEN(skb->data)); - card->interface.rcvcallb_skb(card->myid, nccip->chan, skb); - capi_cmsg_answer(cmsg); - send_message(card, cmsg); -} - -static _cmsg s_cmsg; - -static void capidrv_recv_message(struct capi20_appl *ap, struct sk_buff *skb) -{ - if (capi_message2cmsg(&s_cmsg, skb->data)) { - printk(KERN_ERR "capidrv: applid=%d: received invalid message\n", - ap->applid); - kfree_skb(skb); - return; - } - if (debugmode > 3) { - _cdebbuf *cdb = capi_cmsg2str(&s_cmsg); - - if (cdb) { - printk(KERN_DEBUG "%s: applid=%d %s\n", __func__, - ap->applid, cdb->buf); - cdebbuf_free(cdb); - } else - printk(KERN_DEBUG "%s: applid=%d %s not traced\n", - __func__, ap->applid, - capi_cmd2str(s_cmsg.Command, s_cmsg.Subcommand)); - } - if (s_cmsg.Command == CAPI_DATA_B3 - && s_cmsg.Subcommand == CAPI_IND) { - handle_data(&s_cmsg, skb); - return; - } - if ((s_cmsg.adr.adrController & 0xffffff00) == 0) - handle_controller(&s_cmsg); - else if ((s_cmsg.adr.adrPLCI & 0xffff0000) == 0) - handle_plci(&s_cmsg); - else - handle_ncci(&s_cmsg); - /* - * data of skb used in s_cmsg, - * free data when s_cmsg is not used again - * thanks to Lars Heete - */ - kfree_skb(skb); -} - -/* ------------------------------------------------------------------- */ - -#define PUTBYTE_TO_STATUS(card, byte) \ - do { \ - *(card)->q931_write++ = (byte); \ - if ((card)->q931_write > (card)->q931_end) \ - (card)->q931_write = (card)->q931_buf; \ - } while (0) - -static void handle_dtrace_data(capidrv_contr *card, - int send, int level2, u8 *data, u16 len) -{ - u8 *p, *end; - isdn_ctrl cmd; - - if (!len) { - printk(KERN_DEBUG "capidrv-%d: avmb1_q931_data: len == %d\n", - card->contrnr, len); - return; - } - - if (level2) { - PUTBYTE_TO_STATUS(card, 'D'); - PUTBYTE_TO_STATUS(card, '2'); - PUTBYTE_TO_STATUS(card, send ? '>' : '<'); - PUTBYTE_TO_STATUS(card, ':'); - } else { - PUTBYTE_TO_STATUS(card, 'D'); - PUTBYTE_TO_STATUS(card, '3'); - PUTBYTE_TO_STATUS(card, send ? '>' : '<'); - PUTBYTE_TO_STATUS(card, ':'); - } - - for (p = data, end = data + len; p < end; p++) { - PUTBYTE_TO_STATUS(card, ' '); - PUTBYTE_TO_STATUS(card, hex_asc_hi(*p)); - PUTBYTE_TO_STATUS(card, hex_asc_lo(*p)); - } - PUTBYTE_TO_STATUS(card, '\n'); - - cmd.command = ISDN_STAT_STAVAIL; - cmd.driver = card->myid; - cmd.arg = len * 3 + 5; - card->interface.statcallb(&cmd); -} - -/* ------------------------------------------------------------------- */ - -static _cmsg cmdcmsg; - -static int capidrv_ioctl(isdn_ctrl *c, capidrv_contr *card) -{ - switch (c->arg) { - case 1: - debugmode = (int)(*((unsigned int *)c->parm.num)); - printk(KERN_DEBUG "capidrv-%d: debugmode=%d\n", - card->contrnr, debugmode); - return 0; - default: - printk(KERN_DEBUG "capidrv-%d: capidrv_ioctl(%ld) called ??\n", - card->contrnr, c->arg); - return -EINVAL; - } - return -EINVAL; -} - -/* - * Handle leased lines (CAPI-Bundling) - */ - -struct internal_bchannelinfo { - unsigned short channelalloc; - unsigned short operation; - unsigned char cmask[31]; -}; - -static int decodeFVteln(char *teln, unsigned long *bmaskp, int *activep) -{ - unsigned long bmask = 0; - int active = !0; - char *s; - int i; - - if (strncmp(teln, "FV:", 3) != 0) - return 1; - s = teln + 3; - while (*s && *s == ' ') s++; - if (!*s) return -2; - if (*s == 'p' || *s == 'P') { - active = 0; - s++; - } - if (*s == 'a' || *s == 'A') { - active = !0; - s++; - } - while (*s) { - int digit1 = 0; - int digit2 = 0; - char *endp; - - digit1 = simple_strtoul(s, &endp, 10); - if (s == endp) - return -3; - s = endp; - - if (digit1 <= 0 || digit1 > 30) return -4; - if (*s == 0 || *s == ',' || *s == ' ') { - bmask |= (1 << digit1); - digit1 = 0; - if (*s) s++; - continue; - } - if (*s != '-') return -5; - s++; - - digit2 = simple_strtoul(s, &endp, 10); - if (s == endp) - return -3; - s = endp; - - if (digit2 <= 0 || digit2 > 30) return -4; - if (*s == 0 || *s == ',' || *s == ' ') { - if (digit1 > digit2) - for (i = digit2; i <= digit1; i++) - bmask |= (1 << i); - else - for (i = digit1; i <= digit2; i++) - bmask |= (1 << i); - digit1 = digit2 = 0; - if (*s) s++; - continue; - } - return -6; - } - if (activep) *activep = active; - if (bmaskp) *bmaskp = bmask; - return 0; -} - -static int FVteln2capi20(char *teln, u8 AdditionalInfo[1 + 2 + 2 + 31]) -{ - unsigned long bmask; - int active; - int rc, i; - - rc = decodeFVteln(teln, &bmask, &active); - if (rc) return rc; - /* Length */ - AdditionalInfo[0] = 2 + 2 + 31; - /* Channel: 3 => use channel allocation */ - AdditionalInfo[1] = 3; AdditionalInfo[2] = 0; - /* Operation: 0 => DTE mode, 1 => DCE mode */ - if (active) { - AdditionalInfo[3] = 0; AdditionalInfo[4] = 0; - } else { - AdditionalInfo[3] = 1; AdditionalInfo[4] = 0; - } - /* Channel mask array */ - AdditionalInfo[5] = 0; /* no D-Channel */ - for (i = 1; i <= 30; i++) - AdditionalInfo[5 + i] = (bmask & (1 << i)) ? 0xff : 0; - return 0; -} - -static int capidrv_command(isdn_ctrl *c, capidrv_contr *card) -{ - isdn_ctrl cmd; - struct capidrv_bchan *bchan; - struct capidrv_plci *plcip; - u8 AdditionalInfo[1 + 2 + 2 + 31]; - int rc, isleasedline = 0; - - if (c->command == ISDN_CMD_IOCTL) - return capidrv_ioctl(c, card); - - switch (c->command) { - case ISDN_CMD_DIAL: { - u8 calling[ISDN_MSNLEN + 3]; - u8 called[ISDN_MSNLEN + 2]; - - if (debugmode) - printk(KERN_DEBUG "capidrv-%d: ISDN_CMD_DIAL(ch=%ld,\"%s,%d,%d,%s\")\n", - card->contrnr, - c->arg, - c->parm.setup.phone, - c->parm.setup.si1, - c->parm.setup.si2, - c->parm.setup.eazmsn); - - bchan = &card->bchans[c->arg % card->nbchan]; - - if (bchan->plcip) { - printk(KERN_ERR "capidrv-%d: dail ch=%ld,\"%s,%d,%d,%s\" in use (plci=0x%x)\n", - card->contrnr, - c->arg, - c->parm.setup.phone, - c->parm.setup.si1, - c->parm.setup.si2, - c->parm.setup.eazmsn, - bchan->plcip->plci); - return 0; - } - bchan->si1 = c->parm.setup.si1; - bchan->si2 = c->parm.setup.si2; - - strncpy(bchan->num, c->parm.setup.phone, sizeof(bchan->num)); - strncpy(bchan->mynum, c->parm.setup.eazmsn, sizeof(bchan->mynum)); - rc = FVteln2capi20(bchan->num, AdditionalInfo); - isleasedline = (rc == 0); - if (rc < 0) - printk(KERN_ERR "capidrv-%d: WARNING: invalid leased linedefinition \"%s\"\n", card->contrnr, bchan->num); - - if (isleasedline) { - calling[0] = 0; - called[0] = 0; - if (debugmode) - printk(KERN_DEBUG "capidrv-%d: connecting leased line\n", card->contrnr); - } else { - calling[0] = strlen(bchan->mynum) + 2; - calling[1] = 0; - calling[2] = 0x80; - strncpy(calling + 3, bchan->mynum, ISDN_MSNLEN); - called[0] = strlen(bchan->num) + 1; - called[1] = 0x80; - strncpy(called + 2, bchan->num, ISDN_MSNLEN); - } - - capi_fill_CONNECT_REQ(&cmdcmsg, - global.ap.applid, - card->msgid++, - card->contrnr, /* adr */ - si2cip(bchan->si1, bchan->si2), /* cipvalue */ - called, /* CalledPartyNumber */ - calling, /* CallingPartyNumber */ - NULL, /* CalledPartySubaddress */ - NULL, /* CallingPartySubaddress */ - b1prot(bchan->l2, bchan->l3), /* B1protocol */ - b2prot(bchan->l2, bchan->l3), /* B2protocol */ - b3prot(bchan->l2, bchan->l3), /* B3protocol */ - b1config(bchan->l2, bchan->l3), /* B1configuration */ - NULL, /* B2configuration */ - NULL, /* B3configuration */ - NULL, /* BC */ - NULL, /* LLC */ - NULL, /* HLC */ - /* BChannelinformation */ - isleasedline ? AdditionalInfo : NULL, - NULL, /* Keypadfacility */ - NULL, /* Useruserdata */ - NULL /* Facilitydataarray */ - ); - if ((plcip = new_plci(card, (c->arg % card->nbchan))) == NULL) { - cmd.command = ISDN_STAT_DHUP; - cmd.driver = card->myid; - cmd.arg = (c->arg % card->nbchan); - card->interface.statcallb(&cmd); - return -1; - } - plcip->msgid = cmdcmsg.Messagenumber; - plcip->leasedline = isleasedline; - plci_change_state(card, plcip, EV_PLCI_CONNECT_REQ); - send_message(card, &cmdcmsg); - return 0; - } - - case ISDN_CMD_ACCEPTD: - - bchan = &card->bchans[c->arg % card->nbchan]; - if (debugmode) - printk(KERN_DEBUG "capidrv-%d: ISDN_CMD_ACCEPTD(ch=%ld) l2=%d l3=%d\n", - card->contrnr, - c->arg, bchan->l2, bchan->l3); - - capi_fill_CONNECT_RESP(&cmdcmsg, - global.ap.applid, - card->msgid++, - bchan->plcip->plci, /* adr */ - 0, /* Reject */ - b1prot(bchan->l2, bchan->l3), /* B1protocol */ - b2prot(bchan->l2, bchan->l3), /* B2protocol */ - b3prot(bchan->l2, bchan->l3), /* B3protocol */ - b1config(bchan->l2, bchan->l3), /* B1configuration */ - NULL, /* B2configuration */ - NULL, /* B3configuration */ - NULL, /* ConnectedNumber */ - NULL, /* ConnectedSubaddress */ - NULL, /* LLC */ - NULL, /* BChannelinformation */ - NULL, /* Keypadfacility */ - NULL, /* Useruserdata */ - NULL /* Facilitydataarray */ - ); - if (capi_cmsg2message(&cmdcmsg, cmdcmsg.buf)) { - printk(KERN_ERR "capidrv-%d: capidrv_command: parser failure\n", - card->contrnr); - return -EINVAL; - } - plci_change_state(card, bchan->plcip, EV_PLCI_CONNECT_RESP); - send_message(card, &cmdcmsg); - return 0; - - case ISDN_CMD_ACCEPTB: - if (debugmode) - printk(KERN_DEBUG "capidrv-%d: ISDN_CMD_ACCEPTB(ch=%ld)\n", - card->contrnr, - c->arg); - return -ENOSYS; - - case ISDN_CMD_HANGUP: - if (debugmode) - printk(KERN_DEBUG "capidrv-%d: ISDN_CMD_HANGUP(ch=%ld)\n", - card->contrnr, - c->arg); - bchan = &card->bchans[c->arg % card->nbchan]; - - if (bchan->disconnecting) { - if (debugmode) - printk(KERN_DEBUG "capidrv-%d: chan %ld already disconnecting ...\n", - card->contrnr, - c->arg); - return 0; - } - if (bchan->nccip) { - bchan->disconnecting = 1; - capi_fill_DISCONNECT_B3_REQ(&cmdcmsg, - global.ap.applid, - card->msgid++, - bchan->nccip->ncci, - NULL /* NCPI */ - ); - ncci_change_state(card, bchan->nccip, EV_NCCI_DISCONNECT_B3_REQ); - send_message(card, &cmdcmsg); - return 0; - } else if (bchan->plcip) { - if (bchan->plcip->state == ST_PLCI_INCOMING) { - /* - * just ignore, we a called from - * isdn_status_callback(), - * which will return 0 or 2, this is handled - * by the CONNECT_IND handler - */ - bchan->disconnecting = 1; - return 0; - } else if (bchan->plcip->plci) { - bchan->disconnecting = 1; - capi_fill_DISCONNECT_REQ(&cmdcmsg, - global.ap.applid, - card->msgid++, - bchan->plcip->plci, - NULL, /* BChannelinformation */ - NULL, /* Keypadfacility */ - NULL, /* Useruserdata */ - NULL /* Facilitydataarray */ - ); - plci_change_state(card, bchan->plcip, EV_PLCI_DISCONNECT_REQ); - send_message(card, &cmdcmsg); - return 0; - } else { - printk(KERN_ERR "capidrv-%d: chan %ld disconnect request while waiting for CONNECT_CONF\n", - card->contrnr, - c->arg); - return -EINVAL; - } - } - printk(KERN_ERR "capidrv-%d: chan %ld disconnect request on free channel\n", - card->contrnr, - c->arg); - return -EINVAL; -/* ready */ - - case ISDN_CMD_SETL2: - if (debugmode) - printk(KERN_DEBUG "capidrv-%d: set L2 on chan %ld to %ld\n", - card->contrnr, - (c->arg & 0xff), (c->arg >> 8)); - bchan = &card->bchans[(c->arg & 0xff) % card->nbchan]; - bchan->l2 = (c->arg >> 8); - return 0; - - case ISDN_CMD_SETL3: - if (debugmode) - printk(KERN_DEBUG "capidrv-%d: set L3 on chan %ld to %ld\n", - card->contrnr, - (c->arg & 0xff), (c->arg >> 8)); - bchan = &card->bchans[(c->arg & 0xff) % card->nbchan]; - bchan->l3 = (c->arg >> 8); - return 0; - - case ISDN_CMD_SETEAZ: - if (debugmode) - printk(KERN_DEBUG "capidrv-%d: set EAZ \"%s\" on chan %ld\n", - card->contrnr, - c->parm.num, c->arg); - bchan = &card->bchans[c->arg % card->nbchan]; - strncpy(bchan->msn, c->parm.num, ISDN_MSNLEN); - return 0; - - case ISDN_CMD_CLREAZ: - if (debugmode) - printk(KERN_DEBUG "capidrv-%d: clearing EAZ on chan %ld\n", - card->contrnr, c->arg); - bchan = &card->bchans[c->arg % card->nbchan]; - bchan->msn[0] = 0; - return 0; - - default: - printk(KERN_ERR "capidrv-%d: ISDN_CMD_%d, Huh?\n", - card->contrnr, c->command); - return -EINVAL; - } - return 0; -} - -static int if_command(isdn_ctrl *c) -{ - capidrv_contr *card = findcontrbydriverid(c->driver); - - if (card) - return capidrv_command(c, card); - - printk(KERN_ERR - "capidrv: if_command %d called with invalid driverId %d!\n", - c->command, c->driver); - return -ENODEV; -} - -static _cmsg sendcmsg; - -static int if_sendbuf(int id, int channel, int doack, struct sk_buff *skb) -{ - capidrv_contr *card = findcontrbydriverid(id); - capidrv_bchan *bchan; - capidrv_ncci *nccip; - int len = skb->len; - int msglen; - u16 errcode; - u16 datahandle; - u32 data; - - if (!card) { - printk(KERN_ERR "capidrv: if_sendbuf called with invalid driverId %d!\n", - id); - return 0; - } - if (debugmode > 4) - printk(KERN_DEBUG "capidrv-%d: sendbuf len=%d skb=%p doack=%d\n", - card->contrnr, len, skb, doack); - bchan = &card->bchans[channel % card->nbchan]; - nccip = bchan->nccip; - if (!nccip || nccip->state != ST_NCCI_ACTIVE) { - printk(KERN_ERR "capidrv-%d: if_sendbuf: %s:%d: chan not up!\n", - card->contrnr, card->name, channel); - return 0; - } - datahandle = nccip->datahandle; - - /* - * Here we copy pointer skb->data into the 32-bit 'Data' field. - * The 'Data' field is not used in practice in linux kernel - * (neither in 32 or 64 bit), but should have some value, - * since a CAPI message trace will display it. - * - * The correct value in the 32 bit case is the address of the - * data, in 64 bit it makes no sense, we use 0 there. - */ - -#ifdef CONFIG_64BIT - data = 0; -#else - data = (unsigned long) skb->data; -#endif - - capi_fill_DATA_B3_REQ(&sendcmsg, global.ap.applid, card->msgid++, - nccip->ncci, /* adr */ - data, /* Data */ - skb->len, /* DataLength */ - datahandle, /* DataHandle */ - 0 /* Flags */ - ); - - if (capidrv_add_ack(nccip, datahandle, doack ? (int)skb->len : -1) < 0) - return 0; - - if (capi_cmsg2message(&sendcmsg, sendcmsg.buf)) { - printk(KERN_ERR "capidrv-%d: if_sendbuf: parser failure\n", - card->contrnr); - return -EINVAL; - } - msglen = CAPIMSG_LEN(sendcmsg.buf); - if (skb_headroom(skb) < msglen) { - struct sk_buff *nskb = skb_realloc_headroom(skb, msglen); - if (!nskb) { - printk(KERN_ERR "capidrv-%d: if_sendbuf: no memory\n", - card->contrnr); - (void)capidrv_del_ack(nccip, datahandle); - return 0; - } - printk(KERN_DEBUG "capidrv-%d: only %d bytes headroom, need %d\n", - card->contrnr, skb_headroom(skb), msglen); - memcpy(skb_push(nskb, msglen), sendcmsg.buf, msglen); - errcode = capi20_put_message(&global.ap, nskb); - if (errcode == CAPI_NOERROR) { - dev_kfree_skb(skb); - nccip->datahandle++; - return len; - } - if (debugmode > 3) - printk(KERN_DEBUG "capidrv-%d: sendbuf putmsg ret(%x) - %s\n", - card->contrnr, errcode, capi_info2str(errcode)); - (void)capidrv_del_ack(nccip, datahandle); - dev_kfree_skb(nskb); - return errcode == CAPI_SENDQUEUEFULL ? 0 : -1; - } else { - memcpy(skb_push(skb, msglen), sendcmsg.buf, msglen); - errcode = capi20_put_message(&global.ap, skb); - if (errcode == CAPI_NOERROR) { - nccip->datahandle++; - return len; - } - if (debugmode > 3) - printk(KERN_DEBUG "capidrv-%d: sendbuf putmsg ret(%x) - %s\n", - card->contrnr, errcode, capi_info2str(errcode)); - skb_pull(skb, msglen); - (void)capidrv_del_ack(nccip, datahandle); - return errcode == CAPI_SENDQUEUEFULL ? 0 : -1; - } -} - -static int if_readstat(u8 __user *buf, int len, int id, int channel) -{ - capidrv_contr *card = findcontrbydriverid(id); - int count; - u8 __user *p; - - if (!card) { - printk(KERN_ERR "capidrv: if_readstat called with invalid driverId %d!\n", - id); - return -ENODEV; - } - - for (p = buf, count = 0; count < len; p++, count++) { - if (put_user(*card->q931_read++, p)) - return -EFAULT; - if (card->q931_read > card->q931_end) - card->q931_read = card->q931_buf; - } - return count; - -} - -static void enable_dchannel_trace(capidrv_contr *card) -{ - u8 manufacturer[CAPI_MANUFACTURER_LEN]; - capi_version version; - u16 contr = card->contrnr; - u16 errcode; - u16 avmversion[3]; - - errcode = capi20_get_manufacturer(contr, manufacturer); - if (errcode != CAPI_NOERROR) { - printk(KERN_ERR "%s: can't get manufacturer (0x%x)\n", - card->name, errcode); - return; - } - if (strstr(manufacturer, "AVM") == NULL) { - printk(KERN_ERR "%s: not from AVM, no d-channel trace possible (%s)\n", - card->name, manufacturer); - return; - } - errcode = capi20_get_version(contr, &version); - if (errcode != CAPI_NOERROR) { - printk(KERN_ERR "%s: can't get version (0x%x)\n", - card->name, errcode); - return; - } - avmversion[0] = (version.majormanuversion >> 4) & 0x0f; - avmversion[1] = (version.majormanuversion << 4) & 0xf0; - avmversion[1] |= (version.minormanuversion >> 4) & 0x0f; - avmversion[2] |= version.minormanuversion & 0x0f; - - if (avmversion[0] > 3 || (avmversion[0] == 3 && avmversion[1] > 5)) { - printk(KERN_INFO "%s: D2 trace enabled\n", card->name); - capi_fill_MANUFACTURER_REQ(&cmdcmsg, global.ap.applid, - card->msgid++, - contr, - 0x214D5641, /* ManuID */ - 0, /* Class */ - 1, /* Function */ - (_cstruct)"\004\200\014\000\000"); - } else { - printk(KERN_INFO "%s: D3 trace enabled\n", card->name); - capi_fill_MANUFACTURER_REQ(&cmdcmsg, global.ap.applid, - card->msgid++, - contr, - 0x214D5641, /* ManuID */ - 0, /* Class */ - 1, /* Function */ - (_cstruct)"\004\002\003\000\000"); - } - send_message(card, &cmdcmsg); -} - - -static void send_listen(capidrv_contr *card) -{ - capi_fill_LISTEN_REQ(&cmdcmsg, global.ap.applid, - card->msgid++, - card->contrnr, /* controller */ - 1 << 6, /* Infomask */ - card->cipmask, - card->cipmask2, - NULL, NULL); - listen_change_state(card, EV_LISTEN_REQ); - send_message(card, &cmdcmsg); -} - -static void listentimerfunc(struct timer_list *t) -{ - capidrv_contr *card = from_timer(card, t, listentimer); - if (card->state != ST_LISTEN_NONE && card->state != ST_LISTEN_ACTIVE) - printk(KERN_ERR "%s: controller dead ??\n", card->name); - send_listen(card); - mod_timer(&card->listentimer, jiffies + 60 * HZ); -} - - -static int capidrv_addcontr(u16 contr, struct capi_profile *profp) -{ - capidrv_contr *card; - unsigned long flags; - isdn_ctrl cmd; - char id[20]; - int i; - - sprintf(id, "capidrv-%d", contr); - if (!try_module_get(THIS_MODULE)) { - printk(KERN_WARNING "capidrv: (%s) Could not reserve module\n", id); - return -1; - } - if (!(card = kzalloc(sizeof(capidrv_contr), GFP_ATOMIC))) { - printk(KERN_WARNING - "capidrv: (%s) Could not allocate contr-struct.\n", id); - return -1; - } - card->owner = THIS_MODULE; - timer_setup(&card->listentimer, listentimerfunc, 0); - strcpy(card->name, id); - card->contrnr = contr; - card->nbchan = profp->nbchannel; - card->bchans = kmalloc_array(card->nbchan, sizeof(capidrv_bchan), - GFP_ATOMIC); - if (!card->bchans) { - printk(KERN_WARNING - "capidrv: (%s) Could not allocate bchan-structs.\n", id); - module_put(card->owner); - kfree(card); - return -1; - } - card->interface.channels = profp->nbchannel; - card->interface.maxbufsize = 2048; - card->interface.command = if_command; - card->interface.writebuf_skb = if_sendbuf; - card->interface.writecmd = NULL; - card->interface.readstat = if_readstat; - card->interface.features = - ISDN_FEATURE_L2_HDLC | - ISDN_FEATURE_L2_TRANS | - ISDN_FEATURE_L3_TRANS | - ISDN_FEATURE_P_UNKNOWN | - ISDN_FEATURE_L2_X75I | - ISDN_FEATURE_L2_X75UI | - ISDN_FEATURE_L2_X75BUI; - if (profp->support1 & (1 << 2)) - card->interface.features |= - ISDN_FEATURE_L2_V11096 | - ISDN_FEATURE_L2_V11019 | - ISDN_FEATURE_L2_V11038; - if (profp->support1 & (1 << 8)) - card->interface.features |= ISDN_FEATURE_L2_MODEM; - card->interface.hl_hdrlen = 22; /* len of DATA_B3_REQ */ - strncpy(card->interface.id, id, sizeof(card->interface.id) - 1); - - - card->q931_read = card->q931_buf; - card->q931_write = card->q931_buf; - card->q931_end = card->q931_buf + sizeof(card->q931_buf) - 1; - - if (!register_isdn(&card->interface)) { - printk(KERN_ERR "capidrv: Unable to register contr %s\n", id); - kfree(card->bchans); - module_put(card->owner); - kfree(card); - return -1; - } - card->myid = card->interface.channels; - memset(card->bchans, 0, sizeof(capidrv_bchan) * card->nbchan); - for (i = 0; i < card->nbchan; i++) { - card->bchans[i].contr = card; - } - - spin_lock_irqsave(&global_lock, flags); - card->next = global.contr_list; - global.contr_list = card; - global.ncontr++; - spin_unlock_irqrestore(&global_lock, flags); - - cmd.command = ISDN_STAT_RUN; - cmd.driver = card->myid; - card->interface.statcallb(&cmd); - - card->cipmask = 0x1FFF03FF; /* any */ - card->cipmask2 = 0; - - send_listen(card); - mod_timer(&card->listentimer, jiffies + 60 * HZ); - - printk(KERN_INFO "%s: now up (%d B channels)\n", - card->name, card->nbchan); - - enable_dchannel_trace(card); - - return 0; -} - -static int capidrv_delcontr(u16 contr) -{ - capidrv_contr **pp, *card; - unsigned long flags; - isdn_ctrl cmd; - - spin_lock_irqsave(&global_lock, flags); - for (card = global.contr_list; card; card = card->next) { - if (card->contrnr == contr) - break; - } - if (!card) { - spin_unlock_irqrestore(&global_lock, flags); - printk(KERN_ERR "capidrv: delcontr: no contr %u\n", contr); - return -1; - } - - /* FIXME: maybe a race condition the card should be removed - * here from global list /kkeil - */ - spin_unlock_irqrestore(&global_lock, flags); - - del_timer(&card->listentimer); - - if (debugmode) - printk(KERN_DEBUG "capidrv-%d: id=%d unloading\n", - card->contrnr, card->myid); - - cmd.command = ISDN_STAT_STOP; - cmd.driver = card->myid; - card->interface.statcallb(&cmd); - - while (card->nbchan) { - - cmd.command = ISDN_STAT_DISCH; - cmd.driver = card->myid; - cmd.arg = card->nbchan - 1; - cmd.parm.num[0] = 0; - if (debugmode) - printk(KERN_DEBUG "capidrv-%d: id=%d disable chan=%ld\n", - card->contrnr, card->myid, cmd.arg); - card->interface.statcallb(&cmd); - - if (card->bchans[card->nbchan - 1].nccip) - free_ncci(card, card->bchans[card->nbchan - 1].nccip); - if (card->bchans[card->nbchan - 1].plcip) - free_plci(card, card->bchans[card->nbchan - 1].plcip); - if (card->plci_list) - printk(KERN_ERR "capidrv: bug in free_plci()\n"); - card->nbchan--; - } - kfree(card->bchans); - card->bchans = NULL; - - if (debugmode) - printk(KERN_DEBUG "capidrv-%d: id=%d isdn unload\n", - card->contrnr, card->myid); - - cmd.command = ISDN_STAT_UNLOAD; - cmd.driver = card->myid; - card->interface.statcallb(&cmd); - - if (debugmode) - printk(KERN_DEBUG "capidrv-%d: id=%d remove contr from list\n", - card->contrnr, card->myid); - - spin_lock_irqsave(&global_lock, flags); - for (pp = &global.contr_list; *pp; pp = &(*pp)->next) { - if (*pp == card) { - *pp = (*pp)->next; - card->next = NULL; - global.ncontr--; - break; - } - } - spin_unlock_irqrestore(&global_lock, flags); - - module_put(card->owner); - printk(KERN_INFO "%s: now down.\n", card->name); - kfree(card); - return 0; -} - - -static int -lower_callback(struct notifier_block *nb, unsigned long val, void *v) -{ - capi_profile profile; - u32 contr = (long)v; - - switch (val) { - case CAPICTR_UP: - printk(KERN_INFO "capidrv: controller %hu up\n", contr); - if (capi20_get_profile(contr, &profile) == CAPI_NOERROR) - (void) capidrv_addcontr(contr, &profile); - break; - case CAPICTR_DOWN: - printk(KERN_INFO "capidrv: controller %hu down\n", contr); - (void) capidrv_delcontr(contr); - break; - } - return NOTIFY_OK; -} - -/* - * /proc/capi/capidrv: - * nrecvctlpkt nrecvdatapkt nsendctlpkt nsenddatapkt - */ -static int __maybe_unused capidrv_proc_show(struct seq_file *m, void *v) -{ - seq_printf(m, "%lu %lu %lu %lu\n", - global.ap.nrecvctlpkt, - global.ap.nrecvdatapkt, - global.ap.nsentctlpkt, - global.ap.nsentdatapkt); - return 0; -} - -static void __init proc_init(void) -{ - proc_create_single("capi/capidrv", 0, NULL, capidrv_proc_show); -} - -static void __exit proc_exit(void) -{ - remove_proc_entry("capi/capidrv", NULL); -} - -static struct notifier_block capictr_nb = { - .notifier_call = lower_callback, -}; - -static int __init capidrv_init(void) -{ - capi_profile profile; - u32 ncontr, contr; - u16 errcode; - - global.ap.rparam.level3cnt = -2; /* number of bchannels twice */ - global.ap.rparam.datablkcnt = 16; - global.ap.rparam.datablklen = 2048; - - global.ap.recv_message = capidrv_recv_message; - errcode = capi20_register(&global.ap); - if (errcode) { - return -EIO; - } - - register_capictr_notifier(&capictr_nb); - - errcode = capi20_get_profile(0, &profile); - if (errcode != CAPI_NOERROR) { - unregister_capictr_notifier(&capictr_nb); - capi20_release(&global.ap); - return -EIO; - } - - ncontr = profile.ncontroller; - for (contr = 1; contr <= ncontr; contr++) { - errcode = capi20_get_profile(contr, &profile); - if (errcode != CAPI_NOERROR) - continue; - (void) capidrv_addcontr(contr, &profile); - } - proc_init(); - - return 0; -} - -static void __exit capidrv_exit(void) -{ - unregister_capictr_notifier(&capictr_nb); - capi20_release(&global.ap); - - proc_exit(); -} - -module_init(capidrv_init); -module_exit(capidrv_exit); diff --git a/drivers/isdn/capi/capidrv.h b/drivers/isdn/capi/capidrv.h deleted file mode 100644 index 4466b2e0176d..000000000000 --- a/drivers/isdn/capi/capidrv.h +++ /dev/null @@ -1,140 +0,0 @@ -/* $Id: capidrv.h,v 1.2.8.2 2001/09/23 22:24:33 kai Exp $ - * - * ISDN4Linux Driver, using capi20 interface (kernelcapi) - * - * Copyright 1997 by Carsten Paeth - * - * This software may be used and distributed according to the terms - * of the GNU General Public License, incorporated herein by reference. - * - */ - -#ifndef __CAPIDRV_H__ -#define __CAPIDRV_H__ - -/* - * LISTEN state machine - */ -#define ST_LISTEN_NONE 0 /* L-0 */ -#define ST_LISTEN_WAIT_CONF 1 /* L-0.1 */ -#define ST_LISTEN_ACTIVE 2 /* L-1 */ -#define ST_LISTEN_ACTIVE_WAIT_CONF 3 /* L-1.1 */ - - -#define EV_LISTEN_REQ 1 /* L-0 -> L-0.1 - L-1 -> L-1.1 */ -#define EV_LISTEN_CONF_ERROR 2 /* L-0.1 -> L-0 - L-1.1 -> L-1 */ -#define EV_LISTEN_CONF_EMPTY 3 /* L-0.1 -> L-0 - L-1.1 -> L-0 */ -#define EV_LISTEN_CONF_OK 4 /* L-0.1 -> L-1 - L-1.1 -> L.1 */ - -/* - * per plci state machine - */ -#define ST_PLCI_NONE 0 /* P-0 */ -#define ST_PLCI_OUTGOING 1 /* P-0.1 */ -#define ST_PLCI_ALLOCATED 2 /* P-1 */ -#define ST_PLCI_ACTIVE 3 /* P-ACT */ -#define ST_PLCI_INCOMING 4 /* P-2 */ -#define ST_PLCI_FACILITY_IND 5 /* P-3 */ -#define ST_PLCI_ACCEPTING 6 /* P-4 */ -#define ST_PLCI_DISCONNECTING 7 /* P-5 */ -#define ST_PLCI_DISCONNECTED 8 /* P-6 */ -#define ST_PLCI_RESUMEING 9 /* P-0.Res */ -#define ST_PLCI_RESUME 10 /* P-Res */ -#define ST_PLCI_HELD 11 /* P-HELD */ - -#define EV_PLCI_CONNECT_REQ 1 /* P-0 -> P-0.1 - */ -#define EV_PLCI_CONNECT_CONF_ERROR 2 /* P-0.1 -> P-0 - */ -#define EV_PLCI_CONNECT_CONF_OK 3 /* P-0.1 -> P-1 - */ -#define EV_PLCI_FACILITY_IND_UP 4 /* P-0 -> P-1 - */ -#define EV_PLCI_CONNECT_IND 5 /* P-0 -> P-2 - */ -#define EV_PLCI_CONNECT_ACTIVE_IND 6 /* P-1 -> P-ACT - */ -#define EV_PLCI_CONNECT_REJECT 7 /* P-2 -> P-5 - P-3 -> P-5 - */ -#define EV_PLCI_DISCONNECT_REQ 8 /* P-1 -> P-5 - P-2 -> P-5 - P-3 -> P-5 - P-4 -> P-5 - P-ACT -> P-5 - P-Res -> P-5 (*) - P-HELD -> P-5 (*) - */ -#define EV_PLCI_DISCONNECT_IND 9 /* P-1 -> P-6 - P-2 -> P-6 - P-3 -> P-6 - P-4 -> P-6 - P-5 -> P-6 - P-ACT -> P-6 - P-Res -> P-6 (*) - P-HELD -> P-6 (*) - */ -#define EV_PLCI_FACILITY_IND_DOWN 10 /* P-0.1 -> P-5 - P-1 -> P-5 - P-ACT -> P-5 - P-2 -> P-5 - P-3 -> P-5 - P-4 -> P-5 - */ -#define EV_PLCI_DISCONNECT_RESP 11 /* P-6 -> P-0 - */ -#define EV_PLCI_CONNECT_RESP 12 /* P-6 -> P-0 - */ - -#define EV_PLCI_RESUME_REQ 13 /* P-0 -> P-0.Res - */ -#define EV_PLCI_RESUME_CONF_OK 14 /* P-0.Res -> P-Res - */ -#define EV_PLCI_RESUME_CONF_ERROR 15 /* P-0.Res -> P-0 - */ -#define EV_PLCI_RESUME_IND 16 /* P-Res -> P-ACT - */ -#define EV_PLCI_HOLD_IND 17 /* P-ACT -> P-HELD - */ -#define EV_PLCI_RETRIEVE_IND 18 /* P-HELD -> P-ACT - */ -#define EV_PLCI_SUSPEND_IND 19 /* P-ACT -> P-5 - */ -#define EV_PLCI_CD_IND 20 /* P-2 -> P-5 - */ - -/* - * per ncci state machine - */ -#define ST_NCCI_PREVIOUS -1 -#define ST_NCCI_NONE 0 /* N-0 */ -#define ST_NCCI_OUTGOING 1 /* N-0.1 */ -#define ST_NCCI_INCOMING 2 /* N-1 */ -#define ST_NCCI_ALLOCATED 3 /* N-2 */ -#define ST_NCCI_ACTIVE 4 /* N-ACT */ -#define ST_NCCI_RESETING 5 /* N-3 */ -#define ST_NCCI_DISCONNECTING 6 /* N-4 */ -#define ST_NCCI_DISCONNECTED 7 /* N-5 */ - -#define EV_NCCI_CONNECT_B3_REQ 1 /* N-0 -> N-0.1 */ -#define EV_NCCI_CONNECT_B3_IND 2 /* N-0 -> N.1 */ -#define EV_NCCI_CONNECT_B3_CONF_OK 3 /* N-0.1 -> N.2 */ -#define EV_NCCI_CONNECT_B3_CONF_ERROR 4 /* N-0.1 -> N.0 */ -#define EV_NCCI_CONNECT_B3_REJECT 5 /* N-1 -> N-4 */ -#define EV_NCCI_CONNECT_B3_RESP 6 /* N-1 -> N-2 */ -#define EV_NCCI_CONNECT_B3_ACTIVE_IND 7 /* N-2 -> N-ACT */ -#define EV_NCCI_RESET_B3_REQ 8 /* N-ACT -> N-3 */ -#define EV_NCCI_RESET_B3_IND 9 /* N-3 -> N-ACT */ -#define EV_NCCI_DISCONNECT_B3_IND 10 /* N-4 -> N.5 */ -#define EV_NCCI_DISCONNECT_B3_CONF_ERROR 11 /* N-4 -> previous */ -#define EV_NCCI_DISCONNECT_B3_REQ 12 /* N-1 -> N-4 - N-2 -> N-4 - N-3 -> N-4 - N-ACT -> N-4 */ -#define EV_NCCI_DISCONNECT_B3_RESP 13 /* N-5 -> N-0 */ - -#endif /* __CAPIDRV_H__ */ diff --git a/drivers/isdn/divert/Makefile b/drivers/isdn/divert/Makefile deleted file mode 100644 index 07684fe53537..000000000000 --- a/drivers/isdn/divert/Makefile +++ /dev/null @@ -1,10 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only -# Makefile for the dss1_divert ISDN module - -# Each configuration option enables a list of files. - -obj-$(CONFIG_ISDN_DIVERSION) += dss1_divert.o - -# Multipart objects. - -dss1_divert-y := isdn_divert.o divert_procfs.o divert_init.o diff --git a/drivers/isdn/divert/divert_init.c b/drivers/isdn/divert/divert_init.c deleted file mode 100644 index 267dede13bfd..000000000000 --- a/drivers/isdn/divert/divert_init.c +++ /dev/null @@ -1,82 +0,0 @@ -/* $Id divert_init.c,v 1.5.6.2 2001/01/24 22:18:17 kai Exp $ - * - * Module init for DSS1 diversion services for i4l. - * - * Copyright 1999 by Werner Cornelius (werner@isdn4linux.de) - * - * This software may be used and distributed according to the terms - * of the GNU General Public License, incorporated herein by reference. - * - */ - -#include -#include -#include - -#include "isdn_divert.h" - -MODULE_DESCRIPTION("ISDN4Linux: Call diversion support"); -MODULE_AUTHOR("Werner Cornelius"); -MODULE_LICENSE("GPL"); - -/****************************************/ -/* structure containing interface to hl */ -/****************************************/ -isdn_divert_if divert_if = { - DIVERT_IF_MAGIC, /* magic value */ - DIVERT_CMD_REG, /* register cmd */ - ll_callback, /* callback routine from ll */ - NULL, /* command still not specified */ - NULL, /* drv_to_name */ - NULL, /* name_to_drv */ -}; - -/*************************/ -/* Module interface code */ -/* no cmd line parms */ -/*************************/ -static int __init divert_init(void) -{ - int i; - - if (divert_dev_init()) { - printk(KERN_WARNING "dss1_divert: cannot install device, not loaded\n"); - return (-EIO); - } - if ((i = DIVERT_REG_NAME(&divert_if)) != DIVERT_NO_ERR) { - divert_dev_deinit(); - printk(KERN_WARNING "dss1_divert: error %d registering module, not loaded\n", i); - return (-EIO); - } - printk(KERN_INFO "dss1_divert module successfully installed\n"); - return (0); -} - -/**********************/ -/* Module deinit code */ -/**********************/ -static void __exit divert_exit(void) -{ - unsigned long flags; - int i; - - spin_lock_irqsave(&divert_lock, flags); - divert_if.cmd = DIVERT_CMD_REL; /* release */ - if ((i = DIVERT_REG_NAME(&divert_if)) != DIVERT_NO_ERR) { - printk(KERN_WARNING "dss1_divert: error %d releasing module\n", i); - spin_unlock_irqrestore(&divert_lock, flags); - return; - } - if (divert_dev_deinit()) { - printk(KERN_WARNING "dss1_divert: device busy, remove cancelled\n"); - spin_unlock_irqrestore(&divert_lock, flags); - return; - } - spin_unlock_irqrestore(&divert_lock, flags); - deleterule(-1); /* delete all rules and free mem */ - deleteprocs(); - printk(KERN_INFO "dss1_divert module successfully removed \n"); -} - -module_init(divert_init); -module_exit(divert_exit); diff --git a/drivers/isdn/divert/divert_procfs.c b/drivers/isdn/divert/divert_procfs.c deleted file mode 100644 index 342585e04fd3..000000000000 --- a/drivers/isdn/divert/divert_procfs.c +++ /dev/null @@ -1,336 +0,0 @@ -/* $Id: divert_procfs.c,v 1.11.6.2 2001/09/23 22:24:36 kai Exp $ - * - * Filesystem handling for the diversion supplementary services. - * - * Copyright 1998 by Werner Cornelius (werner@isdn4linux.de) - * - * This software may be used and distributed according to the terms - * of the GNU General Public License, incorporated herein by reference. - * - */ - -#include -#include -#include -#ifdef CONFIG_PROC_FS -#include -#else -#include -#endif -#include -#include -#include -#include -#include "isdn_divert.h" - - -/*********************************/ -/* Variables for interface queue */ -/*********************************/ -ulong if_used = 0; /* number of interface users */ -static DEFINE_MUTEX(isdn_divert_mutex); -static struct divert_info *divert_info_head = NULL; /* head of queue */ -static struct divert_info *divert_info_tail = NULL; /* pointer to last entry */ -static DEFINE_SPINLOCK(divert_info_lock);/* lock for queue */ -static wait_queue_head_t rd_queue; - -/*********************************/ -/* put an info buffer into queue */ -/*********************************/ -void -put_info_buffer(char *cp) -{ - struct divert_info *ib; - unsigned long flags; - - if (if_used <= 0) - return; - if (!cp) - return; - if (!*cp) - return; - if (!(ib = kmalloc(sizeof(struct divert_info) + strlen(cp), GFP_ATOMIC))) - return; /* no memory */ - strcpy(ib->info_start, cp); /* set output string */ - ib->next = NULL; - spin_lock_irqsave(&divert_info_lock, flags); - ib->usage_cnt = if_used; - if (!divert_info_head) - divert_info_head = ib; /* new head */ - else - divert_info_tail->next = ib; /* follows existing messages */ - divert_info_tail = ib; /* new tail */ - - /* delete old entrys */ - while (divert_info_head->next) { - if ((divert_info_head->usage_cnt <= 0) && - (divert_info_head->next->usage_cnt <= 0)) { - ib = divert_info_head; - divert_info_head = divert_info_head->next; - kfree(ib); - } else - break; - } /* divert_info_head->next */ - spin_unlock_irqrestore(&divert_info_lock, flags); - wake_up_interruptible(&(rd_queue)); -} /* put_info_buffer */ - -#ifdef CONFIG_PROC_FS - -/**********************************/ -/* deflection device read routine */ -/**********************************/ -static ssize_t -isdn_divert_read(struct file *file, char __user *buf, size_t count, loff_t *off) -{ - struct divert_info *inf; - int len; - - if (!(inf = *((struct divert_info **) file->private_data))) { - if (file->f_flags & O_NONBLOCK) - return -EAGAIN; - wait_event_interruptible(rd_queue, (inf = - *((struct divert_info **) file->private_data))); - } - if (!inf) - return (0); - - inf->usage_cnt--; /* new usage count */ - file->private_data = &inf->next; /* next structure */ - if ((len = strlen(inf->info_start)) <= count) { - if (copy_to_user(buf, inf->info_start, len)) - return -EFAULT; - *off += len; - return (len); - } - return (0); -} /* isdn_divert_read */ - -/**********************************/ -/* deflection device write routine */ -/**********************************/ -static ssize_t -isdn_divert_write(struct file *file, const char __user *buf, size_t count, loff_t *off) -{ - return (-ENODEV); -} /* isdn_divert_write */ - - -/***************************************/ -/* select routines for various kernels */ -/***************************************/ -static __poll_t -isdn_divert_poll(struct file *file, poll_table *wait) -{ - __poll_t mask = 0; - - poll_wait(file, &(rd_queue), wait); - /* mask = EPOLLOUT | EPOLLWRNORM; */ - if (*((struct divert_info **) file->private_data)) { - mask |= EPOLLIN | EPOLLRDNORM; - } - return mask; -} /* isdn_divert_poll */ - -/****************/ -/* Open routine */ -/****************/ -static int -isdn_divert_open(struct inode *ino, struct file *filep) -{ - unsigned long flags; - - spin_lock_irqsave(&divert_info_lock, flags); - if_used++; - if (divert_info_head) - filep->private_data = &(divert_info_tail->next); - else - filep->private_data = &divert_info_head; - spin_unlock_irqrestore(&divert_info_lock, flags); - /* start_divert(); */ - return nonseekable_open(ino, filep); -} /* isdn_divert_open */ - -/*******************/ -/* close routine */ -/*******************/ -static int -isdn_divert_close(struct inode *ino, struct file *filep) -{ - struct divert_info *inf; - unsigned long flags; - - spin_lock_irqsave(&divert_info_lock, flags); - if_used--; - inf = *((struct divert_info **) filep->private_data); - while (inf) { - inf->usage_cnt--; - inf = inf->next; - } - if (if_used <= 0) - while (divert_info_head) { - inf = divert_info_head; - divert_info_head = divert_info_head->next; - kfree(inf); - } - spin_unlock_irqrestore(&divert_info_lock, flags); - return (0); -} /* isdn_divert_close */ - -/*********/ -/* IOCTL */ -/*********/ -static int isdn_divert_ioctl_unlocked(struct file *file, uint cmd, ulong arg) -{ - divert_ioctl dioctl; - int i; - unsigned long flags; - divert_rule *rulep; - char *cp; - - if (copy_from_user(&dioctl, (void __user *) arg, sizeof(dioctl))) - return -EFAULT; - - switch (cmd) { - case IIOCGETVER: - dioctl.drv_version = DIVERT_IIOC_VERSION; /* set version */ - break; - - case IIOCGETDRV: - if ((dioctl.getid.drvid = divert_if.name_to_drv(dioctl.getid.drvnam)) < 0) - return (-EINVAL); - break; - - case IIOCGETNAM: - cp = divert_if.drv_to_name(dioctl.getid.drvid); - if (!cp) - return (-EINVAL); - if (!*cp) - return (-EINVAL); - strcpy(dioctl.getid.drvnam, cp); - break; - - case IIOCGETRULE: - if (!(rulep = getruleptr(dioctl.getsetrule.ruleidx))) - return (-EINVAL); - dioctl.getsetrule.rule = *rulep; /* copy data */ - break; - - case IIOCMODRULE: - if (!(rulep = getruleptr(dioctl.getsetrule.ruleidx))) - return (-EINVAL); - spin_lock_irqsave(&divert_lock, flags); - *rulep = dioctl.getsetrule.rule; /* copy data */ - spin_unlock_irqrestore(&divert_lock, flags); - return (0); /* no copy required */ - break; - - case IIOCINSRULE: - return (insertrule(dioctl.getsetrule.ruleidx, &dioctl.getsetrule.rule)); - break; - - case IIOCDELRULE: - return (deleterule(dioctl.getsetrule.ruleidx)); - break; - - case IIOCDODFACT: - return (deflect_extern_action(dioctl.fwd_ctrl.subcmd, - dioctl.fwd_ctrl.callid, - dioctl.fwd_ctrl.to_nr)); - - case IIOCDOCFACT: - case IIOCDOCFDIS: - case IIOCDOCFINT: - if (!divert_if.drv_to_name(dioctl.cf_ctrl.drvid)) - return (-EINVAL); /* invalid driver */ - if (strnlen(dioctl.cf_ctrl.msn, sizeof(dioctl.cf_ctrl.msn)) == - sizeof(dioctl.cf_ctrl.msn)) - return -EINVAL; - if (strnlen(dioctl.cf_ctrl.fwd_nr, sizeof(dioctl.cf_ctrl.fwd_nr)) == - sizeof(dioctl.cf_ctrl.fwd_nr)) - return -EINVAL; - if ((i = cf_command(dioctl.cf_ctrl.drvid, - (cmd == IIOCDOCFACT) ? 1 : (cmd == IIOCDOCFDIS) ? 0 : 2, - dioctl.cf_ctrl.cfproc, - dioctl.cf_ctrl.msn, - dioctl.cf_ctrl.service, - dioctl.cf_ctrl.fwd_nr, - &dioctl.cf_ctrl.procid))) - return (i); - break; - - default: - return (-EINVAL); - } /* switch cmd */ - return copy_to_user((void __user *)arg, &dioctl, sizeof(dioctl)) ? -EFAULT : 0; -} /* isdn_divert_ioctl */ - -static long isdn_divert_ioctl(struct file *file, uint cmd, ulong arg) -{ - long ret; - - mutex_lock(&isdn_divert_mutex); - ret = isdn_divert_ioctl_unlocked(file, cmd, arg); - mutex_unlock(&isdn_divert_mutex); - - return ret; -} - -static const struct file_operations isdn_fops = -{ - .owner = THIS_MODULE, - .llseek = no_llseek, - .read = isdn_divert_read, - .write = isdn_divert_write, - .poll = isdn_divert_poll, - .unlocked_ioctl = isdn_divert_ioctl, - .open = isdn_divert_open, - .release = isdn_divert_close, -}; - -/****************************/ -/* isdn subdir in /proc/net */ -/****************************/ -static struct proc_dir_entry *isdn_proc_entry = NULL; -static struct proc_dir_entry *isdn_divert_entry = NULL; -#endif /* CONFIG_PROC_FS */ - -/***************************************************************************/ -/* divert_dev_init must be called before the proc filesystem may be used */ -/***************************************************************************/ -int -divert_dev_init(void) -{ - - init_waitqueue_head(&rd_queue); - -#ifdef CONFIG_PROC_FS - isdn_proc_entry = proc_mkdir("isdn", init_net.proc_net); - if (!isdn_proc_entry) - return (-1); - isdn_divert_entry = proc_create("divert", S_IFREG | S_IRUGO, - isdn_proc_entry, &isdn_fops); - if (!isdn_divert_entry) { - remove_proc_entry("isdn", init_net.proc_net); - return (-1); - } -#endif /* CONFIG_PROC_FS */ - - return (0); -} /* divert_dev_init */ - -/***************************************************************************/ -/* divert_dev_deinit must be called before leaving isdn when included as */ -/* a module. */ -/***************************************************************************/ -int -divert_dev_deinit(void) -{ - -#ifdef CONFIG_PROC_FS - remove_proc_entry("divert", isdn_proc_entry); - remove_proc_entry("isdn", init_net.proc_net); -#endif /* CONFIG_PROC_FS */ - - return (0); -} /* divert_dev_deinit */ diff --git a/drivers/isdn/divert/isdn_divert.c b/drivers/isdn/divert/isdn_divert.c deleted file mode 100644 index 5620fd2c6009..000000000000 --- a/drivers/isdn/divert/isdn_divert.c +++ /dev/null @@ -1,846 +0,0 @@ -/* $Id: isdn_divert.c,v 1.6.6.3 2001/09/23 22:24:36 kai Exp $ - * - * DSS1 main diversion supplementary handling for i4l. - * - * Copyright 1999 by Werner Cornelius (werner@isdn4linux.de) - * - * This software may be used and distributed according to the terms - * of the GNU General Public License, incorporated herein by reference. - * - */ - -#include -#include -#include -#include - -#include "isdn_divert.h" - -/**********************************/ -/* structure keeping calling info */ -/**********************************/ -struct call_struc { - isdn_ctrl ics; /* delivered setup + driver parameters */ - ulong divert_id; /* Id delivered to user */ - unsigned char akt_state; /* actual state */ - char deflect_dest[35]; /* deflection destination */ - struct timer_list timer; /* timer control structure */ - char info[90]; /* device info output */ - struct call_struc *next; /* pointer to next entry */ - struct call_struc *prev; -}; - - -/********************************************/ -/* structure keeping deflection table entry */ -/********************************************/ -struct deflect_struc { - struct deflect_struc *next, *prev; - divert_rule rule; /* used rule */ -}; - - -/*****************************************/ -/* variables for main diversion services */ -/*****************************************/ -/* diversion/deflection processes */ -static struct call_struc *divert_head = NULL; /* head of remembered entrys */ -static ulong next_id = 1; /* next info id */ -static struct deflect_struc *table_head = NULL; -static struct deflect_struc *table_tail = NULL; -static unsigned char extern_wait_max = 4; /* maximum wait in s for external process */ - -DEFINE_SPINLOCK(divert_lock); - -/***************************/ -/* timer callback function */ -/***************************/ -static void deflect_timer_expire(struct timer_list *t) -{ - unsigned long flags; - struct call_struc *cs = from_timer(cs, t, timer); - - spin_lock_irqsave(&divert_lock, flags); - del_timer(&cs->timer); /* delete active timer */ - spin_unlock_irqrestore(&divert_lock, flags); - - switch (cs->akt_state) { - case DEFLECT_PROCEED: - cs->ics.command = ISDN_CMD_HANGUP; /* cancel action */ - divert_if.ll_cmd(&cs->ics); - spin_lock_irqsave(&divert_lock, flags); - cs->akt_state = DEFLECT_AUTODEL; /* delete after timeout */ - cs->timer.expires = jiffies + (HZ * AUTODEL_TIME); - add_timer(&cs->timer); - spin_unlock_irqrestore(&divert_lock, flags); - break; - - case DEFLECT_ALERT: - cs->ics.command = ISDN_CMD_REDIR; /* protocol */ - strlcpy(cs->ics.parm.setup.phone, cs->deflect_dest, sizeof(cs->ics.parm.setup.phone)); - strcpy(cs->ics.parm.setup.eazmsn, "Testtext delayed"); - divert_if.ll_cmd(&cs->ics); - spin_lock_irqsave(&divert_lock, flags); - cs->akt_state = DEFLECT_AUTODEL; /* delete after timeout */ - cs->timer.expires = jiffies + (HZ * AUTODEL_TIME); - add_timer(&cs->timer); - spin_unlock_irqrestore(&divert_lock, flags); - break; - - case DEFLECT_AUTODEL: - default: - spin_lock_irqsave(&divert_lock, flags); - if (cs->prev) - cs->prev->next = cs->next; /* forward link */ - else - divert_head = cs->next; - if (cs->next) - cs->next->prev = cs->prev; /* back link */ - spin_unlock_irqrestore(&divert_lock, flags); - kfree(cs); - return; - - } /* switch */ -} /* deflect_timer_func */ - - -/*****************************************/ -/* handle call forwarding de/activations */ -/* 0 = deact, 1 = act, 2 = interrogate */ -/*****************************************/ -int cf_command(int drvid, int mode, - u_char proc, char *msn, - u_char service, char *fwd_nr, ulong *procid) -{ - unsigned long flags; - int retval, msnlen; - int fwd_len; - char *p, *ielenp, tmp[60]; - struct call_struc *cs; - - if (strchr(msn, '.')) return (-EINVAL); /* subaddress not allowed in msn */ - if ((proc & 0x7F) > 2) return (-EINVAL); - proc &= 3; - p = tmp; - *p++ = 0x30; /* enumeration */ - ielenp = p++; /* remember total length position */ - *p++ = 0xa; /* proc tag */ - *p++ = 1; /* length */ - *p++ = proc & 0x7F; /* procedure to de/activate/interrogate */ - *p++ = 0xa; /* service tag */ - *p++ = 1; /* length */ - *p++ = service; /* service to handle */ - - if (mode == 1) { - if (!*fwd_nr) return (-EINVAL); /* destination missing */ - if (strchr(fwd_nr, '.')) return (-EINVAL); /* subaddress not allowed */ - fwd_len = strlen(fwd_nr); - *p++ = 0x30; /* number enumeration */ - *p++ = fwd_len + 2; /* complete forward to len */ - *p++ = 0x80; /* fwd to nr */ - *p++ = fwd_len; /* length of number */ - strcpy(p, fwd_nr); /* copy number */ - p += fwd_len; /* pointer beyond fwd */ - } /* activate */ - - msnlen = strlen(msn); - *p++ = 0x80; /* msn number */ - if (msnlen > 1) { - *p++ = msnlen; /* length */ - strcpy(p, msn); - p += msnlen; - } else - *p++ = 0; - - *ielenp = p - ielenp - 1; /* set total IE length */ - - /* allocate mem for information struct */ - if (!(cs = kmalloc(sizeof(struct call_struc), GFP_ATOMIC))) - return (-ENOMEM); /* no memory */ - timer_setup(&cs->timer, deflect_timer_expire, 0); - cs->info[0] = '\0'; - cs->ics.driver = drvid; - cs->ics.command = ISDN_CMD_PROT_IO; /* protocol specific io */ - cs->ics.arg = DSS1_CMD_INVOKE; /* invoke supplementary service */ - cs->ics.parm.dss1_io.proc = (mode == 1) ? 7 : (mode == 2) ? 11 : 8; /* operation */ - cs->ics.parm.dss1_io.timeout = 4000; /* from ETS 300 207-1 */ - cs->ics.parm.dss1_io.datalen = p - tmp; /* total len */ - cs->ics.parm.dss1_io.data = tmp; /* start of buffer */ - - spin_lock_irqsave(&divert_lock, flags); - cs->ics.parm.dss1_io.ll_id = next_id++; /* id for callback */ - spin_unlock_irqrestore(&divert_lock, flags); - *procid = cs->ics.parm.dss1_io.ll_id; - - sprintf(cs->info, "%d 0x%lx %s%s 0 %s %02x %d%s%s\n", - (!mode) ? DIVERT_DEACTIVATE : (mode == 1) ? DIVERT_ACTIVATE : DIVERT_REPORT, - cs->ics.parm.dss1_io.ll_id, - (mode != 2) ? "" : "0 ", - divert_if.drv_to_name(cs->ics.driver), - msn, - service & 0xFF, - proc, - (mode != 1) ? "" : " 0 ", - (mode != 1) ? "" : fwd_nr); - - retval = divert_if.ll_cmd(&cs->ics); /* execute command */ - - if (!retval) { - cs->prev = NULL; - spin_lock_irqsave(&divert_lock, flags); - cs->next = divert_head; - divert_head = cs; - spin_unlock_irqrestore(&divert_lock, flags); - } else - kfree(cs); - return (retval); -} /* cf_command */ - - -/****************************************/ -/* handle a external deflection command */ -/****************************************/ -int deflect_extern_action(u_char cmd, ulong callid, char *to_nr) -{ - struct call_struc *cs; - isdn_ctrl ic; - unsigned long flags; - int i; - - if ((cmd & 0x7F) > 2) return (-EINVAL); /* invalid command */ - cs = divert_head; /* start of parameter list */ - while (cs) { - if (cs->divert_id == callid) break; /* found */ - cs = cs->next; - } /* search entry */ - if (!cs) return (-EINVAL); /* invalid callid */ - - ic.driver = cs->ics.driver; - ic.arg = cs->ics.arg; - i = -EINVAL; - if (cs->akt_state == DEFLECT_AUTODEL) return (i); /* no valid call */ - switch (cmd & 0x7F) { - case 0: /* hangup */ - del_timer(&cs->timer); - ic.command = ISDN_CMD_HANGUP; - i = divert_if.ll_cmd(&ic); - spin_lock_irqsave(&divert_lock, flags); - cs->akt_state = DEFLECT_AUTODEL; /* delete after timeout */ - cs->timer.expires = jiffies + (HZ * AUTODEL_TIME); - add_timer(&cs->timer); - spin_unlock_irqrestore(&divert_lock, flags); - break; - - case 1: /* alert */ - if (cs->akt_state == DEFLECT_ALERT) return (0); - cmd &= 0x7F; /* never wait */ - del_timer(&cs->timer); - ic.command = ISDN_CMD_ALERT; - if ((i = divert_if.ll_cmd(&ic))) { - spin_lock_irqsave(&divert_lock, flags); - cs->akt_state = DEFLECT_AUTODEL; /* delete after timeout */ - cs->timer.expires = jiffies + (HZ * AUTODEL_TIME); - add_timer(&cs->timer); - spin_unlock_irqrestore(&divert_lock, flags); - } else - cs->akt_state = DEFLECT_ALERT; - break; - - case 2: /* redir */ - del_timer(&cs->timer); - strlcpy(cs->ics.parm.setup.phone, to_nr, sizeof(cs->ics.parm.setup.phone)); - strcpy(cs->ics.parm.setup.eazmsn, "Testtext manual"); - ic.command = ISDN_CMD_REDIR; - if ((i = divert_if.ll_cmd(&ic))) { - spin_lock_irqsave(&divert_lock, flags); - cs->akt_state = DEFLECT_AUTODEL; /* delete after timeout */ - cs->timer.expires = jiffies + (HZ * AUTODEL_TIME); - add_timer(&cs->timer); - spin_unlock_irqrestore(&divert_lock, flags); - } else - cs->akt_state = DEFLECT_ALERT; - break; - - } /* switch */ - return (i); -} /* deflect_extern_action */ - -/********************************/ -/* insert a new rule before idx */ -/********************************/ -int insertrule(int idx, divert_rule *newrule) -{ - struct deflect_struc *ds, *ds1 = NULL; - unsigned long flags; - - if (!(ds = kmalloc(sizeof(struct deflect_struc), GFP_KERNEL))) - return (-ENOMEM); /* no memory */ - - ds->rule = *newrule; /* set rule */ - - spin_lock_irqsave(&divert_lock, flags); - - if (idx >= 0) { - ds1 = table_head; - while ((ds1) && (idx > 0)) - { idx--; - ds1 = ds1->next; - } - if (!ds1) idx = -1; - } - - if (idx < 0) { - ds->prev = table_tail; /* previous entry */ - ds->next = NULL; /* end of chain */ - if (ds->prev) - ds->prev->next = ds; /* last forward */ - else - table_head = ds; /* is first entry */ - table_tail = ds; /* end of queue */ - } else { - ds->next = ds1; /* next entry */ - ds->prev = ds1->prev; /* prev entry */ - ds1->prev = ds; /* backward chain old element */ - if (!ds->prev) - table_head = ds; /* first element */ - } - - spin_unlock_irqrestore(&divert_lock, flags); - return (0); -} /* insertrule */ - -/***********************************/ -/* delete the rule at position idx */ -/***********************************/ -int deleterule(int idx) -{ - struct deflect_struc *ds, *ds1; - unsigned long flags; - - if (idx < 0) { - spin_lock_irqsave(&divert_lock, flags); - ds = table_head; - table_head = NULL; - table_tail = NULL; - spin_unlock_irqrestore(&divert_lock, flags); - while (ds) { - ds1 = ds; - ds = ds->next; - kfree(ds1); - } - return (0); - } - - spin_lock_irqsave(&divert_lock, flags); - ds = table_head; - - while ((ds) && (idx > 0)) { - idx--; - ds = ds->next; - } - - if (!ds) { - spin_unlock_irqrestore(&divert_lock, flags); - return (-EINVAL); - } - - if (ds->next) - ds->next->prev = ds->prev; /* backward chain */ - else - table_tail = ds->prev; /* end of chain */ - - if (ds->prev) - ds->prev->next = ds->next; /* forward chain */ - else - table_head = ds->next; /* start of chain */ - - spin_unlock_irqrestore(&divert_lock, flags); - kfree(ds); - return (0); -} /* deleterule */ - -/*******************************************/ -/* get a pointer to a specific rule number */ -/*******************************************/ -divert_rule *getruleptr(int idx) -{ - struct deflect_struc *ds = table_head; - - if (idx < 0) return (NULL); - while ((ds) && (idx >= 0)) { - if (!(idx--)) { - return (&ds->rule); - break; - } - ds = ds->next; - } - return (NULL); -} /* getruleptr */ - -/*************************************************/ -/* called from common module on an incoming call */ -/*************************************************/ -static int isdn_divert_icall(isdn_ctrl *ic) -{ - int retval = 0; - unsigned long flags; - struct call_struc *cs = NULL; - struct deflect_struc *dv; - char *p, *p1; - u_char accept; - - /* first check the internal deflection table */ - for (dv = table_head; dv; dv = dv->next) { - /* scan table */ - if (((dv->rule.callopt == 1) && (ic->command == ISDN_STAT_ICALLW)) || - ((dv->rule.callopt == 2) && (ic->command == ISDN_STAT_ICALL))) - continue; /* call option check */ - if (!(dv->rule.drvid & (1L << ic->driver))) - continue; /* driver not matching */ - if ((dv->rule.si1) && (dv->rule.si1 != ic->parm.setup.si1)) - continue; /* si1 not matching */ - if ((dv->rule.si2) && (dv->rule.si2 != ic->parm.setup.si2)) - continue; /* si2 not matching */ - - p = dv->rule.my_msn; - p1 = ic->parm.setup.eazmsn; - accept = 0; - while (*p) { - /* complete compare */ - if (*p == '-') { - accept = 1; /* call accepted */ - break; - } - if (*p++ != *p1++) - break; /* not accepted */ - if ((!*p) && (!*p1)) - accept = 1; - } /* complete compare */ - if (!accept) continue; /* not accepted */ - - if ((strcmp(dv->rule.caller, "0")) || - (ic->parm.setup.phone[0])) { - p = dv->rule.caller; - p1 = ic->parm.setup.phone; - accept = 0; - while (*p) { - /* complete compare */ - if (*p == '-') { - accept = 1; /* call accepted */ - break; - } - if (*p++ != *p1++) - break; /* not accepted */ - if ((!*p) && (!*p1)) - accept = 1; - } /* complete compare */ - if (!accept) continue; /* not accepted */ - } - - switch (dv->rule.action) { - case DEFLECT_IGNORE: - return 0; - - case DEFLECT_ALERT: - case DEFLECT_PROCEED: - case DEFLECT_REPORT: - case DEFLECT_REJECT: - if (dv->rule.action == DEFLECT_PROCEED) - if ((!if_used) || ((!extern_wait_max) && (!dv->rule.waittime))) - return (0); /* no external deflection needed */ - if (!(cs = kmalloc(sizeof(struct call_struc), GFP_ATOMIC))) - return (0); /* no memory */ - timer_setup(&cs->timer, deflect_timer_expire, 0); - cs->info[0] = '\0'; - - cs->ics = *ic; /* copy incoming data */ - if (!cs->ics.parm.setup.phone[0]) strcpy(cs->ics.parm.setup.phone, "0"); - if (!cs->ics.parm.setup.eazmsn[0]) strcpy(cs->ics.parm.setup.eazmsn, "0"); - cs->ics.parm.setup.screen = dv->rule.screen; - if (dv->rule.waittime) - cs->timer.expires = jiffies + (HZ * dv->rule.waittime); - else if (dv->rule.action == DEFLECT_PROCEED) - cs->timer.expires = jiffies + (HZ * extern_wait_max); - else - cs->timer.expires = 0; - cs->akt_state = dv->rule.action; - spin_lock_irqsave(&divert_lock, flags); - cs->divert_id = next_id++; /* new sequence number */ - spin_unlock_irqrestore(&divert_lock, flags); - cs->prev = NULL; - if (cs->akt_state == DEFLECT_ALERT) { - strcpy(cs->deflect_dest, dv->rule.to_nr); - if (!cs->timer.expires) { - strcpy(ic->parm.setup.eazmsn, - "Testtext direct"); - ic->parm.setup.screen = dv->rule.screen; - strlcpy(ic->parm.setup.phone, dv->rule.to_nr, sizeof(ic->parm.setup.phone)); - cs->akt_state = DEFLECT_AUTODEL; /* delete after timeout */ - cs->timer.expires = jiffies + (HZ * AUTODEL_TIME); - retval = 5; - } else - retval = 1; /* alerting */ - } else { - cs->deflect_dest[0] = '\0'; - retval = 4; /* only proceed */ - } - snprintf(cs->info, sizeof(cs->info), - "%d 0x%lx %s %s %s %s 0x%x 0x%x %d %d %s\n", - cs->akt_state, - cs->divert_id, - divert_if.drv_to_name(cs->ics.driver), - (ic->command == ISDN_STAT_ICALLW) ? "1" : "0", - cs->ics.parm.setup.phone, - cs->ics.parm.setup.eazmsn, - cs->ics.parm.setup.si1, - cs->ics.parm.setup.si2, - cs->ics.parm.setup.screen, - dv->rule.waittime, - cs->deflect_dest); - if ((dv->rule.action == DEFLECT_REPORT) || - (dv->rule.action == DEFLECT_REJECT)) { - put_info_buffer(cs->info); - kfree(cs); /* remove */ - return ((dv->rule.action == DEFLECT_REPORT) ? 0 : 2); /* nothing to do */ - } - break; - - default: - return 0; /* ignore call */ - } /* switch action */ - break; /* will break the 'for' looping */ - } /* scan_table */ - - if (cs) { - cs->prev = NULL; - spin_lock_irqsave(&divert_lock, flags); - cs->next = divert_head; - divert_head = cs; - if (cs->timer.expires) add_timer(&cs->timer); - spin_unlock_irqrestore(&divert_lock, flags); - - put_info_buffer(cs->info); - return (retval); - } else - return (0); -} /* isdn_divert_icall */ - - -void deleteprocs(void) -{ - struct call_struc *cs, *cs1; - unsigned long flags; - - spin_lock_irqsave(&divert_lock, flags); - cs = divert_head; - divert_head = NULL; - while (cs) { - del_timer(&cs->timer); - cs1 = cs; - cs = cs->next; - kfree(cs1); - } - spin_unlock_irqrestore(&divert_lock, flags); -} /* deleteprocs */ - -/****************************************************/ -/* put a address including address type into buffer */ -/****************************************************/ -static int put_address(char *st, u_char *p, int len) -{ - u_char retval = 0; - u_char adr_typ = 0; /* network standard */ - - if (len < 2) return (retval); - if (*p == 0xA1) { - retval = *(++p) + 2; /* total length */ - if (retval > len) return (0); /* too short */ - len = retval - 2; /* remaining length */ - if (len < 3) return (0); - if ((*(++p) != 0x0A) || (*(++p) != 1)) return (0); - adr_typ = *(++p); - len -= 3; - p++; - if (len < 2) return (0); - if (*p++ != 0x12) return (0); - if (*p > len) return (0); /* check number length */ - len = *p++; - } else if (*p == 0x80) { - retval = *(++p) + 2; /* total length */ - if (retval > len) return (0); - len = retval - 2; - p++; - } else - return (0); /* invalid address information */ - - sprintf(st, "%d ", adr_typ); - st += strlen(st); - if (!len) - *st++ = '-'; - else - while (len--) - *st++ = *p++; - *st = '\0'; - return (retval); -} /* put_address */ - -/*************************************/ -/* report a successful interrogation */ -/*************************************/ -static int interrogate_success(isdn_ctrl *ic, struct call_struc *cs) -{ - char *src = ic->parm.dss1_io.data; - int restlen = ic->parm.dss1_io.datalen; - int cnt = 1; - u_char n, n1; - char st[90], *p, *stp; - - if (restlen < 2) return (-100); /* frame too short */ - if (*src++ != 0x30) return (-101); - if ((n = *src++) > 0x81) return (-102); /* invalid length field */ - restlen -= 2; /* remaining bytes */ - if (n == 0x80) { - if (restlen < 2) return (-103); - if ((*(src + restlen - 1)) || (*(src + restlen - 2))) return (-104); - restlen -= 2; - } else if (n == 0x81) { - n = *src++; - restlen--; - if (n > restlen) return (-105); - restlen = n; - } else if (n > restlen) - return (-106); - else - restlen = n; /* standard format */ - if (restlen < 3) return (-107); /* no procedure */ - if ((*src++ != 2) || (*src++ != 1) || (*src++ != 0x0B)) return (-108); - restlen -= 3; - if (restlen < 2) return (-109); /* list missing */ - if (*src == 0x31) { - src++; - if ((n = *src++) > 0x81) return (-110); /* invalid length field */ - restlen -= 2; /* remaining bytes */ - if (n == 0x80) { - if (restlen < 2) return (-111); - if ((*(src + restlen - 1)) || (*(src + restlen - 2))) return (-112); - restlen -= 2; - } else if (n == 0x81) { - n = *src++; - restlen--; - if (n > restlen) return (-113); - restlen = n; - } else if (n > restlen) - return (-114); - else - restlen = n; /* standard format */ - } /* result list header */ - - while (restlen >= 2) { - stp = st; - sprintf(stp, "%d 0x%lx %d %s ", DIVERT_REPORT, ic->parm.dss1_io.ll_id, - cnt++, divert_if.drv_to_name(ic->driver)); - stp += strlen(stp); - if (*src++ != 0x30) return (-115); /* invalid enum */ - n = *src++; - restlen -= 2; - if (n > restlen) return (-116); /* enum length wrong */ - restlen -= n; - p = src; /* one entry */ - src += n; - if (!(n1 = put_address(stp, p, n & 0xFF))) continue; - stp += strlen(stp); - p += n1; - n -= n1; - if (n < 6) continue; /* no service and proc */ - if ((*p++ != 0x0A) || (*p++ != 1)) continue; - sprintf(stp, " 0x%02x ", (*p++) & 0xFF); - stp += strlen(stp); - if ((*p++ != 0x0A) || (*p++ != 1)) continue; - sprintf(stp, "%d ", (*p++) & 0xFF); - stp += strlen(stp); - n -= 6; - if (n > 2) { - if (*p++ != 0x30) continue; - if (*p > (n - 2)) continue; - n = *p++; - if (!(n1 = put_address(stp, p, n & 0xFF))) continue; - stp += strlen(stp); - } - sprintf(stp, "\n"); - put_info_buffer(st); - } /* while restlen */ - if (restlen) return (-117); - return (0); -} /* interrogate_success */ - -/*********************************************/ -/* callback for protocol specific extensions */ -/*********************************************/ -static int prot_stat_callback(isdn_ctrl *ic) -{ - struct call_struc *cs, *cs1; - int i; - unsigned long flags; - - cs = divert_head; /* start of list */ - cs1 = NULL; - while (cs) { - if (ic->driver == cs->ics.driver) { - switch (cs->ics.arg) { - case DSS1_CMD_INVOKE: - if ((cs->ics.parm.dss1_io.ll_id == ic->parm.dss1_io.ll_id) && - (cs->ics.parm.dss1_io.hl_id == ic->parm.dss1_io.hl_id)) { - switch (ic->arg) { - case DSS1_STAT_INVOKE_ERR: - sprintf(cs->info, "128 0x%lx 0x%x\n", - ic->parm.dss1_io.ll_id, - ic->parm.dss1_io.timeout); - put_info_buffer(cs->info); - break; - - case DSS1_STAT_INVOKE_RES: - switch (cs->ics.parm.dss1_io.proc) { - case 7: - case 8: - put_info_buffer(cs->info); - break; - - case 11: - i = interrogate_success(ic, cs); - if (i) - sprintf(cs->info, "%d 0x%lx %d\n", DIVERT_REPORT, - ic->parm.dss1_io.ll_id, i); - put_info_buffer(cs->info); - break; - - default: - printk(KERN_WARNING "dss1_divert: unknown proc %d\n", cs->ics.parm.dss1_io.proc); - break; - } - - break; - - default: - printk(KERN_WARNING "dss1_divert unknown invoke answer %lx\n", ic->arg); - break; - } - cs1 = cs; /* remember structure */ - cs = NULL; - continue; /* abort search */ - } /* id found */ - break; - - case DSS1_CMD_INVOKE_ABORT: - printk(KERN_WARNING "dss1_divert unhandled invoke abort\n"); - break; - - default: - printk(KERN_WARNING "dss1_divert unknown cmd 0x%lx\n", cs->ics.arg); - break; - } /* switch ics.arg */ - cs = cs->next; - } /* driver ok */ - } - - if (!cs1) { - printk(KERN_WARNING "dss1_divert unhandled process\n"); - return (0); - } - - if (cs1->ics.driver == -1) { - spin_lock_irqsave(&divert_lock, flags); - del_timer(&cs1->timer); - if (cs1->prev) - cs1->prev->next = cs1->next; /* forward link */ - else - divert_head = cs1->next; - if (cs1->next) - cs1->next->prev = cs1->prev; /* back link */ - spin_unlock_irqrestore(&divert_lock, flags); - kfree(cs1); - } - - return (0); -} /* prot_stat_callback */ - - -/***************************/ -/* status callback from HL */ -/***************************/ -static int isdn_divert_stat_callback(isdn_ctrl *ic) -{ - struct call_struc *cs, *cs1; - unsigned long flags; - int retval; - - retval = -1; - cs = divert_head; /* start of list */ - while (cs) { - if ((ic->driver == cs->ics.driver) && - (ic->arg == cs->ics.arg)) { - switch (ic->command) { - case ISDN_STAT_DHUP: - sprintf(cs->info, "129 0x%lx\n", cs->divert_id); - del_timer(&cs->timer); - cs->ics.driver = -1; - break; - - case ISDN_STAT_CAUSE: - sprintf(cs->info, "130 0x%lx %s\n", cs->divert_id, ic->parm.num); - break; - - case ISDN_STAT_REDIR: - sprintf(cs->info, "131 0x%lx\n", cs->divert_id); - del_timer(&cs->timer); - cs->ics.driver = -1; - break; - - default: - sprintf(cs->info, "999 0x%lx 0x%x\n", cs->divert_id, (int)(ic->command)); - break; - } - put_info_buffer(cs->info); - retval = 0; - } - cs1 = cs; - cs = cs->next; - if (cs1->ics.driver == -1) { - spin_lock_irqsave(&divert_lock, flags); - if (cs1->prev) - cs1->prev->next = cs1->next; /* forward link */ - else - divert_head = cs1->next; - if (cs1->next) - cs1->next->prev = cs1->prev; /* back link */ - spin_unlock_irqrestore(&divert_lock, flags); - kfree(cs1); - } - } - return (retval); /* not found */ -} /* isdn_divert_stat_callback */ - - -/********************/ -/* callback from ll */ -/********************/ -int ll_callback(isdn_ctrl *ic) -{ - switch (ic->command) { - case ISDN_STAT_ICALL: - case ISDN_STAT_ICALLW: - return (isdn_divert_icall(ic)); - break; - - case ISDN_STAT_PROT: - if ((ic->arg & 0xFF) == ISDN_PTYPE_EURO) { - if (ic->arg != DSS1_STAT_INVOKE_BRD) - return (prot_stat_callback(ic)); - else - return (0); /* DSS1 invoke broadcast */ - } else - return (-1); /* protocol not euro */ - - default: - return (isdn_divert_stat_callback(ic)); - } -} /* ll_callback */ diff --git a/drivers/isdn/divert/isdn_divert.h b/drivers/isdn/divert/isdn_divert.h deleted file mode 100644 index 55033dd872c0..000000000000 --- a/drivers/isdn/divert/isdn_divert.h +++ /dev/null @@ -1,132 +0,0 @@ -/* $Id: isdn_divert.h,v 1.5.6.1 2001/09/23 22:24:36 kai Exp $ - * - * Header for the diversion supplementary ioctl interface. - * - * Copyright 1998 by Werner Cornelius (werner@ikt.de) - * - * This software may be used and distributed according to the terms - * of the GNU General Public License, incorporated herein by reference. - * - */ - -#include -#include - -/******************************************/ -/* IOCTL codes for interface to user prog */ -/******************************************/ -#define DIVERT_IIOC_VERSION 0x01 /* actual version */ -#define IIOCGETVER _IO('I', 1) /* get version of interface */ -#define IIOCGETDRV _IO('I', 2) /* get driver number */ -#define IIOCGETNAM _IO('I', 3) /* get driver name */ -#define IIOCGETRULE _IO('I', 4) /* read one rule */ -#define IIOCMODRULE _IO('I', 5) /* modify/replace a rule */ -#define IIOCINSRULE _IO('I', 6) /* insert/append one rule */ -#define IIOCDELRULE _IO('I', 7) /* delete a rule */ -#define IIOCDODFACT _IO('I', 8) /* hangup/reject/alert/immediately deflect a call */ -#define IIOCDOCFACT _IO('I', 9) /* activate control forwarding in PBX */ -#define IIOCDOCFDIS _IO('I', 10) /* deactivate control forwarding in PBX */ -#define IIOCDOCFINT _IO('I', 11) /* interrogate control forwarding in PBX */ - -/*************************************/ -/* states reported through interface */ -/*************************************/ -#define DEFLECT_IGNORE 0 /* ignore incoming call */ -#define DEFLECT_REPORT 1 /* only report */ -#define DEFLECT_PROCEED 2 /* deflect when externally triggered */ -#define DEFLECT_ALERT 3 /* alert and deflect after delay */ -#define DEFLECT_REJECT 4 /* reject immediately */ -#define DIVERT_ACTIVATE 5 /* diversion activate */ -#define DIVERT_DEACTIVATE 6 /* diversion deactivate */ -#define DIVERT_REPORT 7 /* interrogation result */ -#define DEFLECT_AUTODEL 255 /* only for internal use */ - -#define DEFLECT_ALL_IDS 0xFFFFFFFF /* all drivers selected */ - -typedef struct { - ulong drvid; /* driver ids, bit mapped */ - char my_msn[35]; /* desired msn, subaddr allowed */ - char caller[35]; /* caller id, partial string with * + subaddr allowed */ - char to_nr[35]; /* deflected to number incl. subaddress */ - u_char si1, si2; /* service indicators, si1=bitmask, si1+2 0 = all */ - u_char screen; /* screening: 0 = no info, 1 = info, 2 = nfo with nr */ - u_char callopt; /* option for call handling: - 0 = all calls - 1 = only non waiting calls - 2 = only waiting calls */ - u_char action; /* desired action: - 0 = don't report call -> ignore - 1 = report call, do not allow/proceed for deflection - 2 = report call, send proceed, wait max waittime secs - 3 = report call, alert and deflect after waittime - 4 = report call, reject immediately - actions 1-2 only take place if interface is opened - */ - u_char waittime; /* maximum wait time for proceeding */ -} divert_rule; - -typedef union { - int drv_version; /* return of driver version */ - struct { - int drvid; /* id of driver */ - char drvnam[30]; /* name of driver */ - } getid; - struct { - int ruleidx; /* index of rule */ - divert_rule rule; /* rule parms */ - } getsetrule; - struct { - u_char subcmd; /* 0 = hangup/reject, - 1 = alert, - 2 = deflect */ - ulong callid; /* id of call delivered by ascii output */ - char to_nr[35]; /* destination when deflect, - else uus1 string (maxlen 31), - data from rule used if empty */ - } fwd_ctrl; - struct { - int drvid; /* id of driver */ - u_char cfproc; /* cfu = 0, cfb = 1, cfnr = 2 */ - ulong procid; /* process id returned when no error */ - u_char service; /* basically coded service, 0 = all */ - char msn[25]; /* desired msn, empty = all */ - char fwd_nr[35];/* forwarded to number + subaddress */ - } cf_ctrl; -} divert_ioctl; - -#ifdef __KERNEL__ - -#include -#include - -#define AUTODEL_TIME 30 /* timeout in s to delete internal entries */ - -/**************************************************/ -/* structure keeping ascii info for device output */ -/**************************************************/ -struct divert_info { - struct divert_info *next; - ulong usage_cnt; /* number of files still to work */ - char info_start[2]; /* info string start */ -}; - - -/**************/ -/* Prototypes */ -/**************/ -extern spinlock_t divert_lock; - -extern ulong if_used; /* number of interface users */ -extern int divert_dev_deinit(void); -extern int divert_dev_init(void); -extern void put_info_buffer(char *); -extern int ll_callback(isdn_ctrl *); -extern isdn_divert_if divert_if; -extern divert_rule *getruleptr(int); -extern int insertrule(int, divert_rule *); -extern int deleterule(int); -extern void deleteprocs(void); -extern int deflect_extern_action(u_char, ulong, char *); -extern int cf_command(int, int, u_char, char *, u_char, char *, ulong *); - -#endif /* __KERNEL__ */ diff --git a/drivers/isdn/i4l/Kconfig b/drivers/isdn/i4l/Kconfig deleted file mode 100644 index cacde8de38a3..000000000000 --- a/drivers/isdn/i4l/Kconfig +++ /dev/null @@ -1,127 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only -# -# Old ISDN4Linux config -# - -if ISDN_I4L - -config ISDN_PPP - bool "Support synchronous PPP" - depends on INET - select SLHC - help - Over digital connections such as ISDN, there is no need to - synchronize sender and recipient's clocks with start and stop bits - as is done over analog telephone lines. Instead, one can use - "synchronous PPP". Saying Y here will include this protocol. This - protocol is used by Cisco and Sun for example. So you want to say Y - here if the other end of your ISDN connection supports it. You will - need a special version of pppd (called ipppd) for using this - feature. See and - for more information. - -config ISDN_PPP_VJ - bool "Use VJ-compression with synchronous PPP" - depends on ISDN_PPP - help - This enables Van Jacobson header compression for synchronous PPP. - Say Y if the other end of the connection supports it. - -config ISDN_MPP - bool "Support generic MP (RFC 1717)" - depends on ISDN_PPP - help - With synchronous PPP enabled, it is possible to increase throughput - by bundling several ISDN-connections, using this protocol. See - for more information. - -config IPPP_FILTER - bool "Filtering for synchronous PPP" - depends on ISDN_PPP - help - Say Y here if you want to be able to filter the packets passing over - IPPP interfaces. This allows you to control which packets count as - activity (i.e. which packets will reset the idle timer or bring up - a demand-dialled link) and which packets are to be dropped entirely. - You need to say Y here if you wish to use the pass-filter and - active-filter options to ipppd. - -config ISDN_PPP_BSDCOMP - tristate "Support BSD compression" - depends on ISDN_PPP - help - Support for the BSD-Compress compression method for PPP, which uses - the LZW compression method to compress each PPP packet before it is - sent over the wire. The machine at the other end of the PPP link - (usually your ISP) has to support the BSD-Compress compression - method as well for this to be useful. Even if they don't support it, - it is safe to say Y here. - -config ISDN_AUDIO - bool "Support audio via ISDN" - help - If you say Y here, the modem-emulator will support a subset of the - EIA Class 8 Voice commands. Using a getty with voice-support - (mgetty+sendfax by with an extension, available - with the ISDN utility package for example), you will be able to use - your Linux box as an ISDN-answering machine. Of course, this must be - supported by the lowlevel driver also. Currently, the HiSax driver - is the only voice-supporting driver. See - for more information. - -config ISDN_TTY_FAX - bool "Support AT-Fax Class 1 and 2 commands" - depends on ISDN_AUDIO - help - If you say Y here, the modem-emulator will support a subset of the - Fax Class 1 and 2 commands. Using a getty with fax-support - (mgetty+sendfax, hylafax), you will be able to use your Linux box as - an ISDN-fax-machine. This must be supported by the lowlevel driver - also. See for more information. - -config ISDN_X25 - bool "X.25 PLP on top of ISDN" - depends on X25 - help - This feature provides the X.25 protocol over ISDN connections. - See for more information - if you are thinking about using this. - - -menu "ISDN feature submodules" - -config ISDN_DRV_LOOP - tristate "isdnloop support" - depends on BROKEN_ON_SMP - help - This driver provides a virtual ISDN card. Its primary purpose is - testing of linklevel features or configuration without getting - charged by your service-provider for lots of phone calls. - You need will need the loopctrl utility from the latest isdn4k-utils - package to set up this driver. - -config ISDN_DIVERSION - tristate "Support isdn diversion services" - help - This option allows you to use some supplementary diversion - services in conjunction with the HiSax driver on an EURO/DSS1 - line. - - Supported options are CD (call deflection), CFU (Call forward - unconditional), CFB (Call forward when busy) and CFNR (call forward - not reachable). Additionally the actual CFU, CFB and CFNR state may - be interrogated. - - The use of CFU, CFB, CFNR and interrogation may be limited to some - countries. The keypad protocol is still not implemented. CD should - work in all countries if the service has been subscribed to. - - Please read the file . - -endmenu - -comment "ISDN4Linux hardware drivers" - -# end ISDN_I4L -endif - diff --git a/drivers/isdn/i4l/Makefile b/drivers/isdn/i4l/Makefile index be77500c9e86..11fe697739d5 100644 --- a/drivers/isdn/i4l/Makefile +++ b/drivers/isdn/i4l/Makefile @@ -3,18 +3,4 @@ # Each configuration option enables a list of files. -obj-$(CONFIG_ISDN_I4L) += isdn.o -obj-$(CONFIG_ISDN_PPP_BSDCOMP) += isdn_bsdcomp.o obj-$(CONFIG_ISDN_HDLC) += isdnhdlc.o - -# Multipart objects. - -isdn-y := isdn_net.o isdn_tty.o isdn_v110.o isdn_common.o - -# Optional parts of multipart objects. - -isdn-$(CONFIG_ISDN_PPP) += isdn_ppp.o -isdn-$(CONFIG_ISDN_X25) += isdn_concap.o isdn_x25iface.o -isdn-$(CONFIG_ISDN_AUDIO) += isdn_audio.o -isdn-$(CONFIG_ISDN_TTY_FAX) += isdn_ttyfax.o - diff --git a/drivers/isdn/i4l/isdn_audio.c b/drivers/isdn/i4l/isdn_audio.c deleted file mode 100644 index b6bcd1eca128..000000000000 --- a/drivers/isdn/i4l/isdn_audio.c +++ /dev/null @@ -1,711 +0,0 @@ -/* $Id: isdn_audio.c,v 1.1.2.2 2004/01/12 22:37:18 keil Exp $ - * - * Linux ISDN subsystem, audio conversion and compression (linklevel). - * - * Copyright 1994-1999 by Fritz Elfert (fritz@isdn4linux.de) - * DTMF code (c) 1996 by Christian Mock (cm@kukuruz.ping.at) - * Silence detection (c) 1998 by Armin Schindler (mac@gismo.telekom.de) - * - * This software may be used and distributed according to the terms - * of the GNU General Public License, incorporated herein by reference. - * - */ - -#include -#include -#include "isdn_audio.h" -#include "isdn_common.h" - -char *isdn_audio_revision = "$Revision: 1.1.2.2 $"; - -/* - * Misc. lookup-tables. - */ - -/* ulaw -> signed 16-bit */ -static short isdn_audio_ulaw_to_s16[] = -{ - 0x8284, 0x8684, 0x8a84, 0x8e84, 0x9284, 0x9684, 0x9a84, 0x9e84, - 0xa284, 0xa684, 0xaa84, 0xae84, 0xb284, 0xb684, 0xba84, 0xbe84, - 0xc184, 0xc384, 0xc584, 0xc784, 0xc984, 0xcb84, 0xcd84, 0xcf84, - 0xd184, 0xd384, 0xd584, 0xd784, 0xd984, 0xdb84, 0xdd84, 0xdf84, - 0xe104, 0xe204, 0xe304, 0xe404, 0xe504, 0xe604, 0xe704, 0xe804, - 0xe904, 0xea04, 0xeb04, 0xec04, 0xed04, 0xee04, 0xef04, 0xf004, - 0xf0c4, 0xf144, 0xf1c4, 0xf244, 0xf2c4, 0xf344, 0xf3c4, 0xf444, - 0xf4c4, 0xf544, 0xf5c4, 0xf644, 0xf6c4, 0xf744, 0xf7c4, 0xf844, - 0xf8a4, 0xf8e4, 0xf924, 0xf964, 0xf9a4, 0xf9e4, 0xfa24, 0xfa64, - 0xfaa4, 0xfae4, 0xfb24, 0xfb64, 0xfba4, 0xfbe4, 0xfc24, 0xfc64, - 0xfc94, 0xfcb4, 0xfcd4, 0xfcf4, 0xfd14, 0xfd34, 0xfd54, 0xfd74, - 0xfd94, 0xfdb4, 0xfdd4, 0xfdf4, 0xfe14, 0xfe34, 0xfe54, 0xfe74, - 0xfe8c, 0xfe9c, 0xfeac, 0xfebc, 0xfecc, 0xfedc, 0xfeec, 0xfefc, - 0xff0c, 0xff1c, 0xff2c, 0xff3c, 0xff4c, 0xff5c, 0xff6c, 0xff7c, - 0xff88, 0xff90, 0xff98, 0xffa0, 0xffa8, 0xffb0, 0xffb8, 0xffc0, - 0xffc8, 0xffd0, 0xffd8, 0xffe0, 0xffe8, 0xfff0, 0xfff8, 0x0000, - 0x7d7c, 0x797c, 0x757c, 0x717c, 0x6d7c, 0x697c, 0x657c, 0x617c, - 0x5d7c, 0x597c, 0x557c, 0x517c, 0x4d7c, 0x497c, 0x457c, 0x417c, - 0x3e7c, 0x3c7c, 0x3a7c, 0x387c, 0x367c, 0x347c, 0x327c, 0x307c, - 0x2e7c, 0x2c7c, 0x2a7c, 0x287c, 0x267c, 0x247c, 0x227c, 0x207c, - 0x1efc, 0x1dfc, 0x1cfc, 0x1bfc, 0x1afc, 0x19fc, 0x18fc, 0x17fc, - 0x16fc, 0x15fc, 0x14fc, 0x13fc, 0x12fc, 0x11fc, 0x10fc, 0x0ffc, - 0x0f3c, 0x0ebc, 0x0e3c, 0x0dbc, 0x0d3c, 0x0cbc, 0x0c3c, 0x0bbc, - 0x0b3c, 0x0abc, 0x0a3c, 0x09bc, 0x093c, 0x08bc, 0x083c, 0x07bc, - 0x075c, 0x071c, 0x06dc, 0x069c, 0x065c, 0x061c, 0x05dc, 0x059c, - 0x055c, 0x051c, 0x04dc, 0x049c, 0x045c, 0x041c, 0x03dc, 0x039c, - 0x036c, 0x034c, 0x032c, 0x030c, 0x02ec, 0x02cc, 0x02ac, 0x028c, - 0x026c, 0x024c, 0x022c, 0x020c, 0x01ec, 0x01cc, 0x01ac, 0x018c, - 0x0174, 0x0164, 0x0154, 0x0144, 0x0134, 0x0124, 0x0114, 0x0104, - 0x00f4, 0x00e4, 0x00d4, 0x00c4, 0x00b4, 0x00a4, 0x0094, 0x0084, - 0x0078, 0x0070, 0x0068, 0x0060, 0x0058, 0x0050, 0x0048, 0x0040, - 0x0038, 0x0030, 0x0028, 0x0020, 0x0018, 0x0010, 0x0008, 0x0000 -}; - -/* alaw -> signed 16-bit */ -static short isdn_audio_alaw_to_s16[] = -{ - 0x13fc, 0xec04, 0x0144, 0xfebc, 0x517c, 0xae84, 0x051c, 0xfae4, - 0x0a3c, 0xf5c4, 0x0048, 0xffb8, 0x287c, 0xd784, 0x028c, 0xfd74, - 0x1bfc, 0xe404, 0x01cc, 0xfe34, 0x717c, 0x8e84, 0x071c, 0xf8e4, - 0x0e3c, 0xf1c4, 0x00c4, 0xff3c, 0x387c, 0xc784, 0x039c, 0xfc64, - 0x0ffc, 0xf004, 0x0104, 0xfefc, 0x417c, 0xbe84, 0x041c, 0xfbe4, - 0x083c, 0xf7c4, 0x0008, 0xfff8, 0x207c, 0xdf84, 0x020c, 0xfdf4, - 0x17fc, 0xe804, 0x018c, 0xfe74, 0x617c, 0x9e84, 0x061c, 0xf9e4, - 0x0c3c, 0xf3c4, 0x0084, 0xff7c, 0x307c, 0xcf84, 0x030c, 0xfcf4, - 0x15fc, 0xea04, 0x0164, 0xfe9c, 0x597c, 0xa684, 0x059c, 0xfa64, - 0x0b3c, 0xf4c4, 0x0068, 0xff98, 0x2c7c, 0xd384, 0x02cc, 0xfd34, - 0x1dfc, 0xe204, 0x01ec, 0xfe14, 0x797c, 0x8684, 0x07bc, 0xf844, - 0x0f3c, 0xf0c4, 0x00e4, 0xff1c, 0x3c7c, 0xc384, 0x03dc, 0xfc24, - 0x11fc, 0xee04, 0x0124, 0xfedc, 0x497c, 0xb684, 0x049c, 0xfb64, - 0x093c, 0xf6c4, 0x0028, 0xffd8, 0x247c, 0xdb84, 0x024c, 0xfdb4, - 0x19fc, 0xe604, 0x01ac, 0xfe54, 0x697c, 0x9684, 0x069c, 0xf964, - 0x0d3c, 0xf2c4, 0x00a4, 0xff5c, 0x347c, 0xcb84, 0x034c, 0xfcb4, - 0x12fc, 0xed04, 0x0134, 0xfecc, 0x4d7c, 0xb284, 0x04dc, 0xfb24, - 0x09bc, 0xf644, 0x0038, 0xffc8, 0x267c, 0xd984, 0x026c, 0xfd94, - 0x1afc, 0xe504, 0x01ac, 0xfe54, 0x6d7c, 0x9284, 0x06dc, 0xf924, - 0x0dbc, 0xf244, 0x00b4, 0xff4c, 0x367c, 0xc984, 0x036c, 0xfc94, - 0x0f3c, 0xf0c4, 0x00f4, 0xff0c, 0x3e7c, 0xc184, 0x03dc, 0xfc24, - 0x07bc, 0xf844, 0x0008, 0xfff8, 0x1efc, 0xe104, 0x01ec, 0xfe14, - 0x16fc, 0xe904, 0x0174, 0xfe8c, 0x5d7c, 0xa284, 0x05dc, 0xfa24, - 0x0bbc, 0xf444, 0x0078, 0xff88, 0x2e7c, 0xd184, 0x02ec, 0xfd14, - 0x14fc, 0xeb04, 0x0154, 0xfeac, 0x557c, 0xaa84, 0x055c, 0xfaa4, - 0x0abc, 0xf544, 0x0058, 0xffa8, 0x2a7c, 0xd584, 0x02ac, 0xfd54, - 0x1cfc, 0xe304, 0x01cc, 0xfe34, 0x757c, 0x8a84, 0x075c, 0xf8a4, - 0x0ebc, 0xf144, 0x00d4, 0xff2c, 0x3a7c, 0xc584, 0x039c, 0xfc64, - 0x10fc, 0xef04, 0x0114, 0xfeec, 0x457c, 0xba84, 0x045c, 0xfba4, - 0x08bc, 0xf744, 0x0018, 0xffe8, 0x227c, 0xdd84, 0x022c, 0xfdd4, - 0x18fc, 0xe704, 0x018c, 0xfe74, 0x657c, 0x9a84, 0x065c, 0xf9a4, - 0x0cbc, 0xf344, 0x0094, 0xff6c, 0x327c, 0xcd84, 0x032c, 0xfcd4 -}; - -/* alaw -> ulaw */ -static char isdn_audio_alaw_to_ulaw[] = -{ - 0xab, 0x2b, 0xe3, 0x63, 0x8b, 0x0b, 0xc9, 0x49, - 0xba, 0x3a, 0xf6, 0x76, 0x9b, 0x1b, 0xd7, 0x57, - 0xa3, 0x23, 0xdd, 0x5d, 0x83, 0x03, 0xc1, 0x41, - 0xb2, 0x32, 0xeb, 0x6b, 0x93, 0x13, 0xcf, 0x4f, - 0xaf, 0x2f, 0xe7, 0x67, 0x8f, 0x0f, 0xcd, 0x4d, - 0xbe, 0x3e, 0xfe, 0x7e, 0x9f, 0x1f, 0xdb, 0x5b, - 0xa7, 0x27, 0xdf, 0x5f, 0x87, 0x07, 0xc5, 0x45, - 0xb6, 0x36, 0xef, 0x6f, 0x97, 0x17, 0xd3, 0x53, - 0xa9, 0x29, 0xe1, 0x61, 0x89, 0x09, 0xc7, 0x47, - 0xb8, 0x38, 0xf2, 0x72, 0x99, 0x19, 0xd5, 0x55, - 0xa1, 0x21, 0xdc, 0x5c, 0x81, 0x01, 0xbf, 0x3f, - 0xb0, 0x30, 0xe9, 0x69, 0x91, 0x11, 0xce, 0x4e, - 0xad, 0x2d, 0xe5, 0x65, 0x8d, 0x0d, 0xcb, 0x4b, - 0xbc, 0x3c, 0xfa, 0x7a, 0x9d, 0x1d, 0xd9, 0x59, - 0xa5, 0x25, 0xde, 0x5e, 0x85, 0x05, 0xc3, 0x43, - 0xb4, 0x34, 0xed, 0x6d, 0x95, 0x15, 0xd1, 0x51, - 0xac, 0x2c, 0xe4, 0x64, 0x8c, 0x0c, 0xca, 0x4a, - 0xbb, 0x3b, 0xf8, 0x78, 0x9c, 0x1c, 0xd8, 0x58, - 0xa4, 0x24, 0xde, 0x5e, 0x84, 0x04, 0xc2, 0x42, - 0xb3, 0x33, 0xec, 0x6c, 0x94, 0x14, 0xd0, 0x50, - 0xb0, 0x30, 0xe8, 0x68, 0x90, 0x10, 0xce, 0x4e, - 0xbf, 0x3f, 0xfe, 0x7e, 0xa0, 0x20, 0xdc, 0x5c, - 0xa8, 0x28, 0xe0, 0x60, 0x88, 0x08, 0xc6, 0x46, - 0xb7, 0x37, 0xf0, 0x70, 0x98, 0x18, 0xd4, 0x54, - 0xaa, 0x2a, 0xe2, 0x62, 0x8a, 0x0a, 0xc8, 0x48, - 0xb9, 0x39, 0xf4, 0x74, 0x9a, 0x1a, 0xd6, 0x56, - 0xa2, 0x22, 0xdd, 0x5d, 0x82, 0x02, 0xc0, 0x40, - 0xb1, 0x31, 0xea, 0x6a, 0x92, 0x12, 0xcf, 0x4f, - 0xae, 0x2e, 0xe6, 0x66, 0x8e, 0x0e, 0xcc, 0x4c, - 0xbd, 0x3d, 0xfc, 0x7c, 0x9e, 0x1e, 0xda, 0x5a, - 0xa6, 0x26, 0xdf, 0x5f, 0x86, 0x06, 0xc4, 0x44, - 0xb5, 0x35, 0xee, 0x6e, 0x96, 0x16, 0xd2, 0x52 -}; - -/* ulaw -> alaw */ -static char isdn_audio_ulaw_to_alaw[] = -{ - 0xab, 0x55, 0xd5, 0x15, 0x95, 0x75, 0xf5, 0x35, - 0xb5, 0x45, 0xc5, 0x05, 0x85, 0x65, 0xe5, 0x25, - 0xa5, 0x5d, 0xdd, 0x1d, 0x9d, 0x7d, 0xfd, 0x3d, - 0xbd, 0x4d, 0xcd, 0x0d, 0x8d, 0x6d, 0xed, 0x2d, - 0xad, 0x51, 0xd1, 0x11, 0x91, 0x71, 0xf1, 0x31, - 0xb1, 0x41, 0xc1, 0x01, 0x81, 0x61, 0xe1, 0x21, - 0x59, 0xd9, 0x19, 0x99, 0x79, 0xf9, 0x39, 0xb9, - 0x49, 0xc9, 0x09, 0x89, 0x69, 0xe9, 0x29, 0xa9, - 0xd7, 0x17, 0x97, 0x77, 0xf7, 0x37, 0xb7, 0x47, - 0xc7, 0x07, 0x87, 0x67, 0xe7, 0x27, 0xa7, 0xdf, - 0x9f, 0x7f, 0xff, 0x3f, 0xbf, 0x4f, 0xcf, 0x0f, - 0x8f, 0x6f, 0xef, 0x2f, 0x53, 0x13, 0x73, 0x33, - 0xb3, 0x43, 0xc3, 0x03, 0x83, 0x63, 0xe3, 0x23, - 0xa3, 0x5b, 0xdb, 0x1b, 0x9b, 0x7b, 0xfb, 0x3b, - 0xbb, 0xbb, 0x4b, 0x4b, 0xcb, 0xcb, 0x0b, 0x0b, - 0x8b, 0x8b, 0x6b, 0x6b, 0xeb, 0xeb, 0x2b, 0x2b, - 0xab, 0x54, 0xd4, 0x14, 0x94, 0x74, 0xf4, 0x34, - 0xb4, 0x44, 0xc4, 0x04, 0x84, 0x64, 0xe4, 0x24, - 0xa4, 0x5c, 0xdc, 0x1c, 0x9c, 0x7c, 0xfc, 0x3c, - 0xbc, 0x4c, 0xcc, 0x0c, 0x8c, 0x6c, 0xec, 0x2c, - 0xac, 0x50, 0xd0, 0x10, 0x90, 0x70, 0xf0, 0x30, - 0xb0, 0x40, 0xc0, 0x00, 0x80, 0x60, 0xe0, 0x20, - 0x58, 0xd8, 0x18, 0x98, 0x78, 0xf8, 0x38, 0xb8, - 0x48, 0xc8, 0x08, 0x88, 0x68, 0xe8, 0x28, 0xa8, - 0xd6, 0x16, 0x96, 0x76, 0xf6, 0x36, 0xb6, 0x46, - 0xc6, 0x06, 0x86, 0x66, 0xe6, 0x26, 0xa6, 0xde, - 0x9e, 0x7e, 0xfe, 0x3e, 0xbe, 0x4e, 0xce, 0x0e, - 0x8e, 0x6e, 0xee, 0x2e, 0x52, 0x12, 0x72, 0x32, - 0xb2, 0x42, 0xc2, 0x02, 0x82, 0x62, 0xe2, 0x22, - 0xa2, 0x5a, 0xda, 0x1a, 0x9a, 0x7a, 0xfa, 0x3a, - 0xba, 0xba, 0x4a, 0x4a, 0xca, 0xca, 0x0a, 0x0a, - 0x8a, 0x8a, 0x6a, 0x6a, 0xea, 0xea, 0x2a, 0x2a -}; - -#define NCOEFF 8 /* number of frequencies to be analyzed */ -#define DTMF_TRESH 4000 /* above this is dtmf */ -#define SILENCE_TRESH 200 /* below this is silence */ -#define AMP_BITS 9 /* bits per sample, reduced to avoid overflow */ -#define LOGRP 0 -#define HIGRP 1 - -/* For DTMF recognition: - * 2 * cos(2 * PI * k / N) precalculated for all k - */ -static int cos2pik[NCOEFF] = -{ - 55813, 53604, 51193, 48591, 38114, 33057, 25889, 18332 -}; - -static char dtmf_matrix[4][4] = -{ - {'1', '2', '3', 'A'}, - {'4', '5', '6', 'B'}, - {'7', '8', '9', 'C'}, - {'*', '0', '#', 'D'} -}; - -static inline void -isdn_audio_tlookup(const u_char *table, u_char *buff, unsigned long n) -{ -#ifdef __i386__ - unsigned long d0, d1, d2, d3; - __asm__ __volatile__( - "cld\n" - "1:\tlodsb\n\t" - "xlatb\n\t" - "stosb\n\t" - "loop 1b\n\t" - : "=&b"(d0), "=&c"(d1), "=&D"(d2), "=&S"(d3) - : "0"((long) table), "1"(n), "2"((long) buff), "3"((long) buff) - : "memory", "ax"); -#else - while (n--) - *buff = table[*(unsigned char *)buff], buff++; -#endif -} - -void -isdn_audio_ulaw2alaw(unsigned char *buff, unsigned long len) -{ - isdn_audio_tlookup(isdn_audio_ulaw_to_alaw, buff, len); -} - -void -isdn_audio_alaw2ulaw(unsigned char *buff, unsigned long len) -{ - isdn_audio_tlookup(isdn_audio_alaw_to_ulaw, buff, len); -} - -/* - * linear <-> adpcm conversion stuff - * Most parts from the mgetty-package. - * (C) by Gert Doering and Klaus Weidner - * Used by permission of Gert Doering - */ - - -#define ZEROTRAP /* turn on the trap as per the MIL-STD */ -#undef ZEROTRAP -#define BIAS 0x84 /* define the add-in bias for 16 bit samples */ -#define CLIP 32635 - -static unsigned char -isdn_audio_linear2ulaw(int sample) -{ - static int exp_lut[256] = - { - 0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, - 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, - 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, - 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, - 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, - 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7 - }; - int sign, - exponent, - mantissa; - unsigned char ulawbyte; - - /* Get the sample into sign-magnitude. */ - sign = (sample >> 8) & 0x80; /* set aside the sign */ - if (sign != 0) - sample = -sample; /* get magnitude */ - if (sample > CLIP) - sample = CLIP; /* clip the magnitude */ - - /* Convert from 16 bit linear to ulaw. */ - sample = sample + BIAS; - exponent = exp_lut[(sample >> 7) & 0xFF]; - mantissa = (sample >> (exponent + 3)) & 0x0F; - ulawbyte = ~(sign | (exponent << 4) | mantissa); -#ifdef ZEROTRAP - /* optional CCITT trap */ - if (ulawbyte == 0) - ulawbyte = 0x02; -#endif - return (ulawbyte); -} - - -static int Mx[3][8] = -{ - {0x3800, 0x5600, 0, 0, 0, 0, 0, 0}, - {0x399a, 0x3a9f, 0x4d14, 0x6607, 0, 0, 0, 0}, - {0x3556, 0x3556, 0x399A, 0x3A9F, 0x4200, 0x4D14, 0x6607, 0x6607}, -}; - -static int bitmask[9] = -{ - 0, 0x01, 0x03, 0x07, 0x0f, 0x1f, 0x3f, 0x7f, 0xff -}; - -static int -isdn_audio_get_bits(adpcm_state *s, unsigned char **in, int *len) -{ - while (s->nleft < s->nbits) { - int d = *((*in)++); - (*len)--; - s->word = (s->word << 8) | d; - s->nleft += 8; - } - s->nleft -= s->nbits; - return (s->word >> s->nleft) & bitmask[s->nbits]; -} - -static void -isdn_audio_put_bits(int data, int nbits, adpcm_state *s, - unsigned char **out, int *len) -{ - s->word = (s->word << nbits) | (data & bitmask[nbits]); - s->nleft += nbits; - while (s->nleft >= 8) { - int d = (s->word >> (s->nleft - 8)); - *(out[0]++) = d & 255; - (*len)++; - s->nleft -= 8; - } -} - -adpcm_state * -isdn_audio_adpcm_init(adpcm_state *s, int nbits) -{ - if (!s) - s = kmalloc(sizeof(adpcm_state), GFP_ATOMIC); - if (s) { - s->a = 0; - s->d = 5; - s->word = 0; - s->nleft = 0; - s->nbits = nbits; - } - return s; -} - -dtmf_state * -isdn_audio_dtmf_init(dtmf_state *s) -{ - if (!s) - s = kmalloc(sizeof(dtmf_state), GFP_ATOMIC); - if (s) { - s->idx = 0; - s->last = ' '; - } - return s; -} - -/* - * Decompression of adpcm data to a/u-law - * - */ - -int -isdn_audio_adpcm2xlaw(adpcm_state *s, int fmt, unsigned char *in, - unsigned char *out, int len) -{ - int a = s->a; - int d = s->d; - int nbits = s->nbits; - int olen = 0; - - while (len) { - int e = isdn_audio_get_bits(s, &in, &len); - int sign; - - if (nbits == 4 && e == 0) - d = 4; - sign = (e >> (nbits - 1)) ? -1 : 1; - e &= bitmask[nbits - 1]; - a += sign * ((e << 1) + 1) * d >> 1; - if (d & 1) - a++; - if (fmt) - *out++ = isdn_audio_ulaw_to_alaw[ - isdn_audio_linear2ulaw(a << 2)]; - else - *out++ = isdn_audio_linear2ulaw(a << 2); - olen++; - d = (d * Mx[nbits - 2][e] + 0x2000) >> 14; - if (d < 5) - d = 5; - } - s->a = a; - s->d = d; - return olen; -} - -int -isdn_audio_xlaw2adpcm(adpcm_state *s, int fmt, unsigned char *in, - unsigned char *out, int len) -{ - int a = s->a; - int d = s->d; - int nbits = s->nbits; - int olen = 0; - - while (len--) { - int e = 0, - nmax = 1 << (nbits - 1); - int sign, - delta; - - if (fmt) - delta = (isdn_audio_alaw_to_s16[*in++] >> 2) - a; - else - delta = (isdn_audio_ulaw_to_s16[*in++] >> 2) - a; - if (delta < 0) { - e = nmax; - delta = -delta; - } - while (--nmax && delta > d) { - delta -= d; - e++; - } - if (nbits == 4 && ((e & 0x0f) == 0)) - e = 8; - isdn_audio_put_bits(e, nbits, s, &out, &olen); - sign = (e >> (nbits - 1)) ? -1 : 1; - e &= bitmask[nbits - 1]; - - a += sign * ((e << 1) + 1) * d >> 1; - if (d & 1) - a++; - d = (d * Mx[nbits - 2][e] + 0x2000) >> 14; - if (d < 5) - d = 5; - } - s->a = a; - s->d = d; - return olen; -} - -/* - * Goertzel algorithm. - * See http://ptolemy.eecs.berkeley.edu/papers/96/dtmf_ict/ - * for more info. - * Result is stored into an sk_buff and queued up for later - * evaluation. - */ -static void -isdn_audio_goertzel(int *sample, modem_info *info) -{ - int sk, - sk1, - sk2; - int k, - n; - struct sk_buff *skb; - int *result; - - skb = dev_alloc_skb(sizeof(int) * NCOEFF); - if (!skb) { - printk(KERN_WARNING - "isdn_audio: Could not alloc DTMF result for ttyI%d\n", - info->line); - return; - } - result = skb_put(skb, sizeof(int) * NCOEFF); - for (k = 0; k < NCOEFF; k++) { - sk = sk1 = sk2 = 0; - for (n = 0; n < DTMF_NPOINTS; n++) { - sk = sample[n] + ((cos2pik[k] * sk1) >> 15) - sk2; - sk2 = sk1; - sk1 = sk; - } - /* Avoid overflows */ - sk >>= 1; - sk2 >>= 1; - /* compute |X(k)|**2 */ - /* report overflows. This should not happen. */ - /* Comment this out if desired */ - if (sk < -32768 || sk > 32767) - printk(KERN_DEBUG - "isdn_audio: dtmf goertzel overflow, sk=%d\n", sk); - if (sk2 < -32768 || sk2 > 32767) - printk(KERN_DEBUG - "isdn_audio: dtmf goertzel overflow, sk2=%d\n", sk2); - result[k] = - ((sk * sk) >> AMP_BITS) - - ((((cos2pik[k] * sk) >> 15) * sk2) >> AMP_BITS) + - ((sk2 * sk2) >> AMP_BITS); - } - skb_queue_tail(&info->dtmf_queue, skb); - isdn_timer_ctrl(ISDN_TIMER_MODEMREAD, 1); -} - -void -isdn_audio_eval_dtmf(modem_info *info) -{ - struct sk_buff *skb; - int *result; - dtmf_state *s; - int silence; - int i; - int di; - int ch; - int grp[2]; - char what; - char *p; - int thresh; - - while ((skb = skb_dequeue(&info->dtmf_queue))) { - result = (int *) skb->data; - s = info->dtmf_state; - grp[LOGRP] = grp[HIGRP] = -1; - silence = 0; - thresh = 0; - for (i = 0; i < NCOEFF; i++) { - if (result[i] > DTMF_TRESH) { - if (result[i] > thresh) - thresh = result[i]; - } - else if (result[i] < SILENCE_TRESH) - silence++; - } - if (silence == NCOEFF) - what = ' '; - else { - if (thresh > 0) { - thresh = thresh >> 4; /* touchtones must match within 12 dB */ - for (i = 0; i < NCOEFF; i++) { - if (result[i] < thresh) - continue; /* ignore */ - /* good level found. This is allowed only one time per group */ - if (i < NCOEFF / 2) { - /* lowgroup*/ - if (grp[LOGRP] >= 0) { - // Bad. Another tone found. */ - grp[LOGRP] = -1; - break; - } - else - grp[LOGRP] = i; - } - else { /* higroup */ - if (grp[HIGRP] >= 0) { // Bad. Another tone found. */ - grp[HIGRP] = -1; - break; - } - else - grp[HIGRP] = i - NCOEFF/2; - } - } - if ((grp[LOGRP] >= 0) && (grp[HIGRP] >= 0)) { - what = dtmf_matrix[grp[LOGRP]][grp[HIGRP]]; - if (s->last != ' ' && s->last != '.') - s->last = what; /* min. 1 non-DTMF between DTMF */ - } else - what = '.'; - } - else - what = '.'; - } - if ((what != s->last) && (what != ' ') && (what != '.')) { - printk(KERN_DEBUG "dtmf: tt='%c'\n", what); - p = skb->data; - *p++ = 0x10; - *p = what; - skb_trim(skb, 2); - ISDN_AUDIO_SKB_DLECOUNT(skb) = 0; - ISDN_AUDIO_SKB_LOCK(skb) = 0; - di = info->isdn_driver; - ch = info->isdn_channel; - __skb_queue_tail(&dev->drv[di]->rpqueue[ch], skb); - dev->drv[di]->rcvcount[ch] += 2; - /* Schedule dequeuing */ - if ((dev->modempoll) && (info->rcvsched)) - isdn_timer_ctrl(ISDN_TIMER_MODEMREAD, 1); - wake_up_interruptible(&dev->drv[di]->rcv_waitq[ch]); - } else - kfree_skb(skb); - s->last = what; - } -} - -/* - * Decode DTMF tones, queue result in separate sk_buf for - * later examination. - * Parameters: - * s = pointer to state-struct. - * buf = input audio data - * len = size of audio data. - * fmt = audio data format (0 = ulaw, 1 = alaw) - */ -void -isdn_audio_calc_dtmf(modem_info *info, unsigned char *buf, int len, int fmt) -{ - dtmf_state *s = info->dtmf_state; - int i; - int c; - - while (len) { - c = DTMF_NPOINTS - s->idx; - if (c > len) - c = len; - if (c <= 0) - break; - for (i = 0; i < c; i++) { - if (fmt) - s->buf[s->idx++] = - isdn_audio_alaw_to_s16[*buf++] >> (15 - AMP_BITS); - else - s->buf[s->idx++] = - isdn_audio_ulaw_to_s16[*buf++] >> (15 - AMP_BITS); - } - if (s->idx == DTMF_NPOINTS) { - isdn_audio_goertzel(s->buf, info); - s->idx = 0; - } - len -= c; - } -} - -silence_state * -isdn_audio_silence_init(silence_state *s) -{ - if (!s) - s = kmalloc(sizeof(silence_state), GFP_ATOMIC); - if (s) { - s->idx = 0; - s->state = 0; - } - return s; -} - -void -isdn_audio_calc_silence(modem_info *info, unsigned char *buf, int len, int fmt) -{ - silence_state *s = info->silence_state; - int i; - signed char c; - - if (!info->emu.vpar[1]) return; - - for (i = 0; i < len; i++) { - if (fmt) - c = isdn_audio_alaw_to_ulaw[*buf++]; - else - c = *buf++; - - if (c > 0) c -= 128; - c = abs(c); - - if (c > (info->emu.vpar[1] * 4)) { - s->idx = 0; - s->state = 1; - } else { - if (s->idx < 210000) s->idx++; - } - } -} - -void -isdn_audio_put_dle_code(modem_info *info, u_char code) -{ - struct sk_buff *skb; - int di; - int ch; - char *p; - - skb = dev_alloc_skb(2); - if (!skb) { - printk(KERN_WARNING - "isdn_audio: Could not alloc skb for ttyI%d\n", - info->line); - return; - } - p = skb_put(skb, 2); - p[0] = 0x10; - p[1] = code; - ISDN_AUDIO_SKB_DLECOUNT(skb) = 0; - ISDN_AUDIO_SKB_LOCK(skb) = 0; - di = info->isdn_driver; - ch = info->isdn_channel; - __skb_queue_tail(&dev->drv[di]->rpqueue[ch], skb); - dev->drv[di]->rcvcount[ch] += 2; - /* Schedule dequeuing */ - if ((dev->modempoll) && (info->rcvsched)) - isdn_timer_ctrl(ISDN_TIMER_MODEMREAD, 1); - wake_up_interruptible(&dev->drv[di]->rcv_waitq[ch]); -} - -void -isdn_audio_eval_silence(modem_info *info) -{ - silence_state *s = info->silence_state; - char what; - - what = ' '; - - if (s->idx > (info->emu.vpar[2] * 800)) { - s->idx = 0; - if (!s->state) { /* silence from beginning of rec */ - what = 's'; - } else { - what = 'q'; - } - } - if ((what == 's') || (what == 'q')) { - printk(KERN_DEBUG "ttyI%d: %s\n", info->line, - (what == 's') ? "silence" : "quiet"); - isdn_audio_put_dle_code(info, what); - } -} diff --git a/drivers/isdn/i4l/isdn_audio.h b/drivers/isdn/i4l/isdn_audio.h deleted file mode 100644 index 013c3582e0d1..000000000000 --- a/drivers/isdn/i4l/isdn_audio.h +++ /dev/null @@ -1,44 +0,0 @@ -/* $Id: isdn_audio.h,v 1.1.2.2 2004/01/12 22:37:18 keil Exp $ - * - * Linux ISDN subsystem, audio conversion and compression (linklevel). - * - * Copyright 1994-1999 by Fritz Elfert (fritz@isdn4linux.de) - * - * This software may be used and distributed according to the terms - * of the GNU General Public License, incorporated herein by reference. - * - */ - -#define DTMF_NPOINTS 205 /* Number of samples for DTMF recognition */ -typedef struct adpcm_state { - int a; - int d; - int word; - int nleft; - int nbits; -} adpcm_state; - -typedef struct dtmf_state { - char last; - char llast; - int idx; - int buf[DTMF_NPOINTS]; -} dtmf_state; - -typedef struct silence_state { - int state; - unsigned int idx; -} silence_state; - -extern void isdn_audio_ulaw2alaw(unsigned char *, unsigned long); -extern void isdn_audio_alaw2ulaw(unsigned char *, unsigned long); -extern adpcm_state *isdn_audio_adpcm_init(adpcm_state *, int); -extern int isdn_audio_adpcm2xlaw(adpcm_state *, int, unsigned char *, unsigned char *, int); -extern int isdn_audio_xlaw2adpcm(adpcm_state *, int, unsigned char *, unsigned char *, int); -extern void isdn_audio_calc_dtmf(modem_info *, unsigned char *, int, int); -extern void isdn_audio_eval_dtmf(modem_info *); -dtmf_state *isdn_audio_dtmf_init(dtmf_state *); -extern void isdn_audio_calc_silence(modem_info *, unsigned char *, int, int); -extern void isdn_audio_eval_silence(modem_info *); -silence_state *isdn_audio_silence_init(silence_state *); -extern void isdn_audio_put_dle_code(modem_info *, u_char); diff --git a/drivers/isdn/i4l/isdn_bsdcomp.c b/drivers/isdn/i4l/isdn_bsdcomp.c deleted file mode 100644 index 7f28b967ed19..000000000000 --- a/drivers/isdn/i4l/isdn_bsdcomp.c +++ /dev/null @@ -1,930 +0,0 @@ -/* - * BSD compression module - * - * Patched version for ISDN syncPPP written 1997/1998 by Michael Hipp - * The whole module is now SKB based. - * - */ - -/* - * Update: The Berkeley copyright was changed, and the change - * is retroactive to all "true" BSD software (ie everything - * from UCB as opposed to other peoples code that just carried - * the same license). The new copyright doesn't clash with the - * GPL, so the module-only restriction has been removed.. - */ - -/* - * Original copyright notice: - * - * Copyright (c) 1985, 1986 The Regents of the University of California. - * All rights reserved. - * - * This code is derived from software contributed to Berkeley by - * James A. Woods, derived from original work by Spencer Thomas - * and Joseph Orost. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. - * 4. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include /* used in new tty drivers */ -#include /* used in new tty drivers */ -#include - -#include -#include - -#include - -#include -#include -#include -#include -#include -#include - -#include - -#include -#include -#include -#include -#include -#include - -#include "isdn_ppp.h" - -MODULE_DESCRIPTION("ISDN4Linux: BSD Compression for PPP over ISDN"); -MODULE_LICENSE("Dual BSD/GPL"); - -#define BSD_VERSION(x) ((x) >> 5) -#define BSD_NBITS(x) ((x) & 0x1F) - -#define BSD_CURRENT_VERSION 1 - -#define DEBUG 1 - -/* - * A dictionary for doing BSD compress. - */ - -struct bsd_dict { - u32 fcode; - u16 codem1; /* output of hash table -1 */ - u16 cptr; /* map code to hash table entry */ -}; - -struct bsd_db { - int totlen; /* length of this structure */ - unsigned int hsize; /* size of the hash table */ - unsigned char hshift; /* used in hash function */ - unsigned char n_bits; /* current bits/code */ - unsigned char maxbits; /* maximum bits/code */ - unsigned char debug; /* non-zero if debug desired */ - unsigned char unit; /* ppp unit number */ - u16 seqno; /* sequence # of next packet */ - unsigned int mru; /* size of receive (decompress) bufr */ - unsigned int maxmaxcode; /* largest valid code */ - unsigned int max_ent; /* largest code in use */ - unsigned int in_count; /* uncompressed bytes, aged */ - unsigned int bytes_out; /* compressed bytes, aged */ - unsigned int ratio; /* recent compression ratio */ - unsigned int checkpoint; /* when to next check the ratio */ - unsigned int clear_count; /* times dictionary cleared */ - unsigned int incomp_count; /* incompressible packets */ - unsigned int incomp_bytes; /* incompressible bytes */ - unsigned int uncomp_count; /* uncompressed packets */ - unsigned int uncomp_bytes; /* uncompressed bytes */ - unsigned int comp_count; /* compressed packets */ - unsigned int comp_bytes; /* compressed bytes */ - unsigned short *lens; /* array of lengths of codes */ - struct bsd_dict *dict; /* dictionary */ - int xmit; -}; - -#define BSD_OVHD 2 /* BSD compress overhead/packet */ -#define MIN_BSD_BITS 9 -#define BSD_INIT_BITS MIN_BSD_BITS -#define MAX_BSD_BITS 15 - -/* - * the next two codes should not be changed lightly, as they must not - * lie within the contiguous general code space. - */ -#define CLEAR 256 /* table clear output code */ -#define FIRST 257 /* first free entry */ -#define LAST 255 - -#define MAXCODE(b) ((1 << (b)) - 1) -#define BADCODEM1 MAXCODE(MAX_BSD_BITS) - -#define BSD_HASH(prefix, suffix, hshift) ((((unsigned long)(suffix)) << (hshift)) \ - ^ (unsigned long)(prefix)) -#define BSD_KEY(prefix, suffix) ((((unsigned long)(suffix)) << 16) \ - + (unsigned long)(prefix)) - -#define CHECK_GAP 10000 /* Ratio check interval */ - -#define RATIO_SCALE_LOG 8 -#define RATIO_SCALE (1 << RATIO_SCALE_LOG) -#define RATIO_MAX (0x7fffffff >> RATIO_SCALE_LOG) - -/* - * clear the dictionary - */ - -static void bsd_clear(struct bsd_db *db) -{ - db->clear_count++; - db->max_ent = FIRST - 1; - db->n_bits = BSD_INIT_BITS; - db->bytes_out = 0; - db->in_count = 0; - db->incomp_count = 0; - db->ratio = 0; - db->checkpoint = CHECK_GAP; -} - -/* - * If the dictionary is full, then see if it is time to reset it. - * - * Compute the compression ratio using fixed-point arithmetic - * with 8 fractional bits. - * - * Since we have an infinite stream instead of a single file, - * watch only the local compression ratio. - * - * Since both peers must reset the dictionary at the same time even in - * the absence of CLEAR codes (while packets are incompressible), they - * must compute the same ratio. - */ -static int bsd_check(struct bsd_db *db) /* 1=output CLEAR */ -{ - unsigned int new_ratio; - - if (db->in_count >= db->checkpoint) - { - /* age the ratio by limiting the size of the counts */ - if (db->in_count >= RATIO_MAX || db->bytes_out >= RATIO_MAX) - { - db->in_count -= (db->in_count >> 2); - db->bytes_out -= (db->bytes_out >> 2); - } - - db->checkpoint = db->in_count + CHECK_GAP; - - if (db->max_ent >= db->maxmaxcode) - { - /* Reset the dictionary only if the ratio is worse, - * or if it looks as if it has been poisoned - * by incompressible data. - * - * This does not overflow, because - * db->in_count <= RATIO_MAX. - */ - - new_ratio = db->in_count << RATIO_SCALE_LOG; - if (db->bytes_out != 0) - { - new_ratio /= db->bytes_out; - } - - if (new_ratio < db->ratio || new_ratio < 1 * RATIO_SCALE) - { - bsd_clear(db); - return 1; - } - db->ratio = new_ratio; - } - } - return 0; -} - -/* - * Return statistics. - */ - -static void bsd_stats(void *state, struct compstat *stats) -{ - struct bsd_db *db = (struct bsd_db *) state; - - stats->unc_bytes = db->uncomp_bytes; - stats->unc_packets = db->uncomp_count; - stats->comp_bytes = db->comp_bytes; - stats->comp_packets = db->comp_count; - stats->inc_bytes = db->incomp_bytes; - stats->inc_packets = db->incomp_count; - stats->in_count = db->in_count; - stats->bytes_out = db->bytes_out; -} - -/* - * Reset state, as on a CCP ResetReq. - */ -static void bsd_reset(void *state, unsigned char code, unsigned char id, - unsigned char *data, unsigned len, - struct isdn_ppp_resetparams *rsparm) -{ - struct bsd_db *db = (struct bsd_db *) state; - - bsd_clear(db); - db->seqno = 0; - db->clear_count = 0; -} - -/* - * Release the compression structure - */ -static void bsd_free(void *state) -{ - struct bsd_db *db = (struct bsd_db *) state; - - if (db) { - /* - * Release the dictionary - */ - vfree(db->dict); - db->dict = NULL; - - /* - * Release the string buffer - */ - vfree(db->lens); - db->lens = NULL; - - /* - * Finally release the structure itself. - */ - kfree(db); - } -} - - -/* - * Allocate space for a (de) compressor. - */ -static void *bsd_alloc(struct isdn_ppp_comp_data *data) -{ - int bits; - unsigned int hsize, hshift, maxmaxcode; - struct bsd_db *db; - int decomp; - - static unsigned int htab[][2] = { - { 5003 , 4 } , { 5003 , 4 } , { 5003 , 4 } , { 5003 , 4 } , - { 9001 , 5 } , { 18013 , 6 } , { 35023 , 7 } , { 69001 , 8 } - }; - - if (data->optlen != 1 || data->num != CI_BSD_COMPRESS - || BSD_VERSION(data->options[0]) != BSD_CURRENT_VERSION) - return NULL; - - bits = BSD_NBITS(data->options[0]); - - if (bits < 9 || bits > 15) - return NULL; - - hsize = htab[bits - 9][0]; - hshift = htab[bits - 9][1]; - - /* - * Allocate the main control structure for this instance. - */ - maxmaxcode = MAXCODE(bits); - db = kzalloc(sizeof(struct bsd_db), GFP_KERNEL); - if (!db) - return NULL; - - db->xmit = data->flags & IPPP_COMP_FLAG_XMIT; - decomp = db->xmit ? 0 : 1; - - /* - * Allocate space for the dictionary. This may be more than one page in - * length. - */ - db->dict = vmalloc(array_size(hsize, sizeof(struct bsd_dict))); - if (!db->dict) { - bsd_free(db); - return NULL; - } - - /* - * If this is the compression buffer then there is no length data. - * For decompression, the length information is needed as well. - */ - if (!decomp) - db->lens = NULL; - else { - db->lens = vmalloc(array_size(sizeof(db->lens[0]), - maxmaxcode + 1)); - if (!db->lens) { - bsd_free(db); - return (NULL); - } - } - - /* - * Initialize the data information for the compression code - */ - db->totlen = sizeof(struct bsd_db) + (sizeof(struct bsd_dict) * hsize); - db->hsize = hsize; - db->hshift = hshift; - db->maxmaxcode = maxmaxcode; - db->maxbits = bits; - - return (void *)db; -} - -/* - * Initialize the database. - */ -static int bsd_init(void *state, struct isdn_ppp_comp_data *data, int unit, int debug) -{ - struct bsd_db *db = state; - int indx; - int decomp; - - if (!state || !data) { - printk(KERN_ERR "isdn_bsd_init: [%d] ERR, state %lx data %lx\n", unit, (long)state, (long)data); - return 0; - } - - decomp = db->xmit ? 0 : 1; - - if (data->optlen != 1 || data->num != CI_BSD_COMPRESS - || (BSD_VERSION(data->options[0]) != BSD_CURRENT_VERSION) - || (BSD_NBITS(data->options[0]) != db->maxbits) - || (decomp && db->lens == NULL)) { - printk(KERN_ERR "isdn_bsd: %d %d %d %d %lx\n", data->optlen, data->num, data->options[0], decomp, (unsigned long)db->lens); - return 0; - } - - if (decomp) - for (indx = LAST; indx >= 0; indx--) - db->lens[indx] = 1; - - indx = db->hsize; - while (indx-- != 0) { - db->dict[indx].codem1 = BADCODEM1; - db->dict[indx].cptr = 0; - } - - db->unit = unit; - db->mru = 0; - - db->debug = 1; - - bsd_reset(db, 0, 0, NULL, 0, NULL); - - return 1; -} - -/* - * Obtain pointers to the various structures in the compression tables - */ - -#define dict_ptrx(p, idx) &(p->dict[idx]) -#define lens_ptrx(p, idx) &(p->lens[idx]) - -#ifdef DEBUG -static unsigned short *lens_ptr(struct bsd_db *db, int idx) -{ - if ((unsigned int) idx > (unsigned int) db->maxmaxcode) { - printk(KERN_DEBUG "<9>ppp: lens_ptr(%d) > max\n", idx); - idx = 0; - } - return lens_ptrx(db, idx); -} - -static struct bsd_dict *dict_ptr(struct bsd_db *db, int idx) -{ - if ((unsigned int) idx >= (unsigned int) db->hsize) { - printk(KERN_DEBUG "<9>ppp: dict_ptr(%d) > max\n", idx); - idx = 0; - } - return dict_ptrx(db, idx); -} - -#else -#define lens_ptr(db, idx) lens_ptrx(db, idx) -#define dict_ptr(db, idx) dict_ptrx(db, idx) -#endif - -/* - * compress a packet - */ -static int bsd_compress(void *state, struct sk_buff *skb_in, struct sk_buff *skb_out, int proto) -{ - struct bsd_db *db; - int hshift; - unsigned int max_ent; - unsigned int n_bits; - unsigned int bitno; - unsigned long accm; - int ent; - unsigned long fcode; - struct bsd_dict *dictp; - unsigned char c; - int hval, disp, ilen, mxcode; - unsigned char *rptr = skb_in->data; - int isize = skb_in->len; - -#define OUTPUT(ent) \ - { \ - bitno -= n_bits; \ - accm |= ((ent) << bitno); \ - do { \ - if (skb_out && skb_tailroom(skb_out) > 0) \ - skb_put_u8(skb_out, (u8)(accm >> 24)); \ - accm <<= 8; \ - bitno += 8; \ - } while (bitno <= 24); \ - } - - /* - * If the protocol is not in the range we're interested in, - * just return without compressing the packet. If it is, - * the protocol becomes the first byte to compress. - */ - printk(KERN_DEBUG "bsd_compress called with %x\n", proto); - - ent = proto; - if (proto < 0x21 || proto > 0xf9 || !(proto & 0x1)) - return 0; - - db = (struct bsd_db *) state; - hshift = db->hshift; - max_ent = db->max_ent; - n_bits = db->n_bits; - bitno = 32; - accm = 0; - mxcode = MAXCODE(n_bits); - - /* This is the PPP header information */ - if (skb_out && skb_tailroom(skb_out) >= 2) { - char *v = skb_put(skb_out, 2); - /* we only push our own data on the header, - AC,PC and protos is pushed by caller */ - v[0] = db->seqno >> 8; - v[1] = db->seqno; - } - - ilen = ++isize; /* This is off by one, but that is what is in draft! */ - - while (--ilen > 0) { - c = *rptr++; - fcode = BSD_KEY(ent, c); - hval = BSD_HASH(ent, c, hshift); - dictp = dict_ptr(db, hval); - - /* Validate and then check the entry. */ - if (dictp->codem1 >= max_ent) - goto nomatch; - - if (dictp->fcode == fcode) { - ent = dictp->codem1 + 1; - continue; /* found (prefix,suffix) */ - } - - /* continue probing until a match or invalid entry */ - disp = (hval == 0) ? 1 : hval; - - do { - hval += disp; - if (hval >= db->hsize) - hval -= db->hsize; - dictp = dict_ptr(db, hval); - if (dictp->codem1 >= max_ent) - goto nomatch; - } while (dictp->fcode != fcode); - - ent = dictp->codem1 + 1; /* finally found (prefix,suffix) */ - continue; - - nomatch: - OUTPUT(ent); /* output the prefix */ - - /* code -> hashtable */ - if (max_ent < db->maxmaxcode) { - struct bsd_dict *dictp2; - struct bsd_dict *dictp3; - int indx; - - /* expand code size if needed */ - if (max_ent >= mxcode) { - db->n_bits = ++n_bits; - mxcode = MAXCODE(n_bits); - } - - /* - * Invalidate old hash table entry using - * this code, and then take it over. - */ - dictp2 = dict_ptr(db, max_ent + 1); - indx = dictp2->cptr; - dictp3 = dict_ptr(db, indx); - - if (dictp3->codem1 == max_ent) - dictp3->codem1 = BADCODEM1; - - dictp2->cptr = hval; - dictp->codem1 = max_ent; - dictp->fcode = fcode; - db->max_ent = ++max_ent; - - if (db->lens) { - unsigned short *len1 = lens_ptr(db, max_ent); - unsigned short *len2 = lens_ptr(db, ent); - *len1 = *len2 + 1; - } - } - ent = c; - } - - OUTPUT(ent); /* output the last code */ - - if (skb_out) - db->bytes_out += skb_out->len; /* Do not count bytes from here */ - db->uncomp_bytes += isize; - db->in_count += isize; - ++db->uncomp_count; - ++db->seqno; - - if (bitno < 32) - ++db->bytes_out; /* must be set before calling bsd_check */ - - /* - * Generate the clear command if needed - */ - - if (bsd_check(db)) - OUTPUT(CLEAR); - - /* - * Pad dribble bits of last code with ones. - * Do not emit a completely useless byte of ones. - */ - if (bitno < 32 && skb_out && skb_tailroom(skb_out) > 0) - skb_put_u8(skb_out, - (unsigned char)((accm | (0xff << (bitno - 8))) >> 24)); - - /* - * Increase code size if we would have without the packet - * boundary because the decompressor will do so. - */ - if (max_ent >= mxcode && max_ent < db->maxmaxcode) - db->n_bits++; - - /* If output length is too large then this is an incompressible frame. */ - if (!skb_out || skb_out->len >= skb_in->len) { - ++db->incomp_count; - db->incomp_bytes += isize; - return 0; - } - - /* Count the number of compressed frames */ - ++db->comp_count; - db->comp_bytes += skb_out->len; - return skb_out->len; - -#undef OUTPUT -} - -/* - * Update the "BSD Compress" dictionary on the receiver for - * incompressible data by pretending to compress the incoming data. - */ -static void bsd_incomp(void *state, struct sk_buff *skb_in, int proto) -{ - bsd_compress(state, skb_in, NULL, proto); -} - -/* - * Decompress "BSD Compress". - */ -static int bsd_decompress(void *state, struct sk_buff *skb_in, struct sk_buff *skb_out, - struct isdn_ppp_resetparams *rsparm) -{ - struct bsd_db *db; - unsigned int max_ent; - unsigned long accm; - unsigned int bitno; /* 1st valid bit in accm */ - unsigned int n_bits; - unsigned int tgtbitno; /* bitno when we have a code */ - struct bsd_dict *dictp; - int seq; - unsigned int incode; - unsigned int oldcode; - unsigned int finchar; - unsigned char *p, *ibuf; - int ilen; - int codelen; - int extra; - - db = (struct bsd_db *) state; - max_ent = db->max_ent; - accm = 0; - bitno = 32; /* 1st valid bit in accm */ - n_bits = db->n_bits; - tgtbitno = 32 - n_bits; /* bitno when we have a code */ - - printk(KERN_DEBUG "bsd_decompress called\n"); - - if (!skb_in || !skb_out) { - printk(KERN_ERR "bsd_decompress called with NULL parameter\n"); - return DECOMP_ERROR; - } - - /* - * Get the sequence number. - */ - if ((p = skb_pull(skb_in, 2)) == NULL) { - return DECOMP_ERROR; - } - p -= 2; - seq = (p[0] << 8) + p[1]; - ilen = skb_in->len; - ibuf = skb_in->data; - - /* - * Check the sequence number and give up if it differs from - * the value we're expecting. - */ - if (seq != db->seqno) { - if (db->debug) { - printk(KERN_DEBUG "bsd_decomp%d: bad sequence # %d, expected %d\n", - db->unit, seq, db->seqno - 1); - } - return DECOMP_ERROR; - } - - ++db->seqno; - db->bytes_out += ilen; - - if (skb_tailroom(skb_out) > 0) - skb_put_u8(skb_out, 0); - else - return DECOMP_ERR_NOMEM; - - oldcode = CLEAR; - - /* - * Keep the checkpoint correctly so that incompressible packets - * clear the dictionary at the proper times. - */ - - for (;;) { - if (ilen-- <= 0) { - db->in_count += (skb_out->len - 1); /* don't count the header */ - break; - } - - /* - * Accumulate bytes until we have a complete code. - * Then get the next code, relying on the 32-bit, - * unsigned accm to mask the result. - */ - - bitno -= 8; - accm |= *ibuf++ << bitno; - if (tgtbitno < bitno) - continue; - - incode = accm >> tgtbitno; - accm <<= n_bits; - bitno += n_bits; - - /* - * The dictionary must only be cleared at the end of a packet. - */ - - if (incode == CLEAR) { - if (ilen > 0) { - if (db->debug) - printk(KERN_DEBUG "bsd_decomp%d: bad CLEAR\n", db->unit); - return DECOMP_FATALERROR; /* probably a bug */ - } - bsd_clear(db); - break; - } - - if ((incode > max_ent + 2) || (incode > db->maxmaxcode) - || (incode > max_ent && oldcode == CLEAR)) { - if (db->debug) { - printk(KERN_DEBUG "bsd_decomp%d: bad code 0x%x oldcode=0x%x ", - db->unit, incode, oldcode); - printk(KERN_DEBUG "max_ent=0x%x skb->Len=%d seqno=%d\n", - max_ent, skb_out->len, db->seqno); - } - return DECOMP_FATALERROR; /* probably a bug */ - } - - /* Special case for KwKwK string. */ - if (incode > max_ent) { - finchar = oldcode; - extra = 1; - } else { - finchar = incode; - extra = 0; - } - - codelen = *(lens_ptr(db, finchar)); - if (skb_tailroom(skb_out) < codelen + extra) { - if (db->debug) { - printk(KERN_DEBUG "bsd_decomp%d: ran out of mru\n", db->unit); -#ifdef DEBUG - printk(KERN_DEBUG " len=%d, finchar=0x%x, codelen=%d,skblen=%d\n", - ilen, finchar, codelen, skb_out->len); -#endif - } - return DECOMP_FATALERROR; - } - - /* - * Decode this code and install it in the decompressed buffer. - */ - - p = skb_put(skb_out, codelen); - p += codelen; - while (finchar > LAST) { - struct bsd_dict *dictp2 = dict_ptr(db, finchar); - - dictp = dict_ptr(db, dictp2->cptr); - -#ifdef DEBUG - if (--codelen <= 0 || dictp->codem1 != finchar - 1) { - if (codelen <= 0) { - printk(KERN_ERR "bsd_decomp%d: fell off end of chain ", db->unit); - printk(KERN_ERR "0x%x at 0x%x by 0x%x, max_ent=0x%x\n", incode, finchar, dictp2->cptr, max_ent); - } else { - if (dictp->codem1 != finchar - 1) { - printk(KERN_ERR "bsd_decomp%d: bad code chain 0x%x finchar=0x%x ", db->unit, incode, finchar); - printk(KERN_ERR "oldcode=0x%x cptr=0x%x codem1=0x%x\n", oldcode, dictp2->cptr, dictp->codem1); - } - } - return DECOMP_FATALERROR; - } -#endif - - { - u32 fcode = dictp->fcode; - *--p = (fcode >> 16) & 0xff; - finchar = fcode & 0xffff; - } - } - *--p = finchar; - -#ifdef DEBUG - if (--codelen != 0) - printk(KERN_ERR "bsd_decomp%d: short by %d after code 0x%x, max_ent=0x%x\n", db->unit, codelen, incode, max_ent); -#endif - - if (extra) /* the KwKwK case again */ - skb_put_u8(skb_out, finchar); - - /* - * If not first code in a packet, and - * if not out of code space, then allocate a new code. - * - * Keep the hash table correct so it can be used - * with uncompressed packets. - */ - if (oldcode != CLEAR && max_ent < db->maxmaxcode) { - struct bsd_dict *dictp2, *dictp3; - u16 *lens1, *lens2; - unsigned long fcode; - int hval, disp, indx; - - fcode = BSD_KEY(oldcode, finchar); - hval = BSD_HASH(oldcode, finchar, db->hshift); - dictp = dict_ptr(db, hval); - - /* look for a free hash table entry */ - if (dictp->codem1 < max_ent) { - disp = (hval == 0) ? 1 : hval; - do { - hval += disp; - if (hval >= db->hsize) - hval -= db->hsize; - dictp = dict_ptr(db, hval); - } while (dictp->codem1 < max_ent); - } - - /* - * Invalidate previous hash table entry - * assigned this code, and then take it over - */ - - dictp2 = dict_ptr(db, max_ent + 1); - indx = dictp2->cptr; - dictp3 = dict_ptr(db, indx); - - if (dictp3->codem1 == max_ent) - dictp3->codem1 = BADCODEM1; - - dictp2->cptr = hval; - dictp->codem1 = max_ent; - dictp->fcode = fcode; - db->max_ent = ++max_ent; - - /* Update the length of this string. */ - lens1 = lens_ptr(db, max_ent); - lens2 = lens_ptr(db, oldcode); - *lens1 = *lens2 + 1; - - /* Expand code size if needed. */ - if (max_ent >= MAXCODE(n_bits) && max_ent < db->maxmaxcode) { - db->n_bits = ++n_bits; - tgtbitno = 32-n_bits; - } - } - oldcode = incode; - } - - ++db->comp_count; - ++db->uncomp_count; - db->comp_bytes += skb_in->len - BSD_OVHD; - db->uncomp_bytes += skb_out->len; - - if (bsd_check(db)) { - if (db->debug) - printk(KERN_DEBUG "bsd_decomp%d: peer should have cleared dictionary on %d\n", - db->unit, db->seqno - 1); - } - return skb_out->len; -} - -/************************************************************* - * Table of addresses for the BSD compression module - *************************************************************/ - -static struct isdn_ppp_compressor ippp_bsd_compress = { - .owner = THIS_MODULE, - .num = CI_BSD_COMPRESS, - .alloc = bsd_alloc, - .free = bsd_free, - .init = bsd_init, - .reset = bsd_reset, - .compress = bsd_compress, - .decompress = bsd_decompress, - .incomp = bsd_incomp, - .stat = bsd_stats, -}; - -/************************************************************* - * Module support routines - *************************************************************/ - -static int __init isdn_bsdcomp_init(void) -{ - int answer = isdn_ppp_register_compressor(&ippp_bsd_compress); - if (answer == 0) - printk(KERN_INFO "PPP BSD Compression module registered\n"); - return answer; -} - -static void __exit isdn_bsdcomp_exit(void) -{ - isdn_ppp_unregister_compressor(&ippp_bsd_compress); -} - -module_init(isdn_bsdcomp_init); -module_exit(isdn_bsdcomp_exit); diff --git a/drivers/isdn/i4l/isdn_common.c b/drivers/isdn/i4l/isdn_common.c deleted file mode 100644 index 74ee00f5b310..000000000000 --- a/drivers/isdn/i4l/isdn_common.c +++ /dev/null @@ -1,2368 +0,0 @@ -/* $Id: isdn_common.c,v 1.1.2.3 2004/02/10 01:07:13 keil Exp $ - * - * Linux ISDN subsystem, common used functions (linklevel). - * - * Copyright 1994-1999 by Fritz Elfert (fritz@isdn4linux.de) - * Copyright 1995,96 Thinking Objects Software GmbH Wuerzburg - * Copyright 1995,96 by Michael Hipp (Michael.Hipp@student.uni-tuebingen.de) - * - * This software may be used and distributed according to the terms - * of the GNU General Public License, incorporated herein by reference. - * - */ - -#include -#include -#include -#include -#include -#include -#include -#include "isdn_common.h" -#include "isdn_tty.h" -#include "isdn_net.h" -#include "isdn_ppp.h" -#ifdef CONFIG_ISDN_AUDIO -#include "isdn_audio.h" -#endif -#ifdef CONFIG_ISDN_DIVERSION_MODULE -#define CONFIG_ISDN_DIVERSION -#endif -#ifdef CONFIG_ISDN_DIVERSION -#include -#endif /* CONFIG_ISDN_DIVERSION */ -#include "isdn_v110.h" - -/* Debugflags */ -#undef ISDN_DEBUG_STATCALLB - -MODULE_DESCRIPTION("ISDN4Linux: link layer"); -MODULE_AUTHOR("Fritz Elfert"); -MODULE_LICENSE("GPL"); - -isdn_dev *dev; - -static DEFINE_MUTEX(isdn_mutex); -static char *isdn_revision = "$Revision: 1.1.2.3 $"; - -extern char *isdn_net_revision; -#ifdef CONFIG_ISDN_PPP -extern char *isdn_ppp_revision; -#else -static char *isdn_ppp_revision = ": none $"; -#endif -#ifdef CONFIG_ISDN_AUDIO -extern char *isdn_audio_revision; -#else -static char *isdn_audio_revision = ": none $"; -#endif -extern char *isdn_v110_revision; - -#ifdef CONFIG_ISDN_DIVERSION -static isdn_divert_if *divert_if; /* = NULL */ -#endif /* CONFIG_ISDN_DIVERSION */ - - -static int isdn_writebuf_stub(int, int, const u_char __user *, int); -static void set_global_features(void); -static int isdn_wildmat(char *s, char *p); -static int isdn_add_channels(isdn_driver_t *d, int drvidx, int n, int adding); - -static inline void -isdn_lock_driver(isdn_driver_t *drv) -{ - try_module_get(drv->interface->owner); - drv->locks++; -} - -void -isdn_lock_drivers(void) -{ - int i; - - for (i = 0; i < ISDN_MAX_DRIVERS; i++) { - if (!dev->drv[i]) - continue; - isdn_lock_driver(dev->drv[i]); - } -} - -static inline void -isdn_unlock_driver(isdn_driver_t *drv) -{ - if (drv->locks > 0) { - drv->locks--; - module_put(drv->interface->owner); - } -} - -void -isdn_unlock_drivers(void) -{ - int i; - - for (i = 0; i < ISDN_MAX_DRIVERS; i++) { - if (!dev->drv[i]) - continue; - isdn_unlock_driver(dev->drv[i]); - } -} - -#if defined(ISDN_DEBUG_NET_DUMP) || defined(ISDN_DEBUG_MODEM_DUMP) -void -isdn_dumppkt(char *s, u_char *p, int len, int dumplen) -{ - int dumpc; - - printk(KERN_DEBUG "%s(%d) ", s, len); - for (dumpc = 0; (dumpc < dumplen) && (len); len--, dumpc++) - printk(" %02x", *p++); - printk("\n"); -} -#endif - -/* - * I picked the pattern-matching-functions from an old GNU-tar version (1.10) - * It was originally written and put to PD by rs@mirror.TMC.COM (Rich Salz) - */ -static int -isdn_star(char *s, char *p) -{ - while (isdn_wildmat(s, p)) { - if (*++s == '\0') - return (2); - } - return (0); -} - -/* - * Shell-type Pattern-matching for incoming caller-Ids - * This function gets a string in s and checks, if it matches the pattern - * given in p. - * - * Return: - * 0 = match. - * 1 = no match. - * 2 = no match. Would eventually match, if s would be longer. - * - * Possible Patterns: - * - * '?' matches one character - * '*' matches zero or more characters - * [xyz] matches the set of characters in brackets. - * [^xyz] matches any single character not in the set of characters - */ - -static int -isdn_wildmat(char *s, char *p) -{ - register int last; - register int matched; - register int reverse; - register int nostar = 1; - - if (!(*s) && !(*p)) - return (1); - for (; *p; s++, p++) - switch (*p) { - case '\\': - /* Literal match with following character. */ - p++; - /* fall through */ - default: - if (*s != *p) - return (*s == '\0') ? 2 : 1; - continue; - case '?': - /* Match anything. */ - if (*s == '\0') - return (2); - continue; - case '*': - nostar = 0; - /* Trailing star matches everything. */ - return (*++p ? isdn_star(s, p) : 0); - case '[': - /* [^....] means inverse character class. */ - if ((reverse = (p[1] == '^'))) - p++; - for (last = 0, matched = 0; *++p && (*p != ']'); last = *p) - /* This next line requires a good C compiler. */ - if (*p == '-' ? *s <= *++p && *s >= last : *s == *p) - matched = 1; - if (matched == reverse) - return (1); - continue; - } - return (*s == '\0') ? 0 : nostar; -} - -int isdn_msncmp(const char *msn1, const char *msn2) -{ - char TmpMsn1[ISDN_MSNLEN]; - char TmpMsn2[ISDN_MSNLEN]; - char *p; - - for (p = TmpMsn1; *msn1 && *msn1 != ':';) // Strip off a SPID - *p++ = *msn1++; - *p = '\0'; - - for (p = TmpMsn2; *msn2 && *msn2 != ':';) // Strip off a SPID - *p++ = *msn2++; - *p = '\0'; - - return isdn_wildmat(TmpMsn1, TmpMsn2); -} - -int -isdn_dc2minor(int di, int ch) -{ - int i; - for (i = 0; i < ISDN_MAX_CHANNELS; i++) - if (dev->chanmap[i] == ch && dev->drvmap[i] == di) - return i; - return -1; -} - -static int isdn_timer_cnt1 = 0; -static int isdn_timer_cnt2 = 0; -static int isdn_timer_cnt3 = 0; - -static void -isdn_timer_funct(struct timer_list *unused) -{ - int tf = dev->tflags; - if (tf & ISDN_TIMER_FAST) { - if (tf & ISDN_TIMER_MODEMREAD) - isdn_tty_readmodem(); - if (tf & ISDN_TIMER_MODEMPLUS) - isdn_tty_modem_escape(); - if (tf & ISDN_TIMER_MODEMXMIT) - isdn_tty_modem_xmit(); - } - if (tf & ISDN_TIMER_SLOW) { - if (++isdn_timer_cnt1 >= ISDN_TIMER_02SEC) { - isdn_timer_cnt1 = 0; - if (tf & ISDN_TIMER_NETDIAL) - isdn_net_dial(); - } - if (++isdn_timer_cnt2 >= ISDN_TIMER_1SEC) { - isdn_timer_cnt2 = 0; - if (tf & ISDN_TIMER_NETHANGUP) - isdn_net_autohup(); - if (++isdn_timer_cnt3 >= ISDN_TIMER_RINGING) { - isdn_timer_cnt3 = 0; - if (tf & ISDN_TIMER_MODEMRING) - isdn_tty_modem_ring(); - } - if (tf & ISDN_TIMER_CARRIER) - isdn_tty_carrier_timeout(); - } - } - if (tf) - mod_timer(&dev->timer, jiffies + ISDN_TIMER_RES); -} - -void -isdn_timer_ctrl(int tf, int onoff) -{ - unsigned long flags; - int old_tflags; - - spin_lock_irqsave(&dev->timerlock, flags); - if ((tf & ISDN_TIMER_SLOW) && (!(dev->tflags & ISDN_TIMER_SLOW))) { - /* If the slow-timer wasn't activated until now */ - isdn_timer_cnt1 = 0; - isdn_timer_cnt2 = 0; - } - old_tflags = dev->tflags; - if (onoff) - dev->tflags |= tf; - else - dev->tflags &= ~tf; - if (dev->tflags && !old_tflags) - mod_timer(&dev->timer, jiffies + ISDN_TIMER_RES); - spin_unlock_irqrestore(&dev->timerlock, flags); -} - -/* - * Receive a packet from B-Channel. (Called from low-level-module) - */ -static void -isdn_receive_skb_callback(int di, int channel, struct sk_buff *skb) -{ - int i; - - if ((i = isdn_dc2minor(di, channel)) == -1) { - dev_kfree_skb(skb); - return; - } - /* Update statistics */ - dev->ibytes[i] += skb->len; - - /* First, try to deliver data to network-device */ - if (isdn_net_rcv_skb(i, skb)) - return; - - /* V.110 handling - * makes sense for async streams only, so it is - * called after possible net-device delivery. - */ - if (dev->v110[i]) { - atomic_inc(&dev->v110use[i]); - skb = isdn_v110_decode(dev->v110[i], skb); - atomic_dec(&dev->v110use[i]); - if (!skb) - return; - } - - /* No network-device found, deliver to tty or raw-channel */ - if (skb->len) { - if (isdn_tty_rcv_skb(i, di, channel, skb)) - return; - wake_up_interruptible(&dev->drv[di]->rcv_waitq[channel]); - } else - dev_kfree_skb(skb); -} - -/* - * Intercept command from Linklevel to Lowlevel. - * If layer 2 protocol is V.110 and this is not supported by current - * lowlevel-driver, use driver's transparent mode and handle V.110 in - * linklevel instead. - */ -int -isdn_command(isdn_ctrl *cmd) -{ - if (cmd->driver == -1) { - printk(KERN_WARNING "isdn_command command(%x) driver -1\n", cmd->command); - return (1); - } - if (!dev->drv[cmd->driver]) { - printk(KERN_WARNING "isdn_command command(%x) dev->drv[%d] NULL\n", - cmd->command, cmd->driver); - return (1); - } - if (!dev->drv[cmd->driver]->interface) { - printk(KERN_WARNING "isdn_command command(%x) dev->drv[%d]->interface NULL\n", - cmd->command, cmd->driver); - return (1); - } - if (cmd->command == ISDN_CMD_SETL2) { - int idx = isdn_dc2minor(cmd->driver, cmd->arg & 255); - unsigned long l2prot = (cmd->arg >> 8) & 255; - unsigned long features = (dev->drv[cmd->driver]->interface->features - >> ISDN_FEATURE_L2_SHIFT) & - ISDN_FEATURE_L2_MASK; - unsigned long l2_feature = (1 << l2prot); - - switch (l2prot) { - case ISDN_PROTO_L2_V11096: - case ISDN_PROTO_L2_V11019: - case ISDN_PROTO_L2_V11038: - /* If V.110 requested, but not supported by - * HL-driver, set emulator-flag and change - * Layer-2 to transparent - */ - if (!(features & l2_feature)) { - dev->v110emu[idx] = l2prot; - cmd->arg = (cmd->arg & 255) | - (ISDN_PROTO_L2_TRANS << 8); - } else - dev->v110emu[idx] = 0; - } - } - return dev->drv[cmd->driver]->interface->command(cmd); -} - -void -isdn_all_eaz(int di, int ch) -{ - isdn_ctrl cmd; - - if (di < 0) - return; - cmd.driver = di; - cmd.arg = ch; - cmd.command = ISDN_CMD_SETEAZ; - cmd.parm.num[0] = '\0'; - isdn_command(&cmd); -} - -/* - * Begin of a CAPI like LL<->HL interface, currently used only for - * supplementary service (CAPI 2.0 part III) - */ -#include - -static int -isdn_capi_rec_hl_msg(capi_msg *cm) -{ - switch (cm->Command) { - case CAPI_FACILITY: - /* in the moment only handled in tty */ - return (isdn_tty_capi_facility(cm)); - default: - return (-1); - } -} - -static int -isdn_status_callback(isdn_ctrl *c) -{ - int di; - u_long flags; - int i; - int r; - int retval = 0; - isdn_ctrl cmd; - isdn_net_dev *p; - - di = c->driver; - i = isdn_dc2minor(di, c->arg); - switch (c->command) { - case ISDN_STAT_BSENT: - if (i < 0) - return -1; - if (dev->global_flags & ISDN_GLOBAL_STOPPED) - return 0; - if (isdn_net_stat_callback(i, c)) - return 0; - if (isdn_v110_stat_callback(i, c)) - return 0; - if (isdn_tty_stat_callback(i, c)) - return 0; - wake_up_interruptible(&dev->drv[di]->snd_waitq[c->arg]); - break; - case ISDN_STAT_STAVAIL: - dev->drv[di]->stavail += c->arg; - wake_up_interruptible(&dev->drv[di]->st_waitq); - break; - case ISDN_STAT_RUN: - dev->drv[di]->flags |= DRV_FLAG_RUNNING; - for (i = 0; i < ISDN_MAX_CHANNELS; i++) - if (dev->drvmap[i] == di) - isdn_all_eaz(di, dev->chanmap[i]); - set_global_features(); - break; - case ISDN_STAT_STOP: - dev->drv[di]->flags &= ~DRV_FLAG_RUNNING; - break; - case ISDN_STAT_ICALL: - if (i < 0) - return -1; -#ifdef ISDN_DEBUG_STATCALLB - printk(KERN_DEBUG "ICALL (net): %d %ld %s\n", di, c->arg, c->parm.num); -#endif - if (dev->global_flags & ISDN_GLOBAL_STOPPED) { - cmd.driver = di; - cmd.arg = c->arg; - cmd.command = ISDN_CMD_HANGUP; - isdn_command(&cmd); - return 0; - } - /* Try to find a network-interface which will accept incoming call */ - r = ((c->command == ISDN_STAT_ICALLW) ? 0 : isdn_net_find_icall(di, c->arg, i, &c->parm.setup)); - switch (r) { - case 0: - /* No network-device replies. - * Try ttyI's. - * These return 0 on no match, 1 on match and - * 3 on eventually match, if CID is longer. - */ - if (c->command == ISDN_STAT_ICALL) - if ((retval = isdn_tty_find_icall(di, c->arg, &c->parm.setup))) return (retval); -#ifdef CONFIG_ISDN_DIVERSION - if (divert_if) - if ((retval = divert_if->stat_callback(c))) - return (retval); /* processed */ -#endif /* CONFIG_ISDN_DIVERSION */ - if ((!retval) && (dev->drv[di]->flags & DRV_FLAG_REJBUS)) { - /* No tty responding */ - cmd.driver = di; - cmd.arg = c->arg; - cmd.command = ISDN_CMD_HANGUP; - isdn_command(&cmd); - retval = 2; - } - break; - case 1: - /* Schedule connection-setup */ - isdn_net_dial(); - cmd.driver = di; - cmd.arg = c->arg; - cmd.command = ISDN_CMD_ACCEPTD; - for (p = dev->netdev; p; p = p->next) - if (p->local->isdn_channel == cmd.arg) - { - strcpy(cmd.parm.setup.eazmsn, p->local->msn); - isdn_command(&cmd); - retval = 1; - break; - } - break; - - case 2: /* For calling back, first reject incoming call ... */ - case 3: /* Interface found, but down, reject call actively */ - retval = 2; - printk(KERN_INFO "isdn: Rejecting Call\n"); - cmd.driver = di; - cmd.arg = c->arg; - cmd.command = ISDN_CMD_HANGUP; - isdn_command(&cmd); - if (r == 3) - break; - /* Fall through */ - case 4: - /* ... then start callback. */ - isdn_net_dial(); - break; - case 5: - /* Number would eventually match, if longer */ - retval = 3; - break; - } -#ifdef ISDN_DEBUG_STATCALLB - printk(KERN_DEBUG "ICALL: ret=%d\n", retval); -#endif - return retval; - break; - case ISDN_STAT_CINF: - if (i < 0) - return -1; -#ifdef ISDN_DEBUG_STATCALLB - printk(KERN_DEBUG "CINF: %ld %s\n", c->arg, c->parm.num); -#endif - if (dev->global_flags & ISDN_GLOBAL_STOPPED) - return 0; - if (strcmp(c->parm.num, "0")) - isdn_net_stat_callback(i, c); - isdn_tty_stat_callback(i, c); - break; - case ISDN_STAT_CAUSE: -#ifdef ISDN_DEBUG_STATCALLB - printk(KERN_DEBUG "CAUSE: %ld %s\n", c->arg, c->parm.num); -#endif - printk(KERN_INFO "isdn: %s,ch%ld cause: %s\n", - dev->drvid[di], c->arg, c->parm.num); - isdn_tty_stat_callback(i, c); -#ifdef CONFIG_ISDN_DIVERSION - if (divert_if) - divert_if->stat_callback(c); -#endif /* CONFIG_ISDN_DIVERSION */ - break; - case ISDN_STAT_DISPLAY: -#ifdef ISDN_DEBUG_STATCALLB - printk(KERN_DEBUG "DISPLAY: %ld %s\n", c->arg, c->parm.display); -#endif - isdn_tty_stat_callback(i, c); -#ifdef CONFIG_ISDN_DIVERSION - if (divert_if) - divert_if->stat_callback(c); -#endif /* CONFIG_ISDN_DIVERSION */ - break; - case ISDN_STAT_DCONN: - if (i < 0) - return -1; -#ifdef ISDN_DEBUG_STATCALLB - printk(KERN_DEBUG "DCONN: %ld\n", c->arg); -#endif - if (dev->global_flags & ISDN_GLOBAL_STOPPED) - return 0; - /* Find any net-device, waiting for D-channel setup */ - if (isdn_net_stat_callback(i, c)) - break; - isdn_v110_stat_callback(i, c); - /* Find any ttyI, waiting for D-channel setup */ - if (isdn_tty_stat_callback(i, c)) { - cmd.driver = di; - cmd.arg = c->arg; - cmd.command = ISDN_CMD_ACCEPTB; - isdn_command(&cmd); - break; - } - break; - case ISDN_STAT_DHUP: - if (i < 0) - return -1; -#ifdef ISDN_DEBUG_STATCALLB - printk(KERN_DEBUG "DHUP: %ld\n", c->arg); -#endif - if (dev->global_flags & ISDN_GLOBAL_STOPPED) - return 0; - dev->drv[di]->online &= ~(1 << (c->arg)); - isdn_info_update(); - /* Signal hangup to network-devices */ - if (isdn_net_stat_callback(i, c)) - break; - isdn_v110_stat_callback(i, c); - if (isdn_tty_stat_callback(i, c)) - break; -#ifdef CONFIG_ISDN_DIVERSION - if (divert_if) - divert_if->stat_callback(c); -#endif /* CONFIG_ISDN_DIVERSION */ - break; - break; - case ISDN_STAT_BCONN: - if (i < 0) - return -1; -#ifdef ISDN_DEBUG_STATCALLB - printk(KERN_DEBUG "BCONN: %ld\n", c->arg); -#endif - /* Signal B-channel-connect to network-devices */ - if (dev->global_flags & ISDN_GLOBAL_STOPPED) - return 0; - dev->drv[di]->online |= (1 << (c->arg)); - isdn_info_update(); - if (isdn_net_stat_callback(i, c)) - break; - isdn_v110_stat_callback(i, c); - if (isdn_tty_stat_callback(i, c)) - break; - break; - case ISDN_STAT_BHUP: - if (i < 0) - return -1; -#ifdef ISDN_DEBUG_STATCALLB - printk(KERN_DEBUG "BHUP: %ld\n", c->arg); -#endif - if (dev->global_flags & ISDN_GLOBAL_STOPPED) - return 0; - dev->drv[di]->online &= ~(1 << (c->arg)); - isdn_info_update(); -#ifdef CONFIG_ISDN_X25 - /* Signal hangup to network-devices */ - if (isdn_net_stat_callback(i, c)) - break; -#endif - isdn_v110_stat_callback(i, c); - if (isdn_tty_stat_callback(i, c)) - break; - break; - case ISDN_STAT_NODCH: - if (i < 0) - return -1; -#ifdef ISDN_DEBUG_STATCALLB - printk(KERN_DEBUG "NODCH: %ld\n", c->arg); -#endif - if (dev->global_flags & ISDN_GLOBAL_STOPPED) - return 0; - if (isdn_net_stat_callback(i, c)) - break; - if (isdn_tty_stat_callback(i, c)) - break; - break; - case ISDN_STAT_ADDCH: - spin_lock_irqsave(&dev->lock, flags); - if (isdn_add_channels(dev->drv[di], di, c->arg, 1)) { - spin_unlock_irqrestore(&dev->lock, flags); - return -1; - } - spin_unlock_irqrestore(&dev->lock, flags); - isdn_info_update(); - break; - case ISDN_STAT_DISCH: - spin_lock_irqsave(&dev->lock, flags); - for (i = 0; i < ISDN_MAX_CHANNELS; i++) - if ((dev->drvmap[i] == di) && - (dev->chanmap[i] == c->arg)) { - if (c->parm.num[0]) - dev->usage[i] &= ~ISDN_USAGE_DISABLED; - else - if (USG_NONE(dev->usage[i])) { - dev->usage[i] |= ISDN_USAGE_DISABLED; - } - else - retval = -1; - break; - } - spin_unlock_irqrestore(&dev->lock, flags); - isdn_info_update(); - break; - case ISDN_STAT_UNLOAD: - while (dev->drv[di]->locks > 0) { - isdn_unlock_driver(dev->drv[di]); - } - spin_lock_irqsave(&dev->lock, flags); - isdn_tty_stat_callback(i, c); - for (i = 0; i < ISDN_MAX_CHANNELS; i++) - if (dev->drvmap[i] == di) { - dev->drvmap[i] = -1; - dev->chanmap[i] = -1; - dev->usage[i] &= ~ISDN_USAGE_DISABLED; - } - dev->drivers--; - dev->channels -= dev->drv[di]->channels; - kfree(dev->drv[di]->rcverr); - kfree(dev->drv[di]->rcvcount); - for (i = 0; i < dev->drv[di]->channels; i++) - skb_queue_purge(&dev->drv[di]->rpqueue[i]); - kfree(dev->drv[di]->rpqueue); - kfree(dev->drv[di]->rcv_waitq); - kfree(dev->drv[di]); - dev->drv[di] = NULL; - dev->drvid[di][0] = '\0'; - isdn_info_update(); - set_global_features(); - spin_unlock_irqrestore(&dev->lock, flags); - return 0; - case ISDN_STAT_L1ERR: - break; - case CAPI_PUT_MESSAGE: - return (isdn_capi_rec_hl_msg(&c->parm.cmsg)); -#ifdef CONFIG_ISDN_TTY_FAX - case ISDN_STAT_FAXIND: - isdn_tty_stat_callback(i, c); - break; -#endif -#ifdef CONFIG_ISDN_AUDIO - case ISDN_STAT_AUDIO: - isdn_tty_stat_callback(i, c); - break; -#endif -#ifdef CONFIG_ISDN_DIVERSION - case ISDN_STAT_PROT: - case ISDN_STAT_REDIR: - if (divert_if) - return (divert_if->stat_callback(c)); -#endif /* CONFIG_ISDN_DIVERSION */ - /* fall through */ - default: - return -1; - } - return 0; -} - -/* - * Get integer from char-pointer, set pointer to end of number - */ -int -isdn_getnum(char **p) -{ - int v = -1; - - while (*p[0] >= '0' && *p[0] <= '9') - v = ((v < 0) ? 0 : (v * 10)) + (int) ((*p[0]++) - '0'); - return v; -} - -#define DLE 0x10 - -/* - * isdn_readbchan() tries to get data from the read-queue. - * It MUST be called with interrupts off. - * - * Be aware that this is not an atomic operation when sleep != 0, even though - * interrupts are turned off! Well, like that we are currently only called - * on behalf of a read system call on raw device files (which are documented - * to be dangerous and for debugging purpose only). The inode semaphore - * takes care that this is not called for the same minor device number while - * we are sleeping, but access is not serialized against simultaneous read() - * from the corresponding ttyI device. Can other ugly events, like changes - * of the mapping (di,ch)<->minor, happen during the sleep? --he - */ -int -isdn_readbchan(int di, int channel, u_char *buf, u_char *fp, int len, wait_queue_head_t *sleep) -{ - int count; - int count_pull; - int count_put; - int dflag; - struct sk_buff *skb; - u_char *cp; - - if (!dev->drv[di]) - return 0; - if (skb_queue_empty(&dev->drv[di]->rpqueue[channel])) { - if (sleep) - wait_event_interruptible(*sleep, - !skb_queue_empty(&dev->drv[di]->rpqueue[channel])); - else - return 0; - } - if (len > dev->drv[di]->rcvcount[channel]) - len = dev->drv[di]->rcvcount[channel]; - cp = buf; - count = 0; - while (len) { - if (!(skb = skb_peek(&dev->drv[di]->rpqueue[channel]))) - break; -#ifdef CONFIG_ISDN_AUDIO - if (ISDN_AUDIO_SKB_LOCK(skb)) - break; - ISDN_AUDIO_SKB_LOCK(skb) = 1; - if ((ISDN_AUDIO_SKB_DLECOUNT(skb)) || (dev->drv[di]->DLEflag & (1 << channel))) { - char *p = skb->data; - unsigned long DLEmask = (1 << channel); - - dflag = 0; - count_pull = count_put = 0; - while ((count_pull < skb->len) && (len > 0)) { - len--; - if (dev->drv[di]->DLEflag & DLEmask) { - *cp++ = DLE; - dev->drv[di]->DLEflag &= ~DLEmask; - } else { - *cp++ = *p; - if (*p == DLE) { - dev->drv[di]->DLEflag |= DLEmask; - (ISDN_AUDIO_SKB_DLECOUNT(skb))--; - } - p++; - count_pull++; - } - count_put++; - } - if (count_pull >= skb->len) - dflag = 1; - } else { -#endif - /* No DLE's in buff, so simply copy it */ - dflag = 1; - if ((count_pull = skb->len) > len) { - count_pull = len; - dflag = 0; - } - count_put = count_pull; - skb_copy_from_linear_data(skb, cp, count_put); - cp += count_put; - len -= count_put; -#ifdef CONFIG_ISDN_AUDIO - } -#endif - count += count_put; - if (fp) { - memset(fp, 0, count_put); - fp += count_put; - } - if (dflag) { - /* We got all the data in this buff. - * Now we can dequeue it. - */ - if (fp) - *(fp - 1) = 0xff; -#ifdef CONFIG_ISDN_AUDIO - ISDN_AUDIO_SKB_LOCK(skb) = 0; -#endif - skb = skb_dequeue(&dev->drv[di]->rpqueue[channel]); - dev_kfree_skb(skb); - } else { - /* Not yet emptied this buff, so it - * must stay in the queue, for further calls - * but we pull off the data we got until now. - */ - skb_pull(skb, count_pull); -#ifdef CONFIG_ISDN_AUDIO - ISDN_AUDIO_SKB_LOCK(skb) = 0; -#endif - } - dev->drv[di]->rcvcount[channel] -= count_put; - } - return count; -} - -/* - * isdn_readbchan_tty() tries to get data from the read-queue. - * It MUST be called with interrupts off. - * - * Be aware that this is not an atomic operation when sleep != 0, even though - * interrupts are turned off! Well, like that we are currently only called - * on behalf of a read system call on raw device files (which are documented - * to be dangerous and for debugging purpose only). The inode semaphore - * takes care that this is not called for the same minor device number while - * we are sleeping, but access is not serialized against simultaneous read() - * from the corresponding ttyI device. Can other ugly events, like changes - * of the mapping (di,ch)<->minor, happen during the sleep? --he - */ -int -isdn_readbchan_tty(int di, int channel, struct tty_port *port, int cisco_hack) -{ - int count; - int count_pull; - int count_put; - int dflag; - struct sk_buff *skb; - char last = 0; - int len; - - if (!dev->drv[di]) - return 0; - if (skb_queue_empty(&dev->drv[di]->rpqueue[channel])) - return 0; - - len = tty_buffer_request_room(port, dev->drv[di]->rcvcount[channel]); - if (len == 0) - return len; - - count = 0; - while (len) { - if (!(skb = skb_peek(&dev->drv[di]->rpqueue[channel]))) - break; -#ifdef CONFIG_ISDN_AUDIO - if (ISDN_AUDIO_SKB_LOCK(skb)) - break; - ISDN_AUDIO_SKB_LOCK(skb) = 1; - if ((ISDN_AUDIO_SKB_DLECOUNT(skb)) || (dev->drv[di]->DLEflag & (1 << channel))) { - char *p = skb->data; - unsigned long DLEmask = (1 << channel); - - dflag = 0; - count_pull = count_put = 0; - while ((count_pull < skb->len) && (len > 0)) { - /* push every character but the last to the tty buffer directly */ - if (count_put) - tty_insert_flip_char(port, last, TTY_NORMAL); - len--; - if (dev->drv[di]->DLEflag & DLEmask) { - last = DLE; - dev->drv[di]->DLEflag &= ~DLEmask; - } else { - last = *p; - if (last == DLE) { - dev->drv[di]->DLEflag |= DLEmask; - (ISDN_AUDIO_SKB_DLECOUNT(skb))--; - } - p++; - count_pull++; - } - count_put++; - } - if (count_pull >= skb->len) - dflag = 1; - } else { -#endif - /* No DLE's in buff, so simply copy it */ - dflag = 1; - if ((count_pull = skb->len) > len) { - count_pull = len; - dflag = 0; - } - count_put = count_pull; - if (count_put > 1) - tty_insert_flip_string(port, skb->data, count_put - 1); - last = skb->data[count_put - 1]; - len -= count_put; -#ifdef CONFIG_ISDN_AUDIO - } -#endif - count += count_put; - if (dflag) { - /* We got all the data in this buff. - * Now we can dequeue it. - */ - if (cisco_hack) - tty_insert_flip_char(port, last, 0xFF); - else - tty_insert_flip_char(port, last, TTY_NORMAL); -#ifdef CONFIG_ISDN_AUDIO - ISDN_AUDIO_SKB_LOCK(skb) = 0; -#endif - skb = skb_dequeue(&dev->drv[di]->rpqueue[channel]); - dev_kfree_skb(skb); - } else { - tty_insert_flip_char(port, last, TTY_NORMAL); - /* Not yet emptied this buff, so it - * must stay in the queue, for further calls - * but we pull off the data we got until now. - */ - skb_pull(skb, count_pull); -#ifdef CONFIG_ISDN_AUDIO - ISDN_AUDIO_SKB_LOCK(skb) = 0; -#endif - } - dev->drv[di]->rcvcount[channel] -= count_put; - } - return count; -} - - -static inline int -isdn_minor2drv(int minor) -{ - return (dev->drvmap[minor]); -} - -static inline int -isdn_minor2chan(int minor) -{ - return (dev->chanmap[minor]); -} - -static char * -isdn_statstr(void) -{ - static char istatbuf[2048]; - char *p; - int i; - - sprintf(istatbuf, "idmap:\t"); - p = istatbuf + strlen(istatbuf); - for (i = 0; i < ISDN_MAX_CHANNELS; i++) { - sprintf(p, "%s ", (dev->drvmap[i] < 0) ? "-" : dev->drvid[dev->drvmap[i]]); - p = istatbuf + strlen(istatbuf); - } - sprintf(p, "\nchmap:\t"); - p = istatbuf + strlen(istatbuf); - for (i = 0; i < ISDN_MAX_CHANNELS; i++) { - sprintf(p, "%d ", dev->chanmap[i]); - p = istatbuf + strlen(istatbuf); - } - sprintf(p, "\ndrmap:\t"); - p = istatbuf + strlen(istatbuf); - for (i = 0; i < ISDN_MAX_CHANNELS; i++) { - sprintf(p, "%d ", dev->drvmap[i]); - p = istatbuf + strlen(istatbuf); - } - sprintf(p, "\nusage:\t"); - p = istatbuf + strlen(istatbuf); - for (i = 0; i < ISDN_MAX_CHANNELS; i++) { - sprintf(p, "%d ", dev->usage[i]); - p = istatbuf + strlen(istatbuf); - } - sprintf(p, "\nflags:\t"); - p = istatbuf + strlen(istatbuf); - for (i = 0; i < ISDN_MAX_DRIVERS; i++) { - if (dev->drv[i]) { - sprintf(p, "%ld ", dev->drv[i]->online); - p = istatbuf + strlen(istatbuf); - } else { - sprintf(p, "? "); - p = istatbuf + strlen(istatbuf); - } - } - sprintf(p, "\nphone:\t"); - p = istatbuf + strlen(istatbuf); - for (i = 0; i < ISDN_MAX_CHANNELS; i++) { - sprintf(p, "%s ", dev->num[i]); - p = istatbuf + strlen(istatbuf); - } - sprintf(p, "\n"); - return istatbuf; -} - -/* Module interface-code */ - -void -isdn_info_update(void) -{ - infostruct *p = dev->infochain; - - while (p) { - *(p->private) = 1; - p = (infostruct *) p->next; - } - wake_up_interruptible(&(dev->info_waitq)); -} - -static ssize_t -isdn_read(struct file *file, char __user *buf, size_t count, loff_t *off) -{ - uint minor = iminor(file_inode(file)); - int len = 0; - int drvidx; - int chidx; - int retval; - char *p; - - mutex_lock(&isdn_mutex); - if (minor == ISDN_MINOR_STATUS) { - if (!file->private_data) { - if (file->f_flags & O_NONBLOCK) { - retval = -EAGAIN; - goto out; - } - wait_event_interruptible(dev->info_waitq, - file->private_data); - } - p = isdn_statstr(); - file->private_data = NULL; - if ((len = strlen(p)) <= count) { - if (copy_to_user(buf, p, len)) { - retval = -EFAULT; - goto out; - } - *off += len; - retval = len; - goto out; - } - retval = 0; - goto out; - } - if (!dev->drivers) { - retval = -ENODEV; - goto out; - } - if (minor <= ISDN_MINOR_BMAX) { - printk(KERN_WARNING "isdn_read minor %d obsolete!\n", minor); - drvidx = isdn_minor2drv(minor); - if (drvidx < 0) { - retval = -ENODEV; - goto out; - } - if (!(dev->drv[drvidx]->flags & DRV_FLAG_RUNNING)) { - retval = -ENODEV; - goto out; - } - chidx = isdn_minor2chan(minor); - if (!(p = kmalloc(count, GFP_KERNEL))) { - retval = -ENOMEM; - goto out; - } - len = isdn_readbchan(drvidx, chidx, p, NULL, count, - &dev->drv[drvidx]->rcv_waitq[chidx]); - *off += len; - if (copy_to_user(buf, p, len)) - len = -EFAULT; - kfree(p); - retval = len; - goto out; - } - if (minor <= ISDN_MINOR_CTRLMAX) { - drvidx = isdn_minor2drv(minor - ISDN_MINOR_CTRL); - if (drvidx < 0) { - retval = -ENODEV; - goto out; - } - if (!dev->drv[drvidx]->stavail) { - if (file->f_flags & O_NONBLOCK) { - retval = -EAGAIN; - goto out; - } - wait_event_interruptible(dev->drv[drvidx]->st_waitq, - dev->drv[drvidx]->stavail); - } - if (dev->drv[drvidx]->interface->readstat) { - if (count > dev->drv[drvidx]->stavail) - count = dev->drv[drvidx]->stavail; - len = dev->drv[drvidx]->interface->readstat(buf, count, - drvidx, isdn_minor2chan(minor - ISDN_MINOR_CTRL)); - if (len < 0) { - retval = len; - goto out; - } - } else { - len = 0; - } - if (len) - dev->drv[drvidx]->stavail -= len; - else - dev->drv[drvidx]->stavail = 0; - *off += len; - retval = len; - goto out; - } -#ifdef CONFIG_ISDN_PPP - if (minor <= ISDN_MINOR_PPPMAX) { - retval = isdn_ppp_read(minor - ISDN_MINOR_PPP, file, buf, count); - goto out; - } -#endif - retval = -ENODEV; -out: - mutex_unlock(&isdn_mutex); - return retval; -} - -static ssize_t -isdn_write(struct file *file, const char __user *buf, size_t count, loff_t *off) -{ - uint minor = iminor(file_inode(file)); - int drvidx; - int chidx; - int retval; - - if (minor == ISDN_MINOR_STATUS) - return -EPERM; - if (!dev->drivers) - return -ENODEV; - - mutex_lock(&isdn_mutex); - if (minor <= ISDN_MINOR_BMAX) { - printk(KERN_WARNING "isdn_write minor %d obsolete!\n", minor); - drvidx = isdn_minor2drv(minor); - if (drvidx < 0) { - retval = -ENODEV; - goto out; - } - if (!(dev->drv[drvidx]->flags & DRV_FLAG_RUNNING)) { - retval = -ENODEV; - goto out; - } - chidx = isdn_minor2chan(minor); - wait_event_interruptible(dev->drv[drvidx]->snd_waitq[chidx], - (retval = isdn_writebuf_stub(drvidx, chidx, buf, count))); - goto out; - } - if (minor <= ISDN_MINOR_CTRLMAX) { - drvidx = isdn_minor2drv(minor - ISDN_MINOR_CTRL); - if (drvidx < 0) { - retval = -ENODEV; - goto out; - } - /* - * We want to use the isdnctrl device to load the firmware - * - if (!(dev->drv[drvidx]->flags & DRV_FLAG_RUNNING)) - return -ENODEV; - */ - if (dev->drv[drvidx]->interface->writecmd) - retval = dev->drv[drvidx]->interface-> - writecmd(buf, count, drvidx, - isdn_minor2chan(minor - ISDN_MINOR_CTRL)); - else - retval = count; - goto out; - } -#ifdef CONFIG_ISDN_PPP - if (minor <= ISDN_MINOR_PPPMAX) { - retval = isdn_ppp_write(minor - ISDN_MINOR_PPP, file, buf, count); - goto out; - } -#endif - retval = -ENODEV; -out: - mutex_unlock(&isdn_mutex); - return retval; -} - -static __poll_t -isdn_poll(struct file *file, poll_table *wait) -{ - __poll_t mask = 0; - unsigned int minor = iminor(file_inode(file)); - int drvidx = isdn_minor2drv(minor - ISDN_MINOR_CTRL); - - mutex_lock(&isdn_mutex); - if (minor == ISDN_MINOR_STATUS) { - poll_wait(file, &(dev->info_waitq), wait); - /* mask = EPOLLOUT | EPOLLWRNORM; */ - if (file->private_data) { - mask |= EPOLLIN | EPOLLRDNORM; - } - goto out; - } - if (minor >= ISDN_MINOR_CTRL && minor <= ISDN_MINOR_CTRLMAX) { - if (drvidx < 0) { - /* driver deregistered while file open */ - mask = EPOLLHUP; - goto out; - } - poll_wait(file, &(dev->drv[drvidx]->st_waitq), wait); - mask = EPOLLOUT | EPOLLWRNORM; - if (dev->drv[drvidx]->stavail) { - mask |= EPOLLIN | EPOLLRDNORM; - } - goto out; - } -#ifdef CONFIG_ISDN_PPP - if (minor <= ISDN_MINOR_PPPMAX) { - mask = isdn_ppp_poll(file, wait); - goto out; - } -#endif - mask = EPOLLERR; -out: - mutex_unlock(&isdn_mutex); - return mask; -} - - -static int -isdn_ioctl(struct file *file, uint cmd, ulong arg) -{ - uint minor = iminor(file_inode(file)); - isdn_ctrl c; - int drvidx; - int ret; - int i; - char __user *p; - char *s; - union iocpar { - char name[10]; - char bname[22]; - isdn_ioctl_struct iocts; - isdn_net_ioctl_phone phone; - isdn_net_ioctl_cfg cfg; - } iocpar; - void __user *argp = (void __user *)arg; - -#define name iocpar.name -#define bname iocpar.bname -#define iocts iocpar.iocts -#define phone iocpar.phone -#define cfg iocpar.cfg - - if (minor == ISDN_MINOR_STATUS) { - switch (cmd) { - case IIOCGETDVR: - return (TTY_DV + - (NET_DV << 8) + - (INF_DV << 16)); - case IIOCGETCPS: - if (arg) { - ulong __user *p = argp; - int i; - for (i = 0; i < ISDN_MAX_CHANNELS; i++) { - put_user(dev->ibytes[i], p++); - put_user(dev->obytes[i], p++); - } - return 0; - } else - return -EINVAL; - break; - case IIOCNETGPN: - /* Get peer phone number of a connected - * isdn network interface */ - if (arg) { - if (copy_from_user(&phone, argp, sizeof(phone))) - return -EFAULT; - return isdn_net_getpeer(&phone, argp); - } else - return -EINVAL; - default: - return -EINVAL; - } - } - if (!dev->drivers) - return -ENODEV; - if (minor <= ISDN_MINOR_BMAX) { - drvidx = isdn_minor2drv(minor); - if (drvidx < 0) - return -ENODEV; - if (!(dev->drv[drvidx]->flags & DRV_FLAG_RUNNING)) - return -ENODEV; - return 0; - } - if (minor <= ISDN_MINOR_CTRLMAX) { -/* - * isdn net devices manage lots of configuration variables as linked lists. - * Those lists must only be manipulated from user space. Some of the ioctl's - * service routines access user space and are not atomic. Therefore, ioctl's - * manipulating the lists and ioctl's sleeping while accessing the lists - * are serialized by means of a semaphore. - */ - switch (cmd) { - case IIOCNETDWRSET: - printk(KERN_INFO "INFO: ISDN_DW_ABC_EXTENSION not enabled\n"); - return (-EINVAL); - case IIOCNETLCR: - printk(KERN_INFO "INFO: ISDN_ABC_LCR_SUPPORT not enabled\n"); - return -ENODEV; - case IIOCNETAIF: - /* Add a network-interface */ - if (arg) { - if (copy_from_user(name, argp, sizeof(name))) - return -EFAULT; - s = name; - } else { - s = NULL; - } - ret = mutex_lock_interruptible(&dev->mtx); - if (ret) return ret; - if ((s = isdn_net_new(s, NULL))) { - if (copy_to_user(argp, s, strlen(s) + 1)) { - ret = -EFAULT; - } else { - ret = 0; - } - } else - ret = -ENODEV; - mutex_unlock(&dev->mtx); - return ret; - case IIOCNETASL: - /* Add a slave to a network-interface */ - if (arg) { - if (copy_from_user(bname, argp, sizeof(bname) - 1)) - return -EFAULT; - bname[sizeof(bname)-1] = 0; - } else - return -EINVAL; - ret = mutex_lock_interruptible(&dev->mtx); - if (ret) return ret; - if ((s = isdn_net_newslave(bname))) { - if (copy_to_user(argp, s, strlen(s) + 1)) { - ret = -EFAULT; - } else { - ret = 0; - } - } else - ret = -ENODEV; - mutex_unlock(&dev->mtx); - return ret; - case IIOCNETDIF: - /* Delete a network-interface */ - if (arg) { - if (copy_from_user(name, argp, sizeof(name))) - return -EFAULT; - ret = mutex_lock_interruptible(&dev->mtx); - if (ret) return ret; - ret = isdn_net_rm(name); - mutex_unlock(&dev->mtx); - return ret; - } else - return -EINVAL; - case IIOCNETSCF: - /* Set configurable parameters of a network-interface */ - if (arg) { - if (copy_from_user(&cfg, argp, sizeof(cfg))) - return -EFAULT; - return isdn_net_setcfg(&cfg); - } else - return -EINVAL; - case IIOCNETGCF: - /* Get configurable parameters of a network-interface */ - if (arg) { - if (copy_from_user(&cfg, argp, sizeof(cfg))) - return -EFAULT; - if (!(ret = isdn_net_getcfg(&cfg))) { - if (copy_to_user(argp, &cfg, sizeof(cfg))) - return -EFAULT; - } - return ret; - } else - return -EINVAL; - case IIOCNETANM: - /* Add a phone-number to a network-interface */ - if (arg) { - if (copy_from_user(&phone, argp, sizeof(phone))) - return -EFAULT; - ret = mutex_lock_interruptible(&dev->mtx); - if (ret) return ret; - ret = isdn_net_addphone(&phone); - mutex_unlock(&dev->mtx); - return ret; - } else - return -EINVAL; - case IIOCNETGNM: - /* Get list of phone-numbers of a network-interface */ - if (arg) { - if (copy_from_user(&phone, argp, sizeof(phone))) - return -EFAULT; - ret = mutex_lock_interruptible(&dev->mtx); - if (ret) return ret; - ret = isdn_net_getphones(&phone, argp); - mutex_unlock(&dev->mtx); - return ret; - } else - return -EINVAL; - case IIOCNETDNM: - /* Delete a phone-number of a network-interface */ - if (arg) { - if (copy_from_user(&phone, argp, sizeof(phone))) - return -EFAULT; - ret = mutex_lock_interruptible(&dev->mtx); - if (ret) return ret; - ret = isdn_net_delphone(&phone); - mutex_unlock(&dev->mtx); - return ret; - } else - return -EINVAL; - case IIOCNETDIL: - /* Force dialing of a network-interface */ - if (arg) { - if (copy_from_user(name, argp, sizeof(name))) - return -EFAULT; - return isdn_net_force_dial(name); - } else - return -EINVAL; -#ifdef CONFIG_ISDN_PPP - case IIOCNETALN: - if (!arg) - return -EINVAL; - if (copy_from_user(name, argp, sizeof(name))) - return -EFAULT; - return isdn_ppp_dial_slave(name); - case IIOCNETDLN: - if (!arg) - return -EINVAL; - if (copy_from_user(name, argp, sizeof(name))) - return -EFAULT; - return isdn_ppp_hangup_slave(name); -#endif - case IIOCNETHUP: - /* Force hangup of a network-interface */ - if (!arg) - return -EINVAL; - if (copy_from_user(name, argp, sizeof(name))) - return -EFAULT; - return isdn_net_force_hangup(name); - break; - case IIOCSETVER: - dev->net_verbose = arg; - printk(KERN_INFO "isdn: Verbose-Level is %d\n", dev->net_verbose); - return 0; - case IIOCSETGST: - if (arg) - dev->global_flags |= ISDN_GLOBAL_STOPPED; - else - dev->global_flags &= ~ISDN_GLOBAL_STOPPED; - printk(KERN_INFO "isdn: Global Mode %s\n", - (dev->global_flags & ISDN_GLOBAL_STOPPED) ? "stopped" : "running"); - return 0; - case IIOCSETBRJ: - drvidx = -1; - if (arg) { - int i; - char *p; - if (copy_from_user(&iocts, argp, - sizeof(isdn_ioctl_struct))) - return -EFAULT; - iocts.drvid[sizeof(iocts.drvid) - 1] = 0; - if (strlen(iocts.drvid)) { - if ((p = strchr(iocts.drvid, ','))) - *p = 0; - drvidx = -1; - for (i = 0; i < ISDN_MAX_DRIVERS; i++) - if (!(strcmp(dev->drvid[i], iocts.drvid))) { - drvidx = i; - break; - } - } - } - if (drvidx == -1) - return -ENODEV; - if (iocts.arg) - dev->drv[drvidx]->flags |= DRV_FLAG_REJBUS; - else - dev->drv[drvidx]->flags &= ~DRV_FLAG_REJBUS; - return 0; - case IIOCSIGPRF: - dev->profd = current; - return 0; - break; - case IIOCGETPRF: - /* Get all Modem-Profiles */ - if (arg) { - char __user *p = argp; - int i; - - for (i = 0; i < ISDN_MAX_CHANNELS; i++) { - if (copy_to_user(p, dev->mdm.info[i].emu.profile, - ISDN_MODEM_NUMREG)) - return -EFAULT; - p += ISDN_MODEM_NUMREG; - if (copy_to_user(p, dev->mdm.info[i].emu.pmsn, ISDN_MSNLEN)) - return -EFAULT; - p += ISDN_MSNLEN; - if (copy_to_user(p, dev->mdm.info[i].emu.plmsn, ISDN_LMSNLEN)) - return -EFAULT; - p += ISDN_LMSNLEN; - } - return (ISDN_MODEM_NUMREG + ISDN_MSNLEN + ISDN_LMSNLEN) * ISDN_MAX_CHANNELS; - } else - return -EINVAL; - break; - case IIOCSETPRF: - /* Set all Modem-Profiles */ - if (arg) { - char __user *p = argp; - int i; - - for (i = 0; i < ISDN_MAX_CHANNELS; i++) { - if (copy_from_user(dev->mdm.info[i].emu.profile, p, - ISDN_MODEM_NUMREG)) - return -EFAULT; - p += ISDN_MODEM_NUMREG; - if (copy_from_user(dev->mdm.info[i].emu.plmsn, p, ISDN_LMSNLEN)) - return -EFAULT; - p += ISDN_LMSNLEN; - if (copy_from_user(dev->mdm.info[i].emu.pmsn, p, ISDN_MSNLEN)) - return -EFAULT; - p += ISDN_MSNLEN; - } - return 0; - } else - return -EINVAL; - break; - case IIOCSETMAP: - case IIOCGETMAP: - /* Set/Get MSN->EAZ-Mapping for a driver */ - if (arg) { - - if (copy_from_user(&iocts, argp, - sizeof(isdn_ioctl_struct))) - return -EFAULT; - iocts.drvid[sizeof(iocts.drvid) - 1] = 0; - if (strlen(iocts.drvid)) { - drvidx = -1; - for (i = 0; i < ISDN_MAX_DRIVERS; i++) - if (!(strcmp(dev->drvid[i], iocts.drvid))) { - drvidx = i; - break; - } - } else - drvidx = 0; - if (drvidx == -1) - return -ENODEV; - if (cmd == IIOCSETMAP) { - int loop = 1; - - p = (char __user *) iocts.arg; - i = 0; - while (loop) { - int j = 0; - - while (1) { - get_user(bname[j], p++); - switch (bname[j]) { - case '\0': - loop = 0; - /* Fall through */ - case ',': - bname[j] = '\0'; - strcpy(dev->drv[drvidx]->msn2eaz[i], bname); - j = ISDN_MSNLEN; - break; - default: - j++; - } - if (j >= ISDN_MSNLEN) - break; - } - if (++i > 9) - break; - } - } else { - p = (char __user *) iocts.arg; - for (i = 0; i < 10; i++) { - snprintf(bname, sizeof(bname), "%s%s", - strlen(dev->drv[drvidx]->msn2eaz[i]) ? - dev->drv[drvidx]->msn2eaz[i] : "_", - (i < 9) ? "," : "\0"); - if (copy_to_user(p, bname, strlen(bname) + 1)) - return -EFAULT; - p += strlen(bname); - } - } - return 0; - } else - return -EINVAL; - case IIOCDBGVAR: - return -EINVAL; - default: - if ((cmd & IIOCDRVCTL) == IIOCDRVCTL) - cmd = ((cmd >> _IOC_NRSHIFT) & _IOC_NRMASK) & ISDN_DRVIOCTL_MASK; - else - return -EINVAL; - if (arg) { - int i; - char *p; - if (copy_from_user(&iocts, argp, sizeof(isdn_ioctl_struct))) - return -EFAULT; - iocts.drvid[sizeof(iocts.drvid) - 1] = 0; - if (strlen(iocts.drvid)) { - if ((p = strchr(iocts.drvid, ','))) - *p = 0; - drvidx = -1; - for (i = 0; i < ISDN_MAX_DRIVERS; i++) - if (!(strcmp(dev->drvid[i], iocts.drvid))) { - drvidx = i; - break; - } - } else - drvidx = 0; - if (drvidx == -1) - return -ENODEV; - c.driver = drvidx; - c.command = ISDN_CMD_IOCTL; - c.arg = cmd; - memcpy(c.parm.num, &iocts.arg, sizeof(ulong)); - ret = isdn_command(&c); - memcpy(&iocts.arg, c.parm.num, sizeof(ulong)); - if (copy_to_user(argp, &iocts, sizeof(isdn_ioctl_struct))) - return -EFAULT; - return ret; - } else - return -EINVAL; - } - } -#ifdef CONFIG_ISDN_PPP - if (minor <= ISDN_MINOR_PPPMAX) - return (isdn_ppp_ioctl(minor - ISDN_MINOR_PPP, file, cmd, arg)); -#endif - return -ENODEV; - -#undef name -#undef bname -#undef iocts -#undef phone -#undef cfg -} - -static long -isdn_unlocked_ioctl(struct file *file, unsigned int cmd, unsigned long arg) -{ - int ret; - - mutex_lock(&isdn_mutex); - ret = isdn_ioctl(file, cmd, arg); - mutex_unlock(&isdn_mutex); - - return ret; -} - -/* - * Open the device code. - */ -static int -isdn_open(struct inode *ino, struct file *filep) -{ - uint minor = iminor(ino); - int drvidx; - int chidx; - int retval = -ENODEV; - - mutex_lock(&isdn_mutex); - if (minor == ISDN_MINOR_STATUS) { - infostruct *p; - - if ((p = kmalloc(sizeof(infostruct), GFP_KERNEL))) { - p->next = (char *) dev->infochain; - p->private = (char *) &(filep->private_data); - dev->infochain = p; - /* At opening we allow a single update */ - filep->private_data = (char *) 1; - retval = 0; - goto out; - } else { - retval = -ENOMEM; - goto out; - } - } - if (!dev->channels) - goto out; - if (minor <= ISDN_MINOR_BMAX) { - printk(KERN_WARNING "isdn_open minor %d obsolete!\n", minor); - drvidx = isdn_minor2drv(minor); - if (drvidx < 0) - goto out; - chidx = isdn_minor2chan(minor); - if (!(dev->drv[drvidx]->flags & DRV_FLAG_RUNNING)) - goto out; - if (!(dev->drv[drvidx]->online & (1 << chidx))) - goto out; - isdn_lock_drivers(); - retval = 0; - goto out; - } - if (minor <= ISDN_MINOR_CTRLMAX) { - drvidx = isdn_minor2drv(minor - ISDN_MINOR_CTRL); - if (drvidx < 0) - goto out; - isdn_lock_drivers(); - retval = 0; - goto out; - } -#ifdef CONFIG_ISDN_PPP - if (minor <= ISDN_MINOR_PPPMAX) { - retval = isdn_ppp_open(minor - ISDN_MINOR_PPP, filep); - if (retval == 0) - isdn_lock_drivers(); - goto out; - } -#endif -out: - nonseekable_open(ino, filep); - mutex_unlock(&isdn_mutex); - return retval; -} - -static int -isdn_close(struct inode *ino, struct file *filep) -{ - uint minor = iminor(ino); - - mutex_lock(&isdn_mutex); - if (minor == ISDN_MINOR_STATUS) { - infostruct *p = dev->infochain; - infostruct *q = NULL; - - while (p) { - if (p->private == (char *) &(filep->private_data)) { - if (q) - q->next = p->next; - else - dev->infochain = (infostruct *) (p->next); - kfree(p); - goto out; - } - q = p; - p = (infostruct *) (p->next); - } - printk(KERN_WARNING "isdn: No private data while closing isdnctrl\n"); - goto out; - } - isdn_unlock_drivers(); - if (minor <= ISDN_MINOR_BMAX) - goto out; - if (minor <= ISDN_MINOR_CTRLMAX) { - if (dev->profd == current) - dev->profd = NULL; - goto out; - } -#ifdef CONFIG_ISDN_PPP - if (minor <= ISDN_MINOR_PPPMAX) - isdn_ppp_release(minor - ISDN_MINOR_PPP, filep); -#endif - -out: - mutex_unlock(&isdn_mutex); - return 0; -} - -static const struct file_operations isdn_fops = -{ - .owner = THIS_MODULE, - .llseek = no_llseek, - .read = isdn_read, - .write = isdn_write, - .poll = isdn_poll, - .unlocked_ioctl = isdn_unlocked_ioctl, - .open = isdn_open, - .release = isdn_close, -}; - -char * -isdn_map_eaz2msn(char *msn, int di) -{ - isdn_driver_t *this = dev->drv[di]; - int i; - - if (strlen(msn) == 1) { - i = msn[0] - '0'; - if ((i >= 0) && (i <= 9)) - if (strlen(this->msn2eaz[i])) - return (this->msn2eaz[i]); - } - return (msn); -} - -/* - * Find an unused ISDN-channel, whose feature-flags match the - * given L2- and L3-protocols. - */ -#define L2V (~(ISDN_FEATURE_L2_V11096 | ISDN_FEATURE_L2_V11019 | ISDN_FEATURE_L2_V11038)) - -/* - * This function must be called with holding the dev->lock. - */ -int -isdn_get_free_channel(int usage, int l2_proto, int l3_proto, int pre_dev - , int pre_chan, char *msn) -{ - int i; - ulong features; - ulong vfeatures; - - features = ((1 << l2_proto) | (0x10000 << l3_proto)); - vfeatures = (((1 << l2_proto) | (0x10000 << l3_proto)) & - ~(ISDN_FEATURE_L2_V11096 | ISDN_FEATURE_L2_V11019 | ISDN_FEATURE_L2_V11038)); - /* If Layer-2 protocol is V.110, accept drivers with - * transparent feature even if these don't support V.110 - * because we can emulate this in linklevel. - */ - for (i = 0; i < ISDN_MAX_CHANNELS; i++) - if (USG_NONE(dev->usage[i]) && - (dev->drvmap[i] != -1)) { - int d = dev->drvmap[i]; - if ((dev->usage[i] & ISDN_USAGE_EXCLUSIVE) && - ((pre_dev != d) || (pre_chan != dev->chanmap[i]))) - continue; - if (!strcmp(isdn_map_eaz2msn(msn, d), "-")) - continue; - if (dev->usage[i] & ISDN_USAGE_DISABLED) - continue; /* usage not allowed */ - if (dev->drv[d]->flags & DRV_FLAG_RUNNING) { - if (((dev->drv[d]->interface->features & features) == features) || - (((dev->drv[d]->interface->features & vfeatures) == vfeatures) && - (dev->drv[d]->interface->features & ISDN_FEATURE_L2_TRANS))) { - if ((pre_dev < 0) || (pre_chan < 0)) { - dev->usage[i] &= ISDN_USAGE_EXCLUSIVE; - dev->usage[i] |= usage; - isdn_info_update(); - return i; - } else { - if ((pre_dev == d) && (pre_chan == dev->chanmap[i])) { - dev->usage[i] &= ISDN_USAGE_EXCLUSIVE; - dev->usage[i] |= usage; - isdn_info_update(); - return i; - } - } - } - } - } - return -1; -} - -/* - * Set state of ISDN-channel to 'unused' - */ -void -isdn_free_channel(int di, int ch, int usage) -{ - int i; - - if ((di < 0) || (ch < 0)) { - printk(KERN_WARNING "%s: called with invalid drv(%d) or channel(%d)\n", - __func__, di, ch); - return; - } - for (i = 0; i < ISDN_MAX_CHANNELS; i++) - if (((!usage) || ((dev->usage[i] & ISDN_USAGE_MASK) == usage)) && - (dev->drvmap[i] == di) && - (dev->chanmap[i] == ch)) { - dev->usage[i] &= (ISDN_USAGE_NONE | ISDN_USAGE_EXCLUSIVE); - strcpy(dev->num[i], "???"); - dev->ibytes[i] = 0; - dev->obytes[i] = 0; -// 20.10.99 JIM, try to reinitialize v110 ! - dev->v110emu[i] = 0; - atomic_set(&(dev->v110use[i]), 0); - isdn_v110_close(dev->v110[i]); - dev->v110[i] = NULL; -// 20.10.99 JIM, try to reinitialize v110 ! - isdn_info_update(); - if (dev->drv[di]) - skb_queue_purge(&dev->drv[di]->rpqueue[ch]); - } -} - -/* - * Cancel Exclusive-Flag for ISDN-channel - */ -void -isdn_unexclusive_channel(int di, int ch) -{ - int i; - - for (i = 0; i < ISDN_MAX_CHANNELS; i++) - if ((dev->drvmap[i] == di) && - (dev->chanmap[i] == ch)) { - dev->usage[i] &= ~ISDN_USAGE_EXCLUSIVE; - isdn_info_update(); - return; - } -} - -/* - * writebuf replacement for SKB_ABLE drivers - */ -static int -isdn_writebuf_stub(int drvidx, int chan, const u_char __user *buf, int len) -{ - int ret; - int hl = dev->drv[drvidx]->interface->hl_hdrlen; - struct sk_buff *skb = alloc_skb(hl + len, GFP_ATOMIC); - - if (!skb) - return -ENOMEM; - skb_reserve(skb, hl); - if (copy_from_user(skb_put(skb, len), buf, len)) { - dev_kfree_skb(skb); - return -EFAULT; - } - ret = dev->drv[drvidx]->interface->writebuf_skb(drvidx, chan, 1, skb); - if (ret <= 0) - dev_kfree_skb(skb); - if (ret > 0) - dev->obytes[isdn_dc2minor(drvidx, chan)] += ret; - return ret; -} - -/* - * Return: length of data on success, -ERRcode on failure. - */ -int -isdn_writebuf_skb_stub(int drvidx, int chan, int ack, struct sk_buff *skb) -{ - int ret; - struct sk_buff *nskb = NULL; - int v110_ret = skb->len; - int idx = isdn_dc2minor(drvidx, chan); - - if (dev->v110[idx]) { - atomic_inc(&dev->v110use[idx]); - nskb = isdn_v110_encode(dev->v110[idx], skb); - atomic_dec(&dev->v110use[idx]); - if (!nskb) - return 0; - v110_ret = *((int *)nskb->data); - skb_pull(nskb, sizeof(int)); - if (!nskb->len) { - dev_kfree_skb(nskb); - return v110_ret; - } - /* V.110 must always be acknowledged */ - ack = 1; - ret = dev->drv[drvidx]->interface->writebuf_skb(drvidx, chan, ack, nskb); - } else { - int hl = dev->drv[drvidx]->interface->hl_hdrlen; - - if (skb_headroom(skb) < hl) { - /* - * This should only occur when new HL driver with - * increased hl_hdrlen was loaded after netdevice - * was created and connected to the new driver. - * - * The V.110 branch (re-allocates on its own) does - * not need this - */ - struct sk_buff *skb_tmp; - - skb_tmp = skb_realloc_headroom(skb, hl); - printk(KERN_DEBUG "isdn_writebuf_skb_stub: reallocating headroom%s\n", skb_tmp ? "" : " failed"); - if (!skb_tmp) return -ENOMEM; /* 0 better? */ - ret = dev->drv[drvidx]->interface->writebuf_skb(drvidx, chan, ack, skb_tmp); - if (ret > 0) { - dev_kfree_skb(skb); - } else { - dev_kfree_skb(skb_tmp); - } - } else { - ret = dev->drv[drvidx]->interface->writebuf_skb(drvidx, chan, ack, skb); - } - } - if (ret > 0) { - dev->obytes[idx] += ret; - if (dev->v110[idx]) { - atomic_inc(&dev->v110use[idx]); - dev->v110[idx]->skbuser++; - atomic_dec(&dev->v110use[idx]); - /* For V.110 return unencoded data length */ - ret = v110_ret; - /* if the complete frame was send we free the skb; - if not upper function will requeue the skb */ - if (ret == skb->len) - dev_kfree_skb(skb); - } - } else - if (dev->v110[idx]) - dev_kfree_skb(nskb); - return ret; -} - -static int -isdn_add_channels(isdn_driver_t *d, int drvidx, int n, int adding) -{ - int j, k, m; - - init_waitqueue_head(&d->st_waitq); - if (d->flags & DRV_FLAG_RUNNING) - return -1; - if (n < 1) return 0; - - m = (adding) ? d->channels + n : n; - - if (dev->channels + n > ISDN_MAX_CHANNELS) { - printk(KERN_WARNING "register_isdn: Max. %d channels supported\n", - ISDN_MAX_CHANNELS); - return -1; - } - - if ((adding) && (d->rcverr)) - kfree(d->rcverr); - if (!(d->rcverr = kcalloc(m, sizeof(int), GFP_ATOMIC))) { - printk(KERN_WARNING "register_isdn: Could not alloc rcverr\n"); - return -1; - } - - if ((adding) && (d->rcvcount)) - kfree(d->rcvcount); - if (!(d->rcvcount = kcalloc(m, sizeof(int), GFP_ATOMIC))) { - printk(KERN_WARNING "register_isdn: Could not alloc rcvcount\n"); - if (!adding) - kfree(d->rcverr); - return -1; - } - - if ((adding) && (d->rpqueue)) { - for (j = 0; j < d->channels; j++) - skb_queue_purge(&d->rpqueue[j]); - kfree(d->rpqueue); - } - d->rpqueue = kmalloc_array(m, sizeof(struct sk_buff_head), GFP_ATOMIC); - if (!d->rpqueue) { - printk(KERN_WARNING "register_isdn: Could not alloc rpqueue\n"); - if (!adding) { - kfree(d->rcvcount); - kfree(d->rcverr); - } - return -1; - } - for (j = 0; j < m; j++) { - skb_queue_head_init(&d->rpqueue[j]); - } - - if ((adding) && (d->rcv_waitq)) - kfree(d->rcv_waitq); - d->rcv_waitq = kmalloc(array3_size(sizeof(wait_queue_head_t), 2, m), - GFP_ATOMIC); - if (!d->rcv_waitq) { - printk(KERN_WARNING "register_isdn: Could not alloc rcv_waitq\n"); - if (!adding) { - kfree(d->rpqueue); - kfree(d->rcvcount); - kfree(d->rcverr); - } - return -1; - } - d->snd_waitq = d->rcv_waitq + m; - for (j = 0; j < m; j++) { - init_waitqueue_head(&d->rcv_waitq[j]); - init_waitqueue_head(&d->snd_waitq[j]); - } - - dev->channels += n; - for (j = d->channels; j < m; j++) - for (k = 0; k < ISDN_MAX_CHANNELS; k++) - if (dev->chanmap[k] < 0) { - dev->chanmap[k] = j; - dev->drvmap[k] = drvidx; - break; - } - d->channels = m; - return 0; -} - -/* - * Low-level-driver registration - */ - -static void -set_global_features(void) -{ - int drvidx; - - dev->global_features = 0; - for (drvidx = 0; drvidx < ISDN_MAX_DRIVERS; drvidx++) { - if (!dev->drv[drvidx]) - continue; - if (dev->drv[drvidx]->interface) - dev->global_features |= dev->drv[drvidx]->interface->features; - } -} - -#ifdef CONFIG_ISDN_DIVERSION - -static char *map_drvname(int di) -{ - if ((di < 0) || (di >= ISDN_MAX_DRIVERS)) - return (NULL); - return (dev->drvid[di]); /* driver name */ -} /* map_drvname */ - -static int map_namedrv(char *id) -{ int i; - - for (i = 0; i < ISDN_MAX_DRIVERS; i++) - { if (!strcmp(dev->drvid[i], id)) - return (i); - } - return (-1); -} /* map_namedrv */ - -int DIVERT_REG_NAME(isdn_divert_if *i_div) -{ - if (i_div->if_magic != DIVERT_IF_MAGIC) - return (DIVERT_VER_ERR); - switch (i_div->cmd) - { - case DIVERT_CMD_REL: - if (divert_if != i_div) - return (DIVERT_REL_ERR); - divert_if = NULL; /* free interface */ - return (DIVERT_NO_ERR); - - case DIVERT_CMD_REG: - if (divert_if) - return (DIVERT_REG_ERR); - i_div->ll_cmd = isdn_command; /* set command function */ - i_div->drv_to_name = map_drvname; - i_div->name_to_drv = map_namedrv; - divert_if = i_div; /* remember interface */ - return (DIVERT_NO_ERR); - - default: - return (DIVERT_CMD_ERR); - } -} /* DIVERT_REG_NAME */ - -EXPORT_SYMBOL(DIVERT_REG_NAME); - -#endif /* CONFIG_ISDN_DIVERSION */ - - -EXPORT_SYMBOL(register_isdn); -#ifdef CONFIG_ISDN_PPP -EXPORT_SYMBOL(isdn_ppp_register_compressor); -EXPORT_SYMBOL(isdn_ppp_unregister_compressor); -#endif - -int -register_isdn(isdn_if *i) -{ - isdn_driver_t *d; - int j; - ulong flags; - int drvidx; - - if (dev->drivers >= ISDN_MAX_DRIVERS) { - printk(KERN_WARNING "register_isdn: Max. %d drivers supported\n", - ISDN_MAX_DRIVERS); - return 0; - } - if (!i->writebuf_skb) { - printk(KERN_WARNING "register_isdn: No write routine given.\n"); - return 0; - } - if (!(d = kzalloc(sizeof(isdn_driver_t), GFP_KERNEL))) { - printk(KERN_WARNING "register_isdn: Could not alloc driver-struct\n"); - return 0; - } - - d->maxbufsize = i->maxbufsize; - d->pktcount = 0; - d->stavail = 0; - d->flags = DRV_FLAG_LOADED; - d->online = 0; - d->interface = i; - d->channels = 0; - spin_lock_irqsave(&dev->lock, flags); - for (drvidx = 0; drvidx < ISDN_MAX_DRIVERS; drvidx++) - if (!dev->drv[drvidx]) - break; - if (isdn_add_channels(d, drvidx, i->channels, 0)) { - spin_unlock_irqrestore(&dev->lock, flags); - kfree(d); - return 0; - } - i->channels = drvidx; - i->rcvcallb_skb = isdn_receive_skb_callback; - i->statcallb = isdn_status_callback; - if (!strlen(i->id)) - sprintf(i->id, "line%d", drvidx); - for (j = 0; j < drvidx; j++) - if (!strcmp(i->id, dev->drvid[j])) - sprintf(i->id, "line%d", drvidx); - dev->drv[drvidx] = d; - strcpy(dev->drvid[drvidx], i->id); - isdn_info_update(); - dev->drivers++; - set_global_features(); - spin_unlock_irqrestore(&dev->lock, flags); - return 1; -} - -/* -***************************************************************************** -* And now the modules code. -***************************************************************************** -*/ - -static char * -isdn_getrev(const char *revision) -{ - char *rev; - char *p; - - if ((p = strchr(revision, ':'))) { - rev = p + 2; - p = strchr(rev, '$'); - *--p = 0; - } else - rev = "???"; - return rev; -} - -/* - * Allocate and initialize all data, register modem-devices - */ -static int __init isdn_init(void) -{ - int i; - char tmprev[50]; - - dev = vzalloc(sizeof(isdn_dev)); - if (!dev) { - printk(KERN_WARNING "isdn: Could not allocate device-struct.\n"); - return -EIO; - } - timer_setup(&dev->timer, isdn_timer_funct, 0); - spin_lock_init(&dev->lock); - spin_lock_init(&dev->timerlock); -#ifdef MODULE - dev->owner = THIS_MODULE; -#endif - mutex_init(&dev->mtx); - init_waitqueue_head(&dev->info_waitq); - for (i = 0; i < ISDN_MAX_CHANNELS; i++) { - dev->drvmap[i] = -1; - dev->chanmap[i] = -1; - dev->m_idx[i] = -1; - strcpy(dev->num[i], "???"); - } - if (register_chrdev(ISDN_MAJOR, "isdn", &isdn_fops)) { - printk(KERN_WARNING "isdn: Could not register control devices\n"); - vfree(dev); - return -EIO; - } - if ((isdn_tty_modem_init()) < 0) { - printk(KERN_WARNING "isdn: Could not register tty devices\n"); - vfree(dev); - unregister_chrdev(ISDN_MAJOR, "isdn"); - return -EIO; - } -#ifdef CONFIG_ISDN_PPP - if (isdn_ppp_init() < 0) { - printk(KERN_WARNING "isdn: Could not create PPP-device-structs\n"); - isdn_tty_exit(); - unregister_chrdev(ISDN_MAJOR, "isdn"); - vfree(dev); - return -EIO; - } -#endif /* CONFIG_ISDN_PPP */ - - strcpy(tmprev, isdn_revision); - printk(KERN_NOTICE "ISDN subsystem Rev: %s/", isdn_getrev(tmprev)); - strcpy(tmprev, isdn_net_revision); - printk("%s/", isdn_getrev(tmprev)); - strcpy(tmprev, isdn_ppp_revision); - printk("%s/", isdn_getrev(tmprev)); - strcpy(tmprev, isdn_audio_revision); - printk("%s/", isdn_getrev(tmprev)); - strcpy(tmprev, isdn_v110_revision); - printk("%s", isdn_getrev(tmprev)); - -#ifdef MODULE - printk(" loaded\n"); -#else - printk("\n"); -#endif - isdn_info_update(); - return 0; -} - -/* - * Unload module - */ -static void __exit isdn_exit(void) -{ -#ifdef CONFIG_ISDN_PPP - isdn_ppp_cleanup(); -#endif - if (isdn_net_rmall() < 0) { - printk(KERN_WARNING "isdn: net-device busy, remove cancelled\n"); - return; - } - isdn_tty_exit(); - unregister_chrdev(ISDN_MAJOR, "isdn"); - del_timer_sync(&dev->timer); - /* call vfree with interrupts enabled, else it will hang */ - vfree(dev); - printk(KERN_NOTICE "ISDN-subsystem unloaded\n"); -} - -module_init(isdn_init); -module_exit(isdn_exit); diff --git a/drivers/isdn/i4l/isdn_common.h b/drivers/isdn/i4l/isdn_common.h deleted file mode 100644 index 2260ef07ab9c..000000000000 --- a/drivers/isdn/i4l/isdn_common.h +++ /dev/null @@ -1,47 +0,0 @@ -/* $Id: isdn_common.h,v 1.1.2.2 2004/01/12 22:37:19 keil Exp $ - * - * header for Linux ISDN subsystem - * common used functions and debugging-switches (linklevel). - * - * Copyright 1994-1999 by Fritz Elfert (fritz@isdn4linux.de) - * Copyright 1995,96 by Thinking Objects Software GmbH Wuerzburg - * Copyright 1995,96 by Michael Hipp (Michael.Hipp@student.uni-tuebingen.de) - * - * This software may be used and distributed according to the terms - * of the GNU General Public License, incorporated herein by reference. - * - */ - -#undef ISDN_DEBUG_MODEM_OPEN -#undef ISDN_DEBUG_MODEM_IOCTL -#undef ISDN_DEBUG_MODEM_WAITSENT -#undef ISDN_DEBUG_MODEM_HUP -#undef ISDN_DEBUG_MODEM_ICALL -#undef ISDN_DEBUG_MODEM_DUMP -#undef ISDN_DEBUG_MODEM_VOICE -#undef ISDN_DEBUG_AT -#undef ISDN_DEBUG_NET_DUMP -#undef ISDN_DEBUG_NET_DIAL -#undef ISDN_DEBUG_NET_ICALL - -/* Prototypes */ -extern void isdn_lock_drivers(void); -extern void isdn_unlock_drivers(void); -extern void isdn_free_channel(int di, int ch, int usage); -extern void isdn_all_eaz(int di, int ch); -extern int isdn_command(isdn_ctrl *); -extern int isdn_dc2minor(int di, int ch); -extern void isdn_info_update(void); -extern char *isdn_map_eaz2msn(char *msn, int di); -extern void isdn_timer_ctrl(int tf, int onoff); -extern void isdn_unexclusive_channel(int di, int ch); -extern int isdn_getnum(char **); -extern int isdn_readbchan(int, int, u_char *, u_char *, int, wait_queue_head_t *); -extern int isdn_readbchan_tty(int, int, struct tty_port *, int); -extern int isdn_get_free_channel(int, int, int, int, int, char *); -extern int isdn_writebuf_skb_stub(int, int, int, struct sk_buff *); -extern int register_isdn(isdn_if *i); -extern int isdn_msncmp(const char *, const char *); -#if defined(ISDN_DEBUG_NET_DUMP) || defined(ISDN_DEBUG_MODEM_DUMP) -extern void isdn_dumppkt(char *, u_char *, int, int); -#endif diff --git a/drivers/isdn/i4l/isdn_concap.c b/drivers/isdn/i4l/isdn_concap.c deleted file mode 100644 index 336523ec077c..000000000000 --- a/drivers/isdn/i4l/isdn_concap.c +++ /dev/null @@ -1,99 +0,0 @@ -/* $Id: isdn_concap.c,v 1.1.2.2 2004/01/12 22:37:19 keil Exp $ - * - * Linux ISDN subsystem, protocol encapsulation - * - * This software may be used and distributed according to the terms - * of the GNU General Public License, incorporated herein by reference. - * - */ - -/* Stuff to support the concap_proto by isdn4linux. isdn4linux - specific - * stuff goes here. Stuff that depends only on the concap protocol goes to - * another -- protocol specific -- source file. - * - */ - - -#include -#include "isdn_x25iface.h" -#include "isdn_net.h" -#include -#include "isdn_concap.h" - - -/* The following set of device service operations are for encapsulation - protocols that require for reliable datalink semantics. That means: - - - before any data is to be submitted the connection must explicitly - be set up. - - after the successful set up of the connection is signalled the - connection is considered to be reliably up. - - Auto-dialing ist not compatible with this requirements. Thus, auto-dialing - is completely bypassed. - - It might be possible to implement a (non standardized) datalink protocol - that provides a reliable data link service while using some auto dialing - mechanism. Such a protocol would need an auxiliary channel (i.e. user-user- - signaling on the D-channel) while the B-channel is down. -*/ - - -static int isdn_concap_dl_data_req(struct concap_proto *concap, struct sk_buff *skb) -{ - struct net_device *ndev = concap->net_dev; - isdn_net_dev *nd = ((isdn_net_local *) netdev_priv(ndev))->netdev; - isdn_net_local *lp = isdn_net_get_locked_lp(nd); - - IX25DEBUG("isdn_concap_dl_data_req: %s \n", concap->net_dev->name); - if (!lp) { - IX25DEBUG("isdn_concap_dl_data_req: %s : isdn_net_send_skb returned %d\n", concap->net_dev->name, 1); - return 1; - } - lp->huptimer = 0; - isdn_net_writebuf_skb(lp, skb); - spin_unlock_bh(&lp->xmit_lock); - IX25DEBUG("isdn_concap_dl_data_req: %s : isdn_net_send_skb returned %d\n", concap->net_dev->name, 0); - return 0; -} - - -static int isdn_concap_dl_connect_req(struct concap_proto *concap) -{ - struct net_device *ndev = concap->net_dev; - isdn_net_local *lp = netdev_priv(ndev); - int ret; - IX25DEBUG("isdn_concap_dl_connect_req: %s \n", ndev->name); - - /* dial ... */ - ret = isdn_net_dial_req(lp); - if (ret) IX25DEBUG("dialing failed\n"); - return ret; -} - -static int isdn_concap_dl_disconn_req(struct concap_proto *concap) -{ - IX25DEBUG("isdn_concap_dl_disconn_req: %s \n", concap->net_dev->name); - - isdn_net_hangup(concap->net_dev); - return 0; -} - -struct concap_device_ops isdn_concap_reliable_dl_dops = { - .data_req = &isdn_concap_dl_data_req, - .connect_req = &isdn_concap_dl_connect_req, - .disconn_req = &isdn_concap_dl_disconn_req -}; - -/* The following should better go into a dedicated source file such that - this sourcefile does not need to include any protocol specific header - files. For now: -*/ -struct concap_proto *isdn_concap_new(int encap) -{ - switch (encap) { - case ISDN_NET_ENCAP_X25IFACE: - return isdn_x25iface_proto_new(); - } - return NULL; -} diff --git a/drivers/isdn/i4l/isdn_concap.h b/drivers/isdn/i4l/isdn_concap.h deleted file mode 100644 index cd7e3ba74e25..000000000000 --- a/drivers/isdn/i4l/isdn_concap.h +++ /dev/null @@ -1,11 +0,0 @@ -/* $Id: isdn_concap.h,v 1.1.2.2 2004/01/12 22:37:19 keil Exp $ - * - * Linux ISDN subsystem, protocol encapsulation - * - * This software may be used and distributed according to the terms - * of the GNU General Public License, incorporated herein by reference. - * - */ - -extern struct concap_device_ops isdn_concap_reliable_dl_dops; -extern struct concap_proto *isdn_concap_new(int); diff --git a/drivers/isdn/i4l/isdn_net.c b/drivers/isdn/i4l/isdn_net.c deleted file mode 100644 index c138f66f2659..000000000000 --- a/drivers/isdn/i4l/isdn_net.c +++ /dev/null @@ -1,3198 +0,0 @@ -/* $Id: isdn_net.c,v 1.1.2.2 2004/01/12 22:37:19 keil Exp $ - * - * Linux ISDN subsystem, network interfaces and related functions (linklevel). - * - * Copyright 1994-1998 by Fritz Elfert (fritz@isdn4linux.de) - * Copyright 1995,96 by Thinking Objects Software GmbH Wuerzburg - * Copyright 1995,96 by Michael Hipp (Michael.Hipp@student.uni-tuebingen.de) - * - * This software may be used and distributed according to the terms - * of the GNU General Public License, incorporated herein by reference. - * - * Data Over Voice (DOV) support added - Guy Ellis 23-Mar-02 - * guy@traverse.com.au - * Outgoing calls - looks for a 'V' in first char of dialed number - * Incoming calls - checks first character of eaz as follows: - * Numeric - accept DATA only - original functionality - * 'V' - accept VOICE (DOV) only - * 'B' - accept BOTH DATA and DOV types - * - * Jan 2001: fix CISCO HDLC Bjoern A. Zeeb - * for info on the protocol, see - * http://i4l.zabbadoz.net/i4l/cisco-hdlc.txt - */ - -#include -#include -#include -#include -#include -#include -#include "isdn_common.h" -#include "isdn_net.h" -#ifdef CONFIG_ISDN_PPP -#include "isdn_ppp.h" -#endif -#ifdef CONFIG_ISDN_X25 -#include -#include "isdn_concap.h" -#endif - - -/* - * Outline of new tbusy handling: - * - * Old method, roughly spoken, consisted of setting tbusy when entering - * isdn_net_start_xmit() and at several other locations and clearing - * it from isdn_net_start_xmit() thread when sending was successful. - * - * With 2.3.x multithreaded network core, to prevent problems, tbusy should - * only be set by the isdn_net_start_xmit() thread and only when a tx-busy - * condition is detected. Other threads (in particular isdn_net_stat_callb()) - * are only allowed to clear tbusy. - * - * -HE - */ - -/* - * About SOFTNET: - * Most of the changes were pretty obvious and basically done by HE already. - * - * One problem of the isdn net device code is that it uses struct net_device - * for masters and slaves. However, only master interface are registered to - * the network layer, and therefore, it only makes sense to call netif_* - * functions on them. - * - * --KG - */ - -/* - * Find out if the netdevice has been ifup-ed yet. - * For slaves, look at the corresponding master. - */ -static __inline__ int isdn_net_device_started(isdn_net_dev *n) -{ - isdn_net_local *lp = n->local; - struct net_device *dev; - - if (lp->master) - dev = lp->master; - else - dev = n->dev; - return netif_running(dev); -} - -/* - * wake up the network -> net_device queue. - * For slaves, wake the corresponding master interface. - */ -static __inline__ void isdn_net_device_wake_queue(isdn_net_local *lp) -{ - if (lp->master) - netif_wake_queue(lp->master); - else - netif_wake_queue(lp->netdev->dev); -} - -/* - * stop the network -> net_device queue. - * For slaves, stop the corresponding master interface. - */ -static __inline__ void isdn_net_device_stop_queue(isdn_net_local *lp) -{ - if (lp->master) - netif_stop_queue(lp->master); - else - netif_stop_queue(lp->netdev->dev); -} - -/* - * find out if the net_device which this lp belongs to (lp can be - * master or slave) is busy. It's busy iff all (master and slave) - * queues are busy - */ -static __inline__ int isdn_net_device_busy(isdn_net_local *lp) -{ - isdn_net_local *nlp; - isdn_net_dev *nd; - unsigned long flags; - - if (!isdn_net_lp_busy(lp)) - return 0; - - if (lp->master) - nd = ISDN_MASTER_PRIV(lp)->netdev; - else - nd = lp->netdev; - - spin_lock_irqsave(&nd->queue_lock, flags); - nlp = lp->next; - while (nlp != lp) { - if (!isdn_net_lp_busy(nlp)) { - spin_unlock_irqrestore(&nd->queue_lock, flags); - return 0; - } - nlp = nlp->next; - } - spin_unlock_irqrestore(&nd->queue_lock, flags); - return 1; -} - -static __inline__ void isdn_net_inc_frame_cnt(isdn_net_local *lp) -{ - atomic_inc(&lp->frame_cnt); - if (isdn_net_device_busy(lp)) - isdn_net_device_stop_queue(lp); -} - -static __inline__ void isdn_net_dec_frame_cnt(isdn_net_local *lp) -{ - atomic_dec(&lp->frame_cnt); - - if (!(isdn_net_device_busy(lp))) { - if (!skb_queue_empty(&lp->super_tx_queue)) { - schedule_work(&lp->tqueue); - } else { - isdn_net_device_wake_queue(lp); - } - } -} - -static __inline__ void isdn_net_zero_frame_cnt(isdn_net_local *lp) -{ - atomic_set(&lp->frame_cnt, 0); -} - -/* For 2.2.x we leave the transmitter busy timeout at 2 secs, just - * to be safe. - * For 2.3.x we push it up to 20 secs, because call establishment - * (in particular callback) may take such a long time, and we - * don't want confusing messages in the log. However, there is a slight - * possibility that this large timeout will break other things like MPPP, - * which might rely on the tx timeout. If so, we'll find out this way... - */ - -#define ISDN_NET_TX_TIMEOUT (20 * HZ) - -/* Prototypes */ - -static int isdn_net_force_dial_lp(isdn_net_local *); -static netdev_tx_t isdn_net_start_xmit(struct sk_buff *, - struct net_device *); - -static void isdn_net_ciscohdlck_connected(isdn_net_local *lp); -static void isdn_net_ciscohdlck_disconnected(isdn_net_local *lp); - -char *isdn_net_revision = "$Revision: 1.1.2.2 $"; - -/* - * Code for raw-networking over ISDN - */ - -static void -isdn_net_unreachable(struct net_device *dev, struct sk_buff *skb, char *reason) -{ - if (skb) { - - u_short proto = ntohs(skb->protocol); - - printk(KERN_DEBUG "isdn_net: %s: %s, signalling dst_link_failure %s\n", - dev->name, - (reason != NULL) ? reason : "unknown", - (proto != ETH_P_IP) ? "Protocol != ETH_P_IP" : ""); - - dst_link_failure(skb); - } - else { /* dial not triggered by rawIP packet */ - printk(KERN_DEBUG "isdn_net: %s: %s\n", - dev->name, - (reason != NULL) ? reason : "reason unknown"); - } -} - -static void -isdn_net_reset(struct net_device *dev) -{ -#ifdef CONFIG_ISDN_X25 - struct concap_device_ops *dops = - ((isdn_net_local *)netdev_priv(dev))->dops; - struct concap_proto *cprot = - ((isdn_net_local *)netdev_priv(dev))->netdev->cprot; -#endif -#ifdef CONFIG_ISDN_X25 - if (cprot && cprot->pops && dops) - cprot->pops->restart(cprot, dev, dops); -#endif -} - -/* Open/initialize the board. */ -static int -isdn_net_open(struct net_device *dev) -{ - int i; - struct net_device *p; - struct in_device *in_dev; - - /* moved here from isdn_net_reset, because only the master has an - interface associated which is supposed to be started. BTW: - we need to call netif_start_queue, not netif_wake_queue here */ - netif_start_queue(dev); - - isdn_net_reset(dev); - /* Fill in the MAC-level header (not needed, but for compatibility... */ - for (i = 0; i < ETH_ALEN - sizeof(u32); i++) - dev->dev_addr[i] = 0xfc; - if ((in_dev = dev->ip_ptr) != NULL) { - /* - * Any address will do - we take the first - */ - struct in_ifaddr *ifa = in_dev->ifa_list; - if (ifa != NULL) - memcpy(dev->dev_addr + 2, &ifa->ifa_local, 4); - } - - /* If this interface has slaves, start them also */ - p = MASTER_TO_SLAVE(dev); - if (p) { - while (p) { - isdn_net_reset(p); - p = MASTER_TO_SLAVE(p); - } - } - isdn_lock_drivers(); - return 0; -} - -/* - * Assign an ISDN-channel to a net-interface - */ -static void -isdn_net_bind_channel(isdn_net_local *lp, int idx) -{ - lp->flags |= ISDN_NET_CONNECTED; - lp->isdn_device = dev->drvmap[idx]; - lp->isdn_channel = dev->chanmap[idx]; - dev->rx_netdev[idx] = lp->netdev; - dev->st_netdev[idx] = lp->netdev; -} - -/* - * unbind a net-interface (resets interface after an error) - */ -static void -isdn_net_unbind_channel(isdn_net_local *lp) -{ - skb_queue_purge(&lp->super_tx_queue); - - if (!lp->master) { /* reset only master device */ - /* Moral equivalent of dev_purge_queues(): - BEWARE! This chunk of code cannot be called from hardware - interrupt handler. I hope it is true. --ANK - */ - qdisc_reset_all_tx(lp->netdev->dev); - } - lp->dialstate = 0; - dev->rx_netdev[isdn_dc2minor(lp->isdn_device, lp->isdn_channel)] = NULL; - dev->st_netdev[isdn_dc2minor(lp->isdn_device, lp->isdn_channel)] = NULL; - if (lp->isdn_device != -1 && lp->isdn_channel != -1) - isdn_free_channel(lp->isdn_device, lp->isdn_channel, - ISDN_USAGE_NET); - lp->flags &= ~ISDN_NET_CONNECTED; - lp->isdn_device = -1; - lp->isdn_channel = -1; -} - -/* - * Perform auto-hangup and cps-calculation for net-interfaces. - * - * auto-hangup: - * Increment idle-counter (this counter is reset on any incoming or - * outgoing packet), if counter exceeds configured limit either do a - * hangup immediately or - if configured - wait until just before the next - * charge-info. - * - * cps-calculation (needed for dynamic channel-bundling): - * Since this function is called every second, simply reset the - * byte-counter of the interface after copying it to the cps-variable. - */ -static unsigned long last_jiffies = -HZ; - -void -isdn_net_autohup(void) -{ - isdn_net_dev *p = dev->netdev; - int anymore; - - anymore = 0; - while (p) { - isdn_net_local *l = p->local; - if (jiffies == last_jiffies) - l->cps = l->transcount; - else - l->cps = (l->transcount * HZ) / (jiffies - last_jiffies); - l->transcount = 0; - if (dev->net_verbose > 3) - printk(KERN_DEBUG "%s: %d bogocps\n", p->dev->name, l->cps); - if ((l->flags & ISDN_NET_CONNECTED) && (!l->dialstate)) { - anymore = 1; - l->huptimer++; - /* - * if there is some dialmode where timeout-hangup - * should _not_ be done, check for that here - */ - if ((l->onhtime) && - (l->huptimer > l->onhtime)) - { - if (l->hupflags & ISDN_MANCHARGE && - l->hupflags & ISDN_CHARGEHUP) { - while (time_after(jiffies, l->chargetime + l->chargeint)) - l->chargetime += l->chargeint; - if (time_after(jiffies, l->chargetime + l->chargeint - 2 * HZ)) - if (l->outgoing || l->hupflags & ISDN_INHUP) - isdn_net_hangup(p->dev); - } else if (l->outgoing) { - if (l->hupflags & ISDN_CHARGEHUP) { - if (l->hupflags & ISDN_WAITCHARGE) { - printk(KERN_DEBUG "isdn_net: Hupflags of %s are %X\n", - p->dev->name, l->hupflags); - isdn_net_hangup(p->dev); - } else if (time_after(jiffies, l->chargetime + l->chargeint)) { - printk(KERN_DEBUG - "isdn_net: %s: chtime = %lu, chint = %d\n", - p->dev->name, l->chargetime, l->chargeint); - isdn_net_hangup(p->dev); - } - } else - isdn_net_hangup(p->dev); - } else if (l->hupflags & ISDN_INHUP) - isdn_net_hangup(p->dev); - } - - if (dev->global_flags & ISDN_GLOBAL_STOPPED || (ISDN_NET_DIALMODE(*l) == ISDN_NET_DM_OFF)) { - isdn_net_hangup(p->dev); - break; - } - } - p = (isdn_net_dev *) p->next; - } - last_jiffies = jiffies; - isdn_timer_ctrl(ISDN_TIMER_NETHANGUP, anymore); -} - -static void isdn_net_lp_disconnected(isdn_net_local *lp) -{ - isdn_net_rm_from_bundle(lp); -} - -/* - * Handle status-messages from ISDN-interfacecard. - * This function is called from within the main-status-dispatcher - * isdn_status_callback, which itself is called from the low-level driver. - * Return: 1 = Event handled, 0 = not for us or unknown Event. - */ -int -isdn_net_stat_callback(int idx, isdn_ctrl *c) -{ - isdn_net_dev *p = dev->st_netdev[idx]; - int cmd = c->command; - - if (p) { - isdn_net_local *lp = p->local; -#ifdef CONFIG_ISDN_X25 - struct concap_proto *cprot = lp->netdev->cprot; - struct concap_proto_ops *pops = cprot ? cprot->pops : NULL; -#endif - switch (cmd) { - case ISDN_STAT_BSENT: - /* A packet has successfully been sent out */ - if ((lp->flags & ISDN_NET_CONNECTED) && - (!lp->dialstate)) { - isdn_net_dec_frame_cnt(lp); - lp->stats.tx_packets++; - lp->stats.tx_bytes += c->parm.length; - } - return 1; - case ISDN_STAT_DCONN: - /* D-Channel is up */ - switch (lp->dialstate) { - case 4: - case 7: - case 8: - lp->dialstate++; - return 1; - case 12: - lp->dialstate = 5; - return 1; - } - break; - case ISDN_STAT_DHUP: - /* Either D-Channel-hangup or error during dialout */ -#ifdef CONFIG_ISDN_X25 - /* If we are not connencted then dialing had - failed. If there are generic encap protocol - receiver routines signal the closure of - the link*/ - - if (!(lp->flags & ISDN_NET_CONNECTED) - && pops && pops->disconn_ind) - pops->disconn_ind(cprot); -#endif /* CONFIG_ISDN_X25 */ - if ((!lp->dialstate) && (lp->flags & ISDN_NET_CONNECTED)) { - if (lp->p_encap == ISDN_NET_ENCAP_CISCOHDLCK) - isdn_net_ciscohdlck_disconnected(lp); -#ifdef CONFIG_ISDN_PPP - if (lp->p_encap == ISDN_NET_ENCAP_SYNCPPP) - isdn_ppp_free(lp); -#endif - isdn_net_lp_disconnected(lp); - isdn_all_eaz(lp->isdn_device, lp->isdn_channel); - printk(KERN_INFO "%s: remote hangup\n", p->dev->name); - printk(KERN_INFO "%s: Chargesum is %d\n", p->dev->name, - lp->charge); - isdn_net_unbind_channel(lp); - return 1; - } - break; -#ifdef CONFIG_ISDN_X25 - case ISDN_STAT_BHUP: - /* B-Channel-hangup */ - /* try if there are generic encap protocol - receiver routines and signal the closure of - the link */ - if (pops && pops->disconn_ind) { - pops->disconn_ind(cprot); - return 1; - } - break; -#endif /* CONFIG_ISDN_X25 */ - case ISDN_STAT_BCONN: - /* B-Channel is up */ - isdn_net_zero_frame_cnt(lp); - switch (lp->dialstate) { - case 5: - case 6: - case 7: - case 8: - case 9: - case 10: - case 12: - if (lp->dialstate <= 6) { - dev->usage[idx] |= ISDN_USAGE_OUTGOING; - isdn_info_update(); - } else - dev->rx_netdev[idx] = p; - lp->dialstate = 0; - isdn_timer_ctrl(ISDN_TIMER_NETHANGUP, 1); - if (lp->p_encap == ISDN_NET_ENCAP_CISCOHDLCK) - isdn_net_ciscohdlck_connected(lp); - if (lp->p_encap != ISDN_NET_ENCAP_SYNCPPP) { - if (lp->master) { /* is lp a slave? */ - isdn_net_dev *nd = ISDN_MASTER_PRIV(lp)->netdev; - isdn_net_add_to_bundle(nd, lp); - } - } - printk(KERN_INFO "isdn_net: %s connected\n", p->dev->name); - /* If first Chargeinfo comes before B-Channel connect, - * we correct the timestamp here. - */ - lp->chargetime = jiffies; - - /* reset dial-timeout */ - lp->dialstarted = 0; - lp->dialwait_timer = 0; - -#ifdef CONFIG_ISDN_PPP - if (lp->p_encap == ISDN_NET_ENCAP_SYNCPPP) - isdn_ppp_wakeup_daemon(lp); -#endif -#ifdef CONFIG_ISDN_X25 - /* try if there are generic concap receiver routines */ - if (pops) - if (pops->connect_ind) - pops->connect_ind(cprot); -#endif /* CONFIG_ISDN_X25 */ - /* ppp needs to do negotiations first */ - if (lp->p_encap != ISDN_NET_ENCAP_SYNCPPP) - isdn_net_device_wake_queue(lp); - return 1; - } - break; - case ISDN_STAT_NODCH: - /* No D-Channel avail. */ - if (lp->dialstate == 4) { - lp->dialstate--; - return 1; - } - break; - case ISDN_STAT_CINF: - /* Charge-info from TelCo. Calculate interval between - * charge-infos and set timestamp for last info for - * usage by isdn_net_autohup() - */ - lp->charge++; - if (lp->hupflags & ISDN_HAVECHARGE) { - lp->hupflags &= ~ISDN_WAITCHARGE; - lp->chargeint = jiffies - lp->chargetime - (2 * HZ); - } - if (lp->hupflags & ISDN_WAITCHARGE) - lp->hupflags |= ISDN_HAVECHARGE; - lp->chargetime = jiffies; - printk(KERN_DEBUG "isdn_net: Got CINF chargetime of %s now %lu\n", - p->dev->name, lp->chargetime); - return 1; - } - } - return 0; -} - -/* - * Perform dialout for net-interfaces and timeout-handling for - * D-Channel-up and B-Channel-up Messages. - * This function is initially called from within isdn_net_start_xmit() or - * or isdn_net_find_icall() after initializing the dialstate for an - * interface. If further calls are needed, the function schedules itself - * for a timer-callback via isdn_timer_function(). - * The dialstate is also affected by incoming status-messages from - * the ISDN-Channel which are handled in isdn_net_stat_callback() above. - */ -void -isdn_net_dial(void) -{ - isdn_net_dev *p = dev->netdev; - int anymore = 0; - int i; - isdn_ctrl cmd; - u_char *phone_number; - - while (p) { - isdn_net_local *lp = p->local; - -#ifdef ISDN_DEBUG_NET_DIAL - if (lp->dialstate) - printk(KERN_DEBUG "%s: dialstate=%d\n", p->dev->name, lp->dialstate); -#endif - switch (lp->dialstate) { - case 0: - /* Nothing to do for this interface */ - break; - case 1: - /* Initiate dialout. Set phone-number-pointer to first number - * of interface. - */ - lp->dial = lp->phone[1]; - if (!lp->dial) { - printk(KERN_WARNING "%s: phone number deleted?\n", - p->dev->name); - isdn_net_hangup(p->dev); - break; - } - anymore = 1; - - if (lp->dialtimeout > 0) - if (lp->dialstarted == 0 || time_after(jiffies, lp->dialstarted + lp->dialtimeout + lp->dialwait)) { - lp->dialstarted = jiffies; - lp->dialwait_timer = 0; - } - - lp->dialstate++; - /* Fall through */ - case 2: - /* Prepare dialing. Clear EAZ, then set EAZ. */ - cmd.driver = lp->isdn_device; - cmd.arg = lp->isdn_channel; - cmd.command = ISDN_CMD_CLREAZ; - isdn_command(&cmd); - sprintf(cmd.parm.num, "%s", isdn_map_eaz2msn(lp->msn, cmd.driver)); - cmd.command = ISDN_CMD_SETEAZ; - isdn_command(&cmd); - lp->dialretry = 0; - anymore = 1; - lp->dialstate++; - /* Fall through */ - case 3: - /* Setup interface, dial current phone-number, switch to next number. - * If list of phone-numbers is exhausted, increment - * retry-counter. - */ - if (dev->global_flags & ISDN_GLOBAL_STOPPED || (ISDN_NET_DIALMODE(*lp) == ISDN_NET_DM_OFF)) { - char *s; - if (dev->global_flags & ISDN_GLOBAL_STOPPED) - s = "dial suppressed: isdn system stopped"; - else - s = "dial suppressed: dialmode `off'"; - isdn_net_unreachable(p->dev, NULL, s); - isdn_net_hangup(p->dev); - break; - } - cmd.driver = lp->isdn_device; - cmd.command = ISDN_CMD_SETL2; - cmd.arg = lp->isdn_channel + (lp->l2_proto << 8); - isdn_command(&cmd); - cmd.driver = lp->isdn_device; - cmd.command = ISDN_CMD_SETL3; - cmd.arg = lp->isdn_channel + (lp->l3_proto << 8); - isdn_command(&cmd); - cmd.driver = lp->isdn_device; - cmd.arg = lp->isdn_channel; - if (!lp->dial) { - printk(KERN_WARNING "%s: phone number deleted?\n", - p->dev->name); - isdn_net_hangup(p->dev); - break; - } - if (!strncmp(lp->dial->num, "LEASED", strlen("LEASED"))) { - lp->dialstate = 4; - printk(KERN_INFO "%s: Open leased line ...\n", p->dev->name); - } else { - if (lp->dialtimeout > 0) - if (time_after(jiffies, lp->dialstarted + lp->dialtimeout)) { - lp->dialwait_timer = jiffies + lp->dialwait; - lp->dialstarted = 0; - isdn_net_unreachable(p->dev, NULL, "dial: timed out"); - isdn_net_hangup(p->dev); - break; - } - - cmd.driver = lp->isdn_device; - cmd.command = ISDN_CMD_DIAL; - cmd.parm.setup.si2 = 0; - - /* check for DOV */ - phone_number = lp->dial->num; - if ((*phone_number == 'v') || - (*phone_number == 'V')) { /* DOV call */ - cmd.parm.setup.si1 = 1; - } else { /* DATA call */ - cmd.parm.setup.si1 = 7; - } - - strcpy(cmd.parm.setup.phone, phone_number); - /* - * Switch to next number or back to start if at end of list. - */ - if (!(lp->dial = (isdn_net_phone *) lp->dial->next)) { - lp->dial = lp->phone[1]; - lp->dialretry++; - - if (lp->dialretry > lp->dialmax) { - if (lp->dialtimeout == 0) { - lp->dialwait_timer = jiffies + lp->dialwait; - lp->dialstarted = 0; - isdn_net_unreachable(p->dev, NULL, "dial: tried all numbers dialmax times"); - } - isdn_net_hangup(p->dev); - break; - } - } - sprintf(cmd.parm.setup.eazmsn, "%s", - isdn_map_eaz2msn(lp->msn, cmd.driver)); - i = isdn_dc2minor(lp->isdn_device, lp->isdn_channel); - if (i >= 0) { - strcpy(dev->num[i], cmd.parm.setup.phone); - dev->usage[i] |= ISDN_USAGE_OUTGOING; - isdn_info_update(); - } - printk(KERN_INFO "%s: dialing %d %s... %s\n", p->dev->name, - lp->dialretry, cmd.parm.setup.phone, - (cmd.parm.setup.si1 == 1) ? "DOV" : ""); - lp->dtimer = 0; -#ifdef ISDN_DEBUG_NET_DIAL - printk(KERN_DEBUG "dial: d=%d c=%d\n", lp->isdn_device, - lp->isdn_channel); -#endif - isdn_command(&cmd); - } - lp->huptimer = 0; - lp->outgoing = 1; - if (lp->chargeint) { - lp->hupflags |= ISDN_HAVECHARGE; - lp->hupflags &= ~ISDN_WAITCHARGE; - } else { - lp->hupflags |= ISDN_WAITCHARGE; - lp->hupflags &= ~ISDN_HAVECHARGE; - } - anymore = 1; - lp->dialstate = - (lp->cbdelay && - (lp->flags & ISDN_NET_CBOUT)) ? 12 : 4; - break; - case 4: - /* Wait for D-Channel-connect. - * If timeout, switch back to state 3. - * Dialmax-handling moved to state 3. - */ - if (lp->dtimer++ > ISDN_TIMER_DTIMEOUT10) - lp->dialstate = 3; - anymore = 1; - break; - case 5: - /* Got D-Channel-Connect, send B-Channel-request */ - cmd.driver = lp->isdn_device; - cmd.arg = lp->isdn_channel; - cmd.command = ISDN_CMD_ACCEPTB; - anymore = 1; - lp->dtimer = 0; - lp->dialstate++; - isdn_command(&cmd); - break; - case 6: - /* Wait for B- or D-Channel-connect. If timeout, - * switch back to state 3. - */ -#ifdef ISDN_DEBUG_NET_DIAL - printk(KERN_DEBUG "dialtimer2: %d\n", lp->dtimer); -#endif - if (lp->dtimer++ > ISDN_TIMER_DTIMEOUT10) - lp->dialstate = 3; - anymore = 1; - break; - case 7: - /* Got incoming Call, setup L2 and L3 protocols, - * then wait for D-Channel-connect - */ -#ifdef ISDN_DEBUG_NET_DIAL - printk(KERN_DEBUG "dialtimer4: %d\n", lp->dtimer); -#endif - cmd.driver = lp->isdn_device; - cmd.command = ISDN_CMD_SETL2; - cmd.arg = lp->isdn_channel + (lp->l2_proto << 8); - isdn_command(&cmd); - cmd.driver = lp->isdn_device; - cmd.command = ISDN_CMD_SETL3; - cmd.arg = lp->isdn_channel + (lp->l3_proto << 8); - isdn_command(&cmd); - if (lp->dtimer++ > ISDN_TIMER_DTIMEOUT15) - isdn_net_hangup(p->dev); - else { - anymore = 1; - lp->dialstate++; - } - break; - case 9: - /* Got incoming D-Channel-Connect, send B-Channel-request */ - cmd.driver = lp->isdn_device; - cmd.arg = lp->isdn_channel; - cmd.command = ISDN_CMD_ACCEPTB; - isdn_command(&cmd); - anymore = 1; - lp->dtimer = 0; - lp->dialstate++; - break; - case 8: - case 10: - /* Wait for B- or D-channel-connect */ -#ifdef ISDN_DEBUG_NET_DIAL - printk(KERN_DEBUG "dialtimer4: %d\n", lp->dtimer); -#endif - if (lp->dtimer++ > ISDN_TIMER_DTIMEOUT10) - isdn_net_hangup(p->dev); - else - anymore = 1; - break; - case 11: - /* Callback Delay */ - if (lp->dtimer++ > lp->cbdelay) - lp->dialstate = 1; - anymore = 1; - break; - case 12: - /* Remote does callback. Hangup after cbdelay, then wait for incoming - * call (in state 4). - */ - if (lp->dtimer++ > lp->cbdelay) - { - printk(KERN_INFO "%s: hangup waiting for callback ...\n", p->dev->name); - lp->dtimer = 0; - lp->dialstate = 4; - cmd.driver = lp->isdn_device; - cmd.command = ISDN_CMD_HANGUP; - cmd.arg = lp->isdn_channel; - isdn_command(&cmd); - isdn_all_eaz(lp->isdn_device, lp->isdn_channel); - } - anymore = 1; - break; - default: - printk(KERN_WARNING "isdn_net: Illegal dialstate %d for device %s\n", - lp->dialstate, p->dev->name); - } - p = (isdn_net_dev *) p->next; - } - isdn_timer_ctrl(ISDN_TIMER_NETDIAL, anymore); -} - -/* - * Perform hangup for a net-interface. - */ -void -isdn_net_hangup(struct net_device *d) -{ - isdn_net_local *lp = netdev_priv(d); - isdn_ctrl cmd; -#ifdef CONFIG_ISDN_X25 - struct concap_proto *cprot = lp->netdev->cprot; - struct concap_proto_ops *pops = cprot ? cprot->pops : NULL; -#endif - - if (lp->flags & ISDN_NET_CONNECTED) { - if (lp->slave != NULL) { - isdn_net_local *slp = ISDN_SLAVE_PRIV(lp); - if (slp->flags & ISDN_NET_CONNECTED) { - printk(KERN_INFO - "isdn_net: hang up slave %s before %s\n", - lp->slave->name, d->name); - isdn_net_hangup(lp->slave); - } - } - printk(KERN_INFO "isdn_net: local hangup %s\n", d->name); -#ifdef CONFIG_ISDN_PPP - if (lp->p_encap == ISDN_NET_ENCAP_SYNCPPP) - isdn_ppp_free(lp); -#endif - isdn_net_lp_disconnected(lp); -#ifdef CONFIG_ISDN_X25 - /* try if there are generic encap protocol - receiver routines and signal the closure of - the link */ - if (pops && pops->disconn_ind) - pops->disconn_ind(cprot); -#endif /* CONFIG_ISDN_X25 */ - - cmd.driver = lp->isdn_device; - cmd.command = ISDN_CMD_HANGUP; - cmd.arg = lp->isdn_channel; - isdn_command(&cmd); - printk(KERN_INFO "%s: Chargesum is %d\n", d->name, lp->charge); - isdn_all_eaz(lp->isdn_device, lp->isdn_channel); - } - isdn_net_unbind_channel(lp); -} - -typedef struct { - __be16 source; - __be16 dest; -} ip_ports; - -static void -isdn_net_log_skb(struct sk_buff *skb, isdn_net_local *lp) -{ - /* hopefully, this was set correctly */ - const u_char *p = skb_network_header(skb); - unsigned short proto = ntohs(skb->protocol); - int data_ofs; - ip_ports *ipp; - char addinfo[100]; - - addinfo[0] = '\0'; - /* This check stolen from 2.1.72 dev_queue_xmit_nit() */ - if (p < skb->data || skb_network_header(skb) >= skb_tail_pointer(skb)) { - /* fall back to old isdn_net_log_packet method() */ - char *buf = skb->data; - - printk(KERN_DEBUG "isdn_net: protocol %04x is buggy, dev %s\n", skb->protocol, lp->netdev->dev->name); - p = buf; - proto = ETH_P_IP; - switch (lp->p_encap) { - case ISDN_NET_ENCAP_IPTYP: - proto = ntohs(*(__be16 *)&buf[0]); - p = &buf[2]; - break; - case ISDN_NET_ENCAP_ETHER: - proto = ntohs(*(__be16 *)&buf[12]); - p = &buf[14]; - break; - case ISDN_NET_ENCAP_CISCOHDLC: - proto = ntohs(*(__be16 *)&buf[2]); - p = &buf[4]; - break; -#ifdef CONFIG_ISDN_PPP - case ISDN_NET_ENCAP_SYNCPPP: - proto = ntohs(skb->protocol); - p = &buf[IPPP_MAX_HEADER]; - break; -#endif - } - } - data_ofs = ((p[0] & 15) * 4); - switch (proto) { - case ETH_P_IP: - switch (p[9]) { - case 1: - strcpy(addinfo, " ICMP"); - break; - case 2: - strcpy(addinfo, " IGMP"); - break; - case 4: - strcpy(addinfo, " IPIP"); - break; - case 6: - ipp = (ip_ports *) (&p[data_ofs]); - sprintf(addinfo, " TCP, port: %d -> %d", ntohs(ipp->source), - ntohs(ipp->dest)); - break; - case 8: - strcpy(addinfo, " EGP"); - break; - case 12: - strcpy(addinfo, " PUP"); - break; - case 17: - ipp = (ip_ports *) (&p[data_ofs]); - sprintf(addinfo, " UDP, port: %d -> %d", ntohs(ipp->source), - ntohs(ipp->dest)); - break; - case 22: - strcpy(addinfo, " IDP"); - break; - } - printk(KERN_INFO "OPEN: %pI4 -> %pI4%s\n", - p + 12, p + 16, addinfo); - break; - case ETH_P_ARP: - printk(KERN_INFO "OPEN: ARP %pI4 -> *.*.*.* ?%pI4\n", - p + 14, p + 24); - break; - } -} - -/* - * this function is used to send supervisory data, i.e. data which was - * not received from the network layer, but e.g. frames from ipppd, CCP - * reset frames etc. - */ -void isdn_net_write_super(isdn_net_local *lp, struct sk_buff *skb) -{ - if (in_irq()) { - // we can't grab the lock from irq context, - // so we just queue the packet - skb_queue_tail(&lp->super_tx_queue, skb); - schedule_work(&lp->tqueue); - return; - } - - spin_lock_bh(&lp->xmit_lock); - if (!isdn_net_lp_busy(lp)) { - isdn_net_writebuf_skb(lp, skb); - } else { - skb_queue_tail(&lp->super_tx_queue, skb); - } - spin_unlock_bh(&lp->xmit_lock); -} - -/* - * called from tq_immediate - */ -static void isdn_net_softint(struct work_struct *work) -{ - isdn_net_local *lp = container_of(work, isdn_net_local, tqueue); - struct sk_buff *skb; - - spin_lock_bh(&lp->xmit_lock); - while (!isdn_net_lp_busy(lp)) { - skb = skb_dequeue(&lp->super_tx_queue); - if (!skb) - break; - isdn_net_writebuf_skb(lp, skb); - } - spin_unlock_bh(&lp->xmit_lock); -} - -/* - * all frames sent from the (net) LL to a HL driver should go via this function - * it's serialized by the caller holding the lp->xmit_lock spinlock - */ -void isdn_net_writebuf_skb(isdn_net_local *lp, struct sk_buff *skb) -{ - int ret; - int len = skb->len; /* save len */ - - /* before obtaining the lock the caller should have checked that - the lp isn't busy */ - if (isdn_net_lp_busy(lp)) { - printk("isdn BUG at %s:%d!\n", __FILE__, __LINE__); - goto error; - } - - if (!(lp->flags & ISDN_NET_CONNECTED)) { - printk("isdn BUG at %s:%d!\n", __FILE__, __LINE__); - goto error; - } - ret = isdn_writebuf_skb_stub(lp->isdn_device, lp->isdn_channel, 1, skb); - if (ret != len) { - /* we should never get here */ - printk(KERN_WARNING "%s: HL driver queue full\n", lp->netdev->dev->name); - goto error; - } - - lp->transcount += len; - isdn_net_inc_frame_cnt(lp); - return; - -error: - dev_kfree_skb(skb); - lp->stats.tx_errors++; - -} - - -/* - * Helper function for isdn_net_start_xmit. - * When called, the connection is already established. - * Based on cps-calculation, check if device is overloaded. - * If so, and if a slave exists, trigger dialing for it. - * If any slave is online, deliver packets using a simple round robin - * scheme. - * - * Return: 0 on success, !0 on failure. - */ - -static int -isdn_net_xmit(struct net_device *ndev, struct sk_buff *skb) -{ - isdn_net_dev *nd; - isdn_net_local *slp; - isdn_net_local *lp = netdev_priv(ndev); - int retv = NETDEV_TX_OK; - - if (((isdn_net_local *) netdev_priv(ndev))->master) { - printk("isdn BUG at %s:%d!\n", __FILE__, __LINE__); - dev_kfree_skb(skb); - return NETDEV_TX_OK; - } - - /* For the other encaps the header has already been built */ -#ifdef CONFIG_ISDN_PPP - if (lp->p_encap == ISDN_NET_ENCAP_SYNCPPP) { - return isdn_ppp_xmit(skb, ndev); - } -#endif - nd = ((isdn_net_local *) netdev_priv(ndev))->netdev; - lp = isdn_net_get_locked_lp(nd); - if (!lp) { - printk(KERN_WARNING "%s: all channels busy - requeuing!\n", ndev->name); - return NETDEV_TX_BUSY; - } - /* we have our lp locked from now on */ - - /* Reset hangup-timeout */ - lp->huptimer = 0; // FIXME? - isdn_net_writebuf_skb(lp, skb); - spin_unlock_bh(&lp->xmit_lock); - - /* the following stuff is here for backwards compatibility. - * in future, start-up and hangup of slaves (based on current load) - * should move to userspace and get based on an overall cps - * calculation - */ - if (lp->cps > lp->triggercps) { - if (lp->slave) { - if (!lp->sqfull) { - /* First time overload: set timestamp only */ - lp->sqfull = 1; - lp->sqfull_stamp = jiffies; - } else { - /* subsequent overload: if slavedelay exceeded, start dialing */ - if (time_after(jiffies, lp->sqfull_stamp + lp->slavedelay)) { - slp = ISDN_SLAVE_PRIV(lp); - if (!(slp->flags & ISDN_NET_CONNECTED)) { - isdn_net_force_dial_lp(ISDN_SLAVE_PRIV(lp)); - } - } - } - } - } else { - if (lp->sqfull && time_after(jiffies, lp->sqfull_stamp + lp->slavedelay + (10 * HZ))) { - lp->sqfull = 0; - } - /* this is a hack to allow auto-hangup for slaves on moderate loads */ - nd->queue = nd->local; - } - - return retv; - -} - -static void -isdn_net_adjust_hdr(struct sk_buff *skb, struct net_device *dev) -{ - isdn_net_local *lp = netdev_priv(dev); - if (!skb) - return; - if (lp->p_encap == ISDN_NET_ENCAP_ETHER) { - const int pullsize = skb_network_offset(skb) - ETH_HLEN; - if (pullsize > 0) { - printk(KERN_DEBUG "isdn_net: Pull junk %d\n", pullsize); - skb_pull(skb, pullsize); - } - } -} - - -static void isdn_net_tx_timeout(struct net_device *ndev) -{ - isdn_net_local *lp = netdev_priv(ndev); - - printk(KERN_WARNING "isdn_tx_timeout dev %s dialstate %d\n", ndev->name, lp->dialstate); - if (!lp->dialstate) { - lp->stats.tx_errors++; - /* - * There is a certain probability that this currently - * works at all because if we always wake up the interface, - * then upper layer will try to send the next packet - * immediately. And then, the old clean_up logic in the - * driver will hopefully continue to work as it used to do. - * - * This is rather primitive right know, we better should - * clean internal queues here, in particular for multilink and - * ppp, and reset HL driver's channel, too. --HE - * - * actually, this may not matter at all, because ISDN hardware - * should not see transmitter hangs at all IMO - * changed KERN_DEBUG to KERN_WARNING to find out if this is - * ever called --KG - */ - } - netif_trans_update(ndev); - netif_wake_queue(ndev); -} - -/* - * Try sending a packet. - * If this interface isn't connected to a ISDN-Channel, find a free channel, - * and start dialing. - */ -static netdev_tx_t -isdn_net_start_xmit(struct sk_buff *skb, struct net_device *ndev) -{ - isdn_net_local *lp = netdev_priv(ndev); -#ifdef CONFIG_ISDN_X25 - struct concap_proto *cprot = lp->netdev->cprot; -/* At this point hard_start_xmit() passes control to the encapsulation - protocol (if present). - For X.25 auto-dialing is completly bypassed because: - - It does not conform with the semantics of a reliable datalink - service as needed by X.25 PLP. - - I don't want that the interface starts dialing when the network layer - sends a message which requests to disconnect the lapb link (or if it - sends any other message not resulting in data transmission). - Instead, dialing will be initiated by the encapsulation protocol entity - when a dl_establish request is received from the upper layer. -*/ - if (cprot && cprot->pops) { - int ret = cprot->pops->encap_and_xmit(cprot, skb); - - if (ret) - netif_stop_queue(ndev); - return ret; - } else -#endif - /* auto-dialing xmit function */ - { -#ifdef ISDN_DEBUG_NET_DUMP - u_char *buf; -#endif - isdn_net_adjust_hdr(skb, ndev); -#ifdef ISDN_DEBUG_NET_DUMP - buf = skb->data; - isdn_dumppkt("S:", buf, skb->len, 40); -#endif - - if (!(lp->flags & ISDN_NET_CONNECTED)) { - int chi; - /* only do autodial if allowed by config */ - if (!(ISDN_NET_DIALMODE(*lp) == ISDN_NET_DM_AUTO)) { - isdn_net_unreachable(ndev, skb, "dial rejected: interface not in dialmode `auto'"); - dev_kfree_skb(skb); - return NETDEV_TX_OK; - } - if (lp->phone[1]) { - ulong flags; - - if (lp->dialwait_timer <= 0) - if (lp->dialstarted > 0 && lp->dialtimeout > 0 && time_before(jiffies, lp->dialstarted + lp->dialtimeout + lp->dialwait)) - lp->dialwait_timer = lp->dialstarted + lp->dialtimeout + lp->dialwait; - - if (lp->dialwait_timer > 0) { - if (time_before(jiffies, lp->dialwait_timer)) { - isdn_net_unreachable(ndev, skb, "dial rejected: retry-time not reached"); - dev_kfree_skb(skb); - return NETDEV_TX_OK; - } else - lp->dialwait_timer = 0; - } - /* Grab a free ISDN-Channel */ - spin_lock_irqsave(&dev->lock, flags); - if (((chi = - isdn_get_free_channel( - ISDN_USAGE_NET, - lp->l2_proto, - lp->l3_proto, - lp->pre_device, - lp->pre_channel, - lp->msn) - ) < 0) && - ((chi = - isdn_get_free_channel( - ISDN_USAGE_NET, - lp->l2_proto, - lp->l3_proto, - lp->pre_device, - lp->pre_channel^1, - lp->msn) - ) < 0)) { - spin_unlock_irqrestore(&dev->lock, flags); - isdn_net_unreachable(ndev, skb, - "No channel"); - dev_kfree_skb(skb); - return NETDEV_TX_OK; - } - /* Log packet, which triggered dialing */ - if (dev->net_verbose) - isdn_net_log_skb(skb, lp); - lp->dialstate = 1; - /* Connect interface with channel */ - isdn_net_bind_channel(lp, chi); -#ifdef CONFIG_ISDN_PPP - if (lp->p_encap == ISDN_NET_ENCAP_SYNCPPP) { - /* no 'first_skb' handling for syncPPP */ - if (isdn_ppp_bind(lp) < 0) { - dev_kfree_skb(skb); - isdn_net_unbind_channel(lp); - spin_unlock_irqrestore(&dev->lock, flags); - return NETDEV_TX_OK; /* STN (skb to nirvana) ;) */ - } -#ifdef CONFIG_IPPP_FILTER - if (isdn_ppp_autodial_filter(skb, lp)) { - isdn_ppp_free(lp); - isdn_net_unbind_channel(lp); - spin_unlock_irqrestore(&dev->lock, flags); - isdn_net_unreachable(ndev, skb, "dial rejected: packet filtered"); - dev_kfree_skb(skb); - return NETDEV_TX_OK; - } -#endif - spin_unlock_irqrestore(&dev->lock, flags); - isdn_net_dial(); /* Initiate dialing */ - netif_stop_queue(ndev); - return NETDEV_TX_BUSY; /* let upper layer requeue skb packet */ - } -#endif - /* Initiate dialing */ - spin_unlock_irqrestore(&dev->lock, flags); - isdn_net_dial(); - isdn_net_device_stop_queue(lp); - return NETDEV_TX_BUSY; - } else { - isdn_net_unreachable(ndev, skb, - "No phone number"); - dev_kfree_skb(skb); - return NETDEV_TX_OK; - } - } else { - /* Device is connected to an ISDN channel */ - netif_trans_update(ndev); - if (!lp->dialstate) { - /* ISDN connection is established, try sending */ - int ret; - ret = (isdn_net_xmit(ndev, skb)); - if (ret) netif_stop_queue(ndev); - return ret; - } else - netif_stop_queue(ndev); - } - } - return NETDEV_TX_BUSY; -} - -/* - * Shutdown a net-interface. - */ -static int -isdn_net_close(struct net_device *dev) -{ - struct net_device *p; -#ifdef CONFIG_ISDN_X25 - struct concap_proto *cprot = - ((isdn_net_local *)netdev_priv(dev))->netdev->cprot; - /* printk(KERN_DEBUG "isdn_net_close %s\n" , dev-> name); */ -#endif - -#ifdef CONFIG_ISDN_X25 - if (cprot && cprot->pops) cprot->pops->close(cprot); -#endif - netif_stop_queue(dev); - p = MASTER_TO_SLAVE(dev); - if (p) { - /* If this interface has slaves, stop them also */ - while (p) { -#ifdef CONFIG_ISDN_X25 - cprot = ((isdn_net_local *)netdev_priv(p)) - ->netdev->cprot; - if (cprot && cprot->pops) - cprot->pops->close(cprot); -#endif - isdn_net_hangup(p); - p = MASTER_TO_SLAVE(p); - } - } - isdn_net_hangup(dev); - isdn_unlock_drivers(); - return 0; -} - -/* - * Get statistics - */ -static struct net_device_stats * -isdn_net_get_stats(struct net_device *dev) -{ - isdn_net_local *lp = netdev_priv(dev); - return &lp->stats; -} - -/* This is simply a copy from std. eth.c EXCEPT we pull ETH_HLEN - * instead of dev->hard_header_len off. This is done because the - * lowlevel-driver has already pulled off its stuff when we get - * here and this routine only gets called with p_encap == ETHER. - * Determine the packet's protocol ID. The rule here is that we - * assume 802.3 if the type field is short enough to be a length. - * This is normal practice and works for any 'now in use' protocol. - */ - -static __be16 -isdn_net_type_trans(struct sk_buff *skb, struct net_device *dev) -{ - struct ethhdr *eth; - unsigned char *rawp; - - skb_reset_mac_header(skb); - skb_pull(skb, ETH_HLEN); - eth = eth_hdr(skb); - - if (*eth->h_dest & 1) { - if (ether_addr_equal(eth->h_dest, dev->broadcast)) - skb->pkt_type = PACKET_BROADCAST; - else - skb->pkt_type = PACKET_MULTICAST; - } - /* - * This ALLMULTI check should be redundant by 1.4 - * so don't forget to remove it. - */ - - else if (dev->flags & (IFF_PROMISC /*| IFF_ALLMULTI*/)) { - if (!ether_addr_equal(eth->h_dest, dev->dev_addr)) - skb->pkt_type = PACKET_OTHERHOST; - } - if (ntohs(eth->h_proto) >= ETH_P_802_3_MIN) - return eth->h_proto; - - rawp = skb->data; - - /* - * This is a magic hack to spot IPX packets. Older Novell breaks - * the protocol design and runs IPX over 802.3 without an 802.2 LLC - * layer. We look for FFFF which isn't a used 802.2 SSAP/DSAP. This - * won't work for fault tolerant netware but does for the rest. - */ - if (*(unsigned short *) rawp == 0xFFFF) - return htons(ETH_P_802_3); - /* - * Real 802.2 LLC - */ - return htons(ETH_P_802_2); -} - - -/* - * CISCO HDLC keepalive specific stuff - */ -static struct sk_buff* -isdn_net_ciscohdlck_alloc_skb(isdn_net_local *lp, int len) -{ - unsigned short hl = dev->drv[lp->isdn_device]->interface->hl_hdrlen; - struct sk_buff *skb; - - skb = alloc_skb(hl + len, GFP_ATOMIC); - if (skb) - skb_reserve(skb, hl); - else - printk("isdn out of mem at %s:%d!\n", __FILE__, __LINE__); - return skb; -} - -/* cisco hdlck device private ioctls */ -static int -isdn_ciscohdlck_dev_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) -{ - isdn_net_local *lp = netdev_priv(dev); - unsigned long len = 0; - unsigned long expires = 0; - int tmp = 0; - int period = lp->cisco_keepalive_period; - s8 debserint = lp->cisco_debserint; - int rc = 0; - - if (lp->p_encap != ISDN_NET_ENCAP_CISCOHDLCK) - return -EINVAL; - - switch (cmd) { - /* get/set keepalive period */ - case SIOCGKEEPPERIOD: - len = (unsigned long)sizeof(lp->cisco_keepalive_period); - if (copy_to_user(ifr->ifr_data, - &lp->cisco_keepalive_period, len)) - rc = -EFAULT; - break; - case SIOCSKEEPPERIOD: - tmp = lp->cisco_keepalive_period; - len = (unsigned long)sizeof(lp->cisco_keepalive_period); - if (copy_from_user(&period, ifr->ifr_data, len)) - rc = -EFAULT; - if ((period > 0) && (period <= 32767)) - lp->cisco_keepalive_period = period; - else - rc = -EINVAL; - if (!rc && (tmp != lp->cisco_keepalive_period)) { - expires = (unsigned long)(jiffies + - lp->cisco_keepalive_period * HZ); - mod_timer(&lp->cisco_timer, expires); - printk(KERN_INFO "%s: Keepalive period set " - "to %d seconds.\n", - dev->name, lp->cisco_keepalive_period); - } - break; - - /* get/set debugging */ - case SIOCGDEBSERINT: - len = (unsigned long)sizeof(lp->cisco_debserint); - if (copy_to_user(ifr->ifr_data, - &lp->cisco_debserint, len)) - rc = -EFAULT; - break; - case SIOCSDEBSERINT: - len = (unsigned long)sizeof(lp->cisco_debserint); - if (copy_from_user(&debserint, - ifr->ifr_data, len)) - rc = -EFAULT; - if ((debserint >= 0) && (debserint <= 64)) - lp->cisco_debserint = debserint; - else - rc = -EINVAL; - break; - - default: - rc = -EINVAL; - break; - } - return (rc); -} - - -static int isdn_net_ioctl(struct net_device *dev, - struct ifreq *ifr, int cmd) -{ - isdn_net_local *lp = netdev_priv(dev); - - switch (lp->p_encap) { -#ifdef CONFIG_ISDN_PPP - case ISDN_NET_ENCAP_SYNCPPP: - return isdn_ppp_dev_ioctl(dev, ifr, cmd); -#endif - case ISDN_NET_ENCAP_CISCOHDLCK: - return isdn_ciscohdlck_dev_ioctl(dev, ifr, cmd); - default: - return -EINVAL; - } -} - -/* called via cisco_timer.function */ -static void -isdn_net_ciscohdlck_slarp_send_keepalive(struct timer_list *t) -{ - isdn_net_local *lp = from_timer(lp, t, cisco_timer); - struct sk_buff *skb; - unsigned char *p; - unsigned long last_cisco_myseq = lp->cisco_myseq; - int myseq_diff = 0; - - if (!(lp->flags & ISDN_NET_CONNECTED) || lp->dialstate) { - printk("isdn BUG at %s:%d!\n", __FILE__, __LINE__); - return; - } - lp->cisco_myseq++; - - myseq_diff = (lp->cisco_myseq - lp->cisco_mineseen); - if ((lp->cisco_line_state) && ((myseq_diff >= 3) || (myseq_diff <= -3))) { - /* line up -> down */ - lp->cisco_line_state = 0; - printk(KERN_WARNING - "UPDOWN: Line protocol on Interface %s," - " changed state to down\n", lp->netdev->dev->name); - /* should stop routing higher-level data across */ - } else if ((!lp->cisco_line_state) && - (myseq_diff >= 0) && (myseq_diff <= 2)) { - /* line down -> up */ - lp->cisco_line_state = 1; - printk(KERN_WARNING - "UPDOWN: Line protocol on Interface %s," - " changed state to up\n", lp->netdev->dev->name); - /* restart routing higher-level data across */ - } - - if (lp->cisco_debserint) - printk(KERN_DEBUG "%s: HDLC " - "myseq %lu, mineseen %lu%c, yourseen %lu, %s\n", - lp->netdev->dev->name, last_cisco_myseq, lp->cisco_mineseen, - ((last_cisco_myseq == lp->cisco_mineseen) ? '*' : 040), - lp->cisco_yourseq, - ((lp->cisco_line_state) ? "line up" : "line down")); - - skb = isdn_net_ciscohdlck_alloc_skb(lp, 4 + 14); - if (!skb) - return; - - p = skb_put(skb, 4 + 14); - - /* cisco header */ - *(u8 *)(p + 0) = CISCO_ADDR_UNICAST; - *(u8 *)(p + 1) = CISCO_CTRL; - *(__be16 *)(p + 2) = cpu_to_be16(CISCO_TYPE_SLARP); - - /* slarp keepalive */ - *(__be32 *)(p + 4) = cpu_to_be32(CISCO_SLARP_KEEPALIVE); - *(__be32 *)(p + 8) = cpu_to_be32(lp->cisco_myseq); - *(__be32 *)(p + 12) = cpu_to_be32(lp->cisco_yourseq); - *(__be16 *)(p + 16) = cpu_to_be16(0xffff); // reliability, always 0xffff - p += 18; - - isdn_net_write_super(lp, skb); - - lp->cisco_timer.expires = jiffies + lp->cisco_keepalive_period * HZ; - - add_timer(&lp->cisco_timer); -} - -static void -isdn_net_ciscohdlck_slarp_send_request(isdn_net_local *lp) -{ - struct sk_buff *skb; - unsigned char *p; - - skb = isdn_net_ciscohdlck_alloc_skb(lp, 4 + 14); - if (!skb) - return; - - p = skb_put(skb, 4 + 14); - - /* cisco header */ - *(u8 *)(p + 0) = CISCO_ADDR_UNICAST; - *(u8 *)(p + 1) = CISCO_CTRL; - *(__be16 *)(p + 2) = cpu_to_be16(CISCO_TYPE_SLARP); - - /* slarp request */ - *(__be32 *)(p + 4) = cpu_to_be32(CISCO_SLARP_REQUEST); - *(__be32 *)(p + 8) = cpu_to_be32(0); // address - *(__be32 *)(p + 12) = cpu_to_be32(0); // netmask - *(__be16 *)(p + 16) = cpu_to_be16(0); // unused - p += 18; - - isdn_net_write_super(lp, skb); -} - -static void -isdn_net_ciscohdlck_connected(isdn_net_local *lp) -{ - lp->cisco_myseq = 0; - lp->cisco_mineseen = 0; - lp->cisco_yourseq = 0; - lp->cisco_keepalive_period = ISDN_TIMER_KEEPINT; - lp->cisco_last_slarp_in = 0; - lp->cisco_line_state = 0; - lp->cisco_debserint = 0; - - /* send slarp request because interface/seq.no.s reset */ - isdn_net_ciscohdlck_slarp_send_request(lp); - - timer_setup(&lp->cisco_timer, - isdn_net_ciscohdlck_slarp_send_keepalive, 0); - lp->cisco_timer.expires = jiffies + lp->cisco_keepalive_period * HZ; - add_timer(&lp->cisco_timer); -} - -static void -isdn_net_ciscohdlck_disconnected(isdn_net_local *lp) -{ - del_timer(&lp->cisco_timer); -} - -static void -isdn_net_ciscohdlck_slarp_send_reply(isdn_net_local *lp) -{ - struct sk_buff *skb; - unsigned char *p; - struct in_device *in_dev = NULL; - __be32 addr = 0; /* local ipv4 address */ - __be32 mask = 0; /* local netmask */ - - if ((in_dev = lp->netdev->dev->ip_ptr) != NULL) { - /* take primary(first) address of interface */ - struct in_ifaddr *ifa = in_dev->ifa_list; - if (ifa != NULL) { - addr = ifa->ifa_local; - mask = ifa->ifa_mask; - } - } - - skb = isdn_net_ciscohdlck_alloc_skb(lp, 4 + 14); - if (!skb) - return; - - p = skb_put(skb, 4 + 14); - - /* cisco header */ - *(u8 *)(p + 0) = CISCO_ADDR_UNICAST; - *(u8 *)(p + 1) = CISCO_CTRL; - *(__be16 *)(p + 2) = cpu_to_be16(CISCO_TYPE_SLARP); - - /* slarp reply, send own ip/netmask; if values are nonsense remote - * should think we are unable to provide it with an address via SLARP */ - *(__be32 *)(p + 4) = cpu_to_be32(CISCO_SLARP_REPLY); - *(__be32 *)(p + 8) = addr; // address - *(__be32 *)(p + 12) = mask; // netmask - *(__be16 *)(p + 16) = cpu_to_be16(0); // unused - p += 18; - - isdn_net_write_super(lp, skb); -} - -static void -isdn_net_ciscohdlck_slarp_in(isdn_net_local *lp, struct sk_buff *skb) -{ - unsigned char *p; - int period; - u32 code; - u32 my_seq; - u32 your_seq; - __be32 local; - __be32 *addr, *mask; - - if (skb->len < 14) - return; - - p = skb->data; - code = be32_to_cpup((__be32 *)p); - p += 4; - - switch (code) { - case CISCO_SLARP_REQUEST: - lp->cisco_yourseq = 0; - isdn_net_ciscohdlck_slarp_send_reply(lp); - break; - case CISCO_SLARP_REPLY: - addr = (__be32 *)p; - mask = (__be32 *)(p + 4); - if (*mask != cpu_to_be32(0xfffffffc)) - goto slarp_reply_out; - if ((*addr & cpu_to_be32(3)) == cpu_to_be32(0) || - (*addr & cpu_to_be32(3)) == cpu_to_be32(3)) - goto slarp_reply_out; - local = *addr ^ cpu_to_be32(3); - printk(KERN_INFO "%s: got slarp reply: remote ip: %pI4, local ip: %pI4 mask: %pI4\n", - lp->netdev->dev->name, addr, &local, mask); - break; - slarp_reply_out: - printk(KERN_INFO "%s: got invalid slarp reply (%pI4/%pI4) - ignored\n", - lp->netdev->dev->name, addr, mask); - break; - case CISCO_SLARP_KEEPALIVE: - period = (int)((jiffies - lp->cisco_last_slarp_in - + HZ / 2 - 1) / HZ); - if (lp->cisco_debserint && - (period != lp->cisco_keepalive_period) && - lp->cisco_last_slarp_in) { - printk(KERN_DEBUG "%s: Keepalive period mismatch - " - "is %d but should be %d.\n", - lp->netdev->dev->name, period, - lp->cisco_keepalive_period); - } - lp->cisco_last_slarp_in = jiffies; - my_seq = be32_to_cpup((__be32 *)(p + 0)); - your_seq = be32_to_cpup((__be32 *)(p + 4)); - p += 10; - lp->cisco_yourseq = my_seq; - lp->cisco_mineseen = your_seq; - break; - } -} - -static void -isdn_net_ciscohdlck_receive(isdn_net_local *lp, struct sk_buff *skb) -{ - unsigned char *p; - u8 addr; - u8 ctrl; - u16 type; - - if (skb->len < 4) - goto out_free; - - p = skb->data; - addr = *(u8 *)(p + 0); - ctrl = *(u8 *)(p + 1); - type = be16_to_cpup((__be16 *)(p + 2)); - p += 4; - skb_pull(skb, 4); - - if (addr != CISCO_ADDR_UNICAST && addr != CISCO_ADDR_BROADCAST) { - printk(KERN_WARNING "%s: Unknown Cisco addr 0x%02x\n", - lp->netdev->dev->name, addr); - goto out_free; - } - if (ctrl != CISCO_CTRL) { - printk(KERN_WARNING "%s: Unknown Cisco ctrl 0x%02x\n", - lp->netdev->dev->name, ctrl); - goto out_free; - } - - switch (type) { - case CISCO_TYPE_SLARP: - isdn_net_ciscohdlck_slarp_in(lp, skb); - goto out_free; - case CISCO_TYPE_CDP: - if (lp->cisco_debserint) - printk(KERN_DEBUG "%s: Received CDP packet. use " - "\"no cdp enable\" on cisco.\n", - lp->netdev->dev->name); - goto out_free; - default: - /* no special cisco protocol */ - skb->protocol = htons(type); - netif_rx(skb); - return; - } - -out_free: - kfree_skb(skb); -} - -/* - * Got a packet from ISDN-Channel. - */ -static void -isdn_net_receive(struct net_device *ndev, struct sk_buff *skb) -{ - isdn_net_local *lp = netdev_priv(ndev); - isdn_net_local *olp = lp; /* original 'lp' */ -#ifdef CONFIG_ISDN_X25 - struct concap_proto *cprot = lp->netdev->cprot; -#endif - lp->transcount += skb->len; - - lp->stats.rx_packets++; - lp->stats.rx_bytes += skb->len; - if (lp->master) { - /* Bundling: If device is a slave-device, deliver to master, also - * handle master's statistics and hangup-timeout - */ - ndev = lp->master; - lp = netdev_priv(ndev); - lp->stats.rx_packets++; - lp->stats.rx_bytes += skb->len; - } - skb->dev = ndev; - skb->pkt_type = PACKET_HOST; - skb_reset_mac_header(skb); -#ifdef ISDN_DEBUG_NET_DUMP - isdn_dumppkt("R:", skb->data, skb->len, 40); -#endif - switch (lp->p_encap) { - case ISDN_NET_ENCAP_ETHER: - /* Ethernet over ISDN */ - olp->huptimer = 0; - lp->huptimer = 0; - skb->protocol = isdn_net_type_trans(skb, ndev); - break; - case ISDN_NET_ENCAP_UIHDLC: - /* HDLC with UI-frame (for ispa with -h1 option) */ - olp->huptimer = 0; - lp->huptimer = 0; - skb_pull(skb, 2); - /* Fall through */ - case ISDN_NET_ENCAP_RAWIP: - /* RAW-IP without MAC-Header */ - olp->huptimer = 0; - lp->huptimer = 0; - skb->protocol = htons(ETH_P_IP); - break; - case ISDN_NET_ENCAP_CISCOHDLCK: - isdn_net_ciscohdlck_receive(lp, skb); - return; - case ISDN_NET_ENCAP_CISCOHDLC: - /* CISCO-HDLC IP with type field and fake I-frame-header */ - skb_pull(skb, 2); - /* Fall through */ - case ISDN_NET_ENCAP_IPTYP: - /* IP with type field */ - olp->huptimer = 0; - lp->huptimer = 0; - skb->protocol = *(__be16 *)&(skb->data[0]); - skb_pull(skb, 2); - if (*(unsigned short *) skb->data == 0xFFFF) - skb->protocol = htons(ETH_P_802_3); - break; -#ifdef CONFIG_ISDN_PPP - case ISDN_NET_ENCAP_SYNCPPP: - /* huptimer is done in isdn_ppp_push_higher */ - isdn_ppp_receive(lp->netdev, olp, skb); - return; -#endif - - default: -#ifdef CONFIG_ISDN_X25 - /* try if there are generic sync_device receiver routines */ - if (cprot) if (cprot->pops) - if (cprot->pops->data_ind) { - cprot->pops->data_ind(cprot, skb); - return; - }; -#endif /* CONFIG_ISDN_X25 */ - printk(KERN_WARNING "%s: unknown encapsulation, dropping\n", - lp->netdev->dev->name); - kfree_skb(skb); - return; - } - - netif_rx(skb); - return; -} - -/* - * A packet arrived via ISDN. Search interface-chain for a corresponding - * interface. If found, deliver packet to receiver-function and return 1, - * else return 0. - */ -int -isdn_net_rcv_skb(int idx, struct sk_buff *skb) -{ - isdn_net_dev *p = dev->rx_netdev[idx]; - - if (p) { - isdn_net_local *lp = p->local; - if ((lp->flags & ISDN_NET_CONNECTED) && - (!lp->dialstate)) { - isdn_net_receive(p->dev, skb); - return 1; - } - } - return 0; -} - -/* - * build an header - * depends on encaps that is being used. - */ - -static int isdn_net_header(struct sk_buff *skb, struct net_device *dev, - unsigned short type, - const void *daddr, const void *saddr, unsigned plen) -{ - isdn_net_local *lp = netdev_priv(dev); - unsigned char *p; - int len = 0; - - switch (lp->p_encap) { - case ISDN_NET_ENCAP_ETHER: - len = eth_header(skb, dev, type, daddr, saddr, plen); - break; -#ifdef CONFIG_ISDN_PPP - case ISDN_NET_ENCAP_SYNCPPP: - /* stick on a fake header to keep fragmentation code happy. */ - len = IPPP_MAX_HEADER; - skb_push(skb, len); - break; -#endif - case ISDN_NET_ENCAP_RAWIP: - printk(KERN_WARNING "isdn_net_header called with RAW_IP!\n"); - len = 0; - break; - case ISDN_NET_ENCAP_IPTYP: - /* ethernet type field */ - *((__be16 *)skb_push(skb, 2)) = htons(type); - len = 2; - break; - case ISDN_NET_ENCAP_UIHDLC: - /* HDLC with UI-Frames (for ispa with -h1 option) */ - *((__be16 *)skb_push(skb, 2)) = htons(0x0103); - len = 2; - break; - case ISDN_NET_ENCAP_CISCOHDLC: - case ISDN_NET_ENCAP_CISCOHDLCK: - p = skb_push(skb, 4); - *(u8 *)(p + 0) = CISCO_ADDR_UNICAST; - *(u8 *)(p + 1) = CISCO_CTRL; - *(__be16 *)(p + 2) = cpu_to_be16(type); - p += 4; - len = 4; - break; -#ifdef CONFIG_ISDN_X25 - default: - /* try if there are generic concap protocol routines */ - if (lp->netdev->cprot) { - printk(KERN_WARNING "isdn_net_header called with concap_proto!\n"); - len = 0; - break; - } - break; -#endif /* CONFIG_ISDN_X25 */ - } - return len; -} - -static int isdn_header_cache(const struct neighbour *neigh, struct hh_cache *hh, - __be16 type) -{ - const struct net_device *dev = neigh->dev; - isdn_net_local *lp = netdev_priv(dev); - - if (lp->p_encap == ISDN_NET_ENCAP_ETHER) - return eth_header_cache(neigh, hh, type); - return -1; -} - -static void isdn_header_cache_update(struct hh_cache *hh, - const struct net_device *dev, - const unsigned char *haddr) -{ - isdn_net_local *lp = netdev_priv(dev); - if (lp->p_encap == ISDN_NET_ENCAP_ETHER) - eth_header_cache_update(hh, dev, haddr); -} - -static const struct header_ops isdn_header_ops = { - .create = isdn_net_header, - .cache = isdn_header_cache, - .cache_update = isdn_header_cache_update, -}; - -/* - * Interface-setup. (just after registering a new interface) - */ -static int -isdn_net_init(struct net_device *ndev) -{ - ushort max_hlhdr_len = 0; - int drvidx; - - /* - * up till binding we ask the protocol layer to reserve as much - * as we might need for HL layer - */ - - for (drvidx = 0; drvidx < ISDN_MAX_DRIVERS; drvidx++) - if (dev->drv[drvidx]) - if (max_hlhdr_len < dev->drv[drvidx]->interface->hl_hdrlen) - max_hlhdr_len = dev->drv[drvidx]->interface->hl_hdrlen; - - ndev->hard_header_len = ETH_HLEN + max_hlhdr_len; - return 0; -} - -static void -isdn_net_swapbind(int drvidx) -{ - isdn_net_dev *p; - -#ifdef ISDN_DEBUG_NET_ICALL - printk(KERN_DEBUG "n_fi: swapping ch of %d\n", drvidx); -#endif - p = dev->netdev; - while (p) { - if (p->local->pre_device == drvidx) - switch (p->local->pre_channel) { - case 0: - p->local->pre_channel = 1; - break; - case 1: - p->local->pre_channel = 0; - break; - } - p = (isdn_net_dev *) p->next; - } -} - -static void -isdn_net_swap_usage(int i1, int i2) -{ - int u1 = dev->usage[i1] & ISDN_USAGE_EXCLUSIVE; - int u2 = dev->usage[i2] & ISDN_USAGE_EXCLUSIVE; - -#ifdef ISDN_DEBUG_NET_ICALL - printk(KERN_DEBUG "n_fi: usage of %d and %d\n", i1, i2); -#endif - dev->usage[i1] &= ~ISDN_USAGE_EXCLUSIVE; - dev->usage[i1] |= u2; - dev->usage[i2] &= ~ISDN_USAGE_EXCLUSIVE; - dev->usage[i2] |= u1; - isdn_info_update(); -} - -/* - * An incoming call-request has arrived. - * Search the interface-chain for an appropriate interface. - * If found, connect the interface to the ISDN-channel and initiate - * D- and B-Channel-setup. If secure-flag is set, accept only - * configured phone-numbers. If callback-flag is set, initiate - * callback-dialing. - * - * Return-Value: 0 = No appropriate interface for this call. - * 1 = Call accepted - * 2 = Reject call, wait cbdelay, then call back - * 3 = Reject call - * 4 = Wait cbdelay, then call back - * 5 = No appropriate interface for this call, - * would eventually match if CID was longer. - */ - -int -isdn_net_find_icall(int di, int ch, int idx, setup_parm *setup) -{ - char *eaz; - int si1; - int si2; - int ematch; - int wret; - int swapped; - int sidx = 0; - u_long flags; - isdn_net_dev *p; - isdn_net_phone *n; - char nr[ISDN_MSNLEN]; - char *my_eaz; - - /* Search name in netdev-chain */ - if (!setup->phone[0]) { - nr[0] = '0'; - nr[1] = '\0'; - printk(KERN_INFO "isdn_net: Incoming call without OAD, assuming '0'\n"); - } else - strlcpy(nr, setup->phone, ISDN_MSNLEN); - si1 = (int) setup->si1; - si2 = (int) setup->si2; - if (!setup->eazmsn[0]) { - printk(KERN_WARNING "isdn_net: Incoming call without CPN, assuming '0'\n"); - eaz = "0"; - } else - eaz = setup->eazmsn; - if (dev->net_verbose > 1) - printk(KERN_INFO "isdn_net: call from %s,%d,%d -> %s\n", nr, si1, si2, eaz); - /* Accept DATA and VOICE calls at this stage - * local eaz is checked later for allowed call types - */ - if ((si1 != 7) && (si1 != 1)) { - if (dev->net_verbose > 1) - printk(KERN_INFO "isdn_net: Service-Indicator not 1 or 7, ignored\n"); - return 0; - } - n = (isdn_net_phone *) 0; - p = dev->netdev; - ematch = wret = swapped = 0; -#ifdef ISDN_DEBUG_NET_ICALL - printk(KERN_DEBUG "n_fi: di=%d ch=%d idx=%d usg=%d\n", di, ch, idx, - dev->usage[idx]); -#endif - while (p) { - int matchret; - isdn_net_local *lp = p->local; - - /* If last check has triggered as binding-swap, revert it */ - switch (swapped) { - case 2: - isdn_net_swap_usage(idx, sidx); - /* fall through */ - case 1: - isdn_net_swapbind(di); - break; - } - swapped = 0; - /* check acceptable call types for DOV */ - my_eaz = isdn_map_eaz2msn(lp->msn, di); - if (si1 == 1) { /* it's a DOV call, check if we allow it */ - if (*my_eaz == 'v' || *my_eaz == 'V' || - *my_eaz == 'b' || *my_eaz == 'B') - my_eaz++; /* skip to allow a match */ - else - my_eaz = NULL; /* force non match */ - } else { /* it's a DATA call, check if we allow it */ - if (*my_eaz == 'b' || *my_eaz == 'B') - my_eaz++; /* skip to allow a match */ - } - if (my_eaz) - matchret = isdn_msncmp(eaz, my_eaz); - else - matchret = 1; - if (!matchret) - ematch = 1; - - /* Remember if more numbers eventually can match */ - if (matchret > wret) - wret = matchret; -#ifdef ISDN_DEBUG_NET_ICALL - printk(KERN_DEBUG "n_fi: if='%s', l.msn=%s, l.flags=%d, l.dstate=%d\n", - p->dev->name, lp->msn, lp->flags, lp->dialstate); -#endif - if ((!matchret) && /* EAZ is matching */ - (((!(lp->flags & ISDN_NET_CONNECTED)) && /* but not connected */ - (USG_NONE(dev->usage[idx]))) || /* and ch. unused or */ - ((((lp->dialstate == 4) || (lp->dialstate == 12)) && /* if dialing */ - (!(lp->flags & ISDN_NET_CALLBACK))) /* but no callback */ - ))) - { -#ifdef ISDN_DEBUG_NET_ICALL - printk(KERN_DEBUG "n_fi: match1, pdev=%d pch=%d\n", - lp->pre_device, lp->pre_channel); -#endif - if (dev->usage[idx] & ISDN_USAGE_EXCLUSIVE) { - if ((lp->pre_channel != ch) || - (lp->pre_device != di)) { - /* Here we got a problem: - * If using an ICN-Card, an incoming call is always signaled on - * on the first channel of the card, if both channels are - * down. However this channel may be bound exclusive. If the - * second channel is free, this call should be accepted. - * The solution is horribly but it runs, so what: - * We exchange the exclusive bindings of the two channels, the - * corresponding variables in the interface-structs. - */ - if (ch == 0) { - sidx = isdn_dc2minor(di, 1); -#ifdef ISDN_DEBUG_NET_ICALL - printk(KERN_DEBUG "n_fi: ch is 0\n"); -#endif - if (USG_NONE(dev->usage[sidx])) { - /* Second Channel is free, now see if it is bound - * exclusive too. */ - if (dev->usage[sidx] & ISDN_USAGE_EXCLUSIVE) { -#ifdef ISDN_DEBUG_NET_ICALL - printk(KERN_DEBUG "n_fi: 2nd channel is down and bound\n"); -#endif - /* Yes, swap bindings only, if the original - * binding is bound to channel 1 of this driver */ - if ((lp->pre_device == di) && - (lp->pre_channel == 1)) { - isdn_net_swapbind(di); - swapped = 1; - } else { - /* ... else iterate next device */ - p = (isdn_net_dev *) p->next; - continue; - } - } else { -#ifdef ISDN_DEBUG_NET_ICALL - printk(KERN_DEBUG "n_fi: 2nd channel is down and unbound\n"); -#endif - /* No, swap always and swap excl-usage also */ - isdn_net_swap_usage(idx, sidx); - isdn_net_swapbind(di); - swapped = 2; - } - /* Now check for exclusive binding again */ -#ifdef ISDN_DEBUG_NET_ICALL - printk(KERN_DEBUG "n_fi: final check\n"); -#endif - if ((dev->usage[idx] & ISDN_USAGE_EXCLUSIVE) && - ((lp->pre_channel != ch) || - (lp->pre_device != di))) { -#ifdef ISDN_DEBUG_NET_ICALL - printk(KERN_DEBUG "n_fi: final check failed\n"); -#endif - p = (isdn_net_dev *) p->next; - continue; - } - } - } else { - /* We are already on the second channel, so nothing to do */ -#ifdef ISDN_DEBUG_NET_ICALL - printk(KERN_DEBUG "n_fi: already on 2nd channel\n"); -#endif - } - } - } -#ifdef ISDN_DEBUG_NET_ICALL - printk(KERN_DEBUG "n_fi: match2\n"); -#endif - n = lp->phone[0]; - if (lp->flags & ISDN_NET_SECURE) { - while (n) { - if (!isdn_msncmp(nr, n->num)) - break; - n = (isdn_net_phone *) n->next; - } - } - if (n || (!(lp->flags & ISDN_NET_SECURE))) { -#ifdef ISDN_DEBUG_NET_ICALL - printk(KERN_DEBUG "n_fi: match3\n"); -#endif - /* matching interface found */ - - /* - * Is the state STOPPED? - * If so, no dialin is allowed, - * so reject actively. - * */ - if (ISDN_NET_DIALMODE(*lp) == ISDN_NET_DM_OFF) { - printk(KERN_INFO "incoming call, interface %s `stopped' -> rejected\n", - p->dev->name); - return 3; - } - /* - * Is the interface up? - * If not, reject the call actively. - */ - if (!isdn_net_device_started(p)) { - printk(KERN_INFO "%s: incoming call, interface down -> rejected\n", - p->dev->name); - return 3; - } - /* Interface is up, now see if it's a slave. If so, see if - * it's master and parent slave is online. If not, reject the call. - */ - if (lp->master) { - isdn_net_local *mlp = ISDN_MASTER_PRIV(lp); - printk(KERN_DEBUG "ICALLslv: %s\n", p->dev->name); - printk(KERN_DEBUG "master=%s\n", lp->master->name); - if (mlp->flags & ISDN_NET_CONNECTED) { - printk(KERN_DEBUG "master online\n"); - /* Master is online, find parent-slave (master if first slave) */ - while (mlp->slave) { - if (ISDN_SLAVE_PRIV(mlp) == lp) - break; - mlp = ISDN_SLAVE_PRIV(mlp); - } - } else - printk(KERN_DEBUG "master offline\n"); - /* Found parent, if it's offline iterate next device */ - printk(KERN_DEBUG "mlpf: %d\n", mlp->flags & ISDN_NET_CONNECTED); - if (!(mlp->flags & ISDN_NET_CONNECTED)) { - p = (isdn_net_dev *) p->next; - continue; - } - } - if (lp->flags & ISDN_NET_CALLBACK) { - int chi; - /* - * Is the state MANUAL? - * If so, no callback can be made, - * so reject actively. - * */ - if (ISDN_NET_DIALMODE(*lp) == ISDN_NET_DM_OFF) { - printk(KERN_INFO "incoming call for callback, interface %s `off' -> rejected\n", - p->dev->name); - return 3; - } - printk(KERN_DEBUG "%s: call from %s -> %s, start callback\n", - p->dev->name, nr, eaz); - if (lp->phone[1]) { - /* Grab a free ISDN-Channel */ - spin_lock_irqsave(&dev->lock, flags); - if ((chi = - isdn_get_free_channel( - ISDN_USAGE_NET, - lp->l2_proto, - lp->l3_proto, - lp->pre_device, - lp->pre_channel, - lp->msn) - ) < 0) { - - printk(KERN_WARNING "isdn_net_find_icall: No channel for %s\n", - p->dev->name); - spin_unlock_irqrestore(&dev->lock, flags); - return 0; - } - /* Setup dialstate. */ - lp->dtimer = 0; - lp->dialstate = 11; - /* Connect interface with channel */ - isdn_net_bind_channel(lp, chi); -#ifdef CONFIG_ISDN_PPP - if (lp->p_encap == ISDN_NET_ENCAP_SYNCPPP) - if (isdn_ppp_bind(lp) < 0) { - spin_unlock_irqrestore(&dev->lock, flags); - isdn_net_unbind_channel(lp); - return 0; - } -#endif - spin_unlock_irqrestore(&dev->lock, flags); - /* Initiate dialing by returning 2 or 4 */ - return (lp->flags & ISDN_NET_CBHUP) ? 2 : 4; - } else - printk(KERN_WARNING "isdn_net: %s: No phone number\n", - p->dev->name); - return 0; - } else { - printk(KERN_DEBUG "%s: call from %s -> %s accepted\n", - p->dev->name, nr, eaz); - /* if this interface is dialing, it does it probably on a different - device, so free this device */ - if ((lp->dialstate == 4) || (lp->dialstate == 12)) { -#ifdef CONFIG_ISDN_PPP - if (lp->p_encap == ISDN_NET_ENCAP_SYNCPPP) - isdn_ppp_free(lp); -#endif - isdn_net_lp_disconnected(lp); - isdn_free_channel(lp->isdn_device, lp->isdn_channel, - ISDN_USAGE_NET); - } - spin_lock_irqsave(&dev->lock, flags); - dev->usage[idx] &= ISDN_USAGE_EXCLUSIVE; - dev->usage[idx] |= ISDN_USAGE_NET; - strcpy(dev->num[idx], nr); - isdn_info_update(); - dev->st_netdev[idx] = lp->netdev; - lp->isdn_device = di; - lp->isdn_channel = ch; - lp->ppp_slot = -1; - lp->flags |= ISDN_NET_CONNECTED; - lp->dialstate = 7; - lp->dtimer = 0; - lp->outgoing = 0; - lp->huptimer = 0; - lp->hupflags |= ISDN_WAITCHARGE; - lp->hupflags &= ~ISDN_HAVECHARGE; -#ifdef CONFIG_ISDN_PPP - if (lp->p_encap == ISDN_NET_ENCAP_SYNCPPP) { - if (isdn_ppp_bind(lp) < 0) { - isdn_net_unbind_channel(lp); - spin_unlock_irqrestore(&dev->lock, flags); - return 0; - } - } -#endif - spin_unlock_irqrestore(&dev->lock, flags); - return 1; - } - } - } - p = (isdn_net_dev *) p->next; - } - /* If none of configured EAZ/MSN matched and not verbose, be silent */ - if (!ematch || dev->net_verbose) - printk(KERN_INFO "isdn_net: call from %s -> %d %s ignored\n", nr, di, eaz); - return (wret == 2) ? 5 : 0; -} - -/* - * Search list of net-interfaces for an interface with given name. - */ -isdn_net_dev * -isdn_net_findif(char *name) -{ - isdn_net_dev *p = dev->netdev; - - while (p) { - if (!strcmp(p->dev->name, name)) - return p; - p = (isdn_net_dev *) p->next; - } - return (isdn_net_dev *) NULL; -} - -/* - * Force a net-interface to dial out. - * This is called from the userlevel-routine below or - * from isdn_net_start_xmit(). - */ -static int -isdn_net_force_dial_lp(isdn_net_local *lp) -{ - if ((!(lp->flags & ISDN_NET_CONNECTED)) && !lp->dialstate) { - int chi; - if (lp->phone[1]) { - ulong flags; - - /* Grab a free ISDN-Channel */ - spin_lock_irqsave(&dev->lock, flags); - if ((chi = isdn_get_free_channel( - ISDN_USAGE_NET, - lp->l2_proto, - lp->l3_proto, - lp->pre_device, - lp->pre_channel, - lp->msn)) < 0) { - printk(KERN_WARNING "isdn_net_force_dial: No channel for %s\n", - lp->netdev->dev->name); - spin_unlock_irqrestore(&dev->lock, flags); - return -EAGAIN; - } - lp->dialstate = 1; - /* Connect interface with channel */ - isdn_net_bind_channel(lp, chi); -#ifdef CONFIG_ISDN_PPP - if (lp->p_encap == ISDN_NET_ENCAP_SYNCPPP) - if (isdn_ppp_bind(lp) < 0) { - isdn_net_unbind_channel(lp); - spin_unlock_irqrestore(&dev->lock, flags); - return -EAGAIN; - } -#endif - /* Initiate dialing */ - spin_unlock_irqrestore(&dev->lock, flags); - isdn_net_dial(); - return 0; - } else - return -EINVAL; - } else - return -EBUSY; -} - -/* - * This is called from certain upper protocol layers (multilink ppp - * and x25iface encapsulation module) that want to initiate dialing - * themselves. - */ -int -isdn_net_dial_req(isdn_net_local *lp) -{ - /* is there a better error code? */ - if (!(ISDN_NET_DIALMODE(*lp) == ISDN_NET_DM_AUTO)) return -EBUSY; - - return isdn_net_force_dial_lp(lp); -} - -/* - * Force a net-interface to dial out. - * This is always called from within userspace (ISDN_IOCTL_NET_DIAL). - */ -int -isdn_net_force_dial(char *name) -{ - isdn_net_dev *p = isdn_net_findif(name); - - if (!p) - return -ENODEV; - return (isdn_net_force_dial_lp(p->local)); -} - -/* The ISDN-specific entries in the device structure. */ -static const struct net_device_ops isdn_netdev_ops = { - .ndo_init = isdn_net_init, - .ndo_open = isdn_net_open, - .ndo_stop = isdn_net_close, - .ndo_do_ioctl = isdn_net_ioctl, - - .ndo_start_xmit = isdn_net_start_xmit, - .ndo_get_stats = isdn_net_get_stats, - .ndo_tx_timeout = isdn_net_tx_timeout, -}; - -/* - * Helper for alloc_netdev() - */ -static void _isdn_setup(struct net_device *dev) -{ - isdn_net_local *lp = netdev_priv(dev); - - ether_setup(dev); - - /* Setup the generic properties */ - dev->flags = IFF_NOARP | IFF_POINTOPOINT; - - /* isdn prepends a header in the tx path, can't share skbs */ - dev->priv_flags &= ~IFF_TX_SKB_SHARING; - dev->header_ops = NULL; - dev->netdev_ops = &isdn_netdev_ops; - - /* for clients with MPPP maybe higher values better */ - dev->tx_queue_len = 30; - - lp->p_encap = ISDN_NET_ENCAP_RAWIP; - lp->magic = ISDN_NET_MAGIC; - lp->last = lp; - lp->next = lp; - lp->isdn_device = -1; - lp->isdn_channel = -1; - lp->pre_device = -1; - lp->pre_channel = -1; - lp->exclusive = -1; - lp->ppp_slot = -1; - lp->pppbind = -1; - skb_queue_head_init(&lp->super_tx_queue); - lp->l2_proto = ISDN_PROTO_L2_X75I; - lp->l3_proto = ISDN_PROTO_L3_TRANS; - lp->triggercps = 6000; - lp->slavedelay = 10 * HZ; - lp->hupflags = ISDN_INHUP; /* Do hangup even on incoming calls */ - lp->onhtime = 10; /* Default hangup-time for saving costs */ - lp->dialmax = 1; - /* Hangup before Callback, manual dial */ - lp->flags = ISDN_NET_CBHUP | ISDN_NET_DM_MANUAL; - lp->cbdelay = 25; /* Wait 5 secs before Callback */ - lp->dialtimeout = -1; /* Infinite Dial-Timeout */ - lp->dialwait = 5 * HZ; /* Wait 5 sec. after failed dial */ - lp->dialstarted = 0; /* Jiffies of last dial-start */ - lp->dialwait_timer = 0; /* Jiffies of earliest next dial-start */ -} - -/* - * Allocate a new network-interface and initialize its data structures. - */ -char * -isdn_net_new(char *name, struct net_device *master) -{ - isdn_net_dev *netdev; - - /* Avoid creating an existing interface */ - if (isdn_net_findif(name)) { - printk(KERN_WARNING "isdn_net: interface %s already exists\n", name); - return NULL; - } - if (name == NULL) - return NULL; - if (!(netdev = kzalloc(sizeof(isdn_net_dev), GFP_KERNEL))) { - printk(KERN_WARNING "isdn_net: Could not allocate net-device\n"); - return NULL; - } - netdev->dev = alloc_netdev(sizeof(isdn_net_local), name, - NET_NAME_UNKNOWN, _isdn_setup); - if (!netdev->dev) { - printk(KERN_WARNING "isdn_net: Could not allocate network device\n"); - kfree(netdev); - return NULL; - } - netdev->local = netdev_priv(netdev->dev); - - if (master) { - /* Device shall be a slave */ - struct net_device *p = MASTER_TO_SLAVE(master); - struct net_device *q = master; - - netdev->local->master = master; - /* Put device at end of slave-chain */ - while (p) { - q = p; - p = MASTER_TO_SLAVE(p); - } - MASTER_TO_SLAVE(q) = netdev->dev; - } else { - /* Device shall be a master */ - /* - * Watchdog timer (currently) for master only. - */ - netdev->dev->watchdog_timeo = ISDN_NET_TX_TIMEOUT; - if (register_netdev(netdev->dev) != 0) { - printk(KERN_WARNING "isdn_net: Could not register net-device\n"); - free_netdev(netdev->dev); - kfree(netdev); - return NULL; - } - } - netdev->queue = netdev->local; - spin_lock_init(&netdev->queue_lock); - - netdev->local->netdev = netdev; - - INIT_WORK(&netdev->local->tqueue, isdn_net_softint); - spin_lock_init(&netdev->local->xmit_lock); - - /* Put into to netdev-chain */ - netdev->next = (void *) dev->netdev; - dev->netdev = netdev; - return netdev->dev->name; -} - -char * -isdn_net_newslave(char *parm) -{ - char *p = strchr(parm, ','); - isdn_net_dev *n; - char newname[10]; - - if (p) { - /* Slave-Name MUST not be empty or overflow 'newname' */ - if (strscpy(newname, p + 1, sizeof(newname)) <= 0) - return NULL; - *p = 0; - /* Master must already exist */ - if (!(n = isdn_net_findif(parm))) - return NULL; - /* Master must be a real interface, not a slave */ - if (n->local->master) - return NULL; - /* Master must not be started yet */ - if (isdn_net_device_started(n)) - return NULL; - return (isdn_net_new(newname, n->dev)); - } - return NULL; -} - -/* - * Set interface-parameters. - * Always set all parameters, so the user-level application is responsible - * for not overwriting existing setups. It has to get the current - * setup first, if only selected parameters are to be changed. - */ -int -isdn_net_setcfg(isdn_net_ioctl_cfg *cfg) -{ - isdn_net_dev *p = isdn_net_findif(cfg->name); - ulong features; - int i; - int drvidx; - int chidx; - char drvid[25]; - - if (p) { - isdn_net_local *lp = p->local; - - /* See if any registered driver supports the features we want */ - features = ((1 << cfg->l2_proto) << ISDN_FEATURE_L2_SHIFT) | - ((1 << cfg->l3_proto) << ISDN_FEATURE_L3_SHIFT); - for (i = 0; i < ISDN_MAX_DRIVERS; i++) - if (dev->drv[i]) - if ((dev->drv[i]->interface->features & features) == features) - break; - if (i == ISDN_MAX_DRIVERS) { - printk(KERN_WARNING "isdn_net: No driver with selected features\n"); - return -ENODEV; - } - if (lp->p_encap != cfg->p_encap) { -#ifdef CONFIG_ISDN_X25 - struct concap_proto *cprot = p->cprot; -#endif - if (isdn_net_device_started(p)) { - printk(KERN_WARNING "%s: cannot change encap when if is up\n", - p->dev->name); - return -EBUSY; - } -#ifdef CONFIG_ISDN_X25 - if (cprot && cprot->pops) - cprot->pops->proto_del(cprot); - p->cprot = NULL; - lp->dops = NULL; - /* ... , prepare for configuration of new one ... */ - switch (cfg->p_encap) { - case ISDN_NET_ENCAP_X25IFACE: - lp->dops = &isdn_concap_reliable_dl_dops; - } - /* ... and allocate new one ... */ - p->cprot = isdn_concap_new(cfg->p_encap); - /* p -> cprot == NULL now if p_encap is not supported - by means of the concap_proto mechanism */ - /* the protocol is not configured yet; this will - happen later when isdn_net_reset() is called */ -#endif - } - switch (cfg->p_encap) { - case ISDN_NET_ENCAP_SYNCPPP: -#ifndef CONFIG_ISDN_PPP - printk(KERN_WARNING "%s: SyncPPP support not configured\n", - p->dev->name); - return -EINVAL; -#else - p->dev->type = ARPHRD_PPP; /* change ARP type */ - p->dev->addr_len = 0; -#endif - break; - case ISDN_NET_ENCAP_X25IFACE: -#ifndef CONFIG_ISDN_X25 - printk(KERN_WARNING "%s: isdn-x25 support not configured\n", - p->dev->name); - return -EINVAL; -#else - p->dev->type = ARPHRD_X25; /* change ARP type */ - p->dev->addr_len = 0; -#endif - break; - case ISDN_NET_ENCAP_CISCOHDLCK: - break; - default: - if (cfg->p_encap >= 0 && - cfg->p_encap <= ISDN_NET_ENCAP_MAX_ENCAP) - break; - printk(KERN_WARNING - "%s: encapsulation protocol %d not supported\n", - p->dev->name, cfg->p_encap); - return -EINVAL; - } - if (strlen(cfg->drvid)) { - /* A bind has been requested ... */ - char *c, - *e; - - if (strnlen(cfg->drvid, sizeof(cfg->drvid)) == - sizeof(cfg->drvid)) - return -EINVAL; - drvidx = -1; - chidx = -1; - strcpy(drvid, cfg->drvid); - if ((c = strchr(drvid, ','))) { - /* The channel-number is appended to the driver-Id with a comma */ - chidx = (int) simple_strtoul(c + 1, &e, 10); - if (e == c) - chidx = -1; - *c = '\0'; - } - for (i = 0; i < ISDN_MAX_DRIVERS; i++) - /* Lookup driver-Id in array */ - if (!(strcmp(dev->drvid[i], drvid))) { - drvidx = i; - break; - } - if ((drvidx == -1) || (chidx == -1)) - /* Either driver-Id or channel-number invalid */ - return -ENODEV; - } else { - /* Parameters are valid, so get them */ - drvidx = lp->pre_device; - chidx = lp->pre_channel; - } - if (cfg->exclusive > 0) { - unsigned long flags; - - /* If binding is exclusive, try to grab the channel */ - spin_lock_irqsave(&dev->lock, flags); - if ((i = isdn_get_free_channel(ISDN_USAGE_NET, - lp->l2_proto, lp->l3_proto, drvidx, - chidx, lp->msn)) < 0) { - /* Grab failed, because desired channel is in use */ - lp->exclusive = -1; - spin_unlock_irqrestore(&dev->lock, flags); - return -EBUSY; - } - /* All went ok, so update isdninfo */ - dev->usage[i] = ISDN_USAGE_EXCLUSIVE; - isdn_info_update(); - spin_unlock_irqrestore(&dev->lock, flags); - lp->exclusive = i; - } else { - /* Non-exclusive binding or unbind. */ - lp->exclusive = -1; - if ((lp->pre_device != -1) && (cfg->exclusive == -1)) { - isdn_unexclusive_channel(lp->pre_device, lp->pre_channel); - isdn_free_channel(lp->pre_device, lp->pre_channel, ISDN_USAGE_NET); - drvidx = -1; - chidx = -1; - } - } - strlcpy(lp->msn, cfg->eaz, sizeof(lp->msn)); - lp->pre_device = drvidx; - lp->pre_channel = chidx; - lp->onhtime = cfg->onhtime; - lp->charge = cfg->charge; - lp->l2_proto = cfg->l2_proto; - lp->l3_proto = cfg->l3_proto; - lp->cbdelay = cfg->cbdelay; - lp->dialmax = cfg->dialmax; - lp->triggercps = cfg->triggercps; - lp->slavedelay = cfg->slavedelay * HZ; - lp->pppbind = cfg->pppbind; - lp->dialtimeout = cfg->dialtimeout >= 0 ? cfg->dialtimeout * HZ : -1; - lp->dialwait = cfg->dialwait * HZ; - if (cfg->secure) - lp->flags |= ISDN_NET_SECURE; - else - lp->flags &= ~ISDN_NET_SECURE; - if (cfg->cbhup) - lp->flags |= ISDN_NET_CBHUP; - else - lp->flags &= ~ISDN_NET_CBHUP; - switch (cfg->callback) { - case 0: - lp->flags &= ~(ISDN_NET_CALLBACK | ISDN_NET_CBOUT); - break; - case 1: - lp->flags |= ISDN_NET_CALLBACK; - lp->flags &= ~ISDN_NET_CBOUT; - break; - case 2: - lp->flags |= ISDN_NET_CBOUT; - lp->flags &= ~ISDN_NET_CALLBACK; - break; - } - lp->flags &= ~ISDN_NET_DIALMODE_MASK; /* first all bits off */ - if (cfg->dialmode && !(cfg->dialmode & ISDN_NET_DIALMODE_MASK)) { - /* old isdnctrl version, where only 0 or 1 is given */ - printk(KERN_WARNING - "Old isdnctrl version detected! Please update.\n"); - lp->flags |= ISDN_NET_DM_OFF; /* turn on `off' bit */ - } - else { - lp->flags |= cfg->dialmode; /* turn on selected bits */ - } - if (cfg->chargehup) - lp->hupflags |= ISDN_CHARGEHUP; - else - lp->hupflags &= ~ISDN_CHARGEHUP; - if (cfg->ihup) - lp->hupflags |= ISDN_INHUP; - else - lp->hupflags &= ~ISDN_INHUP; - if (cfg->chargeint > 10) { - lp->hupflags |= ISDN_CHARGEHUP | ISDN_HAVECHARGE | ISDN_MANCHARGE; - lp->chargeint = cfg->chargeint * HZ; - } - if (cfg->p_encap != lp->p_encap) { - if (cfg->p_encap == ISDN_NET_ENCAP_RAWIP) { - p->dev->header_ops = NULL; - p->dev->flags = IFF_NOARP | IFF_POINTOPOINT; - } else { - p->dev->header_ops = &isdn_header_ops; - if (cfg->p_encap == ISDN_NET_ENCAP_ETHER) - p->dev->flags = IFF_BROADCAST | IFF_MULTICAST; - else - p->dev->flags = IFF_NOARP | IFF_POINTOPOINT; - } - } - lp->p_encap = cfg->p_encap; - return 0; - } - return -ENODEV; -} - -/* - * Perform get-interface-parameters.ioctl - */ -int -isdn_net_getcfg(isdn_net_ioctl_cfg *cfg) -{ - isdn_net_dev *p = isdn_net_findif(cfg->name); - - if (p) { - isdn_net_local *lp = p->local; - - strcpy(cfg->eaz, lp->msn); - cfg->exclusive = lp->exclusive; - if (lp->pre_device >= 0) { - sprintf(cfg->drvid, "%s,%d", dev->drvid[lp->pre_device], - lp->pre_channel); - } else - cfg->drvid[0] = '\0'; - cfg->onhtime = lp->onhtime; - cfg->charge = lp->charge; - cfg->l2_proto = lp->l2_proto; - cfg->l3_proto = lp->l3_proto; - cfg->p_encap = lp->p_encap; - cfg->secure = (lp->flags & ISDN_NET_SECURE) ? 1 : 0; - cfg->callback = 0; - if (lp->flags & ISDN_NET_CALLBACK) - cfg->callback = 1; - if (lp->flags & ISDN_NET_CBOUT) - cfg->callback = 2; - cfg->cbhup = (lp->flags & ISDN_NET_CBHUP) ? 1 : 0; - cfg->dialmode = lp->flags & ISDN_NET_DIALMODE_MASK; - cfg->chargehup = (lp->hupflags & ISDN_CHARGEHUP) ? 1 : 0; - cfg->ihup = (lp->hupflags & ISDN_INHUP) ? 1 : 0; - cfg->cbdelay = lp->cbdelay; - cfg->dialmax = lp->dialmax; - cfg->triggercps = lp->triggercps; - cfg->slavedelay = lp->slavedelay / HZ; - cfg->chargeint = (lp->hupflags & ISDN_CHARGEHUP) ? - (lp->chargeint / HZ) : 0; - cfg->pppbind = lp->pppbind; - cfg->dialtimeout = lp->dialtimeout >= 0 ? lp->dialtimeout / HZ : -1; - cfg->dialwait = lp->dialwait / HZ; - if (lp->slave) { - if (strlen(lp->slave->name) >= 10) - strcpy(cfg->slave, "too-long"); - else - strcpy(cfg->slave, lp->slave->name); - } else - cfg->slave[0] = '\0'; - if (lp->master) { - if (strlen(lp->master->name) >= 10) - strcpy(cfg->master, "too-long"); - else - strcpy(cfg->master, lp->master->name); - } else - cfg->master[0] = '\0'; - return 0; - } - return -ENODEV; -} - -/* - * Add a phone-number to an interface. - */ -int -isdn_net_addphone(isdn_net_ioctl_phone *phone) -{ - isdn_net_dev *p = isdn_net_findif(phone->name); - isdn_net_phone *n; - - if (p) { - if (!(n = kmalloc(sizeof(isdn_net_phone), GFP_KERNEL))) - return -ENOMEM; - strlcpy(n->num, phone->phone, sizeof(n->num)); - n->next = p->local->phone[phone->outgoing & 1]; - p->local->phone[phone->outgoing & 1] = n; - return 0; - } - return -ENODEV; -} - -/* - * Copy a string of all phone-numbers of an interface to user space. - * This might sleep and must be called with the isdn semaphore down. - */ -int -isdn_net_getphones(isdn_net_ioctl_phone *phone, char __user *phones) -{ - isdn_net_dev *p = isdn_net_findif(phone->name); - int inout = phone->outgoing & 1; - int more = 0; - int count = 0; - isdn_net_phone *n; - - if (!p) - return -ENODEV; - inout &= 1; - for (n = p->local->phone[inout]; n; n = n->next) { - if (more) { - put_user(' ', phones++); - count++; - } - if (copy_to_user(phones, n->num, strlen(n->num) + 1)) { - return -EFAULT; - } - phones += strlen(n->num); - count += strlen(n->num); - more = 1; - } - put_user(0, phones); - count++; - return count; -} - -/* - * Copy a string containing the peer's phone number of a connected interface - * to user space. - */ -int -isdn_net_getpeer(isdn_net_ioctl_phone *phone, isdn_net_ioctl_phone __user *peer) -{ - isdn_net_dev *p = isdn_net_findif(phone->name); - int ch, dv, idx; - - if (!p) - return -ENODEV; - /* - * Theoretical race: while this executes, the remote number might - * become invalid (hang up) or change (new connection), resulting - * in (partially) wrong number copied to user. This race - * currently ignored. - */ - ch = p->local->isdn_channel; - dv = p->local->isdn_device; - if (ch < 0 && dv < 0) - return -ENOTCONN; - idx = isdn_dc2minor(dv, ch); - if (idx < 0) - return -ENODEV; - /* for pre-bound channels, we need this extra check */ - if (strncmp(dev->num[idx], "???", 3) == 0) - return -ENOTCONN; - strncpy(phone->phone, dev->num[idx], ISDN_MSNLEN); - phone->outgoing = USG_OUTGOING(dev->usage[idx]); - if (copy_to_user(peer, phone, sizeof(*peer))) - return -EFAULT; - return 0; -} -/* - * Delete a phone-number from an interface. - */ -int -isdn_net_delphone(isdn_net_ioctl_phone *phone) -{ - isdn_net_dev *p = isdn_net_findif(phone->name); - int inout = phone->outgoing & 1; - isdn_net_phone *n; - isdn_net_phone *m; - - if (p) { - n = p->local->phone[inout]; - m = NULL; - while (n) { - if (!strcmp(n->num, phone->phone)) { - if (p->local->dial == n) - p->local->dial = n->next; - if (m) - m->next = n->next; - else - p->local->phone[inout] = n->next; - kfree(n); - return 0; - } - m = n; - n = (isdn_net_phone *) n->next; - } - return -EINVAL; - } - return -ENODEV; -} - -/* - * Delete all phone-numbers of an interface. - */ -static int -isdn_net_rmallphone(isdn_net_dev *p) -{ - isdn_net_phone *n; - isdn_net_phone *m; - int i; - - for (i = 0; i < 2; i++) { - n = p->local->phone[i]; - while (n) { - m = n->next; - kfree(n); - n = m; - } - p->local->phone[i] = NULL; - } - p->local->dial = NULL; - return 0; -} - -/* - * Force a hangup of a network-interface. - */ -int -isdn_net_force_hangup(char *name) -{ - isdn_net_dev *p = isdn_net_findif(name); - struct net_device *q; - - if (p) { - if (p->local->isdn_device < 0) - return 1; - q = p->local->slave; - /* If this interface has slaves, do a hangup for them also. */ - while (q) { - isdn_net_hangup(q); - q = MASTER_TO_SLAVE(q); - } - isdn_net_hangup(p->dev); - return 0; - } - return -ENODEV; -} - -/* - * Helper-function for isdn_net_rm: Do the real work. - */ -static int -isdn_net_realrm(isdn_net_dev *p, isdn_net_dev *q) -{ - u_long flags; - - if (isdn_net_device_started(p)) { - return -EBUSY; - } -#ifdef CONFIG_ISDN_X25 - if (p->cprot && p->cprot->pops) - p->cprot->pops->proto_del(p->cprot); -#endif - /* Free all phone-entries */ - isdn_net_rmallphone(p); - /* If interface is bound exclusive, free channel-usage */ - if (p->local->exclusive != -1) - isdn_unexclusive_channel(p->local->pre_device, p->local->pre_channel); - if (p->local->master) { - /* It's a slave-device, so update master's slave-pointer if necessary */ - if (((isdn_net_local *) ISDN_MASTER_PRIV(p->local))->slave == - p->dev) - ((isdn_net_local *)ISDN_MASTER_PRIV(p->local))->slave = - p->local->slave; - } else { - /* Unregister only if it's a master-device */ - unregister_netdev(p->dev); - } - /* Unlink device from chain */ - spin_lock_irqsave(&dev->lock, flags); - if (q) - q->next = p->next; - else - dev->netdev = p->next; - if (p->local->slave) { - /* If this interface has a slave, remove it also */ - char *slavename = p->local->slave->name; - isdn_net_dev *n = dev->netdev; - q = NULL; - while (n) { - if (!strcmp(n->dev->name, slavename)) { - spin_unlock_irqrestore(&dev->lock, flags); - isdn_net_realrm(n, q); - spin_lock_irqsave(&dev->lock, flags); - break; - } - q = n; - n = (isdn_net_dev *)n->next; - } - } - spin_unlock_irqrestore(&dev->lock, flags); - /* If no more net-devices remain, disable auto-hangup timer */ - if (dev->netdev == NULL) - isdn_timer_ctrl(ISDN_TIMER_NETHANGUP, 0); - free_netdev(p->dev); - kfree(p); - - return 0; -} - -/* - * Remove a single network-interface. - */ -int -isdn_net_rm(char *name) -{ - u_long flags; - isdn_net_dev *p; - isdn_net_dev *q; - - /* Search name in netdev-chain */ - spin_lock_irqsave(&dev->lock, flags); - p = dev->netdev; - q = NULL; - while (p) { - if (!strcmp(p->dev->name, name)) { - spin_unlock_irqrestore(&dev->lock, flags); - return (isdn_net_realrm(p, q)); - } - q = p; - p = (isdn_net_dev *) p->next; - } - spin_unlock_irqrestore(&dev->lock, flags); - /* If no more net-devices remain, disable auto-hangup timer */ - if (dev->netdev == NULL) - isdn_timer_ctrl(ISDN_TIMER_NETHANGUP, 0); - return -ENODEV; -} - -/* - * Remove all network-interfaces - */ -int -isdn_net_rmall(void) -{ - u_long flags; - int ret; - - /* Walk through netdev-chain */ - spin_lock_irqsave(&dev->lock, flags); - while (dev->netdev) { - if (!dev->netdev->local->master) { - /* Remove master-devices only, slaves get removed with their master */ - spin_unlock_irqrestore(&dev->lock, flags); - if ((ret = isdn_net_realrm(dev->netdev, NULL))) { - return ret; - } - spin_lock_irqsave(&dev->lock, flags); - } - } - dev->netdev = NULL; - spin_unlock_irqrestore(&dev->lock, flags); - return 0; -} diff --git a/drivers/isdn/i4l/isdn_net.h b/drivers/isdn/i4l/isdn_net.h deleted file mode 100644 index cca6d68da171..000000000000 --- a/drivers/isdn/i4l/isdn_net.h +++ /dev/null @@ -1,151 +0,0 @@ -/* $Id: isdn_net.h,v 1.1.2.2 2004/01/12 22:37:19 keil Exp $ - * - * header for Linux ISDN subsystem, network related functions (linklevel). - * - * Copyright 1994-1999 by Fritz Elfert (fritz@isdn4linux.de) - * Copyright 1995,96 by Thinking Objects Software GmbH Wuerzburg - * Copyright 1995,96 by Michael Hipp (Michael.Hipp@student.uni-tuebingen.de) - * - * This software may be used and distributed according to the terms - * of the GNU General Public License, incorporated herein by reference. - * - */ - -/* Definitions for hupflags: */ -#define ISDN_WAITCHARGE 1 /* did not get a charge info yet */ -#define ISDN_HAVECHARGE 2 /* We know a charge info */ -#define ISDN_CHARGEHUP 4 /* We want to use the charge mechanism */ -#define ISDN_INHUP 8 /* Even if incoming, close after huptimeout */ -#define ISDN_MANCHARGE 16 /* Charge Interval manually set */ - -/* - * Definitions for Cisco-HDLC header. - */ - -#define CISCO_ADDR_UNICAST 0x0f -#define CISCO_ADDR_BROADCAST 0x8f -#define CISCO_CTRL 0x00 -#define CISCO_TYPE_CDP 0x2000 -#define CISCO_TYPE_SLARP 0x8035 -#define CISCO_SLARP_REQUEST 0 -#define CISCO_SLARP_REPLY 1 -#define CISCO_SLARP_KEEPALIVE 2 - -extern char *isdn_net_new(char *, struct net_device *); -extern char *isdn_net_newslave(char *); -extern int isdn_net_rm(char *); -extern int isdn_net_rmall(void); -extern int isdn_net_stat_callback(int, isdn_ctrl *); -extern int isdn_net_setcfg(isdn_net_ioctl_cfg *); -extern int isdn_net_getcfg(isdn_net_ioctl_cfg *); -extern int isdn_net_addphone(isdn_net_ioctl_phone *); -extern int isdn_net_getphones(isdn_net_ioctl_phone *, char __user *); -extern int isdn_net_getpeer(isdn_net_ioctl_phone *, isdn_net_ioctl_phone __user *); -extern int isdn_net_delphone(isdn_net_ioctl_phone *); -extern int isdn_net_find_icall(int, int, int, setup_parm *); -extern void isdn_net_hangup(struct net_device *); -extern void isdn_net_dial(void); -extern void isdn_net_autohup(void); -extern int isdn_net_force_hangup(char *); -extern int isdn_net_force_dial(char *); -extern isdn_net_dev *isdn_net_findif(char *); -extern int isdn_net_rcv_skb(int, struct sk_buff *); -extern int isdn_net_dial_req(isdn_net_local *); -extern void isdn_net_writebuf_skb(isdn_net_local *lp, struct sk_buff *skb); -extern void isdn_net_write_super(isdn_net_local *lp, struct sk_buff *skb); - -#define ISDN_NET_MAX_QUEUE_LENGTH 2 - -#define ISDN_MASTER_PRIV(lp) ((isdn_net_local *) netdev_priv(lp->master)) -#define ISDN_SLAVE_PRIV(lp) ((isdn_net_local *) netdev_priv(lp->slave)) -#define MASTER_TO_SLAVE(master) \ - (((isdn_net_local *) netdev_priv(master))->slave) - -/* - * is this particular channel busy? - */ -static __inline__ int isdn_net_lp_busy(isdn_net_local *lp) -{ - if (atomic_read(&lp->frame_cnt) < ISDN_NET_MAX_QUEUE_LENGTH) - return 0; - else - return 1; -} - -/* - * For the given net device, this will get a non-busy channel out of the - * corresponding bundle. The returned channel is locked. - */ -static __inline__ isdn_net_local *isdn_net_get_locked_lp(isdn_net_dev *nd) -{ - unsigned long flags; - isdn_net_local *lp; - - spin_lock_irqsave(&nd->queue_lock, flags); - lp = nd->queue; /* get lp on top of queue */ - while (isdn_net_lp_busy(nd->queue)) { - nd->queue = nd->queue->next; - if (nd->queue == lp) { /* not found -- should never happen */ - lp = NULL; - goto errout; - } - } - lp = nd->queue; - nd->queue = nd->queue->next; - spin_unlock_irqrestore(&nd->queue_lock, flags); - spin_lock(&lp->xmit_lock); - local_bh_disable(); - return lp; -errout: - spin_unlock_irqrestore(&nd->queue_lock, flags); - return lp; -} - -/* - * add a channel to a bundle - */ -static __inline__ void isdn_net_add_to_bundle(isdn_net_dev *nd, isdn_net_local *nlp) -{ - isdn_net_local *lp; - unsigned long flags; - - spin_lock_irqsave(&nd->queue_lock, flags); - - lp = nd->queue; -// printk(KERN_DEBUG "%s: lp:%s(%p) nlp:%s(%p) last(%p)\n", -// __func__, lp->name, lp, nlp->name, nlp, lp->last); - nlp->last = lp->last; - lp->last->next = nlp; - lp->last = nlp; - nlp->next = lp; - nd->queue = nlp; - - spin_unlock_irqrestore(&nd->queue_lock, flags); -} -/* - * remove a channel from the bundle it belongs to - */ -static __inline__ void isdn_net_rm_from_bundle(isdn_net_local *lp) -{ - isdn_net_local *master_lp = lp; - unsigned long flags; - - if (lp->master) - master_lp = ISDN_MASTER_PRIV(lp); - -// printk(KERN_DEBUG "%s: lp:%s(%p) mlp:%s(%p) last(%p) next(%p) mndq(%p)\n", -// __func__, lp->name, lp, master_lp->name, master_lp, lp->last, lp->next, master_lp->netdev->queue); - spin_lock_irqsave(&master_lp->netdev->queue_lock, flags); - lp->last->next = lp->next; - lp->next->last = lp->last; - if (master_lp->netdev->queue == lp) { - master_lp->netdev->queue = lp->next; - if (lp->next == lp) { /* last in queue */ - master_lp->netdev->queue = master_lp->netdev->local; - } - } - lp->next = lp->last = lp; /* (re)set own pointers */ -// printk(KERN_DEBUG "%s: mndq(%p)\n", -// __func__, master_lp->netdev->queue); - spin_unlock_irqrestore(&master_lp->netdev->queue_lock, flags); -} diff --git a/drivers/isdn/i4l/isdn_ppp.c b/drivers/isdn/i4l/isdn_ppp.c deleted file mode 100644 index 7e0f419c14f8..000000000000 --- a/drivers/isdn/i4l/isdn_ppp.c +++ /dev/null @@ -1,3046 +0,0 @@ -/* $Id: isdn_ppp.c,v 1.1.2.3 2004/02/10 01:07:13 keil Exp $ - * - * Linux ISDN subsystem, functions for synchronous PPP (linklevel). - * - * Copyright 1995,96 by Michael Hipp (Michael.Hipp@student.uni-tuebingen.de) - * - * This software may be used and distributed according to the terms - * of the GNU General Public License, incorporated herein by reference. - * - */ - -#include -#include -#include -#include -#ifdef CONFIG_IPPP_FILTER -#include -#endif - -#include "isdn_common.h" -#include "isdn_ppp.h" -#include "isdn_net.h" - -#ifndef PPP_IPX -#define PPP_IPX 0x002b -#endif - -/* Prototypes */ -static int isdn_ppp_fill_rq(unsigned char *buf, int len, int proto, int slot); -static int isdn_ppp_closewait(int slot); -static void isdn_ppp_push_higher(isdn_net_dev *net_dev, isdn_net_local *lp, - struct sk_buff *skb, int proto); -static int isdn_ppp_if_get_unit(char *namebuf); -static int isdn_ppp_set_compressor(struct ippp_struct *is, struct isdn_ppp_comp_data *); -static struct sk_buff *isdn_ppp_decompress(struct sk_buff *, - struct ippp_struct *, struct ippp_struct *, int *proto); -static void isdn_ppp_receive_ccp(isdn_net_dev *net_dev, isdn_net_local *lp, - struct sk_buff *skb, int proto); -static struct sk_buff *isdn_ppp_compress(struct sk_buff *skb_in, int *proto, - struct ippp_struct *is, struct ippp_struct *master, int type); -static void isdn_ppp_send_ccp(isdn_net_dev *net_dev, isdn_net_local *lp, - struct sk_buff *skb); - -/* New CCP stuff */ -static void isdn_ppp_ccp_kickup(struct ippp_struct *is); -static void isdn_ppp_ccp_xmit_reset(struct ippp_struct *is, int proto, - unsigned char code, unsigned char id, - unsigned char *data, int len); -static struct ippp_ccp_reset *isdn_ppp_ccp_reset_alloc(struct ippp_struct *is); -static void isdn_ppp_ccp_reset_free(struct ippp_struct *is); -static void isdn_ppp_ccp_reset_free_state(struct ippp_struct *is, - unsigned char id); -static void isdn_ppp_ccp_timer_callback(struct timer_list *t); -static struct ippp_ccp_reset_state *isdn_ppp_ccp_reset_alloc_state(struct ippp_struct *is, - unsigned char id); -static void isdn_ppp_ccp_reset_trans(struct ippp_struct *is, - struct isdn_ppp_resetparams *rp); -static void isdn_ppp_ccp_reset_ack_rcvd(struct ippp_struct *is, - unsigned char id); - - - -#ifdef CONFIG_ISDN_MPP -static ippp_bundle *isdn_ppp_bundle_arr = NULL; - -static int isdn_ppp_mp_bundle_array_init(void); -static int isdn_ppp_mp_init(isdn_net_local *lp, ippp_bundle *add_to); -static void isdn_ppp_mp_receive(isdn_net_dev *net_dev, isdn_net_local *lp, - struct sk_buff *skb); -static void isdn_ppp_mp_cleanup(isdn_net_local *lp); - -static int isdn_ppp_bundle(struct ippp_struct *, int unit); -#endif /* CONFIG_ISDN_MPP */ - -char *isdn_ppp_revision = "$Revision: 1.1.2.3 $"; - -static struct ippp_struct *ippp_table[ISDN_MAX_CHANNELS]; - -static struct isdn_ppp_compressor *ipc_head = NULL; - -/* - * frame log (debug) - */ -static void -isdn_ppp_frame_log(char *info, char *data, int len, int maxlen, int unit, int slot) -{ - int cnt, - j, - i; - char buf[80]; - - if (len < maxlen) - maxlen = len; - - for (i = 0, cnt = 0; cnt < maxlen; i++) { - for (j = 0; j < 16 && cnt < maxlen; j++, cnt++) - sprintf(buf + j * 3, "%02x ", (unsigned char)data[cnt]); - printk(KERN_DEBUG "[%d/%d].%s[%d]: %s\n", unit, slot, info, i, buf); - } -} - -/* - * unbind isdn_net_local <=> ippp-device - * note: it can happen, that we hangup/free the master before the slaves - * in this case we bind another lp to the master device - */ -int -isdn_ppp_free(isdn_net_local *lp) -{ - struct ippp_struct *is; - - if (lp->ppp_slot < 0 || lp->ppp_slot >= ISDN_MAX_CHANNELS) { - printk(KERN_ERR "%s: ppp_slot(%d) out of range\n", - __func__, lp->ppp_slot); - return 0; - } - -#ifdef CONFIG_ISDN_MPP - spin_lock(&lp->netdev->pb->lock); -#endif - isdn_net_rm_from_bundle(lp); -#ifdef CONFIG_ISDN_MPP - if (lp->netdev->pb->ref_ct == 1) /* last link in queue? */ - isdn_ppp_mp_cleanup(lp); - - lp->netdev->pb->ref_ct--; - spin_unlock(&lp->netdev->pb->lock); -#endif /* CONFIG_ISDN_MPP */ - if (lp->ppp_slot < 0 || lp->ppp_slot >= ISDN_MAX_CHANNELS) { - printk(KERN_ERR "%s: ppp_slot(%d) now invalid\n", - __func__, lp->ppp_slot); - return 0; - } - is = ippp_table[lp->ppp_slot]; - if ((is->state & IPPP_CONNECT)) - isdn_ppp_closewait(lp->ppp_slot); /* force wakeup on ippp device */ - else if (is->state & IPPP_ASSIGNED) - is->state = IPPP_OPEN; /* fallback to 'OPEN but not ASSIGNED' state */ - - if (is->debug & 0x1) - printk(KERN_DEBUG "isdn_ppp_free %d %lx %lx\n", lp->ppp_slot, (long) lp, (long) is->lp); - - is->lp = NULL; /* link is down .. set lp to NULL */ - lp->ppp_slot = -1; /* is this OK ?? */ - - return 0; -} - -/* - * bind isdn_net_local <=> ippp-device - * - * This function is allways called with holding dev->lock so - * no additional lock is needed - */ -int -isdn_ppp_bind(isdn_net_local *lp) -{ - int i; - int unit = 0; - struct ippp_struct *is; - int retval; - - if (lp->pppbind < 0) { /* device bounded to ippp device ? */ - isdn_net_dev *net_dev = dev->netdev; - char exclusive[ISDN_MAX_CHANNELS]; /* exclusive flags */ - memset(exclusive, 0, ISDN_MAX_CHANNELS); - while (net_dev) { /* step through net devices to find exclusive minors */ - isdn_net_local *lp = net_dev->local; - if (lp->pppbind >= 0) - exclusive[lp->pppbind] = 1; - net_dev = net_dev->next; - } - /* - * search a free device / slot - */ - for (i = 0; i < ISDN_MAX_CHANNELS; i++) { - if (ippp_table[i]->state == IPPP_OPEN && !exclusive[ippp_table[i]->minor]) { /* OPEN, but not connected! */ - break; - } - } - } else { - for (i = 0; i < ISDN_MAX_CHANNELS; i++) { - if (ippp_table[i]->minor == lp->pppbind && - (ippp_table[i]->state & IPPP_OPEN) == IPPP_OPEN) - break; - } - } - - if (i >= ISDN_MAX_CHANNELS) { - printk(KERN_WARNING "isdn_ppp_bind: Can't find a (free) connection to the ipppd daemon.\n"); - retval = -1; - goto out; - } - /* get unit number from interface name .. ugly! */ - unit = isdn_ppp_if_get_unit(lp->netdev->dev->name); - if (unit < 0) { - printk(KERN_ERR "isdn_ppp_bind: illegal interface name %s.\n", - lp->netdev->dev->name); - retval = -1; - goto out; - } - - lp->ppp_slot = i; - is = ippp_table[i]; - is->lp = lp; - is->unit = unit; - is->state = IPPP_OPEN | IPPP_ASSIGNED; /* assigned to a netdevice but not connected */ -#ifdef CONFIG_ISDN_MPP - retval = isdn_ppp_mp_init(lp, NULL); - if (retval < 0) - goto out; -#endif /* CONFIG_ISDN_MPP */ - - retval = lp->ppp_slot; - -out: - return retval; -} - -/* - * kick the ipppd on the device - * (wakes up daemon after B-channel connect) - */ - -void -isdn_ppp_wakeup_daemon(isdn_net_local *lp) -{ - if (lp->ppp_slot < 0 || lp->ppp_slot >= ISDN_MAX_CHANNELS) { - printk(KERN_ERR "%s: ppp_slot(%d) out of range\n", - __func__, lp->ppp_slot); - return; - } - ippp_table[lp->ppp_slot]->state = IPPP_OPEN | IPPP_CONNECT | IPPP_NOBLOCK; - wake_up_interruptible(&ippp_table[lp->ppp_slot]->wq); -} - -/* - * there was a hangup on the netdevice - * force wakeup of the ippp device - * go into 'device waits for release' state - */ -static int -isdn_ppp_closewait(int slot) -{ - struct ippp_struct *is; - - if (slot < 0 || slot >= ISDN_MAX_CHANNELS) { - printk(KERN_ERR "%s: slot(%d) out of range\n", - __func__, slot); - return 0; - } - is = ippp_table[slot]; - if (is->state) - wake_up_interruptible(&is->wq); - is->state = IPPP_CLOSEWAIT; - return 1; -} - -/* - * isdn_ppp_find_slot / isdn_ppp_free_slot - */ - -static int -isdn_ppp_get_slot(void) -{ - int i; - for (i = 0; i < ISDN_MAX_CHANNELS; i++) { - if (!ippp_table[i]->state) - return i; - } - return -1; -} - -/* - * isdn_ppp_open - */ - -int -isdn_ppp_open(int min, struct file *file) -{ - int slot; - struct ippp_struct *is; - - if (min < 0 || min >= ISDN_MAX_CHANNELS) - return -ENODEV; - - slot = isdn_ppp_get_slot(); - if (slot < 0) { - return -EBUSY; - } - is = file->private_data = ippp_table[slot]; - - printk(KERN_DEBUG "ippp, open, slot: %d, minor: %d, state: %04x\n", - slot, min, is->state); - - /* compression stuff */ - is->link_compressor = is->compressor = NULL; - is->link_decompressor = is->decompressor = NULL; - is->link_comp_stat = is->comp_stat = NULL; - is->link_decomp_stat = is->decomp_stat = NULL; - is->compflags = 0; - - is->reset = isdn_ppp_ccp_reset_alloc(is); - if (!is->reset) - return -ENOMEM; - - is->lp = NULL; - is->mp_seqno = 0; /* MP sequence number */ - is->pppcfg = 0; /* ppp configuration */ - is->mpppcfg = 0; /* mppp configuration */ - is->last_link_seqno = -1; /* MP: maybe set to Bundle-MIN, when joining a bundle ?? */ - is->unit = -1; /* set, when we have our interface */ - is->mru = 1524; /* MRU, default 1524 */ - is->maxcid = 16; /* VJ: maxcid */ - is->tk = current; - init_waitqueue_head(&is->wq); - is->first = is->rq + NUM_RCV_BUFFS - 1; /* receive queue */ - is->last = is->rq; - is->minor = min; -#ifdef CONFIG_ISDN_PPP_VJ - /* - * VJ header compression init - */ - is->slcomp = slhc_init(16, 16); /* not necessary for 2. link in bundle */ - if (IS_ERR(is->slcomp)) { - isdn_ppp_ccp_reset_free(is); - return PTR_ERR(is->slcomp); - } -#endif -#ifdef CONFIG_IPPP_FILTER - is->pass_filter = NULL; - is->active_filter = NULL; -#endif - is->state = IPPP_OPEN; - - return 0; -} - -/* - * release ippp device - */ -void -isdn_ppp_release(int min, struct file *file) -{ - int i; - struct ippp_struct *is; - - if (min < 0 || min >= ISDN_MAX_CHANNELS) - return; - is = file->private_data; - - if (!is) { - printk(KERN_ERR "%s: no file->private_data\n", __func__); - return; - } - if (is->debug & 0x1) - printk(KERN_DEBUG "ippp: release, minor: %d %lx\n", min, (long) is->lp); - - if (is->lp) { /* a lp address says: this link is still up */ - isdn_net_dev *p = is->lp->netdev; - - if (!p) { - printk(KERN_ERR "%s: no lp->netdev\n", __func__); - return; - } - is->state &= ~IPPP_CONNECT; /* -> effect: no call of wakeup */ - /* - * isdn_net_hangup() calls isdn_ppp_free() - * isdn_ppp_free() sets is->lp to NULL and lp->ppp_slot to -1 - * removing the IPPP_CONNECT flag omits calling of isdn_ppp_wakeup_daemon() - */ - isdn_net_hangup(p->dev); - } - for (i = 0; i < NUM_RCV_BUFFS; i++) { - kfree(is->rq[i].buf); - is->rq[i].buf = NULL; - } - is->first = is->rq + NUM_RCV_BUFFS - 1; /* receive queue */ - is->last = is->rq; - -#ifdef CONFIG_ISDN_PPP_VJ -/* TODO: if this was the previous master: link the slcomp to the new master */ - slhc_free(is->slcomp); - is->slcomp = NULL; -#endif -#ifdef CONFIG_IPPP_FILTER - if (is->pass_filter) { - bpf_prog_destroy(is->pass_filter); - is->pass_filter = NULL; - } - - if (is->active_filter) { - bpf_prog_destroy(is->active_filter); - is->active_filter = NULL; - } -#endif - -/* TODO: if this was the previous master: link the stuff to the new master */ - if (is->comp_stat) - is->compressor->free(is->comp_stat); - if (is->link_comp_stat) - is->link_compressor->free(is->link_comp_stat); - if (is->link_decomp_stat) - is->link_decompressor->free(is->link_decomp_stat); - if (is->decomp_stat) - is->decompressor->free(is->decomp_stat); - is->compressor = is->link_compressor = NULL; - is->decompressor = is->link_decompressor = NULL; - is->comp_stat = is->link_comp_stat = NULL; - is->decomp_stat = is->link_decomp_stat = NULL; - - /* Clean up if necessary */ - if (is->reset) - isdn_ppp_ccp_reset_free(is); - - /* this slot is ready for new connections */ - is->state = 0; -} - -/* - * get_arg .. ioctl helper - */ -static int -get_arg(void __user *b, void *val, int len) -{ - if (len <= 0) - len = sizeof(void *); - if (copy_from_user(val, b, len)) - return -EFAULT; - return 0; -} - -/* - * set arg .. ioctl helper - */ -static int -set_arg(void __user *b, void *val, int len) -{ - if (len <= 0) - len = sizeof(void *); - if (copy_to_user(b, val, len)) - return -EFAULT; - return 0; -} - -#ifdef CONFIG_IPPP_FILTER -static int get_filter(void __user *arg, struct sock_filter **p) -{ - struct sock_fprog uprog; - struct sock_filter *code = NULL; - int len; - - if (copy_from_user(&uprog, arg, sizeof(uprog))) - return -EFAULT; - - if (!uprog.len) { - *p = NULL; - return 0; - } - - /* uprog.len is unsigned short, so no overflow here */ - len = uprog.len * sizeof(struct sock_filter); - code = memdup_user(uprog.filter, len); - if (IS_ERR(code)) - return PTR_ERR(code); - - *p = code; - return uprog.len; -} -#endif /* CONFIG_IPPP_FILTER */ - -/* - * ippp device ioctl - */ -int -isdn_ppp_ioctl(int min, struct file *file, unsigned int cmd, unsigned long arg) -{ - unsigned long val; - int r, i, j; - struct ippp_struct *is; - isdn_net_local *lp; - struct isdn_ppp_comp_data data; - void __user *argp = (void __user *)arg; - - is = file->private_data; - lp = is->lp; - - if (is->debug & 0x1) - printk(KERN_DEBUG "isdn_ppp_ioctl: minor: %d cmd: %x state: %x\n", min, cmd, is->state); - - if (!(is->state & IPPP_OPEN)) - return -EINVAL; - - switch (cmd) { - case PPPIOCBUNDLE: -#ifdef CONFIG_ISDN_MPP - if (!(is->state & IPPP_CONNECT)) - return -EINVAL; - if ((r = get_arg(argp, &val, sizeof(val)))) - return r; - printk(KERN_DEBUG "iPPP-bundle: minor: %d, slave unit: %d, master unit: %d\n", - (int) min, (int) is->unit, (int) val); - return isdn_ppp_bundle(is, val); -#else - return -1; -#endif - break; - case PPPIOCGUNIT: /* get ppp/isdn unit number */ - if ((r = set_arg(argp, &is->unit, sizeof(is->unit)))) - return r; - break; - case PPPIOCGIFNAME: - if (!lp) - return -EINVAL; - if ((r = set_arg(argp, lp->netdev->dev->name, - strlen(lp->netdev->dev->name)))) - return r; - break; - case PPPIOCGMPFLAGS: /* get configuration flags */ - if ((r = set_arg(argp, &is->mpppcfg, sizeof(is->mpppcfg)))) - return r; - break; - case PPPIOCSMPFLAGS: /* set configuration flags */ - if ((r = get_arg(argp, &val, sizeof(val)))) - return r; - is->mpppcfg = val; - break; - case PPPIOCGFLAGS: /* get configuration flags */ - if ((r = set_arg(argp, &is->pppcfg, sizeof(is->pppcfg)))) - return r; - break; - case PPPIOCSFLAGS: /* set configuration flags */ - if ((r = get_arg(argp, &val, sizeof(val)))) { - return r; - } - if (val & SC_ENABLE_IP && !(is->pppcfg & SC_ENABLE_IP) && (is->state & IPPP_CONNECT)) { - if (lp) { - /* OK .. we are ready to send buffers */ - is->pppcfg = val; /* isdn_ppp_xmit test for SC_ENABLE_IP !!! */ - netif_wake_queue(lp->netdev->dev); - break; - } - } - is->pppcfg = val; - break; - case PPPIOCGIDLE: /* get idle time information */ - if (lp) { - struct ppp_idle pidle; - pidle.xmit_idle = pidle.recv_idle = lp->huptimer; - if ((r = set_arg(argp, &pidle, sizeof(struct ppp_idle)))) - return r; - } - break; - case PPPIOCSMRU: /* set receive unit size for PPP */ - if ((r = get_arg(argp, &val, sizeof(val)))) - return r; - is->mru = val; - break; - case PPPIOCSMPMRU: - break; - case PPPIOCSMPMTU: - break; - case PPPIOCSMAXCID: /* set the maximum compression slot id */ - if ((r = get_arg(argp, &val, sizeof(val)))) - return r; - val++; - if (is->maxcid != val) { -#ifdef CONFIG_ISDN_PPP_VJ - struct slcompress *sltmp; -#endif - if (is->debug & 0x1) - printk(KERN_DEBUG "ippp, ioctl: changed MAXCID to %ld\n", val); - is->maxcid = val; -#ifdef CONFIG_ISDN_PPP_VJ - sltmp = slhc_init(16, val); - if (IS_ERR(sltmp)) - return PTR_ERR(sltmp); - if (is->slcomp) - slhc_free(is->slcomp); - is->slcomp = sltmp; -#endif - } - break; - case PPPIOCGDEBUG: - if ((r = set_arg(argp, &is->debug, sizeof(is->debug)))) - return r; - break; - case PPPIOCSDEBUG: - if ((r = get_arg(argp, &val, sizeof(val)))) - return r; - is->debug = val; - break; - case PPPIOCGCOMPRESSORS: - { - unsigned long protos[8] = {0,}; - struct isdn_ppp_compressor *ipc = ipc_head; - while (ipc) { - j = ipc->num / (sizeof(long) * 8); - i = ipc->num % (sizeof(long) * 8); - if (j < 8) - protos[j] |= (1UL << i); - ipc = ipc->next; - } - if ((r = set_arg(argp, protos, 8 * sizeof(long)))) - return r; - } - break; - case PPPIOCSCOMPRESSOR: - if ((r = get_arg(argp, &data, sizeof(struct isdn_ppp_comp_data)))) - return r; - return isdn_ppp_set_compressor(is, &data); - case PPPIOCGCALLINFO: - { - struct pppcallinfo pci; - memset((char *)&pci, 0, sizeof(struct pppcallinfo)); - if (lp) - { - strncpy(pci.local_num, lp->msn, 63); - if (lp->dial) { - strncpy(pci.remote_num, lp->dial->num, 63); - } - pci.charge_units = lp->charge; - if (lp->outgoing) - pci.calltype = CALLTYPE_OUTGOING; - else - pci.calltype = CALLTYPE_INCOMING; - if (lp->flags & ISDN_NET_CALLBACK) - pci.calltype |= CALLTYPE_CALLBACK; - } - return set_arg(argp, &pci, sizeof(struct pppcallinfo)); - } -#ifdef CONFIG_IPPP_FILTER - case PPPIOCSPASS: - { - struct sock_fprog_kern fprog; - struct sock_filter *code; - int err, len = get_filter(argp, &code); - - if (len < 0) - return len; - - fprog.len = len; - fprog.filter = code; - - if (is->pass_filter) { - bpf_prog_destroy(is->pass_filter); - is->pass_filter = NULL; - } - if (fprog.filter != NULL) - err = bpf_prog_create(&is->pass_filter, &fprog); - else - err = 0; - kfree(code); - - return err; - } - case PPPIOCSACTIVE: - { - struct sock_fprog_kern fprog; - struct sock_filter *code; - int err, len = get_filter(argp, &code); - - if (len < 0) - return len; - - fprog.len = len; - fprog.filter = code; - - if (is->active_filter) { - bpf_prog_destroy(is->active_filter); - is->active_filter = NULL; - } - if (fprog.filter != NULL) - err = bpf_prog_create(&is->active_filter, &fprog); - else - err = 0; - kfree(code); - - return err; - } -#endif /* CONFIG_IPPP_FILTER */ - default: - break; - } - return 0; -} - -__poll_t -isdn_ppp_poll(struct file *file, poll_table *wait) -{ - __poll_t mask; - struct ippp_buf_queue *bf, *bl; - u_long flags; - struct ippp_struct *is; - - is = file->private_data; - - if (is->debug & 0x2) - printk(KERN_DEBUG "isdn_ppp_poll: minor: %d\n", - iminor(file_inode(file))); - - /* just registers wait_queue hook. This doesn't really wait. */ - poll_wait(file, &is->wq, wait); - - if (!(is->state & IPPP_OPEN)) { - if (is->state == IPPP_CLOSEWAIT) - return EPOLLHUP; - printk(KERN_DEBUG "isdn_ppp: device not open\n"); - return EPOLLERR; - } - /* we're always ready to send .. */ - mask = EPOLLOUT | EPOLLWRNORM; - - spin_lock_irqsave(&is->buflock, flags); - bl = is->last; - bf = is->first; - /* - * if IPPP_NOBLOCK is set we return even if we have nothing to read - */ - if (bf->next != bl || (is->state & IPPP_NOBLOCK)) { - is->state &= ~IPPP_NOBLOCK; - mask |= EPOLLIN | EPOLLRDNORM; - } - spin_unlock_irqrestore(&is->buflock, flags); - return mask; -} - -/* - * fill up isdn_ppp_read() queue .. - */ - -static int -isdn_ppp_fill_rq(unsigned char *buf, int len, int proto, int slot) -{ - struct ippp_buf_queue *bf, *bl; - u_long flags; - u_char *nbuf; - struct ippp_struct *is; - - if (slot < 0 || slot >= ISDN_MAX_CHANNELS) { - printk(KERN_WARNING "ippp: illegal slot(%d).\n", slot); - return 0; - } - is = ippp_table[slot]; - - if (!(is->state & IPPP_CONNECT)) { - printk(KERN_DEBUG "ippp: device not activated.\n"); - return 0; - } - nbuf = kmalloc(len + 4, GFP_ATOMIC); - if (!nbuf) { - printk(KERN_WARNING "ippp: Can't alloc buf\n"); - return 0; - } - nbuf[0] = PPP_ALLSTATIONS; - nbuf[1] = PPP_UI; - nbuf[2] = proto >> 8; - nbuf[3] = proto & 0xff; - memcpy(nbuf + 4, buf, len); - - spin_lock_irqsave(&is->buflock, flags); - bf = is->first; - bl = is->last; - - if (bf == bl) { - printk(KERN_WARNING "ippp: Queue is full; discarding first buffer\n"); - bf = bf->next; - kfree(bf->buf); - is->first = bf; - } - bl->buf = (char *) nbuf; - bl->len = len + 4; - - is->last = bl->next; - spin_unlock_irqrestore(&is->buflock, flags); - wake_up_interruptible(&is->wq); - return len; -} - -/* - * read() .. non-blocking: ipppd calls it only after select() - * reports, that there is data - */ - -int -isdn_ppp_read(int min, struct file *file, char __user *buf, int count) -{ - struct ippp_struct *is; - struct ippp_buf_queue *b; - u_long flags; - u_char *save_buf; - - is = file->private_data; - - if (!(is->state & IPPP_OPEN)) - return 0; - - spin_lock_irqsave(&is->buflock, flags); - b = is->first->next; - save_buf = b->buf; - if (!save_buf) { - spin_unlock_irqrestore(&is->buflock, flags); - return -EAGAIN; - } - if (b->len < count) - count = b->len; - b->buf = NULL; - is->first = b; - - spin_unlock_irqrestore(&is->buflock, flags); - if (copy_to_user(buf, save_buf, count)) - count = -EFAULT; - kfree(save_buf); - - return count; -} - -/* - * ipppd wanna write a packet to the card .. non-blocking - */ - -int -isdn_ppp_write(int min, struct file *file, const char __user *buf, int count) -{ - isdn_net_local *lp; - struct ippp_struct *is; - int proto; - - is = file->private_data; - - if (!(is->state & IPPP_CONNECT)) - return 0; - - lp = is->lp; - - /* -> push it directly to the lowlevel interface */ - - if (!lp) - printk(KERN_DEBUG "isdn_ppp_write: lp == NULL\n"); - else { - if (lp->isdn_device < 0 || lp->isdn_channel < 0) { - unsigned char protobuf[4]; - /* - * Don't reset huptimer for - * LCP packets. (Echo requests). - */ - if (copy_from_user(protobuf, buf, 4)) - return -EFAULT; - - proto = PPP_PROTOCOL(protobuf); - if (proto != PPP_LCP) - lp->huptimer = 0; - - return 0; - } - - if ((dev->drv[lp->isdn_device]->flags & DRV_FLAG_RUNNING) && - lp->dialstate == 0 && - (lp->flags & ISDN_NET_CONNECTED)) { - unsigned short hl; - struct sk_buff *skb; - unsigned char *cpy_buf; - /* - * we need to reserve enough space in front of - * sk_buff. old call to dev_alloc_skb only reserved - * 16 bytes, now we are looking what the driver want - */ - hl = dev->drv[lp->isdn_device]->interface->hl_hdrlen; - skb = alloc_skb(hl + count, GFP_ATOMIC); - if (!skb) { - printk(KERN_WARNING "isdn_ppp_write: out of memory!\n"); - return count; - } - skb_reserve(skb, hl); - cpy_buf = skb_put(skb, count); - if (copy_from_user(cpy_buf, buf, count)) - { - kfree_skb(skb); - return -EFAULT; - } - - /* - * Don't reset huptimer for - * LCP packets. (Echo requests). - */ - proto = PPP_PROTOCOL(cpy_buf); - if (proto != PPP_LCP) - lp->huptimer = 0; - - if (is->debug & 0x40) { - printk(KERN_DEBUG "ppp xmit: len %d\n", (int) skb->len); - isdn_ppp_frame_log("xmit", skb->data, skb->len, 32, is->unit, lp->ppp_slot); - } - - isdn_ppp_send_ccp(lp->netdev, lp, skb); /* keeps CCP/compression states in sync */ - - isdn_net_write_super(lp, skb); - } - } - return count; -} - -/* - * init memory, structures etc. - */ - -int -isdn_ppp_init(void) -{ - int i, - j; - -#ifdef CONFIG_ISDN_MPP - if (isdn_ppp_mp_bundle_array_init() < 0) - return -ENOMEM; -#endif /* CONFIG_ISDN_MPP */ - - for (i = 0; i < ISDN_MAX_CHANNELS; i++) { - if (!(ippp_table[i] = kzalloc(sizeof(struct ippp_struct), GFP_KERNEL))) { - printk(KERN_WARNING "isdn_ppp_init: Could not alloc ippp_table\n"); - for (j = 0; j < i; j++) - kfree(ippp_table[j]); - return -1; - } - spin_lock_init(&ippp_table[i]->buflock); - ippp_table[i]->state = 0; - ippp_table[i]->first = ippp_table[i]->rq + NUM_RCV_BUFFS - 1; - ippp_table[i]->last = ippp_table[i]->rq; - - for (j = 0; j < NUM_RCV_BUFFS; j++) { - ippp_table[i]->rq[j].buf = NULL; - ippp_table[i]->rq[j].last = ippp_table[i]->rq + - (NUM_RCV_BUFFS + j - 1) % NUM_RCV_BUFFS; - ippp_table[i]->rq[j].next = ippp_table[i]->rq + (j + 1) % NUM_RCV_BUFFS; - } - } - return 0; -} - -void -isdn_ppp_cleanup(void) -{ - int i; - - for (i = 0; i < ISDN_MAX_CHANNELS; i++) - kfree(ippp_table[i]); - -#ifdef CONFIG_ISDN_MPP - kfree(isdn_ppp_bundle_arr); -#endif /* CONFIG_ISDN_MPP */ - -} - -/* - * check for address/control field and skip if allowed - * retval != 0 -> discard packet silently - */ -static int isdn_ppp_skip_ac(struct ippp_struct *is, struct sk_buff *skb) -{ - if (skb->len < 1) - return -1; - - if (skb->data[0] == 0xff) { - if (skb->len < 2) - return -1; - - if (skb->data[1] != 0x03) - return -1; - - // skip address/control (AC) field - skb_pull(skb, 2); - } else { - if (is->pppcfg & SC_REJ_COMP_AC) - // if AC compression was not negotiated, but used, discard packet - return -1; - } - return 0; -} - -/* - * get the PPP protocol header and pull skb - * retval < 0 -> discard packet silently - */ -static int isdn_ppp_strip_proto(struct sk_buff *skb) -{ - int proto; - - if (skb->len < 1) - return -1; - - if (skb->data[0] & 0x1) { - // protocol field is compressed - proto = skb->data[0]; - skb_pull(skb, 1); - } else { - if (skb->len < 2) - return -1; - proto = ((int) skb->data[0] << 8) + skb->data[1]; - skb_pull(skb, 2); - } - return proto; -} - - -/* - * handler for incoming packets on a syncPPP interface - */ -void isdn_ppp_receive(isdn_net_dev *net_dev, isdn_net_local *lp, struct sk_buff *skb) -{ - struct ippp_struct *is; - int slot; - int proto; - - BUG_ON(net_dev->local->master); // we're called with the master device always - - slot = lp->ppp_slot; - if (slot < 0 || slot >= ISDN_MAX_CHANNELS) { - printk(KERN_ERR "isdn_ppp_receive: lp->ppp_slot(%d)\n", - lp->ppp_slot); - kfree_skb(skb); - return; - } - is = ippp_table[slot]; - - if (is->debug & 0x4) { - printk(KERN_DEBUG "ippp_receive: is:%08lx lp:%08lx slot:%d unit:%d len:%d\n", - (long)is, (long)lp, lp->ppp_slot, is->unit, (int)skb->len); - isdn_ppp_frame_log("receive", skb->data, skb->len, 32, is->unit, lp->ppp_slot); - } - - if (isdn_ppp_skip_ac(is, skb) < 0) { - kfree_skb(skb); - return; - } - proto = isdn_ppp_strip_proto(skb); - if (proto < 0) { - kfree_skb(skb); - return; - } - -#ifdef CONFIG_ISDN_MPP - if (is->compflags & SC_LINK_DECOMP_ON) { - skb = isdn_ppp_decompress(skb, is, NULL, &proto); - if (!skb) // decompression error - return; - } - - if (!(is->mpppcfg & SC_REJ_MP_PROT)) { // we agreed to receive MPPP - if (proto == PPP_MP) { - isdn_ppp_mp_receive(net_dev, lp, skb); - return; - } - } -#endif - isdn_ppp_push_higher(net_dev, lp, skb, proto); -} - -/* - * we receive a reassembled frame, MPPP has been taken care of before. - * address/control and protocol have been stripped from the skb - * note: net_dev has to be master net_dev - */ -static void -isdn_ppp_push_higher(isdn_net_dev *net_dev, isdn_net_local *lp, struct sk_buff *skb, int proto) -{ - struct net_device *dev = net_dev->dev; - struct ippp_struct *is, *mis; - isdn_net_local *mlp = NULL; - int slot; - - slot = lp->ppp_slot; - if (slot < 0 || slot >= ISDN_MAX_CHANNELS) { - printk(KERN_ERR "isdn_ppp_push_higher: lp->ppp_slot(%d)\n", - lp->ppp_slot); - goto drop_packet; - } - is = ippp_table[slot]; - - if (lp->master) { // FIXME? - mlp = ISDN_MASTER_PRIV(lp); - slot = mlp->ppp_slot; - if (slot < 0 || slot >= ISDN_MAX_CHANNELS) { - printk(KERN_ERR "isdn_ppp_push_higher: master->ppp_slot(%d)\n", - lp->ppp_slot); - goto drop_packet; - } - } - mis = ippp_table[slot]; - - if (is->debug & 0x10) { - printk(KERN_DEBUG "push, skb %d %04x\n", (int) skb->len, proto); - isdn_ppp_frame_log("rpush", skb->data, skb->len, 32, is->unit, lp->ppp_slot); - } - if (mis->compflags & SC_DECOMP_ON) { - skb = isdn_ppp_decompress(skb, is, mis, &proto); - if (!skb) // decompression error - return; - } - switch (proto) { - case PPP_IPX: /* untested */ - if (is->debug & 0x20) - printk(KERN_DEBUG "isdn_ppp: IPX\n"); - skb->protocol = htons(ETH_P_IPX); - break; - case PPP_IP: - if (is->debug & 0x20) - printk(KERN_DEBUG "isdn_ppp: IP\n"); - skb->protocol = htons(ETH_P_IP); - break; - case PPP_COMP: - case PPP_COMPFRAG: - printk(KERN_INFO "isdn_ppp: unexpected compressed frame dropped\n"); - goto drop_packet; -#ifdef CONFIG_ISDN_PPP_VJ - case PPP_VJC_UNCOMP: - if (is->debug & 0x20) - printk(KERN_DEBUG "isdn_ppp: VJC_UNCOMP\n"); - if (net_dev->local->ppp_slot < 0) { - printk(KERN_ERR "%s: net_dev->local->ppp_slot(%d) out of range\n", - __func__, net_dev->local->ppp_slot); - goto drop_packet; - } - if (slhc_remember(ippp_table[net_dev->local->ppp_slot]->slcomp, skb->data, skb->len) <= 0) { - printk(KERN_WARNING "isdn_ppp: received illegal VJC_UNCOMP frame!\n"); - goto drop_packet; - } - skb->protocol = htons(ETH_P_IP); - break; - case PPP_VJC_COMP: - if (is->debug & 0x20) - printk(KERN_DEBUG "isdn_ppp: VJC_COMP\n"); - { - struct sk_buff *skb_old = skb; - int pkt_len; - skb = dev_alloc_skb(skb_old->len + 128); - - if (!skb) { - printk(KERN_WARNING "%s: Memory squeeze, dropping packet.\n", dev->name); - skb = skb_old; - goto drop_packet; - } - skb_put(skb, skb_old->len + 128); - skb_copy_from_linear_data(skb_old, skb->data, - skb_old->len); - if (net_dev->local->ppp_slot < 0) { - printk(KERN_ERR "%s: net_dev->local->ppp_slot(%d) out of range\n", - __func__, net_dev->local->ppp_slot); - goto drop_packet; - } - pkt_len = slhc_uncompress(ippp_table[net_dev->local->ppp_slot]->slcomp, - skb->data, skb_old->len); - kfree_skb(skb_old); - if (pkt_len < 0) - goto drop_packet; - - skb_trim(skb, pkt_len); - skb->protocol = htons(ETH_P_IP); - } - break; -#endif - case PPP_CCP: - case PPP_CCPFRAG: - isdn_ppp_receive_ccp(net_dev, lp, skb, proto); - /* Dont pop up ResetReq/Ack stuff to the daemon any - longer - the job is done already */ - if (skb->data[0] == CCP_RESETREQ || - skb->data[0] == CCP_RESETACK) - break; - /* fall through */ - default: - isdn_ppp_fill_rq(skb->data, skb->len, proto, lp->ppp_slot); /* push data to pppd device */ - kfree_skb(skb); - return; - } - -#ifdef CONFIG_IPPP_FILTER - /* check if the packet passes the pass and active filters - * the filter instructions are constructed assuming - * a four-byte PPP header on each packet (which is still present) */ - skb_push(skb, 4); - - { - u_int16_t *p = (u_int16_t *) skb->data; - - *p = 0; /* indicate inbound */ - } - - if (is->pass_filter - && BPF_PROG_RUN(is->pass_filter, skb) == 0) { - if (is->debug & 0x2) - printk(KERN_DEBUG "IPPP: inbound frame filtered.\n"); - kfree_skb(skb); - return; - } - if (!(is->active_filter - && BPF_PROG_RUN(is->active_filter, skb) == 0)) { - if (is->debug & 0x2) - printk(KERN_DEBUG "IPPP: link-active filter: resetting huptimer.\n"); - lp->huptimer = 0; - if (mlp) - mlp->huptimer = 0; - } - skb_pull(skb, 4); -#else /* CONFIG_IPPP_FILTER */ - lp->huptimer = 0; - if (mlp) - mlp->huptimer = 0; -#endif /* CONFIG_IPPP_FILTER */ - skb->dev = dev; - skb_reset_mac_header(skb); - netif_rx(skb); - /* net_dev->local->stats.rx_packets++; done in isdn_net.c */ - return; - -drop_packet: - net_dev->local->stats.rx_dropped++; - kfree_skb(skb); -} - -/* - * isdn_ppp_skb_push .. - * checks whether we have enough space at the beginning of the skb - * and allocs a new SKB if necessary - */ -static unsigned char *isdn_ppp_skb_push(struct sk_buff **skb_p, int len) -{ - struct sk_buff *skb = *skb_p; - - if (skb_headroom(skb) < len) { - struct sk_buff *nskb = skb_realloc_headroom(skb, len); - - if (!nskb) { - printk(KERN_ERR "isdn_ppp_skb_push: can't realloc headroom!\n"); - dev_kfree_skb(skb); - return NULL; - } - printk(KERN_DEBUG "isdn_ppp_skb_push:under %d %d\n", skb_headroom(skb), len); - dev_kfree_skb(skb); - *skb_p = nskb; - return skb_push(nskb, len); - } - return skb_push(skb, len); -} - -/* - * send ppp frame .. we expect a PIDCOMPressable proto -- - * (here: currently always PPP_IP,PPP_VJC_COMP,PPP_VJC_UNCOMP) - * - * VJ compression may change skb pointer!!! .. requeue with old - * skb isn't allowed!! - */ - -int -isdn_ppp_xmit(struct sk_buff *skb, struct net_device *netdev) -{ - isdn_net_local *lp, *mlp; - isdn_net_dev *nd; - unsigned int proto = PPP_IP; /* 0x21 */ - struct ippp_struct *ipt, *ipts; - int slot, retval = NETDEV_TX_OK; - - mlp = netdev_priv(netdev); - nd = mlp->netdev; /* get master lp */ - - slot = mlp->ppp_slot; - if (slot < 0 || slot >= ISDN_MAX_CHANNELS) { - printk(KERN_ERR "isdn_ppp_xmit: lp->ppp_slot(%d)\n", - mlp->ppp_slot); - kfree_skb(skb); - goto out; - } - ipts = ippp_table[slot]; - - if (!(ipts->pppcfg & SC_ENABLE_IP)) { /* PPP connected ? */ - if (ipts->debug & 0x1) - printk(KERN_INFO "%s: IP frame delayed.\n", netdev->name); - retval = NETDEV_TX_BUSY; - goto out; - } - - switch (ntohs(skb->protocol)) { - case ETH_P_IP: - proto = PPP_IP; - break; - case ETH_P_IPX: - proto = PPP_IPX; /* untested */ - break; - default: - printk(KERN_ERR "isdn_ppp: skipped unsupported protocol: %#x.\n", - skb->protocol); - dev_kfree_skb(skb); - goto out; - } - - lp = isdn_net_get_locked_lp(nd); - if (!lp) { - printk(KERN_WARNING "%s: all channels busy - requeuing!\n", netdev->name); - retval = NETDEV_TX_BUSY; - goto out; - } - /* we have our lp locked from now on */ - - slot = lp->ppp_slot; - if (slot < 0 || slot >= ISDN_MAX_CHANNELS) { - printk(KERN_ERR "isdn_ppp_xmit: lp->ppp_slot(%d)\n", - lp->ppp_slot); - kfree_skb(skb); - goto unlock; - } - ipt = ippp_table[slot]; - - /* - * after this line .. requeueing in the device queue is no longer allowed!!! - */ - - /* Pull off the fake header we stuck on earlier to keep - * the fragmentation code happy. - */ - skb_pull(skb, IPPP_MAX_HEADER); - -#ifdef CONFIG_IPPP_FILTER - /* check if we should pass this packet - * the filter instructions are constructed assuming - * a four-byte PPP header on each packet */ - *(u8 *)skb_push(skb, 4) = 1; /* indicate outbound */ - - { - __be16 *p = (__be16 *)skb->data; - - p++; - *p = htons(proto); - } - - if (ipt->pass_filter - && BPF_PROG_RUN(ipt->pass_filter, skb) == 0) { - if (ipt->debug & 0x4) - printk(KERN_DEBUG "IPPP: outbound frame filtered.\n"); - kfree_skb(skb); - goto unlock; - } - if (!(ipt->active_filter - && BPF_PROG_RUN(ipt->active_filter, skb) == 0)) { - if (ipt->debug & 0x4) - printk(KERN_DEBUG "IPPP: link-active filter: resetting huptimer.\n"); - lp->huptimer = 0; - } - skb_pull(skb, 4); -#else /* CONFIG_IPPP_FILTER */ - lp->huptimer = 0; -#endif /* CONFIG_IPPP_FILTER */ - - if (ipt->debug & 0x4) - printk(KERN_DEBUG "xmit skb, len %d\n", (int) skb->len); - if (ipts->debug & 0x40) - isdn_ppp_frame_log("xmit0", skb->data, skb->len, 32, ipts->unit, lp->ppp_slot); - -#ifdef CONFIG_ISDN_PPP_VJ - if (proto == PPP_IP && ipts->pppcfg & SC_COMP_TCP) { /* ipts here? probably yes, but check this again */ - struct sk_buff *new_skb; - unsigned short hl; - /* - * we need to reserve enough space in front of - * sk_buff. old call to dev_alloc_skb only reserved - * 16 bytes, now we are looking what the driver want. - */ - hl = dev->drv[lp->isdn_device]->interface->hl_hdrlen + IPPP_MAX_HEADER; - /* - * Note: hl might still be insufficient because the method - * above does not account for a possibible MPPP slave channel - * which had larger HL header space requirements than the - * master. - */ - new_skb = alloc_skb(hl + skb->len, GFP_ATOMIC); - if (new_skb) { - u_char *buf; - int pktlen; - - skb_reserve(new_skb, hl); - new_skb->dev = skb->dev; - skb_put(new_skb, skb->len); - buf = skb->data; - - pktlen = slhc_compress(ipts->slcomp, skb->data, skb->len, new_skb->data, - &buf, !(ipts->pppcfg & SC_NO_TCP_CCID)); - - if (buf != skb->data) { - if (new_skb->data != buf) - printk(KERN_ERR "isdn_ppp: FATAL error after slhc_compress!!\n"); - dev_kfree_skb(skb); - skb = new_skb; - } else { - dev_kfree_skb(new_skb); - } - - skb_trim(skb, pktlen); - if (skb->data[0] & SL_TYPE_COMPRESSED_TCP) { /* cslip? style -> PPP */ - proto = PPP_VJC_COMP; - skb->data[0] ^= SL_TYPE_COMPRESSED_TCP; - } else { - if (skb->data[0] >= SL_TYPE_UNCOMPRESSED_TCP) - proto = PPP_VJC_UNCOMP; - skb->data[0] = (skb->data[0] & 0x0f) | 0x40; - } - } - } -#endif - - /* - * normal (single link) or bundle compression - */ - if (ipts->compflags & SC_COMP_ON) { - /* We send compressed only if both down- und upstream - compression is negotiated, that means, CCP is up */ - if (ipts->compflags & SC_DECOMP_ON) { - skb = isdn_ppp_compress(skb, &proto, ipt, ipts, 0); - } else { - printk(KERN_DEBUG "isdn_ppp: CCP not yet up - sending as-is\n"); - } - } - - if (ipt->debug & 0x24) - printk(KERN_DEBUG "xmit2 skb, len %d, proto %04x\n", (int) skb->len, proto); - -#ifdef CONFIG_ISDN_MPP - if (ipt->mpppcfg & SC_MP_PROT) { - /* we get mp_seqno from static isdn_net_local */ - long mp_seqno = ipts->mp_seqno; - ipts->mp_seqno++; - if (ipt->mpppcfg & SC_OUT_SHORT_SEQ) { - unsigned char *data = isdn_ppp_skb_push(&skb, 3); - if (!data) - goto unlock; - mp_seqno &= 0xfff; - data[0] = MP_BEGIN_FRAG | MP_END_FRAG | ((mp_seqno >> 8) & 0xf); /* (B)egin & (E)ndbit .. */ - data[1] = mp_seqno & 0xff; - data[2] = proto; /* PID compression */ - } else { - unsigned char *data = isdn_ppp_skb_push(&skb, 5); - if (!data) - goto unlock; - data[0] = MP_BEGIN_FRAG | MP_END_FRAG; /* (B)egin & (E)ndbit .. */ - data[1] = (mp_seqno >> 16) & 0xff; /* sequence number: 24bit */ - data[2] = (mp_seqno >> 8) & 0xff; - data[3] = (mp_seqno >> 0) & 0xff; - data[4] = proto; /* PID compression */ - } - proto = PPP_MP; /* MP Protocol, 0x003d */ - } -#endif - - /* - * 'link in bundle' compression ... - */ - if (ipt->compflags & SC_LINK_COMP_ON) - skb = isdn_ppp_compress(skb, &proto, ipt, ipts, 1); - - if ((ipt->pppcfg & SC_COMP_PROT) && (proto <= 0xff)) { - unsigned char *data = isdn_ppp_skb_push(&skb, 1); - if (!data) - goto unlock; - data[0] = proto & 0xff; - } - else { - unsigned char *data = isdn_ppp_skb_push(&skb, 2); - if (!data) - goto unlock; - data[0] = (proto >> 8) & 0xff; - data[1] = proto & 0xff; - } - if (!(ipt->pppcfg & SC_COMP_AC)) { - unsigned char *data = isdn_ppp_skb_push(&skb, 2); - if (!data) - goto unlock; - data[0] = 0xff; /* All Stations */ - data[1] = 0x03; /* Unnumbered information */ - } - - /* tx-stats are now updated via BSENT-callback */ - - if (ipts->debug & 0x40) { - printk(KERN_DEBUG "skb xmit: len: %d\n", (int) skb->len); - isdn_ppp_frame_log("xmit", skb->data, skb->len, 32, ipt->unit, lp->ppp_slot); - } - - isdn_net_writebuf_skb(lp, skb); - -unlock: - spin_unlock_bh(&lp->xmit_lock); -out: - return retval; -} - -#ifdef CONFIG_IPPP_FILTER -/* - * check if this packet may trigger auto-dial. - */ - -int isdn_ppp_autodial_filter(struct sk_buff *skb, isdn_net_local *lp) -{ - struct ippp_struct *is = ippp_table[lp->ppp_slot]; - u_int16_t proto; - int drop = 0; - - switch (ntohs(skb->protocol)) { - case ETH_P_IP: - proto = PPP_IP; - break; - case ETH_P_IPX: - proto = PPP_IPX; - break; - default: - printk(KERN_ERR "isdn_ppp_autodial_filter: unsupported protocol 0x%x.\n", - skb->protocol); - return 1; - } - - /* the filter instructions are constructed assuming - * a four-byte PPP header on each packet. we have to - * temporarily remove part of the fake header stuck on - * earlier. - */ - *(u8 *)skb_pull(skb, IPPP_MAX_HEADER - 4) = 1; /* indicate outbound */ - - { - __be16 *p = (__be16 *)skb->data; - - p++; - *p = htons(proto); - } - - drop |= is->pass_filter - && BPF_PROG_RUN(is->pass_filter, skb) == 0; - drop |= is->active_filter - && BPF_PROG_RUN(is->active_filter, skb) == 0; - - skb_push(skb, IPPP_MAX_HEADER - 4); - return drop; -} -#endif -#ifdef CONFIG_ISDN_MPP - -/* this is _not_ rfc1990 header, but something we convert both short and long - * headers to for convinience's sake: - * byte 0 is flags as in rfc1990 - * bytes 1...4 is 24-bit seqence number converted to host byte order - */ -#define MP_HEADER_LEN 5 - -#define MP_LONGSEQ_MASK 0x00ffffff -#define MP_SHORTSEQ_MASK 0x00000fff -#define MP_LONGSEQ_MAX MP_LONGSEQ_MASK -#define MP_SHORTSEQ_MAX MP_SHORTSEQ_MASK -#define MP_LONGSEQ_MAXBIT ((MP_LONGSEQ_MASK + 1) >> 1) -#define MP_SHORTSEQ_MAXBIT ((MP_SHORTSEQ_MASK + 1) >> 1) - -/* sequence-wrap safe comparisons (for long sequence)*/ -#define MP_LT(a, b) ((a - b) & MP_LONGSEQ_MAXBIT) -#define MP_LE(a, b) !((b - a) & MP_LONGSEQ_MAXBIT) -#define MP_GT(a, b) ((b - a) & MP_LONGSEQ_MAXBIT) -#define MP_GE(a, b) !((a - b) & MP_LONGSEQ_MAXBIT) - -#define MP_SEQ(f) ((*(u32 *)(f->data + 1))) -#define MP_FLAGS(f) (f->data[0]) - -static int isdn_ppp_mp_bundle_array_init(void) -{ - int i; - int sz = ISDN_MAX_CHANNELS * sizeof(ippp_bundle); - if ((isdn_ppp_bundle_arr = kzalloc(sz, GFP_KERNEL)) == NULL) - return -ENOMEM; - for (i = 0; i < ISDN_MAX_CHANNELS; i++) - spin_lock_init(&isdn_ppp_bundle_arr[i].lock); - return 0; -} - -static ippp_bundle *isdn_ppp_mp_bundle_alloc(void) -{ - int i; - for (i = 0; i < ISDN_MAX_CHANNELS; i++) - if (isdn_ppp_bundle_arr[i].ref_ct <= 0) - return (isdn_ppp_bundle_arr + i); - return NULL; -} - -static int isdn_ppp_mp_init(isdn_net_local *lp, ippp_bundle *add_to) -{ - struct ippp_struct *is; - - if (lp->ppp_slot < 0) { - printk(KERN_ERR "%s: lp->ppp_slot(%d) out of range\n", - __func__, lp->ppp_slot); - return (-EINVAL); - } - - is = ippp_table[lp->ppp_slot]; - if (add_to) { - if (lp->netdev->pb) - lp->netdev->pb->ref_ct--; - lp->netdev->pb = add_to; - } else { /* first link in a bundle */ - is->mp_seqno = 0; - if ((lp->netdev->pb = isdn_ppp_mp_bundle_alloc()) == NULL) - return -ENOMEM; - lp->next = lp->last = lp; /* nobody else in a queue */ - lp->netdev->pb->frags = NULL; - lp->netdev->pb->frames = 0; - lp->netdev->pb->seq = UINT_MAX; - } - lp->netdev->pb->ref_ct++; - - is->last_link_seqno = 0; - return 0; -} - -static u32 isdn_ppp_mp_get_seq(int short_seq, - struct sk_buff *skb, u32 last_seq); -static struct sk_buff *isdn_ppp_mp_discard(ippp_bundle *mp, - struct sk_buff *from, struct sk_buff *to); -static void isdn_ppp_mp_reassembly(isdn_net_dev *net_dev, isdn_net_local *lp, - struct sk_buff *from, struct sk_buff *to); -static void isdn_ppp_mp_free_skb(ippp_bundle *mp, struct sk_buff *skb); -static void isdn_ppp_mp_print_recv_pkt(int slot, struct sk_buff *skb); - -static void isdn_ppp_mp_receive(isdn_net_dev *net_dev, isdn_net_local *lp, - struct sk_buff *skb) -{ - struct ippp_struct *is; - isdn_net_local *lpq; - ippp_bundle *mp; - isdn_mppp_stats *stats; - struct sk_buff *newfrag, *frag, *start, *nextf; - u32 newseq, minseq, thisseq; - unsigned long flags; - int slot; - - spin_lock_irqsave(&net_dev->pb->lock, flags); - mp = net_dev->pb; - stats = &mp->stats; - slot = lp->ppp_slot; - if (slot < 0 || slot >= ISDN_MAX_CHANNELS) { - printk(KERN_ERR "%s: lp->ppp_slot(%d)\n", - __func__, lp->ppp_slot); - stats->frame_drops++; - dev_kfree_skb(skb); - spin_unlock_irqrestore(&mp->lock, flags); - return; - } - is = ippp_table[slot]; - if (++mp->frames > stats->max_queue_len) - stats->max_queue_len = mp->frames; - - if (is->debug & 0x8) - isdn_ppp_mp_print_recv_pkt(lp->ppp_slot, skb); - - newseq = isdn_ppp_mp_get_seq(is->mpppcfg & SC_IN_SHORT_SEQ, - skb, is->last_link_seqno); - - - /* if this packet seq # is less than last already processed one, - * toss it right away, but check for sequence start case first - */ - if (mp->seq > MP_LONGSEQ_MAX && (newseq & MP_LONGSEQ_MAXBIT)) { - mp->seq = newseq; /* the first packet: required for - * rfc1990 non-compliant clients -- - * prevents constant packet toss */ - } else if (MP_LT(newseq, mp->seq)) { - stats->frame_drops++; - isdn_ppp_mp_free_skb(mp, skb); - spin_unlock_irqrestore(&mp->lock, flags); - return; - } - - /* find the minimum received sequence number over all links */ - is->last_link_seqno = minseq = newseq; - for (lpq = net_dev->queue;;) { - slot = lpq->ppp_slot; - if (slot < 0 || slot >= ISDN_MAX_CHANNELS) { - printk(KERN_ERR "%s: lpq->ppp_slot(%d)\n", - __func__, lpq->ppp_slot); - } else { - u32 lls = ippp_table[slot]->last_link_seqno; - if (MP_LT(lls, minseq)) - minseq = lls; - } - if ((lpq = lpq->next) == net_dev->queue) - break; - } - if (MP_LT(minseq, mp->seq)) - minseq = mp->seq; /* can't go beyond already processed - * packets */ - newfrag = skb; - - /* if this new fragment is before the first one, then enqueue it now. */ - if ((frag = mp->frags) == NULL || MP_LT(newseq, MP_SEQ(frag))) { - newfrag->next = frag; - mp->frags = frag = newfrag; - newfrag = NULL; - } - - start = MP_FLAGS(frag) & MP_BEGIN_FRAG && - MP_SEQ(frag) == mp->seq ? frag : NULL; - - /* - * main fragment traversing loop - * - * try to accomplish several tasks: - * - insert new fragment into the proper sequence slot (once that's done - * newfrag will be set to NULL) - * - reassemble any complete fragment sequence (non-null 'start' - * indicates there is a contiguous sequence present) - * - discard any incomplete sequences that are below minseq -- due - * to the fact that sender always increment sequence number, if there - * is an incomplete sequence below minseq, no new fragments would - * come to complete such sequence and it should be discarded - * - * loop completes when we accomplished the following tasks: - * - new fragment is inserted in the proper sequence ('newfrag' is - * set to NULL) - * - we hit a gap in the sequence, so no reassembly/processing is - * possible ('start' would be set to NULL) - * - * algorithm for this code is derived from code in the book - * 'PPP Design And Debugging' by James Carlson (Addison-Wesley) - */ - while (start != NULL || newfrag != NULL) { - - thisseq = MP_SEQ(frag); - nextf = frag->next; - - /* drop any duplicate fragments */ - if (newfrag != NULL && thisseq == newseq) { - isdn_ppp_mp_free_skb(mp, newfrag); - newfrag = NULL; - } - - /* insert new fragment before next element if possible. */ - if (newfrag != NULL && (nextf == NULL || - MP_LT(newseq, MP_SEQ(nextf)))) { - newfrag->next = nextf; - frag->next = nextf = newfrag; - newfrag = NULL; - } - - if (start != NULL) { - /* check for misplaced start */ - if (start != frag && (MP_FLAGS(frag) & MP_BEGIN_FRAG)) { - printk(KERN_WARNING"isdn_mppp(seq %d): new " - "BEGIN flag with no prior END", thisseq); - stats->seqerrs++; - stats->frame_drops++; - start = isdn_ppp_mp_discard(mp, start, frag); - nextf = frag->next; - } - } else if (MP_LE(thisseq, minseq)) { - if (MP_FLAGS(frag) & MP_BEGIN_FRAG) - start = frag; - else { - if (MP_FLAGS(frag) & MP_END_FRAG) - stats->frame_drops++; - if (mp->frags == frag) - mp->frags = nextf; - isdn_ppp_mp_free_skb(mp, frag); - frag = nextf; - continue; - } - } - - /* if start is non-null and we have end fragment, then - * we have full reassembly sequence -- reassemble - * and process packet now - */ - if (start != NULL && (MP_FLAGS(frag) & MP_END_FRAG)) { - minseq = mp->seq = (thisseq + 1) & MP_LONGSEQ_MASK; - /* Reassemble the packet then dispatch it */ - isdn_ppp_mp_reassembly(net_dev, lp, start, nextf); - - start = NULL; - frag = NULL; - - mp->frags = nextf; - } - - /* check if need to update start pointer: if we just - * reassembled the packet and sequence is contiguous - * then next fragment should be the start of new reassembly - * if sequence is contiguous, but we haven't reassembled yet, - * keep going. - * if sequence is not contiguous, either clear everything - * below low watermark and set start to the next frag or - * clear start ptr. - */ - if (nextf != NULL && - ((thisseq + 1) & MP_LONGSEQ_MASK) == MP_SEQ(nextf)) { - /* if we just reassembled and the next one is here, - * then start another reassembly. */ - - if (frag == NULL) { - if (MP_FLAGS(nextf) & MP_BEGIN_FRAG) - start = nextf; - else - { - printk(KERN_WARNING"isdn_mppp(seq %d):" - " END flag with no following " - "BEGIN", thisseq); - stats->seqerrs++; - } - } - - } else { - if (nextf != NULL && frag != NULL && - MP_LT(thisseq, minseq)) { - /* we've got a break in the sequence - * and we not at the end yet - * and we did not just reassembled - *(if we did, there wouldn't be anything before) - * and we below the low watermark - * discard all the frames below low watermark - * and start over */ - stats->frame_drops++; - mp->frags = isdn_ppp_mp_discard(mp, start, nextf); - } - /* break in the sequence, no reassembly */ - start = NULL; - } - - frag = nextf; - } /* while -- main loop */ - - if (mp->frags == NULL) - mp->frags = frag; - - /* rather straighforward way to deal with (not very) possible - * queue overflow */ - if (mp->frames > MP_MAX_QUEUE_LEN) { - stats->overflows++; - while (mp->frames > MP_MAX_QUEUE_LEN) { - frag = mp->frags->next; - isdn_ppp_mp_free_skb(mp, mp->frags); - mp->frags = frag; - } - } - spin_unlock_irqrestore(&mp->lock, flags); -} - -static void isdn_ppp_mp_cleanup(isdn_net_local *lp) -{ - struct sk_buff *frag = lp->netdev->pb->frags; - struct sk_buff *nextfrag; - while (frag) { - nextfrag = frag->next; - isdn_ppp_mp_free_skb(lp->netdev->pb, frag); - frag = nextfrag; - } - lp->netdev->pb->frags = NULL; -} - -static u32 isdn_ppp_mp_get_seq(int short_seq, - struct sk_buff *skb, u32 last_seq) -{ - u32 seq; - int flags = skb->data[0] & (MP_BEGIN_FRAG | MP_END_FRAG); - - if (!short_seq) - { - seq = ntohl(*(__be32 *)skb->data) & MP_LONGSEQ_MASK; - skb_push(skb, 1); - } - else - { - /* convert 12-bit short seq number to 24-bit long one - */ - seq = ntohs(*(__be16 *)skb->data) & MP_SHORTSEQ_MASK; - - /* check for seqence wrap */ - if (!(seq & MP_SHORTSEQ_MAXBIT) && - (last_seq & MP_SHORTSEQ_MAXBIT) && - (unsigned long)last_seq <= MP_LONGSEQ_MAX) - seq |= (last_seq + MP_SHORTSEQ_MAX + 1) & - (~MP_SHORTSEQ_MASK & MP_LONGSEQ_MASK); - else - seq |= last_seq & (~MP_SHORTSEQ_MASK & MP_LONGSEQ_MASK); - - skb_push(skb, 3); /* put converted seqence back in skb */ - } - *(u32 *)(skb->data + 1) = seq; /* put seqence back in _host_ byte - * order */ - skb->data[0] = flags; /* restore flags */ - return seq; -} - -static struct sk_buff *isdn_ppp_mp_discard(ippp_bundle *mp, - struct sk_buff *from, - struct sk_buff *to) -{ - if (from) - while (from != to) { - struct sk_buff *next = from->next; - isdn_ppp_mp_free_skb(mp, from); - from = next; - } - return from; -} - -static void isdn_ppp_mp_reassembly(isdn_net_dev *net_dev, isdn_net_local *lp, - struct sk_buff *from, struct sk_buff *to) -{ - ippp_bundle *mp = net_dev->pb; - int proto; - struct sk_buff *skb; - unsigned int tot_len; - - if (lp->ppp_slot < 0 || lp->ppp_slot >= ISDN_MAX_CHANNELS) { - printk(KERN_ERR "%s: lp->ppp_slot(%d) out of range\n", - __func__, lp->ppp_slot); - return; - } - if (MP_FLAGS(from) == (MP_BEGIN_FRAG | MP_END_FRAG)) { - if (ippp_table[lp->ppp_slot]->debug & 0x40) - printk(KERN_DEBUG "isdn_mppp: reassembly: frame %d, " - "len %d\n", MP_SEQ(from), from->len); - skb = from; - skb_pull(skb, MP_HEADER_LEN); - mp->frames--; - } else { - struct sk_buff *frag; - int n; - - for (tot_len = n = 0, frag = from; frag != to; frag = frag->next, n++) - tot_len += frag->len - MP_HEADER_LEN; - - if (ippp_table[lp->ppp_slot]->debug & 0x40) - printk(KERN_DEBUG"isdn_mppp: reassembling frames %d " - "to %d, len %d\n", MP_SEQ(from), - (MP_SEQ(from) + n - 1) & MP_LONGSEQ_MASK, tot_len); - if ((skb = dev_alloc_skb(tot_len)) == NULL) { - printk(KERN_ERR "isdn_mppp: cannot allocate sk buff " - "of size %d\n", tot_len); - isdn_ppp_mp_discard(mp, from, to); - return; - } - - while (from != to) { - unsigned int len = from->len - MP_HEADER_LEN; - - skb_copy_from_linear_data_offset(from, MP_HEADER_LEN, - skb_put(skb, len), - len); - frag = from->next; - isdn_ppp_mp_free_skb(mp, from); - from = frag; - } - } - proto = isdn_ppp_strip_proto(skb); - isdn_ppp_push_higher(net_dev, lp, skb, proto); -} - -static void isdn_ppp_mp_free_skb(ippp_bundle *mp, struct sk_buff *skb) -{ - dev_kfree_skb(skb); - mp->frames--; -} - -static void isdn_ppp_mp_print_recv_pkt(int slot, struct sk_buff *skb) -{ - printk(KERN_DEBUG "mp_recv: %d/%d -> %02x %02x %02x %02x %02x %02x\n", - slot, (int) skb->len, - (int) skb->data[0], (int) skb->data[1], (int) skb->data[2], - (int) skb->data[3], (int) skb->data[4], (int) skb->data[5]); -} - -static int -isdn_ppp_bundle(struct ippp_struct *is, int unit) -{ - char ifn[IFNAMSIZ + 1]; - isdn_net_dev *p; - isdn_net_local *lp, *nlp; - int rc; - unsigned long flags; - - sprintf(ifn, "ippp%d", unit); - p = isdn_net_findif(ifn); - if (!p) { - printk(KERN_ERR "ippp_bundle: cannot find %s\n", ifn); - return -EINVAL; - } - - spin_lock_irqsave(&p->pb->lock, flags); - - nlp = is->lp; - lp = p->queue; - if (nlp->ppp_slot < 0 || nlp->ppp_slot >= ISDN_MAX_CHANNELS || - lp->ppp_slot < 0 || lp->ppp_slot >= ISDN_MAX_CHANNELS) { - printk(KERN_ERR "ippp_bundle: binding to invalid slot %d\n", - nlp->ppp_slot < 0 || nlp->ppp_slot >= ISDN_MAX_CHANNELS ? - nlp->ppp_slot : lp->ppp_slot); - rc = -EINVAL; - goto out; - } - - isdn_net_add_to_bundle(p, nlp); - - ippp_table[nlp->ppp_slot]->unit = ippp_table[lp->ppp_slot]->unit; - - /* maybe also SC_CCP stuff */ - ippp_table[nlp->ppp_slot]->pppcfg |= ippp_table[lp->ppp_slot]->pppcfg & - (SC_ENABLE_IP | SC_NO_TCP_CCID | SC_REJ_COMP_TCP); - ippp_table[nlp->ppp_slot]->mpppcfg |= ippp_table[lp->ppp_slot]->mpppcfg & - (SC_MP_PROT | SC_REJ_MP_PROT | SC_OUT_SHORT_SEQ | SC_IN_SHORT_SEQ); - rc = isdn_ppp_mp_init(nlp, p->pb); -out: - spin_unlock_irqrestore(&p->pb->lock, flags); - return rc; -} - -#endif /* CONFIG_ISDN_MPP */ - -/* - * network device ioctl handlers - */ - -static int -isdn_ppp_dev_ioctl_stats(int slot, struct ifreq *ifr, struct net_device *dev) -{ - struct ppp_stats __user *res = ifr->ifr_data; - struct ppp_stats t; - isdn_net_local *lp = netdev_priv(dev); - - /* build a temporary stat struct and copy it to user space */ - - memset(&t, 0, sizeof(struct ppp_stats)); - if (dev->flags & IFF_UP) { - t.p.ppp_ipackets = lp->stats.rx_packets; - t.p.ppp_ibytes = lp->stats.rx_bytes; - t.p.ppp_ierrors = lp->stats.rx_errors; - t.p.ppp_opackets = lp->stats.tx_packets; - t.p.ppp_obytes = lp->stats.tx_bytes; - t.p.ppp_oerrors = lp->stats.tx_errors; -#ifdef CONFIG_ISDN_PPP_VJ - if (slot >= 0 && ippp_table[slot]->slcomp) { - struct slcompress *slcomp = ippp_table[slot]->slcomp; - t.vj.vjs_packets = slcomp->sls_o_compressed + slcomp->sls_o_uncompressed; - t.vj.vjs_compressed = slcomp->sls_o_compressed; - t.vj.vjs_searches = slcomp->sls_o_searches; - t.vj.vjs_misses = slcomp->sls_o_misses; - t.vj.vjs_errorin = slcomp->sls_i_error; - t.vj.vjs_tossed = slcomp->sls_i_tossed; - t.vj.vjs_uncompressedin = slcomp->sls_i_uncompressed; - t.vj.vjs_compressedin = slcomp->sls_i_compressed; - } -#endif - } - if (copy_to_user(res, &t, sizeof(struct ppp_stats))) - return -EFAULT; - return 0; -} - -int -isdn_ppp_dev_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) -{ - int error = 0; - int len; - isdn_net_local *lp = netdev_priv(dev); - - - if (lp->p_encap != ISDN_NET_ENCAP_SYNCPPP) - return -EINVAL; - - switch (cmd) { -#define PPP_VERSION "2.3.7" - case SIOCGPPPVER: - len = strlen(PPP_VERSION) + 1; - if (copy_to_user(ifr->ifr_data, PPP_VERSION, len)) - error = -EFAULT; - break; - - case SIOCGPPPSTATS: - error = isdn_ppp_dev_ioctl_stats(lp->ppp_slot, ifr, dev); - break; - default: - error = -EINVAL; - break; - } - return error; -} - -static int -isdn_ppp_if_get_unit(char *name) -{ - int len, - i, - unit = 0, - deci; - - len = strlen(name); - - if (strncmp("ippp", name, 4) || len > 8) - return -1; - - for (i = 0, deci = 1; i < len; i++, deci *= 10) { - char a = name[len - i - 1]; - if (a >= '0' && a <= '9') - unit += (a - '0') * deci; - else - break; - } - if (!i || len - i != 4) - unit = -1; - - return unit; -} - - -int -isdn_ppp_dial_slave(char *name) -{ -#ifdef CONFIG_ISDN_MPP - isdn_net_dev *ndev; - isdn_net_local *lp; - struct net_device *sdev; - - if (!(ndev = isdn_net_findif(name))) - return 1; - lp = ndev->local; - if (!(lp->flags & ISDN_NET_CONNECTED)) - return 5; - - sdev = lp->slave; - while (sdev) { - isdn_net_local *mlp = netdev_priv(sdev); - if (!(mlp->flags & ISDN_NET_CONNECTED)) - break; - sdev = mlp->slave; - } - if (!sdev) - return 2; - - isdn_net_dial_req(netdev_priv(sdev)); - return 0; -#else - return -1; -#endif -} - -int -isdn_ppp_hangup_slave(char *name) -{ -#ifdef CONFIG_ISDN_MPP - isdn_net_dev *ndev; - isdn_net_local *lp; - struct net_device *sdev; - - if (!(ndev = isdn_net_findif(name))) - return 1; - lp = ndev->local; - if (!(lp->flags & ISDN_NET_CONNECTED)) - return 5; - - sdev = lp->slave; - while (sdev) { - isdn_net_local *mlp = netdev_priv(sdev); - - if (mlp->slave) { /* find last connected link in chain */ - isdn_net_local *nlp = ISDN_SLAVE_PRIV(mlp); - - if (!(nlp->flags & ISDN_NET_CONNECTED)) - break; - } else if (mlp->flags & ISDN_NET_CONNECTED) - break; - - sdev = mlp->slave; - } - if (!sdev) - return 2; - - isdn_net_hangup(sdev); - return 0; -#else - return -1; -#endif -} - -/* - * PPP compression stuff - */ - - -/* Push an empty CCP Data Frame up to the daemon to wake it up and let it - generate a CCP Reset-Request or tear down CCP altogether */ - -static void isdn_ppp_ccp_kickup(struct ippp_struct *is) -{ - isdn_ppp_fill_rq(NULL, 0, PPP_COMP, is->lp->ppp_slot); -} - -/* In-kernel handling of CCP Reset-Request and Reset-Ack is necessary, - but absolutely nontrivial. The most abstruse problem we are facing is - that the generation, reception and all the handling of timeouts and - resends including proper request id management should be entirely left - to the (de)compressor, but indeed is not covered by the current API to - the (de)compressor. The API is a prototype version from PPP where only - some (de)compressors have yet been implemented and all of them are - rather simple in their reset handling. Especially, their is only one - outstanding ResetAck at a time with all of them and ResetReq/-Acks do - not have parameters. For this very special case it was sufficient to - just return an error code from the decompressor and have a single - reset() entry to communicate all the necessary information between - the framework and the (de)compressor. Bad enough, LZS is different - (and any other compressor may be different, too). It has multiple - histories (eventually) and needs to Reset each of them independently - and thus uses multiple outstanding Acks and history numbers as an - additional parameter to Reqs/Acks. - All that makes it harder to port the reset state engine into the - kernel because it is not just the same simple one as in (i)pppd but - it must be able to pass additional parameters and have multiple out- - standing Acks. We are trying to achieve the impossible by handling - reset transactions independent by their id. The id MUST change when - the data portion changes, thus any (de)compressor who uses more than - one resettable state must provide and recognize individual ids for - each individual reset transaction. The framework itself does _only_ - differentiate them by id, because it has no other semantics like the - (de)compressor might. - This looks like a major redesign of the interface would be nice, - but I don't have an idea how to do it better. */ - -/* Send a CCP Reset-Request or Reset-Ack directly from the kernel. This is - getting that lengthy because there is no simple "send-this-frame-out" - function above but every wrapper does a bit different. Hope I guess - correct in this hack... */ - -static void isdn_ppp_ccp_xmit_reset(struct ippp_struct *is, int proto, - unsigned char code, unsigned char id, - unsigned char *data, int len) -{ - struct sk_buff *skb; - unsigned char *p; - int hl; - int cnt = 0; - isdn_net_local *lp = is->lp; - - /* Alloc large enough skb */ - hl = dev->drv[lp->isdn_device]->interface->hl_hdrlen; - skb = alloc_skb(len + hl + 16, GFP_ATOMIC); - if (!skb) { - printk(KERN_WARNING - "ippp: CCP cannot send reset - out of memory\n"); - return; - } - skb_reserve(skb, hl); - - /* We may need to stuff an address and control field first */ - if (!(is->pppcfg & SC_COMP_AC)) { - p = skb_put(skb, 2); - *p++ = 0xff; - *p++ = 0x03; - } - - /* Stuff proto, code, id and length */ - p = skb_put(skb, 6); - *p++ = (proto >> 8); - *p++ = (proto & 0xff); - *p++ = code; - *p++ = id; - cnt = 4 + len; - *p++ = (cnt >> 8); - *p++ = (cnt & 0xff); - - /* Now stuff remaining bytes */ - if (len) { - skb_put_data(skb, data, len); - } - - /* skb is now ready for xmit */ - printk(KERN_DEBUG "Sending CCP Frame:\n"); - isdn_ppp_frame_log("ccp-xmit", skb->data, skb->len, 32, is->unit, lp->ppp_slot); - - isdn_net_write_super(lp, skb); -} - -/* Allocate the reset state vector */ -static struct ippp_ccp_reset *isdn_ppp_ccp_reset_alloc(struct ippp_struct *is) -{ - struct ippp_ccp_reset *r; - r = kzalloc(sizeof(struct ippp_ccp_reset), GFP_KERNEL); - if (!r) { - printk(KERN_ERR "ippp_ccp: failed to allocate reset data" - " structure - no mem\n"); - return NULL; - } - printk(KERN_DEBUG "ippp_ccp: allocated reset data structure %p\n", r); - is->reset = r; - return r; -} - -/* Destroy the reset state vector. Kill all pending timers first. */ -static void isdn_ppp_ccp_reset_free(struct ippp_struct *is) -{ - unsigned int id; - - printk(KERN_DEBUG "ippp_ccp: freeing reset data structure %p\n", - is->reset); - for (id = 0; id < 256; id++) { - if (is->reset->rs[id]) { - isdn_ppp_ccp_reset_free_state(is, (unsigned char)id); - } - } - kfree(is->reset); - is->reset = NULL; -} - -/* Free a given state and clear everything up for later reallocation */ -static void isdn_ppp_ccp_reset_free_state(struct ippp_struct *is, - unsigned char id) -{ - struct ippp_ccp_reset_state *rs; - - if (is->reset->rs[id]) { - printk(KERN_DEBUG "ippp_ccp: freeing state for id %d\n", id); - rs = is->reset->rs[id]; - /* Make sure the kernel will not call back later */ - if (rs->ta) - del_timer(&rs->timer); - is->reset->rs[id] = NULL; - kfree(rs); - } else { - printk(KERN_WARNING "ippp_ccp: id %d is not allocated\n", id); - } -} - -/* The timer callback function which is called when a ResetReq has timed out, - aka has never been answered by a ResetAck */ -static void isdn_ppp_ccp_timer_callback(struct timer_list *t) -{ - struct ippp_ccp_reset_state *rs = - from_timer(rs, t, timer); - - if (!rs) { - printk(KERN_ERR "ippp_ccp: timer cb with zero closure.\n"); - return; - } - if (rs->ta && rs->state == CCPResetSentReq) { - /* We are correct here */ - if (!rs->expra) { - /* Hmm, there is no Ack really expected. We can clean - up the state now, it will be reallocated if the - decompressor insists on another reset */ - rs->ta = 0; - isdn_ppp_ccp_reset_free_state(rs->is, rs->id); - return; - } - printk(KERN_DEBUG "ippp_ccp: CCP Reset timed out for id %d\n", - rs->id); - /* Push it again */ - isdn_ppp_ccp_xmit_reset(rs->is, PPP_CCP, CCP_RESETREQ, rs->id, - rs->data, rs->dlen); - /* Restart timer */ - rs->timer.expires = jiffies + HZ * 5; - add_timer(&rs->timer); - } else { - printk(KERN_WARNING "ippp_ccp: timer cb in wrong state %d\n", - rs->state); - } -} - -/* Allocate a new reset transaction state */ -static struct ippp_ccp_reset_state *isdn_ppp_ccp_reset_alloc_state(struct ippp_struct *is, - unsigned char id) -{ - struct ippp_ccp_reset_state *rs; - if (is->reset->rs[id]) { - printk(KERN_WARNING "ippp_ccp: old state exists for id %d\n", - id); - return NULL; - } else { - rs = kzalloc(sizeof(struct ippp_ccp_reset_state), GFP_ATOMIC); - if (!rs) - return NULL; - rs->state = CCPResetIdle; - rs->is = is; - rs->id = id; - timer_setup(&rs->timer, isdn_ppp_ccp_timer_callback, 0); - is->reset->rs[id] = rs; - } - return rs; -} - - -/* A decompressor wants a reset with a set of parameters - do what is - necessary to fulfill it */ -static void isdn_ppp_ccp_reset_trans(struct ippp_struct *is, - struct isdn_ppp_resetparams *rp) -{ - struct ippp_ccp_reset_state *rs; - - if (rp->valid) { - /* The decompressor defines parameters by itself */ - if (rp->rsend) { - /* And he wants us to send a request */ - if (!(rp->idval)) { - printk(KERN_ERR "ippp_ccp: decompressor must" - " specify reset id\n"); - return; - } - if (is->reset->rs[rp->id]) { - /* There is already a transaction in existence - for this id. May be still waiting for a - Ack or may be wrong. */ - rs = is->reset->rs[rp->id]; - if (rs->state == CCPResetSentReq && rs->ta) { - printk(KERN_DEBUG "ippp_ccp: reset" - " trans still in progress" - " for id %d\n", rp->id); - } else { - printk(KERN_WARNING "ippp_ccp: reset" - " trans in wrong state %d for" - " id %d\n", rs->state, rp->id); - } - } else { - /* Ok, this is a new transaction */ - printk(KERN_DEBUG "ippp_ccp: new trans for id" - " %d to be started\n", rp->id); - rs = isdn_ppp_ccp_reset_alloc_state(is, rp->id); - if (!rs) { - printk(KERN_ERR "ippp_ccp: out of mem" - " allocing ccp trans\n"); - return; - } - rs->state = CCPResetSentReq; - rs->expra = rp->expra; - if (rp->dtval) { - rs->dlen = rp->dlen; - memcpy(rs->data, rp->data, rp->dlen); - } - /* HACK TODO - add link comp here */ - isdn_ppp_ccp_xmit_reset(is, PPP_CCP, - CCP_RESETREQ, rs->id, - rs->data, rs->dlen); - /* Start the timer */ - rs->timer.expires = jiffies + 5 * HZ; - add_timer(&rs->timer); - rs->ta = 1; - } - } else { - printk(KERN_DEBUG "ippp_ccp: no reset sent\n"); - } - } else { - /* The reset params are invalid. The decompressor does not - care about them, so we just send the minimal requests - and increase ids only when an Ack is received for a - given id */ - if (is->reset->rs[is->reset->lastid]) { - /* There is already a transaction in existence - for this id. May be still waiting for a - Ack or may be wrong. */ - rs = is->reset->rs[is->reset->lastid]; - if (rs->state == CCPResetSentReq && rs->ta) { - printk(KERN_DEBUG "ippp_ccp: reset" - " trans still in progress" - " for id %d\n", rp->id); - } else { - printk(KERN_WARNING "ippp_ccp: reset" - " trans in wrong state %d for" - " id %d\n", rs->state, rp->id); - } - } else { - printk(KERN_DEBUG "ippp_ccp: new trans for id" - " %d to be started\n", is->reset->lastid); - rs = isdn_ppp_ccp_reset_alloc_state(is, - is->reset->lastid); - if (!rs) { - printk(KERN_ERR "ippp_ccp: out of mem" - " allocing ccp trans\n"); - return; - } - rs->state = CCPResetSentReq; - /* We always expect an Ack if the decompressor doesn't - know better */ - rs->expra = 1; - rs->dlen = 0; - /* HACK TODO - add link comp here */ - isdn_ppp_ccp_xmit_reset(is, PPP_CCP, CCP_RESETREQ, - rs->id, NULL, 0); - /* Start the timer */ - rs->timer.expires = jiffies + 5 * HZ; - add_timer(&rs->timer); - rs->ta = 1; - } - } -} - -/* An Ack was received for this id. This means we stop the timer and clean - up the state prior to calling the decompressors reset routine. */ -static void isdn_ppp_ccp_reset_ack_rcvd(struct ippp_struct *is, - unsigned char id) -{ - struct ippp_ccp_reset_state *rs = is->reset->rs[id]; - - if (rs) { - if (rs->ta && rs->state == CCPResetSentReq) { - /* Great, we are correct */ - if (!rs->expra) - printk(KERN_DEBUG "ippp_ccp: ResetAck received" - " for id %d but not expected\n", id); - } else { - printk(KERN_INFO "ippp_ccp: ResetAck received out of" - "sync for id %d\n", id); - } - if (rs->ta) { - rs->ta = 0; - del_timer(&rs->timer); - } - isdn_ppp_ccp_reset_free_state(is, id); - } else { - printk(KERN_INFO "ippp_ccp: ResetAck received for unknown id" - " %d\n", id); - } - /* Make sure the simple reset stuff uses a new id next time */ - is->reset->lastid++; -} - -/* - * decompress packet - * - * if master = 0, we're trying to uncompress an per-link compressed packet, - * as opposed to an compressed reconstructed-from-MPPP packet. - * proto is updated to protocol field of uncompressed packet. - * - * retval: decompressed packet, - * same packet if uncompressed, - * NULL if decompression error - */ - -static struct sk_buff *isdn_ppp_decompress(struct sk_buff *skb, struct ippp_struct *is, struct ippp_struct *master, - int *proto) -{ - void *stat = NULL; - struct isdn_ppp_compressor *ipc = NULL; - struct sk_buff *skb_out; - int len; - struct ippp_struct *ri; - struct isdn_ppp_resetparams rsparm; - unsigned char rsdata[IPPP_RESET_MAXDATABYTES]; - - if (!master) { - // per-link decompression - stat = is->link_decomp_stat; - ipc = is->link_decompressor; - ri = is; - } else { - stat = master->decomp_stat; - ipc = master->decompressor; - ri = master; - } - - if (!ipc) { - // no decompressor -> we can't decompress. - printk(KERN_DEBUG "ippp: no decompressor defined!\n"); - return skb; - } - BUG_ON(!stat); // if we have a compressor, stat has been set as well - - if ((master && *proto == PPP_COMP) || (!master && *proto == PPP_COMPFRAG)) { - // compressed packets are compressed by their protocol type - - // Set up reset params for the decompressor - memset(&rsparm, 0, sizeof(rsparm)); - rsparm.data = rsdata; - rsparm.maxdlen = IPPP_RESET_MAXDATABYTES; - - skb_out = dev_alloc_skb(is->mru + PPP_HDRLEN); - if (!skb_out) { - kfree_skb(skb); - printk(KERN_ERR "ippp: decomp memory allocation failure\n"); - return NULL; - } - len = ipc->decompress(stat, skb, skb_out, &rsparm); - kfree_skb(skb); - if (len <= 0) { - switch (len) { - case DECOMP_ERROR: - printk(KERN_INFO "ippp: decomp wants reset %s params\n", - rsparm.valid ? "with" : "without"); - - isdn_ppp_ccp_reset_trans(ri, &rsparm); - break; - case DECOMP_FATALERROR: - ri->pppcfg |= SC_DC_FERROR; - /* Kick ipppd to recognize the error */ - isdn_ppp_ccp_kickup(ri); - break; - } - kfree_skb(skb_out); - return NULL; - } - *proto = isdn_ppp_strip_proto(skb_out); - if (*proto < 0) { - kfree_skb(skb_out); - return NULL; - } - return skb_out; - } else { - // uncompressed packets are fed through the decompressor to - // update the decompressor state - ipc->incomp(stat, skb, *proto); - return skb; - } -} - -/* - * compress a frame - * type=0: normal/bundle compression - * =1: link compression - * returns original skb if we haven't compressed the frame - * and a new skb pointer if we've done it - */ -static struct sk_buff *isdn_ppp_compress(struct sk_buff *skb_in, int *proto, - struct ippp_struct *is, struct ippp_struct *master, int type) -{ - int ret; - int new_proto; - struct isdn_ppp_compressor *compressor; - void *stat; - struct sk_buff *skb_out; - - /* we do not compress control protocols */ - if (*proto < 0 || *proto > 0x3fff) { - return skb_in; - } - - if (type) { /* type=1 => Link compression */ - return skb_in; - } - else { - if (!master) { - compressor = is->compressor; - stat = is->comp_stat; - } - else { - compressor = master->compressor; - stat = master->comp_stat; - } - new_proto = PPP_COMP; - } - - if (!compressor) { - printk(KERN_ERR "isdn_ppp: No compressor set!\n"); - return skb_in; - } - if (!stat) { - printk(KERN_ERR "isdn_ppp: Compressor not initialized?\n"); - return skb_in; - } - - /* Allow for at least 150 % expansion (for now) */ - skb_out = alloc_skb(skb_in->len + skb_in->len / 2 + 32 + - skb_headroom(skb_in), GFP_ATOMIC); - if (!skb_out) - return skb_in; - skb_reserve(skb_out, skb_headroom(skb_in)); - - ret = (compressor->compress)(stat, skb_in, skb_out, *proto); - if (!ret) { - dev_kfree_skb(skb_out); - return skb_in; - } - - dev_kfree_skb(skb_in); - *proto = new_proto; - return skb_out; -} - -/* - * we received a CCP frame .. - * not a clean solution, but we MUST handle a few cases in the kernel - */ -static void isdn_ppp_receive_ccp(isdn_net_dev *net_dev, isdn_net_local *lp, - struct sk_buff *skb, int proto) -{ - struct ippp_struct *is; - struct ippp_struct *mis; - int len; - struct isdn_ppp_resetparams rsparm; - unsigned char rsdata[IPPP_RESET_MAXDATABYTES]; - - printk(KERN_DEBUG "Received CCP frame from peer slot(%d)\n", - lp->ppp_slot); - if (lp->ppp_slot < 0 || lp->ppp_slot >= ISDN_MAX_CHANNELS) { - printk(KERN_ERR "%s: lp->ppp_slot(%d) out of range\n", - __func__, lp->ppp_slot); - return; - } - is = ippp_table[lp->ppp_slot]; - isdn_ppp_frame_log("ccp-rcv", skb->data, skb->len, 32, is->unit, lp->ppp_slot); - - if (lp->master) { - int slot = ISDN_MASTER_PRIV(lp)->ppp_slot; - if (slot < 0 || slot >= ISDN_MAX_CHANNELS) { - printk(KERN_ERR "%s: slot(%d) out of range\n", - __func__, slot); - return; - } - mis = ippp_table[slot]; - } else - mis = is; - - switch (skb->data[0]) { - case CCP_CONFREQ: - if (is->debug & 0x10) - printk(KERN_DEBUG "Disable compression here!\n"); - if (proto == PPP_CCP) - mis->compflags &= ~SC_COMP_ON; - else - is->compflags &= ~SC_LINK_COMP_ON; - break; - case CCP_TERMREQ: - case CCP_TERMACK: - if (is->debug & 0x10) - printk(KERN_DEBUG "Disable (de)compression here!\n"); - if (proto == PPP_CCP) - mis->compflags &= ~(SC_DECOMP_ON | SC_COMP_ON); - else - is->compflags &= ~(SC_LINK_DECOMP_ON | SC_LINK_COMP_ON); - break; - case CCP_CONFACK: - /* if we RECEIVE an ackowledge we enable the decompressor */ - if (is->debug & 0x10) - printk(KERN_DEBUG "Enable decompression here!\n"); - if (proto == PPP_CCP) { - if (!mis->decompressor) - break; - mis->compflags |= SC_DECOMP_ON; - } else { - if (!is->decompressor) - break; - is->compflags |= SC_LINK_DECOMP_ON; - } - break; - - case CCP_RESETACK: - printk(KERN_DEBUG "Received ResetAck from peer\n"); - len = (skb->data[2] << 8) | skb->data[3]; - len -= 4; - - if (proto == PPP_CCP) { - /* If a reset Ack was outstanding for this id, then - clean up the state engine */ - isdn_ppp_ccp_reset_ack_rcvd(mis, skb->data[1]); - if (mis->decompressor && mis->decomp_stat) - mis->decompressor-> - reset(mis->decomp_stat, - skb->data[0], - skb->data[1], - len ? &skb->data[4] : NULL, - len, NULL); - /* TODO: This is not easy to decide here */ - mis->compflags &= ~SC_DECOMP_DISCARD; - } - else { - isdn_ppp_ccp_reset_ack_rcvd(is, skb->data[1]); - if (is->link_decompressor && is->link_decomp_stat) - is->link_decompressor-> - reset(is->link_decomp_stat, - skb->data[0], - skb->data[1], - len ? &skb->data[4] : NULL, - len, NULL); - /* TODO: neither here */ - is->compflags &= ~SC_LINK_DECOMP_DISCARD; - } - break; - - case CCP_RESETREQ: - printk(KERN_DEBUG "Received ResetReq from peer\n"); - /* Receiving a ResetReq means we must reset our compressor */ - /* Set up reset params for the reset entry */ - memset(&rsparm, 0, sizeof(rsparm)); - rsparm.data = rsdata; - rsparm.maxdlen = IPPP_RESET_MAXDATABYTES; - /* Isolate data length */ - len = (skb->data[2] << 8) | skb->data[3]; - len -= 4; - if (proto == PPP_CCP) { - if (mis->compressor && mis->comp_stat) - mis->compressor-> - reset(mis->comp_stat, - skb->data[0], - skb->data[1], - len ? &skb->data[4] : NULL, - len, &rsparm); - } - else { - if (is->link_compressor && is->link_comp_stat) - is->link_compressor-> - reset(is->link_comp_stat, - skb->data[0], - skb->data[1], - len ? &skb->data[4] : NULL, - len, &rsparm); - } - /* Ack the Req as specified by rsparm */ - if (rsparm.valid) { - /* Compressor reset handler decided how to answer */ - if (rsparm.rsend) { - /* We should send a Frame */ - isdn_ppp_ccp_xmit_reset(is, proto, CCP_RESETACK, - rsparm.idval ? rsparm.id - : skb->data[1], - rsparm.dtval ? - rsparm.data : NULL, - rsparm.dtval ? - rsparm.dlen : 0); - } else { - printk(KERN_DEBUG "ResetAck suppressed\n"); - } - } else { - /* We answer with a straight reflected Ack */ - isdn_ppp_ccp_xmit_reset(is, proto, CCP_RESETACK, - skb->data[1], - len ? &skb->data[4] : NULL, - len); - } - break; - } -} - - -/* - * Daemon sends a CCP frame ... - */ - -/* TODO: Clean this up with new Reset semantics */ - -/* I believe the CCP handling as-is is done wrong. Compressed frames - * should only be sent/received after CCP reaches UP state, which means - * both sides have sent CONF_ACK. Currently, we handle both directions - * independently, which means we may accept compressed frames too early - * (supposedly not a problem), but may also mean we send compressed frames - * too early, which may turn out to be a problem. - * This part of state machine should actually be handled by (i)pppd, but - * that's too big of a change now. --kai - */ - -/* Actually, we might turn this into an advantage: deal with the RFC in - * the old tradition of beeing generous on what we accept, but beeing - * strict on what we send. Thus we should just - * - accept compressed frames as soon as decompression is negotiated - * - send compressed frames only when decomp *and* comp are negotiated - * - drop rx compressed frames if we cannot decomp (instead of pushing them - * up to ipppd) - * and I tried to modify this file according to that. --abp - */ - -static void isdn_ppp_send_ccp(isdn_net_dev *net_dev, isdn_net_local *lp, struct sk_buff *skb) -{ - struct ippp_struct *mis, *is; - int proto, slot = lp->ppp_slot; - unsigned char *data; - - if (!skb || skb->len < 3) - return; - if (slot < 0 || slot >= ISDN_MAX_CHANNELS) { - printk(KERN_ERR "%s: lp->ppp_slot(%d) out of range\n", - __func__, slot); - return; - } - is = ippp_table[slot]; - /* Daemon may send with or without address and control field comp */ - data = skb->data; - if (!(is->pppcfg & SC_COMP_AC) && data[0] == 0xff && data[1] == 0x03) { - data += 2; - if (skb->len < 5) - return; - } - - proto = ((int)data[0]<<8) + data[1]; - if (proto != PPP_CCP && proto != PPP_CCPFRAG) - return; - - printk(KERN_DEBUG "Received CCP frame from daemon:\n"); - isdn_ppp_frame_log("ccp-xmit", skb->data, skb->len, 32, is->unit, lp->ppp_slot); - - if (lp->master) { - slot = ISDN_MASTER_PRIV(lp)->ppp_slot; - if (slot < 0 || slot >= ISDN_MAX_CHANNELS) { - printk(KERN_ERR "%s: slot(%d) out of range\n", - __func__, slot); - return; - } - mis = ippp_table[slot]; - } else - mis = is; - if (mis != is) - printk(KERN_DEBUG "isdn_ppp: Ouch! Master CCP sends on slave slot!\n"); - - switch (data[2]) { - case CCP_CONFREQ: - if (is->debug & 0x10) - printk(KERN_DEBUG "Disable decompression here!\n"); - if (proto == PPP_CCP) - is->compflags &= ~SC_DECOMP_ON; - else - is->compflags &= ~SC_LINK_DECOMP_ON; - break; - case CCP_TERMREQ: - case CCP_TERMACK: - if (is->debug & 0x10) - printk(KERN_DEBUG "Disable (de)compression here!\n"); - if (proto == PPP_CCP) - is->compflags &= ~(SC_DECOMP_ON | SC_COMP_ON); - else - is->compflags &= ~(SC_LINK_DECOMP_ON | SC_LINK_COMP_ON); - break; - case CCP_CONFACK: - /* if we SEND an ackowledge we can/must enable the compressor */ - if (is->debug & 0x10) - printk(KERN_DEBUG "Enable compression here!\n"); - if (proto == PPP_CCP) { - if (!is->compressor) - break; - is->compflags |= SC_COMP_ON; - } else { - if (!is->compressor) - break; - is->compflags |= SC_LINK_COMP_ON; - } - break; - case CCP_RESETACK: - /* If we send a ACK we should reset our compressor */ - if (is->debug & 0x10) - printk(KERN_DEBUG "Reset decompression state here!\n"); - printk(KERN_DEBUG "ResetAck from daemon passed by\n"); - if (proto == PPP_CCP) { - /* link to master? */ - if (is->compressor && is->comp_stat) - is->compressor->reset(is->comp_stat, 0, 0, - NULL, 0, NULL); - is->compflags &= ~SC_COMP_DISCARD; - } - else { - if (is->link_compressor && is->link_comp_stat) - is->link_compressor->reset(is->link_comp_stat, - 0, 0, NULL, 0, NULL); - is->compflags &= ~SC_LINK_COMP_DISCARD; - } - break; - case CCP_RESETREQ: - /* Just let it pass by */ - printk(KERN_DEBUG "ResetReq from daemon passed by\n"); - break; - } -} - -int isdn_ppp_register_compressor(struct isdn_ppp_compressor *ipc) -{ - ipc->next = ipc_head; - ipc->prev = NULL; - if (ipc_head) { - ipc_head->prev = ipc; - } - ipc_head = ipc; - return 0; -} - -int isdn_ppp_unregister_compressor(struct isdn_ppp_compressor *ipc) -{ - if (ipc->prev) - ipc->prev->next = ipc->next; - else - ipc_head = ipc->next; - if (ipc->next) - ipc->next->prev = ipc->prev; - ipc->prev = ipc->next = NULL; - return 0; -} - -static int isdn_ppp_set_compressor(struct ippp_struct *is, struct isdn_ppp_comp_data *data) -{ - struct isdn_ppp_compressor *ipc = ipc_head; - int ret; - void *stat; - int num = data->num; - - if (is->debug & 0x10) - printk(KERN_DEBUG "[%d] Set %s type %d\n", is->unit, - (data->flags & IPPP_COMP_FLAG_XMIT) ? "compressor" : "decompressor", num); - - /* If is has no valid reset state vector, we cannot allocate a - decompressor. The decompressor would cause reset transactions - sooner or later, and they need that vector. */ - - if (!(data->flags & IPPP_COMP_FLAG_XMIT) && !is->reset) { - printk(KERN_ERR "ippp_ccp: no reset data structure - can't" - " allow decompression.\n"); - return -ENOMEM; - } - - while (ipc) { - if (ipc->num == num) { - stat = ipc->alloc(data); - if (stat) { - ret = ipc->init(stat, data, is->unit, 0); - if (!ret) { - printk(KERN_ERR "Can't init (de)compression!\n"); - ipc->free(stat); - stat = NULL; - break; - } - } - else { - printk(KERN_ERR "Can't alloc (de)compression!\n"); - break; - } - - if (data->flags & IPPP_COMP_FLAG_XMIT) { - if (data->flags & IPPP_COMP_FLAG_LINK) { - if (is->link_comp_stat) - is->link_compressor->free(is->link_comp_stat); - is->link_comp_stat = stat; - is->link_compressor = ipc; - } - else { - if (is->comp_stat) - is->compressor->free(is->comp_stat); - is->comp_stat = stat; - is->compressor = ipc; - } - } - else { - if (data->flags & IPPP_COMP_FLAG_LINK) { - if (is->link_decomp_stat) - is->link_decompressor->free(is->link_decomp_stat); - is->link_decomp_stat = stat; - is->link_decompressor = ipc; - } - else { - if (is->decomp_stat) - is->decompressor->free(is->decomp_stat); - is->decomp_stat = stat; - is->decompressor = ipc; - } - } - return 0; - } - ipc = ipc->next; - } - return -EINVAL; -} diff --git a/drivers/isdn/i4l/isdn_ppp.h b/drivers/isdn/i4l/isdn_ppp.h deleted file mode 100644 index 34b8a2ce84f3..000000000000 --- a/drivers/isdn/i4l/isdn_ppp.h +++ /dev/null @@ -1,41 +0,0 @@ -/* $Id: isdn_ppp.h,v 1.1.2.2 2004/01/12 22:37:19 keil Exp $ - * - * header for Linux ISDN subsystem, functions for synchronous PPP (linklevel). - * - * Copyright 1995,96 by Michael Hipp (Michael.Hipp@student.uni-tuebingen.de) - * - * This software may be used and distributed according to the terms - * of the GNU General Public License, incorporated herein by reference. - * - */ - -#include /* for PPP_PROTOCOL */ -#include /* for isdn_ppp info */ - -extern int isdn_ppp_read(int, struct file *, char __user *, int); -extern int isdn_ppp_write(int, struct file *, const char __user *, int); -extern int isdn_ppp_open(int, struct file *); -extern int isdn_ppp_init(void); -extern void isdn_ppp_cleanup(void); -extern int isdn_ppp_free(isdn_net_local *); -extern int isdn_ppp_bind(isdn_net_local *); -extern int isdn_ppp_autodial_filter(struct sk_buff *, isdn_net_local *); -extern int isdn_ppp_xmit(struct sk_buff *, struct net_device *); -extern void isdn_ppp_receive(isdn_net_dev *, isdn_net_local *, struct sk_buff *); -extern int isdn_ppp_dev_ioctl(struct net_device *, struct ifreq *, int); -extern __poll_t isdn_ppp_poll(struct file *, struct poll_table_struct *); -extern int isdn_ppp_ioctl(int, struct file *, unsigned int, unsigned long); -extern void isdn_ppp_release(int, struct file *); -extern int isdn_ppp_dial_slave(char *); -extern void isdn_ppp_wakeup_daemon(isdn_net_local *); - -extern int isdn_ppp_register_compressor(struct isdn_ppp_compressor *ipc); -extern int isdn_ppp_unregister_compressor(struct isdn_ppp_compressor *ipc); - -#define IPPP_OPEN 0x01 -#define IPPP_CONNECT 0x02 -#define IPPP_CLOSEWAIT 0x04 -#define IPPP_NOBLOCK 0x08 -#define IPPP_ASSIGNED 0x10 - -#define IPPP_MAX_HEADER 10 diff --git a/drivers/isdn/i4l/isdn_tty.c b/drivers/isdn/i4l/isdn_tty.c deleted file mode 100644 index 43700fc19a31..000000000000 --- a/drivers/isdn/i4l/isdn_tty.c +++ /dev/null @@ -1,3756 +0,0 @@ -/* - * Linux ISDN subsystem, tty functions and AT-command emulator (linklevel). - * - * Copyright 1994-1999 by Fritz Elfert (fritz@isdn4linux.de) - * Copyright 1995,96 by Thinking Objects Software GmbH Wuerzburg - * - * This software may be used and distributed according to the terms - * of the GNU General Public License, incorporated herein by reference. - * - */ -#undef ISDN_TTY_STAT_DEBUG - -#include -#include /* ASYNC_* flags */ -#include -#include -#include -#include -#include "isdn_common.h" -#include "isdn_tty.h" -#ifdef CONFIG_ISDN_AUDIO -#include "isdn_audio.h" -#define VBUF 0x3e0 -#define VBUFX (VBUF/16) -#endif - -#define FIX_FILE_TRANSFER -#define DUMMY_HAYES_AT - -/* Prototypes */ - -static DEFINE_MUTEX(modem_info_mutex); -static int isdn_tty_edit_at(const char *, int, modem_info *); -static void isdn_tty_check_esc(const u_char *, u_char, int, int *, u_long *); -static void isdn_tty_modem_reset_regs(modem_info *, int); -static void isdn_tty_cmd_ATA(modem_info *); -static void isdn_tty_flush_buffer(struct tty_struct *); -static void isdn_tty_modem_result(int, modem_info *); -#ifdef CONFIG_ISDN_AUDIO -static int isdn_tty_countDLE(unsigned char *, int); -#endif - -/* Leave this unchanged unless you know what you do! */ -#define MODEM_PARANOIA_CHECK -#define MODEM_DO_RESTART - -static int bit2si[8] = -{1, 5, 7, 7, 7, 7, 7, 7}; -static int si2bit[8] = -{4, 1, 4, 4, 4, 4, 4, 4}; - -/* isdn_tty_try_read() is called from within isdn_tty_rcv_skb() - * to stuff incoming data directly into a tty's flip-buffer. This - * is done to speed up tty-receiving if the receive-queue is empty. - * This routine MUST be called with interrupts off. - * Return: - * 1 = Success - * 0 = Failure, data has to be buffered and later processed by - * isdn_tty_readmodem(). - */ -static int -isdn_tty_try_read(modem_info *info, struct sk_buff *skb) -{ - struct tty_port *port = &info->port; - int c; - int len; - char last; - - if (!info->online) - return 0; - - if (!(info->mcr & UART_MCR_RTS)) - return 0; - - len = skb->len -#ifdef CONFIG_ISDN_AUDIO - + ISDN_AUDIO_SKB_DLECOUNT(skb) -#endif - ; - - c = tty_buffer_request_room(port, len); - if (c < len) - return 0; - -#ifdef CONFIG_ISDN_AUDIO - if (ISDN_AUDIO_SKB_DLECOUNT(skb)) { - int l = skb->len; - unsigned char *dp = skb->data; - while (--l) { - if (*dp == DLE) - tty_insert_flip_char(port, DLE, 0); - tty_insert_flip_char(port, *dp++, 0); - } - if (*dp == DLE) - tty_insert_flip_char(port, DLE, 0); - last = *dp; - } else { -#endif - if (len > 1) - tty_insert_flip_string(port, skb->data, len - 1); - last = skb->data[len - 1]; -#ifdef CONFIG_ISDN_AUDIO - } -#endif - if (info->emu.mdmreg[REG_CPPP] & BIT_CPPP) - tty_insert_flip_char(port, last, 0xFF); - else - tty_insert_flip_char(port, last, TTY_NORMAL); - tty_flip_buffer_push(port); - kfree_skb(skb); - - return 1; -} - -/* isdn_tty_readmodem() is called periodically from within timer-interrupt. - * It tries getting received data from the receive queue an stuff it into - * the tty's flip-buffer. - */ -void -isdn_tty_readmodem(void) -{ - int resched = 0; - int midx; - int i; - int r; - modem_info *info; - - for (i = 0; i < ISDN_MAX_CHANNELS; i++) { - midx = dev->m_idx[i]; - if (midx < 0) - continue; - - info = &dev->mdm.info[midx]; - if (!info->online) - continue; - - r = 0; -#ifdef CONFIG_ISDN_AUDIO - isdn_audio_eval_dtmf(info); - if ((info->vonline & 1) && (info->emu.vpar[1])) - isdn_audio_eval_silence(info); -#endif - if (info->mcr & UART_MCR_RTS) { - /* CISCO AsyncPPP Hack */ - if (!(info->emu.mdmreg[REG_CPPP] & BIT_CPPP)) - r = isdn_readbchan_tty(info->isdn_driver, - info->isdn_channel, - &info->port, 0); - else - r = isdn_readbchan_tty(info->isdn_driver, - info->isdn_channel, - &info->port, 1); - if (r) - tty_flip_buffer_push(&info->port); - } else - r = 1; - - if (r) { - info->rcvsched = 0; - resched = 1; - } else - info->rcvsched = 1; - } - if (!resched) - isdn_timer_ctrl(ISDN_TIMER_MODEMREAD, 0); -} - -int -isdn_tty_rcv_skb(int i, int di, int channel, struct sk_buff *skb) -{ - ulong flags; - int midx; -#ifdef CONFIG_ISDN_AUDIO - int ifmt; -#endif - modem_info *info; - - if ((midx = dev->m_idx[i]) < 0) { - /* if midx is invalid, packet is not for tty */ - return 0; - } - info = &dev->mdm.info[midx]; -#ifdef CONFIG_ISDN_AUDIO - ifmt = 1; - - if ((info->vonline) && (!info->emu.vpar[4])) - isdn_audio_calc_dtmf(info, skb->data, skb->len, ifmt); - if ((info->vonline & 1) && (info->emu.vpar[1])) - isdn_audio_calc_silence(info, skb->data, skb->len, ifmt); -#endif - if ((info->online < 2) -#ifdef CONFIG_ISDN_AUDIO - && (!(info->vonline & 1)) -#endif - ) { - /* If Modem not listening, drop data */ - kfree_skb(skb); - return 1; - } - if (info->emu.mdmreg[REG_T70] & BIT_T70) { - if (info->emu.mdmreg[REG_T70] & BIT_T70_EXT) { - /* T.70 decoding: throw away the T.70 header (2 or 4 bytes) */ - if (skb->data[0] == 3) /* pure data packet -> 4 byte headers */ - skb_pull(skb, 4); - else - if (skb->data[0] == 1) /* keepalive packet -> 2 byte hdr */ - skb_pull(skb, 2); - } else - /* T.70 decoding: Simply throw away the T.70 header (4 bytes) */ - if ((skb->data[0] == 1) && ((skb->data[1] == 0) || (skb->data[1] == 1))) - skb_pull(skb, 4); - } -#ifdef CONFIG_ISDN_AUDIO - ISDN_AUDIO_SKB_DLECOUNT(skb) = 0; - ISDN_AUDIO_SKB_LOCK(skb) = 0; - if (info->vonline & 1) { - /* voice conversion/compression */ - switch (info->emu.vpar[3]) { - case 2: - case 3: - case 4: - /* adpcm - * Since compressed data takes less - * space, we can overwrite the buffer. - */ - skb_trim(skb, isdn_audio_xlaw2adpcm(info->adpcmr, - ifmt, - skb->data, - skb->data, - skb->len)); - break; - case 5: - /* a-law */ - if (!ifmt) - isdn_audio_ulaw2alaw(skb->data, skb->len); - break; - case 6: - /* u-law */ - if (ifmt) - isdn_audio_alaw2ulaw(skb->data, skb->len); - break; - } - ISDN_AUDIO_SKB_DLECOUNT(skb) = - isdn_tty_countDLE(skb->data, skb->len); - } -#ifdef CONFIG_ISDN_TTY_FAX - else { - if (info->faxonline & 2) { - isdn_tty_fax_bitorder(info, skb); - ISDN_AUDIO_SKB_DLECOUNT(skb) = - isdn_tty_countDLE(skb->data, skb->len); - } - } -#endif -#endif - /* Try to deliver directly via tty-buf if queue is empty */ - spin_lock_irqsave(&info->readlock, flags); - if (skb_queue_empty(&dev->drv[di]->rpqueue[channel])) - if (isdn_tty_try_read(info, skb)) { - spin_unlock_irqrestore(&info->readlock, flags); - return 1; - } - /* Direct deliver failed or queue wasn't empty. - * Queue up for later dequeueing via timer-irq. - */ - __skb_queue_tail(&dev->drv[di]->rpqueue[channel], skb); - dev->drv[di]->rcvcount[channel] += - (skb->len -#ifdef CONFIG_ISDN_AUDIO - + ISDN_AUDIO_SKB_DLECOUNT(skb) -#endif - ); - spin_unlock_irqrestore(&info->readlock, flags); - /* Schedule dequeuing */ - if ((dev->modempoll) && (info->rcvsched)) - isdn_timer_ctrl(ISDN_TIMER_MODEMREAD, 1); - return 1; -} - -static void -isdn_tty_cleanup_xmit(modem_info *info) -{ - skb_queue_purge(&info->xmit_queue); -#ifdef CONFIG_ISDN_AUDIO - skb_queue_purge(&info->dtmf_queue); -#endif -} - -static void -isdn_tty_tint(modem_info *info) -{ - struct sk_buff *skb = skb_dequeue(&info->xmit_queue); - int len, slen; - - if (!skb) - return; - len = skb->len; - if ((slen = isdn_writebuf_skb_stub(info->isdn_driver, - info->isdn_channel, 1, skb)) == len) { - struct tty_struct *tty = info->port.tty; - info->send_outstanding++; - info->msr &= ~UART_MSR_CTS; - info->lsr &= ~UART_LSR_TEMT; - tty_wakeup(tty); - return; - } - if (slen < 0) { - /* Error: no channel, already shutdown, or wrong parameter */ - dev_kfree_skb(skb); - return; - } - skb_queue_head(&info->xmit_queue, skb); -} - -#ifdef CONFIG_ISDN_AUDIO -static int -isdn_tty_countDLE(unsigned char *buf, int len) -{ - int count = 0; - - while (len--) - if (*buf++ == DLE) - count++; - return count; -} - -/* This routine is called from within isdn_tty_write() to perform - * DLE-decoding when sending audio-data. - */ -static int -isdn_tty_handleDLEdown(modem_info *info, atemu *m, int len) -{ - unsigned char *p = &info->port.xmit_buf[info->xmit_count]; - int count = 0; - - while (len > 0) { - if (m->lastDLE) { - m->lastDLE = 0; - switch (*p) { - case DLE: - /* Escape code */ - if (len > 1) - memmove(p, p + 1, len - 1); - p--; - count++; - break; - case ETX: - /* End of data */ - info->vonline |= 4; - return count; - case DC4: - /* Abort RX */ - info->vonline &= ~1; -#ifdef ISDN_DEBUG_MODEM_VOICE - printk(KERN_DEBUG - "DLEdown: got DLE-DC4, send DLE-ETX on ttyI%d\n", - info->line); -#endif - isdn_tty_at_cout("\020\003", info); - if (!info->vonline) { -#ifdef ISDN_DEBUG_MODEM_VOICE - printk(KERN_DEBUG - "DLEdown: send VCON on ttyI%d\n", - info->line); -#endif - isdn_tty_at_cout("\r\nVCON\r\n", info); - } - /* Fall through */ - case 'q': - case 's': - /* Silence */ - if (len > 1) - memmove(p, p + 1, len - 1); - p--; - break; - } - } else { - if (*p == DLE) - m->lastDLE = 1; - else - count++; - } - p++; - len--; - } - if (len < 0) { - printk(KERN_WARNING "isdn_tty: len<0 in DLEdown\n"); - return 0; - } - return count; -} - -/* This routine is called from within isdn_tty_write() when receiving - * audio-data. It interrupts receiving, if an character other than - * ^S or ^Q is sent. - */ -static int -isdn_tty_end_vrx(const char *buf, int c) -{ - char ch; - - while (c--) { - ch = *buf; - if ((ch != 0x11) && (ch != 0x13)) - return 1; - buf++; - } - return 0; -} - -static int voice_cf[7] = -{0, 0, 4, 3, 2, 0, 0}; - -#endif /* CONFIG_ISDN_AUDIO */ - -/* isdn_tty_senddown() is called either directly from within isdn_tty_write() - * or via timer-interrupt from within isdn_tty_modem_xmit(). It pulls - * outgoing data from the tty's xmit-buffer, handles voice-decompression or - * T.70 if necessary, and finally queues it up for sending via isdn_tty_tint. - */ -static void -isdn_tty_senddown(modem_info *info) -{ - int buflen; - int skb_res; -#ifdef CONFIG_ISDN_AUDIO - int audio_len; -#endif - struct sk_buff *skb; - -#ifdef CONFIG_ISDN_AUDIO - if (info->vonline & 4) { - info->vonline &= ~6; - if (!info->vonline) { -#ifdef ISDN_DEBUG_MODEM_VOICE - printk(KERN_DEBUG - "senddown: send VCON on ttyI%d\n", - info->line); -#endif - isdn_tty_at_cout("\r\nVCON\r\n", info); - } - } -#endif - if (!(buflen = info->xmit_count)) - return; - if ((info->emu.mdmreg[REG_CTS] & BIT_CTS) != 0) - info->msr &= ~UART_MSR_CTS; - info->lsr &= ~UART_LSR_TEMT; - /* info->xmit_count is modified here and in isdn_tty_write(). - * So we return here if isdn_tty_write() is in the - * critical section. - */ - atomic_inc(&info->xmit_lock); - if (!(atomic_dec_and_test(&info->xmit_lock))) - return; - if (info->isdn_driver < 0) { - info->xmit_count = 0; - return; - } - skb_res = dev->drv[info->isdn_driver]->interface->hl_hdrlen + 4; -#ifdef CONFIG_ISDN_AUDIO - if (info->vonline & 2) - audio_len = buflen * voice_cf[info->emu.vpar[3]]; - else - audio_len = 0; - skb = dev_alloc_skb(skb_res + buflen + audio_len); -#else - skb = dev_alloc_skb(skb_res + buflen); -#endif - if (!skb) { - printk(KERN_WARNING - "isdn_tty: Out of memory in ttyI%d senddown\n", - info->line); - return; - } - skb_reserve(skb, skb_res); - skb_put_data(skb, info->port.xmit_buf, buflen); - info->xmit_count = 0; -#ifdef CONFIG_ISDN_AUDIO - if (info->vonline & 2) { - /* For now, ifmt is fixed to 1 (alaw), since this - * is used with ISDN everywhere in the world, except - * US, Canada and Japan. - * Later, when US-ISDN protocols are implemented, - * this setting will depend on the D-channel protocol. - */ - int ifmt = 1; - - /* voice conversion/decompression */ - switch (info->emu.vpar[3]) { - case 2: - case 3: - case 4: - /* adpcm, compatible to ZyXel 1496 modem - * with ROM revision 6.01 - */ - audio_len = isdn_audio_adpcm2xlaw(info->adpcms, - ifmt, - skb->data, - skb_put(skb, audio_len), - buflen); - skb_pull(skb, buflen); - skb_trim(skb, audio_len); - break; - case 5: - /* a-law */ - if (!ifmt) - isdn_audio_alaw2ulaw(skb->data, - buflen); - break; - case 6: - /* u-law */ - if (ifmt) - isdn_audio_ulaw2alaw(skb->data, - buflen); - break; - } - } -#endif /* CONFIG_ISDN_AUDIO */ - if (info->emu.mdmreg[REG_T70] & BIT_T70) { - /* Add T.70 simplified header */ - if (info->emu.mdmreg[REG_T70] & BIT_T70_EXT) - memcpy(skb_push(skb, 2), "\1\0", 2); - else - memcpy(skb_push(skb, 4), "\1\0\1\0", 4); - } - skb_queue_tail(&info->xmit_queue, skb); -} - -/************************************************************ - * - * Modem-functions - * - * mostly "stolen" from original Linux-serial.c and friends. - * - ************************************************************/ - -/* The next routine is called once from within timer-interrupt - * triggered within isdn_tty_modem_ncarrier(). It calls - * isdn_tty_modem_result() to stuff a "NO CARRIER" Message - * into the tty's buffer. - */ -static void -isdn_tty_modem_do_ncarrier(struct timer_list *t) -{ - modem_info *info = from_timer(info, t, nc_timer); - isdn_tty_modem_result(RESULT_NO_CARRIER, info); -} - -/* Next routine is called, whenever the DTR-signal is raised. - * It checks the ncarrier-flag, and triggers the above routine - * when necessary. The ncarrier-flag is set, whenever DTR goes - * low. - */ -static void -isdn_tty_modem_ncarrier(modem_info *info) -{ - if (info->ncarrier) { - info->nc_timer.expires = jiffies + HZ; - add_timer(&info->nc_timer); - } -} - -/* - * return the usage calculated by si and layer 2 protocol - */ -static int -isdn_calc_usage(int si, int l2) -{ - int usg = ISDN_USAGE_MODEM; - -#ifdef CONFIG_ISDN_AUDIO - if (si == 1) { - switch (l2) { - case ISDN_PROTO_L2_MODEM: - usg = ISDN_USAGE_MODEM; - break; -#ifdef CONFIG_ISDN_TTY_FAX - case ISDN_PROTO_L2_FAX: - usg = ISDN_USAGE_FAX; - break; -#endif - case ISDN_PROTO_L2_TRANS: - default: - usg = ISDN_USAGE_VOICE; - break; - } - } -#endif - return (usg); -} - -/* isdn_tty_dial() performs dialing of a tty an the necessary - * setup of the lower levels before that. - */ -static void -isdn_tty_dial(char *n, modem_info *info, atemu *m) -{ - int usg = ISDN_USAGE_MODEM; - int si = 7; - int l2 = m->mdmreg[REG_L2PROT]; - u_long flags; - isdn_ctrl cmd; - int i; - int j; - - for (j = 7; j >= 0; j--) - if (m->mdmreg[REG_SI1] & (1 << j)) { - si = bit2si[j]; - break; - } - usg = isdn_calc_usage(si, l2); -#ifdef CONFIG_ISDN_AUDIO - if ((si == 1) && - (l2 != ISDN_PROTO_L2_MODEM) -#ifdef CONFIG_ISDN_TTY_FAX - && (l2 != ISDN_PROTO_L2_FAX) -#endif - ) { - l2 = ISDN_PROTO_L2_TRANS; - usg = ISDN_USAGE_VOICE; - } -#endif - m->mdmreg[REG_SI1I] = si2bit[si]; - spin_lock_irqsave(&dev->lock, flags); - i = isdn_get_free_channel(usg, l2, m->mdmreg[REG_L3PROT], -1, -1, m->msn); - if (i < 0) { - spin_unlock_irqrestore(&dev->lock, flags); - isdn_tty_modem_result(RESULT_NO_DIALTONE, info); - } else { - info->isdn_driver = dev->drvmap[i]; - info->isdn_channel = dev->chanmap[i]; - info->drv_index = i; - dev->m_idx[i] = info->line; - dev->usage[i] |= ISDN_USAGE_OUTGOING; - info->last_dir = 1; - strcpy(info->last_num, n); - isdn_info_update(); - spin_unlock_irqrestore(&dev->lock, flags); - cmd.driver = info->isdn_driver; - cmd.arg = info->isdn_channel; - cmd.command = ISDN_CMD_CLREAZ; - isdn_command(&cmd); - strcpy(cmd.parm.num, isdn_map_eaz2msn(m->msn, info->isdn_driver)); - cmd.driver = info->isdn_driver; - cmd.command = ISDN_CMD_SETEAZ; - isdn_command(&cmd); - cmd.driver = info->isdn_driver; - cmd.command = ISDN_CMD_SETL2; - info->last_l2 = l2; - cmd.arg = info->isdn_channel + (l2 << 8); - isdn_command(&cmd); - cmd.driver = info->isdn_driver; - cmd.command = ISDN_CMD_SETL3; - cmd.arg = info->isdn_channel + (m->mdmreg[REG_L3PROT] << 8); -#ifdef CONFIG_ISDN_TTY_FAX - if (l2 == ISDN_PROTO_L2_FAX) { - cmd.parm.fax = info->fax; - info->fax->direction = ISDN_TTY_FAX_CONN_OUT; - } -#endif - isdn_command(&cmd); - cmd.driver = info->isdn_driver; - cmd.arg = info->isdn_channel; - sprintf(cmd.parm.setup.phone, "%s", n); - sprintf(cmd.parm.setup.eazmsn, "%s", - isdn_map_eaz2msn(m->msn, info->isdn_driver)); - cmd.parm.setup.si1 = si; - cmd.parm.setup.si2 = m->mdmreg[REG_SI2]; - cmd.command = ISDN_CMD_DIAL; - info->dialing = 1; - info->emu.carrierwait = 0; - strcpy(dev->num[i], n); - isdn_info_update(); - isdn_command(&cmd); - isdn_timer_ctrl(ISDN_TIMER_CARRIER, 1); - } -} - -/* isdn_tty_hangup() disassociates a tty from the real - * ISDN-line (hangup). The usage-status is cleared - * and some cleanup is done also. - */ -void -isdn_tty_modem_hup(modem_info *info, int local) -{ - isdn_ctrl cmd; - int di, ch; - - if (!info) - return; - - di = info->isdn_driver; - ch = info->isdn_channel; - if (di < 0 || ch < 0) - return; - - info->isdn_driver = -1; - info->isdn_channel = -1; - -#ifdef ISDN_DEBUG_MODEM_HUP - printk(KERN_DEBUG "Mhup ttyI%d\n", info->line); -#endif - info->rcvsched = 0; - isdn_tty_flush_buffer(info->port.tty); - if (info->online) { - info->last_lhup = local; - info->online = 0; - isdn_tty_modem_result(RESULT_NO_CARRIER, info); - } -#ifdef CONFIG_ISDN_AUDIO - info->vonline = 0; -#ifdef CONFIG_ISDN_TTY_FAX - info->faxonline = 0; - info->fax->phase = ISDN_FAX_PHASE_IDLE; -#endif - info->emu.vpar[4] = 0; - info->emu.vpar[5] = 8; - kfree(info->dtmf_state); - info->dtmf_state = NULL; - kfree(info->silence_state); - info->silence_state = NULL; - kfree(info->adpcms); - info->adpcms = NULL; - kfree(info->adpcmr); - info->adpcmr = NULL; -#endif - if ((info->msr & UART_MSR_RI) && - (info->emu.mdmreg[REG_RUNG] & BIT_RUNG)) - isdn_tty_modem_result(RESULT_RUNG, info); - info->msr &= ~(UART_MSR_DCD | UART_MSR_RI); - info->lsr |= UART_LSR_TEMT; - - if (local) { - cmd.driver = di; - cmd.command = ISDN_CMD_HANGUP; - cmd.arg = ch; - isdn_command(&cmd); - } - - isdn_all_eaz(di, ch); - info->emu.mdmreg[REG_RINGCNT] = 0; - isdn_free_channel(di, ch, 0); - - if (info->drv_index >= 0) { - dev->m_idx[info->drv_index] = -1; - info->drv_index = -1; - } -} - -/* - * Begin of a CAPI like interface, currently used only for - * supplementary service (CAPI 2.0 part III) - */ -#include -#include - -int -isdn_tty_capi_facility(capi_msg *cm) { - return (-1); /* dummy */ -} - -/* isdn_tty_suspend() tries to suspend the current tty connection - */ -static void -isdn_tty_suspend(char *id, modem_info *info, atemu *m) -{ - isdn_ctrl cmd; - - int l; - - if (!info) - return; - -#ifdef ISDN_DEBUG_MODEM_SERVICES - printk(KERN_DEBUG "Msusp ttyI%d\n", info->line); -#endif - l = strlen(id); - if ((info->isdn_driver >= 0)) { - cmd.parm.cmsg.Length = l + 18; - cmd.parm.cmsg.Command = CAPI_FACILITY; - cmd.parm.cmsg.Subcommand = CAPI_REQ; - cmd.parm.cmsg.adr.Controller = info->isdn_driver + 1; - cmd.parm.cmsg.para[0] = 3; /* 16 bit 0x0003 suplementary service */ - cmd.parm.cmsg.para[1] = 0; - cmd.parm.cmsg.para[2] = l + 3; - cmd.parm.cmsg.para[3] = 4; /* 16 bit 0x0004 Suspend */ - cmd.parm.cmsg.para[4] = 0; - cmd.parm.cmsg.para[5] = l; - memcpy(&cmd.parm.cmsg.para[6], id, l); - cmd.command = CAPI_PUT_MESSAGE; - cmd.driver = info->isdn_driver; - cmd.arg = info->isdn_channel; - isdn_command(&cmd); - } -} - -/* isdn_tty_resume() tries to resume a suspended call - * setup of the lower levels before that. unfortunately here is no - * checking for compatibility of used protocols implemented by Q931 - * It does the same things like isdn_tty_dial, the last command - * is different, may be we can merge it. - */ - -static void -isdn_tty_resume(char *id, modem_info *info, atemu *m) -{ - int usg = ISDN_USAGE_MODEM; - int si = 7; - int l2 = m->mdmreg[REG_L2PROT]; - isdn_ctrl cmd; - ulong flags; - int i; - int j; - int l; - - l = strlen(id); - for (j = 7; j >= 0; j--) - if (m->mdmreg[REG_SI1] & (1 << j)) { - si = bit2si[j]; - break; - } - usg = isdn_calc_usage(si, l2); -#ifdef CONFIG_ISDN_AUDIO - if ((si == 1) && - (l2 != ISDN_PROTO_L2_MODEM) -#ifdef CONFIG_ISDN_TTY_FAX - && (l2 != ISDN_PROTO_L2_FAX) -#endif - ) { - l2 = ISDN_PROTO_L2_TRANS; - usg = ISDN_USAGE_VOICE; - } -#endif - m->mdmreg[REG_SI1I] = si2bit[si]; - spin_lock_irqsave(&dev->lock, flags); - i = isdn_get_free_channel(usg, l2, m->mdmreg[REG_L3PROT], -1, -1, m->msn); - if (i < 0) { - spin_unlock_irqrestore(&dev->lock, flags); - isdn_tty_modem_result(RESULT_NO_DIALTONE, info); - } else { - info->isdn_driver = dev->drvmap[i]; - info->isdn_channel = dev->chanmap[i]; - info->drv_index = i; - dev->m_idx[i] = info->line; - dev->usage[i] |= ISDN_USAGE_OUTGOING; - info->last_dir = 1; -// strcpy(info->last_num, n); - isdn_info_update(); - spin_unlock_irqrestore(&dev->lock, flags); - cmd.driver = info->isdn_driver; - cmd.arg = info->isdn_channel; - cmd.command = ISDN_CMD_CLREAZ; - isdn_command(&cmd); - strcpy(cmd.parm.num, isdn_map_eaz2msn(m->msn, info->isdn_driver)); - cmd.driver = info->isdn_driver; - cmd.command = ISDN_CMD_SETEAZ; - isdn_command(&cmd); - cmd.driver = info->isdn_driver; - cmd.command = ISDN_CMD_SETL2; - info->last_l2 = l2; - cmd.arg = info->isdn_channel + (l2 << 8); - isdn_command(&cmd); - cmd.driver = info->isdn_driver; - cmd.command = ISDN_CMD_SETL3; - cmd.arg = info->isdn_channel + (m->mdmreg[REG_L3PROT] << 8); - isdn_command(&cmd); - cmd.driver = info->isdn_driver; - cmd.arg = info->isdn_channel; - cmd.parm.cmsg.Length = l + 18; - cmd.parm.cmsg.Command = CAPI_FACILITY; - cmd.parm.cmsg.Subcommand = CAPI_REQ; - cmd.parm.cmsg.adr.Controller = info->isdn_driver + 1; - cmd.parm.cmsg.para[0] = 3; /* 16 bit 0x0003 suplementary service */ - cmd.parm.cmsg.para[1] = 0; - cmd.parm.cmsg.para[2] = l + 3; - cmd.parm.cmsg.para[3] = 5; /* 16 bit 0x0005 Resume */ - cmd.parm.cmsg.para[4] = 0; - cmd.parm.cmsg.para[5] = l; - memcpy(&cmd.parm.cmsg.para[6], id, l); - cmd.command = CAPI_PUT_MESSAGE; - info->dialing = 1; -// strcpy(dev->num[i], n); - isdn_info_update(); - isdn_command(&cmd); - isdn_timer_ctrl(ISDN_TIMER_CARRIER, 1); - } -} - -/* isdn_tty_send_msg() sends a message to a HL driver - * This is used for hybrid modem cards to send AT commands to it - */ - -static void -isdn_tty_send_msg(modem_info *info, atemu *m, char *msg) -{ - int usg = ISDN_USAGE_MODEM; - int si = 7; - int l2 = m->mdmreg[REG_L2PROT]; - isdn_ctrl cmd; - ulong flags; - int i; - int j; - int l; - - l = min(strlen(msg), sizeof(cmd.parm) - sizeof(cmd.parm.cmsg) - + sizeof(cmd.parm.cmsg.para) - 2); - - if (!l) { - isdn_tty_modem_result(RESULT_ERROR, info); - return; - } - for (j = 7; j >= 0; j--) - if (m->mdmreg[REG_SI1] & (1 << j)) { - si = bit2si[j]; - break; - } - usg = isdn_calc_usage(si, l2); -#ifdef CONFIG_ISDN_AUDIO - if ((si == 1) && - (l2 != ISDN_PROTO_L2_MODEM) -#ifdef CONFIG_ISDN_TTY_FAX - && (l2 != ISDN_PROTO_L2_FAX) -#endif - ) { - l2 = ISDN_PROTO_L2_TRANS; - usg = ISDN_USAGE_VOICE; - } -#endif - m->mdmreg[REG_SI1I] = si2bit[si]; - spin_lock_irqsave(&dev->lock, flags); - i = isdn_get_free_channel(usg, l2, m->mdmreg[REG_L3PROT], -1, -1, m->msn); - if (i < 0) { - spin_unlock_irqrestore(&dev->lock, flags); - isdn_tty_modem_result(RESULT_NO_DIALTONE, info); - } else { - info->isdn_driver = dev->drvmap[i]; - info->isdn_channel = dev->chanmap[i]; - info->drv_index = i; - dev->m_idx[i] = info->line; - dev->usage[i] |= ISDN_USAGE_OUTGOING; - info->last_dir = 1; - isdn_info_update(); - spin_unlock_irqrestore(&dev->lock, flags); - cmd.driver = info->isdn_driver; - cmd.arg = info->isdn_channel; - cmd.command = ISDN_CMD_CLREAZ; - isdn_command(&cmd); - strcpy(cmd.parm.num, isdn_map_eaz2msn(m->msn, info->isdn_driver)); - cmd.driver = info->isdn_driver; - cmd.command = ISDN_CMD_SETEAZ; - isdn_command(&cmd); - cmd.driver = info->isdn_driver; - cmd.command = ISDN_CMD_SETL2; - info->last_l2 = l2; - cmd.arg = info->isdn_channel + (l2 << 8); - isdn_command(&cmd); - cmd.driver = info->isdn_driver; - cmd.command = ISDN_CMD_SETL3; - cmd.arg = info->isdn_channel + (m->mdmreg[REG_L3PROT] << 8); - isdn_command(&cmd); - cmd.driver = info->isdn_driver; - cmd.arg = info->isdn_channel; - cmd.parm.cmsg.Length = l + 14; - cmd.parm.cmsg.Command = CAPI_MANUFACTURER; - cmd.parm.cmsg.Subcommand = CAPI_REQ; - cmd.parm.cmsg.adr.Controller = info->isdn_driver + 1; - cmd.parm.cmsg.para[0] = l + 1; - strncpy(&cmd.parm.cmsg.para[1], msg, l); - cmd.parm.cmsg.para[l + 1] = 0xd; - cmd.command = CAPI_PUT_MESSAGE; -/* info->dialing = 1; - strcpy(dev->num[i], n); - isdn_info_update(); -*/ - isdn_command(&cmd); - } -} - -static inline int -isdn_tty_paranoia_check(modem_info *info, char *name, const char *routine) -{ -#ifdef MODEM_PARANOIA_CHECK - if (!info) { - printk(KERN_WARNING "isdn_tty: null info_struct for %s in %s\n", - name, routine); - return 1; - } - if (info->magic != ISDN_ASYNC_MAGIC) { - printk(KERN_WARNING "isdn_tty: bad magic for modem struct %s in %s\n", - name, routine); - return 1; - } -#endif - return 0; -} - -/* - * This routine is called to set the UART divisor registers to match - * the specified baud rate for a serial port. - */ -static void -isdn_tty_change_speed(modem_info *info) -{ - struct tty_port *port = &info->port; - uint cflag, - cval, - quot; - int i; - - if (!port->tty) - return; - cflag = port->tty->termios.c_cflag; - - quot = i = cflag & CBAUD; - if (i & CBAUDEX) { - i &= ~CBAUDEX; - if (i < 1 || i > 2) - port->tty->termios.c_cflag &= ~CBAUDEX; - else - i += 15; - } - if (quot) { - info->mcr |= UART_MCR_DTR; - isdn_tty_modem_ncarrier(info); - } else { - info->mcr &= ~UART_MCR_DTR; - if (info->emu.mdmreg[REG_DTRHUP] & BIT_DTRHUP) { -#ifdef ISDN_DEBUG_MODEM_HUP - printk(KERN_DEBUG "Mhup in changespeed\n"); -#endif - if (info->online) - info->ncarrier = 1; - isdn_tty_modem_reset_regs(info, 0); - isdn_tty_modem_hup(info, 1); - } - return; - } - /* byte size and parity */ - cval = cflag & (CSIZE | CSTOPB); - cval >>= 4; - if (cflag & PARENB) - cval |= UART_LCR_PARITY; - if (!(cflag & PARODD)) - cval |= UART_LCR_EPAR; - - tty_port_set_check_carrier(port, ~cflag & CLOCAL); -} - -static int -isdn_tty_startup(modem_info *info) -{ - if (tty_port_initialized(&info->port)) - return 0; - isdn_lock_drivers(); -#ifdef ISDN_DEBUG_MODEM_OPEN - printk(KERN_DEBUG "starting up ttyi%d ...\n", info->line); -#endif - /* - * Now, initialize the UART - */ - info->mcr = UART_MCR_DTR | UART_MCR_RTS | UART_MCR_OUT2; - if (info->port.tty) - clear_bit(TTY_IO_ERROR, &info->port.tty->flags); - /* - * and set the speed of the serial port - */ - isdn_tty_change_speed(info); - - tty_port_set_initialized(&info->port, 1); - info->msr |= (UART_MSR_DSR | UART_MSR_CTS); - info->send_outstanding = 0; - return 0; -} - -/* - * This routine will shutdown a serial port; interrupts are disabled, and - * DTR is dropped if the hangup on close termio flag is on. - */ -static void -isdn_tty_shutdown(modem_info *info) -{ - if (!tty_port_initialized(&info->port)) - return; -#ifdef ISDN_DEBUG_MODEM_OPEN - printk(KERN_DEBUG "Shutting down isdnmodem port %d ....\n", info->line); -#endif - isdn_unlock_drivers(); - info->msr &= ~UART_MSR_RI; - if (!info->port.tty || (info->port.tty->termios.c_cflag & HUPCL)) { - info->mcr &= ~(UART_MCR_DTR | UART_MCR_RTS); - if (info->emu.mdmreg[REG_DTRHUP] & BIT_DTRHUP) { - isdn_tty_modem_reset_regs(info, 0); -#ifdef ISDN_DEBUG_MODEM_HUP - printk(KERN_DEBUG "Mhup in isdn_tty_shutdown\n"); -#endif - isdn_tty_modem_hup(info, 1); - } - } - if (info->port.tty) - set_bit(TTY_IO_ERROR, &info->port.tty->flags); - - tty_port_set_initialized(&info->port, 0); -} - -/* isdn_tty_write() is the main send-routine. It is called from the upper - * levels within the kernel to perform sending data. Depending on the - * online-flag it either directs output to the at-command-interpreter or - * to the lower level. Additional tasks done here: - * - If online, check for escape-sequence (+++) - * - If sending audio-data, call isdn_tty_DLEdown() to parse DLE-codes. - * - If receiving audio-data, call isdn_tty_end_vrx() to abort if needed. - * - If dialing, abort dial. - */ -static int -isdn_tty_write(struct tty_struct *tty, const u_char *buf, int count) -{ - int c; - int total = 0; - modem_info *info = (modem_info *) tty->driver_data; - atemu *m = &info->emu; - - if (isdn_tty_paranoia_check(info, tty->name, "isdn_tty_write")) - return 0; - /* See isdn_tty_senddown() */ - atomic_inc(&info->xmit_lock); - while (1) { - c = count; - if (c > info->xmit_size - info->xmit_count) - c = info->xmit_size - info->xmit_count; - if (info->isdn_driver >= 0 && c > dev->drv[info->isdn_driver]->maxbufsize) - c = dev->drv[info->isdn_driver]->maxbufsize; - if (c <= 0) - break; - if ((info->online > 1) -#ifdef CONFIG_ISDN_AUDIO - || (info->vonline & 3) -#endif - ) { -#ifdef CONFIG_ISDN_AUDIO - if (!info->vonline) -#endif - isdn_tty_check_esc(buf, m->mdmreg[REG_ESC], c, - &(m->pluscount), - &(m->lastplus)); - memcpy(&info->port.xmit_buf[info->xmit_count], buf, c); -#ifdef CONFIG_ISDN_AUDIO - if (info->vonline) { - int cc = isdn_tty_handleDLEdown(info, m, c); - if (info->vonline & 2) { - if (!cc) { - /* If DLE decoding results in zero-transmit, but - * c originally was non-zero, do a wakeup. - */ - tty_wakeup(tty); - info->msr |= UART_MSR_CTS; - info->lsr |= UART_LSR_TEMT; - } - info->xmit_count += cc; - } - if ((info->vonline & 3) == 1) { - /* Do NOT handle Ctrl-Q or Ctrl-S - * when in full-duplex audio mode. - */ - if (isdn_tty_end_vrx(buf, c)) { - info->vonline &= ~1; -#ifdef ISDN_DEBUG_MODEM_VOICE - printk(KERN_DEBUG - "got !^Q/^S, send DLE-ETX,VCON on ttyI%d\n", - info->line); -#endif - isdn_tty_at_cout("\020\003\r\nVCON\r\n", info); - } - } - } else - if (TTY_IS_FCLASS1(info)) { - int cc = isdn_tty_handleDLEdown(info, m, c); - - if (info->vonline & 4) { /* ETX seen */ - isdn_ctrl c; - - c.command = ISDN_CMD_FAXCMD; - c.driver = info->isdn_driver; - c.arg = info->isdn_channel; - c.parm.aux.cmd = ISDN_FAX_CLASS1_CTRL; - c.parm.aux.subcmd = ETX; - isdn_command(&c); - } - info->vonline = 0; -#ifdef ISDN_DEBUG_MODEM_VOICE - printk(KERN_DEBUG "fax dle cc/c %d/%d\n", cc, c); -#endif - info->xmit_count += cc; - } else -#endif - info->xmit_count += c; - } else { - info->msr |= UART_MSR_CTS; - info->lsr |= UART_LSR_TEMT; - if (info->dialing) { - info->dialing = 0; -#ifdef ISDN_DEBUG_MODEM_HUP - printk(KERN_DEBUG "Mhup in isdn_tty_write\n"); -#endif - isdn_tty_modem_result(RESULT_NO_CARRIER, info); - isdn_tty_modem_hup(info, 1); - } else - c = isdn_tty_edit_at(buf, c, info); - } - buf += c; - count -= c; - total += c; - } - atomic_dec(&info->xmit_lock); - if ((info->xmit_count) || !skb_queue_empty(&info->xmit_queue)) { - if (m->mdmreg[REG_DXMT] & BIT_DXMT) { - isdn_tty_senddown(info); - isdn_tty_tint(info); - } - isdn_timer_ctrl(ISDN_TIMER_MODEMXMIT, 1); - } - return total; -} - -static int -isdn_tty_write_room(struct tty_struct *tty) -{ - modem_info *info = (modem_info *) tty->driver_data; - int ret; - - if (isdn_tty_paranoia_check(info, tty->name, "isdn_tty_write_room")) - return 0; - if (!info->online) - return info->xmit_size; - ret = info->xmit_size - info->xmit_count; - return (ret < 0) ? 0 : ret; -} - -static int -isdn_tty_chars_in_buffer(struct tty_struct *tty) -{ - modem_info *info = (modem_info *) tty->driver_data; - - if (isdn_tty_paranoia_check(info, tty->name, "isdn_tty_chars_in_buffer")) - return 0; - if (!info->online) - return 0; - return (info->xmit_count); -} - -static void -isdn_tty_flush_buffer(struct tty_struct *tty) -{ - modem_info *info; - - if (!tty) { - return; - } - info = (modem_info *) tty->driver_data; - if (isdn_tty_paranoia_check(info, tty->name, "isdn_tty_flush_buffer")) { - return; - } - isdn_tty_cleanup_xmit(info); - info->xmit_count = 0; - tty_wakeup(tty); -} - -static void -isdn_tty_flush_chars(struct tty_struct *tty) -{ - modem_info *info = (modem_info *) tty->driver_data; - - if (isdn_tty_paranoia_check(info, tty->name, "isdn_tty_flush_chars")) - return; - if ((info->xmit_count) || !skb_queue_empty(&info->xmit_queue)) - isdn_timer_ctrl(ISDN_TIMER_MODEMXMIT, 1); -} - -/* - * ------------------------------------------------------------ - * isdn_tty_throttle() - * - * This routine is called by the upper-layer tty layer to signal that - * incoming characters should be throttled. - * ------------------------------------------------------------ - */ -static void -isdn_tty_throttle(struct tty_struct *tty) -{ - modem_info *info = (modem_info *) tty->driver_data; - - if (isdn_tty_paranoia_check(info, tty->name, "isdn_tty_throttle")) - return; - if (I_IXOFF(tty)) - info->x_char = STOP_CHAR(tty); - info->mcr &= ~UART_MCR_RTS; -} - -static void -isdn_tty_unthrottle(struct tty_struct *tty) -{ - modem_info *info = (modem_info *) tty->driver_data; - - if (isdn_tty_paranoia_check(info, tty->name, "isdn_tty_unthrottle")) - return; - if (I_IXOFF(tty)) { - if (info->x_char) - info->x_char = 0; - else - info->x_char = START_CHAR(tty); - } - info->mcr |= UART_MCR_RTS; -} - -/* - * ------------------------------------------------------------ - * isdn_tty_ioctl() and friends - * ------------------------------------------------------------ - */ - -/* - * isdn_tty_get_lsr_info - get line status register info - * - * Purpose: Let user call ioctl() to get info when the UART physically - * is emptied. On bus types like RS485, the transmitter must - * release the bus after transmitting. This must be done when - * the transmit shift register is empty, not be done when the - * transmit holding register is empty. This functionality - * allows RS485 driver to be written in user space. - */ -static int -isdn_tty_get_lsr_info(modem_info *info, uint __user *value) -{ - u_char status; - uint result; - - status = info->lsr; - result = ((status & UART_LSR_TEMT) ? TIOCSER_TEMT : 0); - return put_user(result, value); -} - - -static int -isdn_tty_tiocmget(struct tty_struct *tty) -{ - modem_info *info = (modem_info *) tty->driver_data; - u_char control, status; - - if (isdn_tty_paranoia_check(info, tty->name, __func__)) - return -ENODEV; - if (tty_io_error(tty)) - return -EIO; - - mutex_lock(&modem_info_mutex); -#ifdef ISDN_DEBUG_MODEM_IOCTL - printk(KERN_DEBUG "ttyI%d ioctl TIOCMGET\n", info->line); -#endif - - control = info->mcr; - status = info->msr; - mutex_unlock(&modem_info_mutex); - return ((control & UART_MCR_RTS) ? TIOCM_RTS : 0) - | ((control & UART_MCR_DTR) ? TIOCM_DTR : 0) - | ((status & UART_MSR_DCD) ? TIOCM_CAR : 0) - | ((status & UART_MSR_RI) ? TIOCM_RNG : 0) - | ((status & UART_MSR_DSR) ? TIOCM_DSR : 0) - | ((status & UART_MSR_CTS) ? TIOCM_CTS : 0); -} - -static int -isdn_tty_tiocmset(struct tty_struct *tty, - unsigned int set, unsigned int clear) -{ - modem_info *info = (modem_info *) tty->driver_data; - - if (isdn_tty_paranoia_check(info, tty->name, __func__)) - return -ENODEV; - if (tty_io_error(tty)) - return -EIO; - -#ifdef ISDN_DEBUG_MODEM_IOCTL - printk(KERN_DEBUG "ttyI%d ioctl TIOCMxxx: %x %x\n", info->line, set, clear); -#endif - - mutex_lock(&modem_info_mutex); - if (set & TIOCM_RTS) - info->mcr |= UART_MCR_RTS; - if (set & TIOCM_DTR) { - info->mcr |= UART_MCR_DTR; - isdn_tty_modem_ncarrier(info); - } - - if (clear & TIOCM_RTS) - info->mcr &= ~UART_MCR_RTS; - if (clear & TIOCM_DTR) { - info->mcr &= ~UART_MCR_DTR; - if (info->emu.mdmreg[REG_DTRHUP] & BIT_DTRHUP) { - isdn_tty_modem_reset_regs(info, 0); -#ifdef ISDN_DEBUG_MODEM_HUP - printk(KERN_DEBUG "Mhup in TIOCMSET\n"); -#endif - if (info->online) - info->ncarrier = 1; - isdn_tty_modem_hup(info, 1); - } - } - mutex_unlock(&modem_info_mutex); - return 0; -} - -static int -isdn_tty_ioctl(struct tty_struct *tty, uint cmd, ulong arg) -{ - modem_info *info = (modem_info *) tty->driver_data; - - if (isdn_tty_paranoia_check(info, tty->name, "isdn_tty_ioctl")) - return -ENODEV; - if (tty_io_error(tty)) - return -EIO; - switch (cmd) { - case TIOCSERGETLSR: /* Get line status register */ -#ifdef ISDN_DEBUG_MODEM_IOCTL - printk(KERN_DEBUG "ttyI%d ioctl TIOCSERGETLSR\n", info->line); -#endif - return isdn_tty_get_lsr_info(info, (uint __user *) arg); - default: -#ifdef ISDN_DEBUG_MODEM_IOCTL - printk(KERN_DEBUG "UNKNOWN ioctl 0x%08x on ttyi%d\n", cmd, info->line); -#endif - return -ENOIOCTLCMD; - } - return 0; -} - -static void -isdn_tty_set_termios(struct tty_struct *tty, struct ktermios *old_termios) -{ - modem_info *info = (modem_info *) tty->driver_data; - - mutex_lock(&modem_info_mutex); - if (!old_termios) - isdn_tty_change_speed(info); - else { - if (tty->termios.c_cflag == old_termios->c_cflag && - tty->termios.c_ispeed == old_termios->c_ispeed && - tty->termios.c_ospeed == old_termios->c_ospeed) { - mutex_unlock(&modem_info_mutex); - return; - } - isdn_tty_change_speed(info); - } - mutex_unlock(&modem_info_mutex); -} - -/* - * ------------------------------------------------------------ - * isdn_tty_open() and friends - * ------------------------------------------------------------ - */ - -static int isdn_tty_install(struct tty_driver *driver, struct tty_struct *tty) -{ - modem_info *info = &dev->mdm.info[tty->index]; - - if (isdn_tty_paranoia_check(info, tty->name, __func__)) - return -ENODEV; - - tty->driver_data = info; - - return tty_port_install(&info->port, driver, tty); -} - -/* - * This routine is called whenever a serial port is opened. It - * enables interrupts for a serial port, linking in its async structure into - * the IRQ chain. It also performs the serial-specific - * initialization for the tty structure. - */ -static int -isdn_tty_open(struct tty_struct *tty, struct file *filp) -{ - modem_info *info = tty->driver_data; - struct tty_port *port = &info->port; - int retval; - -#ifdef ISDN_DEBUG_MODEM_OPEN - printk(KERN_DEBUG "isdn_tty_open %s, count = %d\n", tty->name, - port->count); -#endif - port->count++; - port->tty = tty; - /* - * Start up serial port - */ - retval = isdn_tty_startup(info); - if (retval) { -#ifdef ISDN_DEBUG_MODEM_OPEN - printk(KERN_DEBUG "isdn_tty_open return after startup\n"); -#endif - return retval; - } - retval = tty_port_block_til_ready(port, tty, filp); - if (retval) { -#ifdef ISDN_DEBUG_MODEM_OPEN - printk(KERN_DEBUG "isdn_tty_open return after isdn_tty_block_til_ready \n"); -#endif - return retval; - } -#ifdef ISDN_DEBUG_MODEM_OPEN - printk(KERN_DEBUG "isdn_tty_open ttyi%d successful...\n", info->line); -#endif - dev->modempoll++; -#ifdef ISDN_DEBUG_MODEM_OPEN - printk(KERN_DEBUG "isdn_tty_open normal exit\n"); -#endif - return 0; -} - -static void -isdn_tty_close(struct tty_struct *tty, struct file *filp) -{ - modem_info *info = (modem_info *) tty->driver_data; - struct tty_port *port = &info->port; - ulong timeout; - - if (!info || isdn_tty_paranoia_check(info, tty->name, "isdn_tty_close")) - return; - if (tty_hung_up_p(filp)) { -#ifdef ISDN_DEBUG_MODEM_OPEN - printk(KERN_DEBUG "isdn_tty_close return after tty_hung_up_p\n"); -#endif - return; - } - if ((tty->count == 1) && (port->count != 1)) { - /* - * Uh, oh. tty->count is 1, which means that the tty - * structure will be freed. Info->count should always - * be one in these conditions. If it's greater than - * one, we've got real problems, since it means the - * serial port won't be shutdown. - */ - printk(KERN_ERR "isdn_tty_close: bad port count; tty->count is 1, " - "info->count is %d\n", port->count); - port->count = 1; - } - if (--port->count < 0) { - printk(KERN_ERR "isdn_tty_close: bad port count for ttyi%d: %d\n", - info->line, port->count); - port->count = 0; - } - if (port->count) { -#ifdef ISDN_DEBUG_MODEM_OPEN - printk(KERN_DEBUG "isdn_tty_close after info->count != 0\n"); -#endif - return; - } - info->closing = 1; - - tty->closing = 1; - /* - * At this point we stop accepting input. To do this, we - * disable the receive line status interrupts, and tell the - * interrupt driver to stop checking the data ready bit in the - * line status register. - */ - if (tty_port_initialized(port)) { - tty_wait_until_sent(tty, 3000); /* 30 seconds timeout */ - /* - * Before we drop DTR, make sure the UART transmitter - * has completely drained; this is especially - * important if there is a transmit FIFO! - */ - timeout = jiffies + HZ; - while (!(info->lsr & UART_LSR_TEMT)) { - schedule_timeout_interruptible(20); - if (time_after(jiffies, timeout)) - break; - } - } - dev->modempoll--; - isdn_tty_shutdown(info); - isdn_tty_flush_buffer(tty); - tty_ldisc_flush(tty); - port->tty = NULL; - info->ncarrier = 0; - - tty_port_close_end(port, tty); - info->closing = 0; -#ifdef ISDN_DEBUG_MODEM_OPEN - printk(KERN_DEBUG "isdn_tty_close normal exit\n"); -#endif -} - -/* - * isdn_tty_hangup() --- called by tty_hangup() when a hangup is signaled. - */ -static void -isdn_tty_hangup(struct tty_struct *tty) -{ - modem_info *info = (modem_info *) tty->driver_data; - struct tty_port *port = &info->port; - - if (isdn_tty_paranoia_check(info, tty->name, "isdn_tty_hangup")) - return; - isdn_tty_shutdown(info); - port->count = 0; - tty_port_set_active(port, 0); - port->tty = NULL; - wake_up_interruptible(&port->open_wait); -} - -/* This routine initializes all emulator-data. - */ -static void -isdn_tty_reset_profile(atemu *m) -{ - m->profile[0] = 0; - m->profile[1] = 0; - m->profile[2] = 43; - m->profile[3] = 13; - m->profile[4] = 10; - m->profile[5] = 8; - m->profile[6] = 3; - m->profile[7] = 60; - m->profile[8] = 2; - m->profile[9] = 6; - m->profile[10] = 7; - m->profile[11] = 70; - m->profile[12] = 0x45; - m->profile[13] = 4; - m->profile[14] = ISDN_PROTO_L2_X75I; - m->profile[15] = ISDN_PROTO_L3_TRANS; - m->profile[16] = ISDN_SERIAL_XMIT_SIZE / 16; - m->profile[17] = ISDN_MODEM_WINSIZE; - m->profile[18] = 4; - m->profile[19] = 0; - m->profile[20] = 0; - m->profile[23] = 0; - m->pmsn[0] = '\0'; - m->plmsn[0] = '\0'; -} - -#ifdef CONFIG_ISDN_AUDIO -static void -isdn_tty_modem_reset_vpar(atemu *m) -{ - m->vpar[0] = 2; /* Voice-device (2 = phone line) */ - m->vpar[1] = 0; /* Silence detection level (0 = none ) */ - m->vpar[2] = 70; /* Silence interval (7 sec. ) */ - m->vpar[3] = 2; /* Compression type (1 = ADPCM-2 ) */ - m->vpar[4] = 0; /* DTMF detection level (0 = softcode ) */ - m->vpar[5] = 8; /* DTMF interval (8 * 5 ms. ) */ -} -#endif - -#ifdef CONFIG_ISDN_TTY_FAX -static void -isdn_tty_modem_reset_faxpar(modem_info *info) -{ - T30_s *f = info->fax; - - f->code = 0; - f->phase = ISDN_FAX_PHASE_IDLE; - f->direction = 0; - f->resolution = 1; /* fine */ - f->rate = 5; /* 14400 bit/s */ - f->width = 0; - f->length = 0; - f->compression = 0; - f->ecm = 0; - f->binary = 0; - f->scantime = 0; - memset(&f->id[0], 32, FAXIDLEN - 1); - f->id[FAXIDLEN - 1] = 0; - f->badlin = 0; - f->badmul = 0; - f->bor = 0; - f->nbc = 0; - f->cq = 0; - f->cr = 0; - f->ctcrty = 0; - f->minsp = 0; - f->phcto = 30; - f->rel = 0; - memset(&f->pollid[0], 32, FAXIDLEN - 1); - f->pollid[FAXIDLEN - 1] = 0; -} -#endif - -static void -isdn_tty_modem_reset_regs(modem_info *info, int force) -{ - atemu *m = &info->emu; - if ((m->mdmreg[REG_DTRR] & BIT_DTRR) || force) { - memcpy(m->mdmreg, m->profile, ISDN_MODEM_NUMREG); - memcpy(m->msn, m->pmsn, ISDN_MSNLEN); - memcpy(m->lmsn, m->plmsn, ISDN_LMSNLEN); - info->xmit_size = m->mdmreg[REG_PSIZE] * 16; - } -#ifdef CONFIG_ISDN_AUDIO - isdn_tty_modem_reset_vpar(m); -#endif -#ifdef CONFIG_ISDN_TTY_FAX - isdn_tty_modem_reset_faxpar(info); -#endif - m->mdmcmdl = 0; -} - -static void -modem_write_profile(atemu *m) -{ - memcpy(m->profile, m->mdmreg, ISDN_MODEM_NUMREG); - memcpy(m->pmsn, m->msn, ISDN_MSNLEN); - memcpy(m->plmsn, m->lmsn, ISDN_LMSNLEN); - if (dev->profd) - send_sig(SIGIO, dev->profd, 1); -} - -static const struct tty_operations modem_ops = { - .install = isdn_tty_install, - .open = isdn_tty_open, - .close = isdn_tty_close, - .write = isdn_tty_write, - .flush_chars = isdn_tty_flush_chars, - .write_room = isdn_tty_write_room, - .chars_in_buffer = isdn_tty_chars_in_buffer, - .flush_buffer = isdn_tty_flush_buffer, - .ioctl = isdn_tty_ioctl, - .throttle = isdn_tty_throttle, - .unthrottle = isdn_tty_unthrottle, - .set_termios = isdn_tty_set_termios, - .hangup = isdn_tty_hangup, - .tiocmget = isdn_tty_tiocmget, - .tiocmset = isdn_tty_tiocmset, -}; - -static int isdn_tty_carrier_raised(struct tty_port *port) -{ - modem_info *info = container_of(port, modem_info, port); - return info->msr & UART_MSR_DCD; -} - -static const struct tty_port_operations isdn_tty_port_ops = { - .carrier_raised = isdn_tty_carrier_raised, -}; - -int -isdn_tty_modem_init(void) -{ - isdn_modem_t *m; - int i, retval; - modem_info *info; - - m = &dev->mdm; - m->tty_modem = alloc_tty_driver(ISDN_MAX_CHANNELS); - if (!m->tty_modem) - return -ENOMEM; - m->tty_modem->name = "ttyI"; - m->tty_modem->major = ISDN_TTY_MAJOR; - m->tty_modem->minor_start = 0; - m->tty_modem->type = TTY_DRIVER_TYPE_SERIAL; - m->tty_modem->subtype = SERIAL_TYPE_NORMAL; - m->tty_modem->init_termios = tty_std_termios; - m->tty_modem->init_termios.c_cflag = B9600 | CS8 | CREAD | HUPCL | CLOCAL; - m->tty_modem->flags = TTY_DRIVER_REAL_RAW; - m->tty_modem->driver_name = "isdn_tty"; - tty_set_operations(m->tty_modem, &modem_ops); - retval = tty_register_driver(m->tty_modem); - if (retval) { - printk(KERN_WARNING "isdn_tty: Couldn't register modem-device\n"); - goto err; - } - for (i = 0; i < ISDN_MAX_CHANNELS; i++) { - info = &m->info[i]; -#ifdef CONFIG_ISDN_TTY_FAX - if (!(info->fax = kmalloc(sizeof(T30_s), GFP_KERNEL))) { - printk(KERN_ERR "Could not allocate fax t30-buffer\n"); - retval = -ENOMEM; - goto err_unregister; - } -#endif - tty_port_init(&info->port); - info->port.ops = &isdn_tty_port_ops; - spin_lock_init(&info->readlock); - sprintf(info->last_cause, "0000"); - sprintf(info->last_num, "none"); - info->last_dir = 0; - info->last_lhup = 1; - info->last_l2 = -1; - info->last_si = 0; - isdn_tty_reset_profile(&info->emu); - isdn_tty_modem_reset_regs(info, 1); - info->magic = ISDN_ASYNC_MAGIC; - info->line = i; - info->x_char = 0; - info->isdn_driver = -1; - info->isdn_channel = -1; - info->drv_index = -1; - info->xmit_size = ISDN_SERIAL_XMIT_SIZE; - timer_setup(&info->nc_timer, isdn_tty_modem_do_ncarrier, 0); - skb_queue_head_init(&info->xmit_queue); -#ifdef CONFIG_ISDN_AUDIO - skb_queue_head_init(&info->dtmf_queue); -#endif - info->port.xmit_buf = kmalloc(ISDN_SERIAL_XMIT_MAX + 5, - GFP_KERNEL); - if (!info->port.xmit_buf) { - printk(KERN_ERR "Could not allocate modem xmit-buffer\n"); - retval = -ENOMEM; - goto err_unregister; - } - /* Make room for T.70 header */ - info->port.xmit_buf += 4; - } - return 0; -err_unregister: - for (i--; i >= 0; i--) { - info = &m->info[i]; -#ifdef CONFIG_ISDN_TTY_FAX - kfree(info->fax); -#endif - kfree(info->port.xmit_buf - 4); - info->port.xmit_buf = NULL; - tty_port_destroy(&info->port); - } - tty_unregister_driver(m->tty_modem); -err: - put_tty_driver(m->tty_modem); - m->tty_modem = NULL; - return retval; -} - -void -isdn_tty_exit(void) -{ - modem_info *info; - int i; - - for (i = 0; i < ISDN_MAX_CHANNELS; i++) { - info = &dev->mdm.info[i]; - isdn_tty_cleanup_xmit(info); -#ifdef CONFIG_ISDN_TTY_FAX - kfree(info->fax); -#endif - kfree(info->port.xmit_buf - 4); - info->port.xmit_buf = NULL; - tty_port_destroy(&info->port); - } - tty_unregister_driver(dev->mdm.tty_modem); - put_tty_driver(dev->mdm.tty_modem); - dev->mdm.tty_modem = NULL; -} - - -/* - * isdn_tty_match_icall(char *MSN, atemu *tty_emulator, int dev_idx) - * match the MSN against the MSNs (glob patterns) defined for tty_emulator, - * and return 0 for match, 1 for no match, 2 if MSN could match if longer. - */ - -static int -isdn_tty_match_icall(char *cid, atemu *emu, int di) -{ -#ifdef ISDN_DEBUG_MODEM_ICALL - printk(KERN_DEBUG "m_fi: msn=%s lmsn=%s mmsn=%s mreg[SI1]=%d mreg[SI2]=%d\n", - emu->msn, emu->lmsn, isdn_map_eaz2msn(emu->msn, di), - emu->mdmreg[REG_SI1], emu->mdmreg[REG_SI2]); -#endif - if (strlen(emu->lmsn)) { - char *p = emu->lmsn; - char *q; - int tmp; - int ret = 0; - - while (1) { - if ((q = strchr(p, ';'))) - *q = '\0'; - if ((tmp = isdn_msncmp(cid, isdn_map_eaz2msn(p, di))) > ret) - ret = tmp; -#ifdef ISDN_DEBUG_MODEM_ICALL - printk(KERN_DEBUG "m_fi: lmsnX=%s mmsn=%s -> tmp=%d\n", - p, isdn_map_eaz2msn(emu->msn, di), tmp); -#endif - if (q) { - *q = ';'; - p = q; - p++; - } - if (!tmp) - return 0; - if (!q) - break; - } - return ret; - } else { - int tmp; - tmp = isdn_msncmp(cid, isdn_map_eaz2msn(emu->msn, di)); -#ifdef ISDN_DEBUG_MODEM_ICALL - printk(KERN_DEBUG "m_fi: mmsn=%s -> tmp=%d\n", - isdn_map_eaz2msn(emu->msn, di), tmp); -#endif - return tmp; - } -} - -/* - * An incoming call-request has arrived. - * Search the tty-devices for an appropriate device and bind - * it to the ISDN-Channel. - * Return: - * - * 0 = No matching device found. - * 1 = A matching device found. - * 3 = No match found, but eventually would match, if - * CID is longer. - */ -int -isdn_tty_find_icall(int di, int ch, setup_parm *setup) -{ - char *eaz; - int i; - int wret; - int idx; - int si1; - int si2; - char *nr; - ulong flags; - - if (!setup->phone[0]) { - nr = "0"; - printk(KERN_INFO "isdn_tty: Incoming call without OAD, assuming '0'\n"); - } else - nr = setup->phone; - si1 = (int) setup->si1; - si2 = (int) setup->si2; - if (!setup->eazmsn[0]) { - printk(KERN_WARNING "isdn_tty: Incoming call without CPN, assuming '0'\n"); - eaz = "0"; - } else - eaz = setup->eazmsn; -#ifdef ISDN_DEBUG_MODEM_ICALL - printk(KERN_DEBUG "m_fi: eaz=%s si1=%d si2=%d\n", eaz, si1, si2); -#endif - wret = 0; - spin_lock_irqsave(&dev->lock, flags); - for (i = 0; i < ISDN_MAX_CHANNELS; i++) { - modem_info *info = &dev->mdm.info[i]; - - if (info->port.count == 0) - continue; - if ((info->emu.mdmreg[REG_SI1] & si2bit[si1]) && /* SI1 is matching */ - (info->emu.mdmreg[REG_SI2] == si2)) { /* SI2 is matching */ - idx = isdn_dc2minor(di, ch); -#ifdef ISDN_DEBUG_MODEM_ICALL - printk(KERN_DEBUG "m_fi: match1 wret=%d\n", wret); - printk(KERN_DEBUG "m_fi: idx=%d flags=%08lx drv=%d ch=%d usg=%d\n", idx, - info->port.flags, info->isdn_driver, - info->isdn_channel, dev->usage[idx]); -#endif - if ( -#ifndef FIX_FILE_TRANSFER - tty_port_active(&info->port) && -#endif - (info->isdn_driver == -1) && - (info->isdn_channel == -1) && - (USG_NONE(dev->usage[idx]))) { - int matchret; - - if ((matchret = isdn_tty_match_icall(eaz, &info->emu, di)) > wret) - wret = matchret; - if (!matchret) { /* EAZ is matching */ - info->isdn_driver = di; - info->isdn_channel = ch; - info->drv_index = idx; - dev->m_idx[idx] = info->line; - dev->usage[idx] &= ISDN_USAGE_EXCLUSIVE; - dev->usage[idx] |= isdn_calc_usage(si1, info->emu.mdmreg[REG_L2PROT]); - strcpy(dev->num[idx], nr); - strcpy(info->emu.cpn, eaz); - info->emu.mdmreg[REG_SI1I] = si2bit[si1]; - info->emu.mdmreg[REG_PLAN] = setup->plan; - info->emu.mdmreg[REG_SCREEN] = setup->screen; - isdn_info_update(); - spin_unlock_irqrestore(&dev->lock, flags); - printk(KERN_INFO "isdn_tty: call from %s, -> RING on ttyI%d\n", nr, - info->line); - info->msr |= UART_MSR_RI; - isdn_tty_modem_result(RESULT_RING, info); - isdn_timer_ctrl(ISDN_TIMER_MODEMRING, 1); - return 1; - } - } - } - } - spin_unlock_irqrestore(&dev->lock, flags); - printk(KERN_INFO "isdn_tty: call from %s -> %s %s\n", nr, eaz, - ((dev->drv[di]->flags & DRV_FLAG_REJBUS) && (wret != 2)) ? "rejected" : "ignored"); - return (wret == 2) ? 3 : 0; -} - -int -isdn_tty_stat_callback(int i, isdn_ctrl *c) -{ - int mi; - modem_info *info; - char *e; - - if (i < 0) - return 0; - if ((mi = dev->m_idx[i]) >= 0) { - info = &dev->mdm.info[mi]; - switch (c->command) { - case ISDN_STAT_CINF: - printk(KERN_DEBUG "CHARGEINFO on ttyI%d: %ld %s\n", info->line, c->arg, c->parm.num); - info->emu.charge = (unsigned) simple_strtoul(c->parm.num, &e, 10); - if (e == (char *)c->parm.num) - info->emu.charge = 0; - - break; - case ISDN_STAT_BSENT: -#ifdef ISDN_TTY_STAT_DEBUG - printk(KERN_DEBUG "tty_STAT_BSENT ttyI%d\n", info->line); -#endif - if ((info->isdn_driver == c->driver) && - (info->isdn_channel == c->arg)) { - info->msr |= UART_MSR_CTS; - if (info->send_outstanding) - if (!(--info->send_outstanding)) - info->lsr |= UART_LSR_TEMT; - isdn_tty_tint(info); - return 1; - } - break; - case ISDN_STAT_CAUSE: -#ifdef ISDN_TTY_STAT_DEBUG - printk(KERN_DEBUG "tty_STAT_CAUSE ttyI%d\n", info->line); -#endif - /* Signal cause to tty-device */ - strncpy(info->last_cause, c->parm.num, 5); - return 1; - case ISDN_STAT_DISPLAY: -#ifdef ISDN_TTY_STAT_DEBUG - printk(KERN_DEBUG "tty_STAT_DISPLAY ttyI%d\n", info->line); -#endif - /* Signal display to tty-device */ - if ((info->emu.mdmreg[REG_DISPLAY] & BIT_DISPLAY) && - !(info->emu.mdmreg[REG_RESPNUM] & BIT_RESPNUM)) { - isdn_tty_at_cout("\r\n", info); - isdn_tty_at_cout("DISPLAY: ", info); - isdn_tty_at_cout(c->parm.display, info); - isdn_tty_at_cout("\r\n", info); - } - return 1; - case ISDN_STAT_DCONN: -#ifdef ISDN_TTY_STAT_DEBUG - printk(KERN_DEBUG "tty_STAT_DCONN ttyI%d\n", info->line); -#endif - if (tty_port_active(&info->port)) { - if (info->dialing == 1) { - info->dialing = 2; - return 1; - } - } - break; - case ISDN_STAT_DHUP: -#ifdef ISDN_TTY_STAT_DEBUG - printk(KERN_DEBUG "tty_STAT_DHUP ttyI%d\n", info->line); -#endif - if (tty_port_active(&info->port)) { - if (info->dialing == 1) - isdn_tty_modem_result(RESULT_BUSY, info); - if (info->dialing > 1) - isdn_tty_modem_result(RESULT_NO_CARRIER, info); - info->dialing = 0; -#ifdef ISDN_DEBUG_MODEM_HUP - printk(KERN_DEBUG "Mhup in ISDN_STAT_DHUP\n"); -#endif - isdn_tty_modem_hup(info, 0); - return 1; - } - break; - case ISDN_STAT_BCONN: -#ifdef ISDN_TTY_STAT_DEBUG - printk(KERN_DEBUG "tty_STAT_BCONN ttyI%d\n", info->line); -#endif - /* Wake up any processes waiting - * for incoming call of this device when - * DCD follow the state of incoming carrier - */ - if (info->port.blocked_open && - (info->emu.mdmreg[REG_DCD] & BIT_DCD)) { - wake_up_interruptible(&info->port.open_wait); - } - - /* Schedule CONNECT-Message to any tty - * waiting for it and - * set DCD-bit of its modem-status. - */ - if (tty_port_active(&info->port) || - (info->port.blocked_open && - (info->emu.mdmreg[REG_DCD] & BIT_DCD))) { - info->msr |= UART_MSR_DCD; - info->emu.charge = 0; - if (info->dialing & 0xf) - info->last_dir = 1; - else - info->last_dir = 0; - info->dialing = 0; - info->rcvsched = 1; - if (USG_MODEM(dev->usage[i])) { - if (info->emu.mdmreg[REG_L2PROT] == ISDN_PROTO_L2_MODEM) { - strcpy(info->emu.connmsg, c->parm.num); - isdn_tty_modem_result(RESULT_CONNECT, info); - } else - isdn_tty_modem_result(RESULT_CONNECT64000, info); - } - if (USG_VOICE(dev->usage[i])) - isdn_tty_modem_result(RESULT_VCON, info); - return 1; - } - break; - case ISDN_STAT_BHUP: -#ifdef ISDN_TTY_STAT_DEBUG - printk(KERN_DEBUG "tty_STAT_BHUP ttyI%d\n", info->line); -#endif - if (tty_port_active(&info->port)) { -#ifdef ISDN_DEBUG_MODEM_HUP - printk(KERN_DEBUG "Mhup in ISDN_STAT_BHUP\n"); -#endif - isdn_tty_modem_hup(info, 0); - return 1; - } - break; - case ISDN_STAT_NODCH: -#ifdef ISDN_TTY_STAT_DEBUG - printk(KERN_DEBUG "tty_STAT_NODCH ttyI%d\n", info->line); -#endif - if (tty_port_active(&info->port)) { - if (info->dialing) { - info->dialing = 0; - info->last_l2 = -1; - info->last_si = 0; - sprintf(info->last_cause, "0000"); - isdn_tty_modem_result(RESULT_NO_DIALTONE, info); - } - isdn_tty_modem_hup(info, 0); - return 1; - } - break; - case ISDN_STAT_UNLOAD: -#ifdef ISDN_TTY_STAT_DEBUG - printk(KERN_DEBUG "tty_STAT_UNLOAD ttyI%d\n", info->line); -#endif - for (i = 0; i < ISDN_MAX_CHANNELS; i++) { - info = &dev->mdm.info[i]; - if (info->isdn_driver == c->driver) { - if (info->online) - isdn_tty_modem_hup(info, 1); - } - } - return 1; -#ifdef CONFIG_ISDN_TTY_FAX - case ISDN_STAT_FAXIND: - if (tty_port_active(&info->port)) { - isdn_tty_fax_command(info, c); - } - break; -#endif -#ifdef CONFIG_ISDN_AUDIO - case ISDN_STAT_AUDIO: - if (tty_port_active(&info->port)) { - switch (c->parm.num[0]) { - case ISDN_AUDIO_DTMF: - if (info->vonline) { - isdn_audio_put_dle_code(info, - c->parm.num[1]); - } - break; - } - } - break; -#endif - } - } - return 0; -} - -/********************************************************************* - Modem-Emulator-Routines -*********************************************************************/ - -#define cmdchar(c) ((c >= ' ') && (c <= 0x7f)) - -/* - * Put a message from the AT-emulator into receive-buffer of tty, - * convert CR, LF, and BS to values in modem-registers 3, 4 and 5. - */ -void -isdn_tty_at_cout(char *msg, modem_info *info) -{ - struct tty_port *port = &info->port; - atemu *m = &info->emu; - char *p; - char c; - u_long flags; - struct sk_buff *skb = NULL; - char *sp = NULL; - int l; - - if (!msg) { - printk(KERN_WARNING "isdn_tty: Null-Message in isdn_tty_at_cout\n"); - return; - } - - l = strlen(msg); - - spin_lock_irqsave(&info->readlock, flags); - if (info->closing) { - spin_unlock_irqrestore(&info->readlock, flags); - return; - } - - /* use queue instead of direct, if online and */ - /* data is in queue or buffer is full */ - if (info->online && ((tty_buffer_request_room(port, l) < l) || - !skb_queue_empty(&dev->drv[info->isdn_driver]->rpqueue[info->isdn_channel]))) { - skb = alloc_skb(l, GFP_ATOMIC); - if (!skb) { - spin_unlock_irqrestore(&info->readlock, flags); - return; - } - sp = skb_put(skb, l); -#ifdef CONFIG_ISDN_AUDIO - ISDN_AUDIO_SKB_DLECOUNT(skb) = 0; - ISDN_AUDIO_SKB_LOCK(skb) = 0; -#endif - } - - for (p = msg; *p; p++) { - switch (*p) { - case '\r': - c = m->mdmreg[REG_CR]; - break; - case '\n': - c = m->mdmreg[REG_LF]; - break; - case '\b': - c = m->mdmreg[REG_BS]; - break; - default: - c = *p; - } - if (skb) { - *sp++ = c; - } else { - if (tty_insert_flip_char(port, c, TTY_NORMAL) == 0) - break; - } - } - if (skb) { - __skb_queue_tail(&dev->drv[info->isdn_driver]->rpqueue[info->isdn_channel], skb); - dev->drv[info->isdn_driver]->rcvcount[info->isdn_channel] += skb->len; - spin_unlock_irqrestore(&info->readlock, flags); - /* Schedule dequeuing */ - if (dev->modempoll && info->rcvsched) - isdn_timer_ctrl(ISDN_TIMER_MODEMREAD, 1); - - } else { - spin_unlock_irqrestore(&info->readlock, flags); - tty_flip_buffer_push(port); - } -} - -/* - * Perform ATH Hangup - */ -static void -isdn_tty_on_hook(modem_info *info) -{ - if (info->isdn_channel >= 0) { -#ifdef ISDN_DEBUG_MODEM_HUP - printk(KERN_DEBUG "Mhup in isdn_tty_on_hook\n"); -#endif - isdn_tty_modem_hup(info, 1); - } -} - -static void -isdn_tty_off_hook(void) -{ - printk(KERN_DEBUG "isdn_tty_off_hook\n"); -} - -#define PLUSWAIT1 (HZ / 2) /* 0.5 sec. */ -#define PLUSWAIT2 (HZ * 3 / 2) /* 1.5 sec */ - -/* - * Check Buffer for Modem-escape-sequence, activate timer-callback to - * isdn_tty_modem_escape() if sequence found. - * - * Parameters: - * p pointer to databuffer - * plus escape-character - * count length of buffer - * pluscount count of valid escape-characters so far - * lastplus timestamp of last character - */ -static void -isdn_tty_check_esc(const u_char *p, u_char plus, int count, int *pluscount, - u_long *lastplus) -{ - if (plus > 127) - return; - if (count > 3) { - p += count - 3; - count = 3; - *pluscount = 0; - } - while (count > 0) { - if (*(p++) == plus) { - if ((*pluscount)++) { - /* Time since last '+' > 0.5 sec. ? */ - if (time_after(jiffies, *lastplus + PLUSWAIT1)) - *pluscount = 1; - } else { - /* Time since last non-'+' < 1.5 sec. ? */ - if (time_before(jiffies, *lastplus + PLUSWAIT2)) - *pluscount = 0; - } - if ((*pluscount == 3) && (count == 1)) - isdn_timer_ctrl(ISDN_TIMER_MODEMPLUS, 1); - if (*pluscount > 3) - *pluscount = 1; - } else - *pluscount = 0; - *lastplus = jiffies; - count--; - } -} - -/* - * Return result of AT-emulator to tty-receive-buffer, depending on - * modem-register 12, bit 0 and 1. - * For CONNECT-messages also switch to online-mode. - * For RING-message handle auto-ATA if register 0 != 0 - */ - -static void -isdn_tty_modem_result(int code, modem_info *info) -{ - atemu *m = &info->emu; - static char *msg[] = - {"OK", "CONNECT", "RING", "NO CARRIER", "ERROR", - "CONNECT 64000", "NO DIALTONE", "BUSY", "NO ANSWER", - "RINGING", "NO MSN/EAZ", "VCON", "RUNG"}; - char s[ISDN_MSNLEN + 10]; - - switch (code) { - case RESULT_RING: - m->mdmreg[REG_RINGCNT]++; - if (m->mdmreg[REG_RINGCNT] == m->mdmreg[REG_RINGATA]) - /* Automatically accept incoming call */ - isdn_tty_cmd_ATA(info); - break; - case RESULT_NO_CARRIER: -#ifdef ISDN_DEBUG_MODEM_HUP - printk(KERN_DEBUG "modem_result: NO CARRIER %d %d\n", - info->closing, !info->port.tty); -#endif - m->mdmreg[REG_RINGCNT] = 0; - del_timer(&info->nc_timer); - info->ncarrier = 0; - if (info->closing || !info->port.tty) - return; - -#ifdef CONFIG_ISDN_AUDIO - if (info->vonline & 1) { -#ifdef ISDN_DEBUG_MODEM_VOICE - printk(KERN_DEBUG "res3: send DLE-ETX on ttyI%d\n", - info->line); -#endif - /* voice-recording, add DLE-ETX */ - isdn_tty_at_cout("\020\003", info); - } - if (info->vonline & 2) { -#ifdef ISDN_DEBUG_MODEM_VOICE - printk(KERN_DEBUG "res3: send DLE-DC4 on ttyI%d\n", - info->line); -#endif - /* voice-playing, add DLE-DC4 */ - isdn_tty_at_cout("\020\024", info); - } -#endif - break; - case RESULT_CONNECT: - case RESULT_CONNECT64000: - sprintf(info->last_cause, "0000"); - if (!info->online) - info->online = 2; - break; - case RESULT_VCON: -#ifdef ISDN_DEBUG_MODEM_VOICE - printk(KERN_DEBUG "res3: send VCON on ttyI%d\n", - info->line); -#endif - sprintf(info->last_cause, "0000"); - if (!info->online) - info->online = 1; - break; - } /* switch (code) */ - - if (m->mdmreg[REG_RESP] & BIT_RESP) { - /* Show results */ - if (m->mdmreg[REG_RESPNUM] & BIT_RESPNUM) { - /* Show numeric results only */ - sprintf(s, "\r\n%d\r\n", code); - isdn_tty_at_cout(s, info); - } else { - if (code == RESULT_RING) { - /* return if "show RUNG" and ringcounter>1 */ - if ((m->mdmreg[REG_RUNG] & BIT_RUNG) && - (m->mdmreg[REG_RINGCNT] > 1)) - return; - /* print CID, _before_ _every_ ring */ - if (!(m->mdmreg[REG_CIDONCE] & BIT_CIDONCE)) { - isdn_tty_at_cout("\r\nCALLER NUMBER: ", info); - isdn_tty_at_cout(dev->num[info->drv_index], info); - if (m->mdmreg[REG_CDN] & BIT_CDN) { - isdn_tty_at_cout("\r\nCALLED NUMBER: ", info); - isdn_tty_at_cout(info->emu.cpn, info); - } - } - } - isdn_tty_at_cout("\r\n", info); - isdn_tty_at_cout(msg[code], info); - switch (code) { - case RESULT_CONNECT: - switch (m->mdmreg[REG_L2PROT]) { - case ISDN_PROTO_L2_MODEM: - isdn_tty_at_cout(" ", info); - isdn_tty_at_cout(m->connmsg, info); - break; - } - break; - case RESULT_RING: - /* Append CPN, if enabled */ - if ((m->mdmreg[REG_CPN] & BIT_CPN)) { - sprintf(s, "/%s", m->cpn); - isdn_tty_at_cout(s, info); - } - /* Print CID only once, _after_ 1st RING */ - if ((m->mdmreg[REG_CIDONCE] & BIT_CIDONCE) && - (m->mdmreg[REG_RINGCNT] == 1)) { - isdn_tty_at_cout("\r\n", info); - isdn_tty_at_cout("CALLER NUMBER: ", info); - isdn_tty_at_cout(dev->num[info->drv_index], info); - if (m->mdmreg[REG_CDN] & BIT_CDN) { - isdn_tty_at_cout("\r\nCALLED NUMBER: ", info); - isdn_tty_at_cout(info->emu.cpn, info); - } - } - break; - case RESULT_NO_CARRIER: - case RESULT_NO_DIALTONE: - case RESULT_BUSY: - case RESULT_NO_ANSWER: - m->mdmreg[REG_RINGCNT] = 0; - /* Append Cause-Message if enabled */ - if (m->mdmreg[REG_RESPXT] & BIT_RESPXT) { - sprintf(s, "/%s", info->last_cause); - isdn_tty_at_cout(s, info); - } - break; - case RESULT_CONNECT64000: - /* Append Protocol to CONNECT message */ - switch (m->mdmreg[REG_L2PROT]) { - case ISDN_PROTO_L2_X75I: - case ISDN_PROTO_L2_X75UI: - case ISDN_PROTO_L2_X75BUI: - isdn_tty_at_cout("/X.75", info); - break; - case ISDN_PROTO_L2_HDLC: - isdn_tty_at_cout("/HDLC", info); - break; - case ISDN_PROTO_L2_V11096: - isdn_tty_at_cout("/V110/9600", info); - break; - case ISDN_PROTO_L2_V11019: - isdn_tty_at_cout("/V110/19200", info); - break; - case ISDN_PROTO_L2_V11038: - isdn_tty_at_cout("/V110/38400", info); - break; - } - if (m->mdmreg[REG_T70] & BIT_T70) { - isdn_tty_at_cout("/T.70", info); - if (m->mdmreg[REG_T70] & BIT_T70_EXT) - isdn_tty_at_cout("+", info); - } - break; - } - isdn_tty_at_cout("\r\n", info); - } - } - if (code == RESULT_NO_CARRIER) { - if (info->closing || (!info->port.tty)) - return; - - if (tty_port_check_carrier(&info->port)) - tty_hangup(info->port.tty); - } -} - - -/* - * Display a modem-register-value. - */ -static void -isdn_tty_show_profile(int ridx, modem_info *info) -{ - char v[6]; - - sprintf(v, "\r\n%d", info->emu.mdmreg[ridx]); - isdn_tty_at_cout(v, info); -} - -/* - * Get MSN-string from char-pointer, set pointer to end of number - */ -static void -isdn_tty_get_msnstr(char *n, char **p) -{ - int limit = ISDN_MSNLEN - 1; - - while (((*p[0] >= '0' && *p[0] <= '9') || - /* Why a comma ??? */ - (*p[0] == ',') || (*p[0] == ':')) && - (limit--)) - *n++ = *p[0]++; - *n = '\0'; -} - -/* - * Get phone-number from modem-commandbuffer - */ -static void -isdn_tty_getdial(char *p, char *q, int cnt) -{ - int first = 1; - int limit = ISDN_MSNLEN - 1; /* MUST match the size of interface var to avoid - buffer overflow */ - - while (strchr(" 0123456789,#.*WPTSR-", *p) && *p && --cnt > 0) { - if ((*p >= '0' && *p <= '9') || ((*p == 'S') && first) || - ((*p == 'R') && first) || - (*p == '*') || (*p == '#')) { - *q++ = *p; - limit--; - } - if (!limit) - break; - p++; - first = 0; - } - *q = 0; -} - -#define PARSE_ERROR { isdn_tty_modem_result(RESULT_ERROR, info); return; } -#define PARSE_ERROR1 { isdn_tty_modem_result(RESULT_ERROR, info); return 1; } - -static void -isdn_tty_report(modem_info *info) -{ - atemu *m = &info->emu; - char s[80]; - - isdn_tty_at_cout("\r\nStatistics of last connection:\r\n\r\n", info); - sprintf(s, " Remote Number: %s\r\n", info->last_num); - isdn_tty_at_cout(s, info); - sprintf(s, " Direction: %s\r\n", info->last_dir ? "outgoing" : "incoming"); - isdn_tty_at_cout(s, info); - isdn_tty_at_cout(" Layer-2 Protocol: ", info); - switch (info->last_l2) { - case ISDN_PROTO_L2_X75I: - isdn_tty_at_cout("X.75i", info); - break; - case ISDN_PROTO_L2_X75UI: - isdn_tty_at_cout("X.75ui", info); - break; - case ISDN_PROTO_L2_X75BUI: - isdn_tty_at_cout("X.75bui", info); - break; - case ISDN_PROTO_L2_HDLC: - isdn_tty_at_cout("HDLC", info); - break; - case ISDN_PROTO_L2_V11096: - isdn_tty_at_cout("V.110 9600 Baud", info); - break; - case ISDN_PROTO_L2_V11019: - isdn_tty_at_cout("V.110 19200 Baud", info); - break; - case ISDN_PROTO_L2_V11038: - isdn_tty_at_cout("V.110 38400 Baud", info); - break; - case ISDN_PROTO_L2_TRANS: - isdn_tty_at_cout("transparent", info); - break; - case ISDN_PROTO_L2_MODEM: - isdn_tty_at_cout("modem", info); - break; - case ISDN_PROTO_L2_FAX: - isdn_tty_at_cout("fax", info); - break; - default: - isdn_tty_at_cout("unknown", info); - break; - } - if (m->mdmreg[REG_T70] & BIT_T70) { - isdn_tty_at_cout("/T.70", info); - if (m->mdmreg[REG_T70] & BIT_T70_EXT) - isdn_tty_at_cout("+", info); - } - isdn_tty_at_cout("\r\n", info); - isdn_tty_at_cout(" Service: ", info); - switch (info->last_si) { - case 1: - isdn_tty_at_cout("audio\r\n", info); - break; - case 5: - isdn_tty_at_cout("btx\r\n", info); - break; - case 7: - isdn_tty_at_cout("data\r\n", info); - break; - default: - sprintf(s, "%d\r\n", info->last_si); - isdn_tty_at_cout(s, info); - break; - } - sprintf(s, " Hangup location: %s\r\n", info->last_lhup ? "local" : "remote"); - isdn_tty_at_cout(s, info); - sprintf(s, " Last cause: %s\r\n", info->last_cause); - isdn_tty_at_cout(s, info); -} - -/* - * Parse AT&.. commands. - */ -static int -isdn_tty_cmd_ATand(char **p, modem_info *info) -{ - atemu *m = &info->emu; - int i; - char rb[100]; - -#define MAXRB (sizeof(rb) - 1) - - switch (*p[0]) { - case 'B': - /* &B - Set Buffersize */ - p[0]++; - i = isdn_getnum(p); - if ((i < 0) || (i > ISDN_SERIAL_XMIT_MAX)) - PARSE_ERROR1; -#ifdef CONFIG_ISDN_AUDIO - if ((m->mdmreg[REG_SI1] & 1) && (i > VBUF)) - PARSE_ERROR1; -#endif - m->mdmreg[REG_PSIZE] = i / 16; - info->xmit_size = m->mdmreg[REG_PSIZE] * 16; - switch (m->mdmreg[REG_L2PROT]) { - case ISDN_PROTO_L2_V11096: - case ISDN_PROTO_L2_V11019: - case ISDN_PROTO_L2_V11038: - info->xmit_size /= 10; - } - break; - case 'C': - /* &C - DCD Status */ - p[0]++; - switch (isdn_getnum(p)) { - case 0: - m->mdmreg[REG_DCD] &= ~BIT_DCD; - break; - case 1: - m->mdmreg[REG_DCD] |= BIT_DCD; - break; - default: - PARSE_ERROR1 - } - break; - case 'D': - /* &D - Set DTR-Low-behavior */ - p[0]++; - switch (isdn_getnum(p)) { - case 0: - m->mdmreg[REG_DTRHUP] &= ~BIT_DTRHUP; - m->mdmreg[REG_DTRR] &= ~BIT_DTRR; - break; - case 2: - m->mdmreg[REG_DTRHUP] |= BIT_DTRHUP; - m->mdmreg[REG_DTRR] &= ~BIT_DTRR; - break; - case 3: - m->mdmreg[REG_DTRHUP] |= BIT_DTRHUP; - m->mdmreg[REG_DTRR] |= BIT_DTRR; - break; - default: - PARSE_ERROR1 - } - break; - case 'E': - /* &E -Set EAZ/MSN */ - p[0]++; - isdn_tty_get_msnstr(m->msn, p); - break; - case 'F': - /* &F -Set Factory-Defaults */ - p[0]++; - if (info->msr & UART_MSR_DCD) - PARSE_ERROR1; - isdn_tty_reset_profile(m); - isdn_tty_modem_reset_regs(info, 1); - break; -#ifdef DUMMY_HAYES_AT - case 'K': - /* only for be compilant with common scripts */ - /* &K Flowcontrol - no function */ - p[0]++; - isdn_getnum(p); - break; -#endif - case 'L': - /* &L -Set Numbers to listen on */ - p[0]++; - i = 0; - while (*p[0] && (strchr("0123456789,-*[]?;", *p[0])) && - (i < ISDN_LMSNLEN - 1)) - m->lmsn[i++] = *p[0]++; - m->lmsn[i] = '\0'; - break; - case 'R': - /* &R - Set V.110 bitrate adaption */ - p[0]++; - i = isdn_getnum(p); - switch (i) { - case 0: - /* Switch off V.110, back to X.75 */ - m->mdmreg[REG_L2PROT] = ISDN_PROTO_L2_X75I; - m->mdmreg[REG_SI2] = 0; - info->xmit_size = m->mdmreg[REG_PSIZE] * 16; - break; - case 9600: - m->mdmreg[REG_L2PROT] = ISDN_PROTO_L2_V11096; - m->mdmreg[REG_SI2] = 197; - info->xmit_size = m->mdmreg[REG_PSIZE] * 16 / 10; - break; - case 19200: - m->mdmreg[REG_L2PROT] = ISDN_PROTO_L2_V11019; - m->mdmreg[REG_SI2] = 199; - info->xmit_size = m->mdmreg[REG_PSIZE] * 16 / 10; - break; - case 38400: - m->mdmreg[REG_L2PROT] = ISDN_PROTO_L2_V11038; - m->mdmreg[REG_SI2] = 198; /* no existing standard for this */ - info->xmit_size = m->mdmreg[REG_PSIZE] * 16 / 10; - break; - default: - PARSE_ERROR1; - } - /* Switch off T.70 */ - m->mdmreg[REG_T70] &= ~(BIT_T70 | BIT_T70_EXT); - /* Set Service 7 */ - m->mdmreg[REG_SI1] |= 4; - break; - case 'S': - /* &S - Set Windowsize */ - p[0]++; - i = isdn_getnum(p); - if ((i > 0) && (i < 9)) - m->mdmreg[REG_WSIZE] = i; - else - PARSE_ERROR1; - break; - case 'V': - /* &V - Show registers */ - p[0]++; - isdn_tty_at_cout("\r\n", info); - for (i = 0; i < ISDN_MODEM_NUMREG; i++) { - sprintf(rb, "S%02d=%03d%s", i, - m->mdmreg[i], ((i + 1) % 10) ? " " : "\r\n"); - isdn_tty_at_cout(rb, info); - } - sprintf(rb, "\r\nEAZ/MSN: %.50s\r\n", - strlen(m->msn) ? m->msn : "None"); - isdn_tty_at_cout(rb, info); - if (strlen(m->lmsn)) { - isdn_tty_at_cout("\r\nListen: ", info); - isdn_tty_at_cout(m->lmsn, info); - isdn_tty_at_cout("\r\n", info); - } - break; - case 'W': - /* &W - Write Profile */ - p[0]++; - switch (*p[0]) { - case '0': - p[0]++; - modem_write_profile(m); - break; - default: - PARSE_ERROR1; - } - break; - case 'X': - /* &X - Switch to BTX-Mode and T.70 */ - p[0]++; - switch (isdn_getnum(p)) { - case 0: - m->mdmreg[REG_T70] &= ~(BIT_T70 | BIT_T70_EXT); - info->xmit_size = m->mdmreg[REG_PSIZE] * 16; - break; - case 1: - m->mdmreg[REG_T70] |= BIT_T70; - m->mdmreg[REG_T70] &= ~BIT_T70_EXT; - m->mdmreg[REG_L2PROT] = ISDN_PROTO_L2_X75I; - info->xmit_size = 112; - m->mdmreg[REG_SI1] = 4; - m->mdmreg[REG_SI2] = 0; - break; - case 2: - m->mdmreg[REG_T70] |= (BIT_T70 | BIT_T70_EXT); - m->mdmreg[REG_L2PROT] = ISDN_PROTO_L2_X75I; - info->xmit_size = 112; - m->mdmreg[REG_SI1] = 4; - m->mdmreg[REG_SI2] = 0; - break; - default: - PARSE_ERROR1; - } - break; - default: - PARSE_ERROR1; - } - return 0; -} - -static int -isdn_tty_check_ats(int mreg, int mval, modem_info *info, atemu *m) -{ - /* Some plausibility checks */ - switch (mreg) { - case REG_L2PROT: - if (mval > ISDN_PROTO_L2_MAX) - return 1; - break; - case REG_PSIZE: - if ((mval * 16) > ISDN_SERIAL_XMIT_MAX) - return 1; -#ifdef CONFIG_ISDN_AUDIO - if ((m->mdmreg[REG_SI1] & 1) && (mval > VBUFX)) - return 1; -#endif - info->xmit_size = mval * 16; - switch (m->mdmreg[REG_L2PROT]) { - case ISDN_PROTO_L2_V11096: - case ISDN_PROTO_L2_V11019: - case ISDN_PROTO_L2_V11038: - info->xmit_size /= 10; - } - break; - case REG_SI1I: - case REG_PLAN: - case REG_SCREEN: - /* readonly registers */ - return 1; - } - return 0; -} - -/* - * Perform ATS command - */ -static int -isdn_tty_cmd_ATS(char **p, modem_info *info) -{ - atemu *m = &info->emu; - int bitpos; - int mreg; - int mval; - int bval; - - mreg = isdn_getnum(p); - if (mreg < 0 || mreg >= ISDN_MODEM_NUMREG) - PARSE_ERROR1; - switch (*p[0]) { - case '=': - p[0]++; - mval = isdn_getnum(p); - if (mval < 0 || mval > 255) - PARSE_ERROR1; - if (isdn_tty_check_ats(mreg, mval, info, m)) - PARSE_ERROR1; - m->mdmreg[mreg] = mval; - break; - case '.': - /* Set/Clear a single bit */ - p[0]++; - bitpos = isdn_getnum(p); - if ((bitpos < 0) || (bitpos > 7)) - PARSE_ERROR1; - switch (*p[0]) { - case '=': - p[0]++; - bval = isdn_getnum(p); - if (bval < 0 || bval > 1) - PARSE_ERROR1; - if (bval) - mval = m->mdmreg[mreg] | (1 << bitpos); - else - mval = m->mdmreg[mreg] & ~(1 << bitpos); - if (isdn_tty_check_ats(mreg, mval, info, m)) - PARSE_ERROR1; - m->mdmreg[mreg] = mval; - break; - case '?': - p[0]++; - isdn_tty_at_cout("\r\n", info); - isdn_tty_at_cout((m->mdmreg[mreg] & (1 << bitpos)) ? "1" : "0", - info); - break; - default: - PARSE_ERROR1; - } - break; - case '?': - p[0]++; - isdn_tty_show_profile(mreg, info); - break; - default: - PARSE_ERROR1; - break; - } - return 0; -} - -/* - * Perform ATA command - */ -static void -isdn_tty_cmd_ATA(modem_info *info) -{ - atemu *m = &info->emu; - isdn_ctrl cmd; - int l2; - - if (info->msr & UART_MSR_RI) { - /* Accept incoming call */ - info->last_dir = 0; - strcpy(info->last_num, dev->num[info->drv_index]); - m->mdmreg[REG_RINGCNT] = 0; - info->msr &= ~UART_MSR_RI; - l2 = m->mdmreg[REG_L2PROT]; -#ifdef CONFIG_ISDN_AUDIO - /* If more than one bit set in reg18, autoselect Layer2 */ - if ((m->mdmreg[REG_SI1] & m->mdmreg[REG_SI1I]) != m->mdmreg[REG_SI1]) { - if (m->mdmreg[REG_SI1I] == 1) { - if ((l2 != ISDN_PROTO_L2_MODEM) && (l2 != ISDN_PROTO_L2_FAX)) - l2 = ISDN_PROTO_L2_TRANS; - } else - l2 = ISDN_PROTO_L2_X75I; - } -#endif - cmd.driver = info->isdn_driver; - cmd.command = ISDN_CMD_SETL2; - cmd.arg = info->isdn_channel + (l2 << 8); - info->last_l2 = l2; - isdn_command(&cmd); - cmd.driver = info->isdn_driver; - cmd.command = ISDN_CMD_SETL3; - cmd.arg = info->isdn_channel + (m->mdmreg[REG_L3PROT] << 8); -#ifdef CONFIG_ISDN_TTY_FAX - if (l2 == ISDN_PROTO_L2_FAX) { - cmd.parm.fax = info->fax; - info->fax->direction = ISDN_TTY_FAX_CONN_IN; - } -#endif - isdn_command(&cmd); - cmd.driver = info->isdn_driver; - cmd.arg = info->isdn_channel; - cmd.command = ISDN_CMD_ACCEPTD; - info->dialing = 16; - info->emu.carrierwait = 0; - isdn_command(&cmd); - isdn_timer_ctrl(ISDN_TIMER_CARRIER, 1); - } else - isdn_tty_modem_result(RESULT_NO_ANSWER, info); -} - -#ifdef CONFIG_ISDN_AUDIO -/* - * Parse AT+F.. commands - */ -static int -isdn_tty_cmd_PLUSF(char **p, modem_info *info) -{ - atemu *m = &info->emu; - char rs[20]; - - if (!strncmp(p[0], "CLASS", 5)) { - p[0] += 5; - switch (*p[0]) { - case '?': - p[0]++; - sprintf(rs, "\r\n%d", - (m->mdmreg[REG_SI1] & 1) ? 8 : 0); -#ifdef CONFIG_ISDN_TTY_FAX - if (TTY_IS_FCLASS2(info)) - sprintf(rs, "\r\n2"); - else if (TTY_IS_FCLASS1(info)) - sprintf(rs, "\r\n1"); -#endif - isdn_tty_at_cout(rs, info); - break; - case '=': - p[0]++; - switch (*p[0]) { - case '0': - p[0]++; - m->mdmreg[REG_L2PROT] = ISDN_PROTO_L2_X75I; - m->mdmreg[REG_L3PROT] = ISDN_PROTO_L3_TRANS; - m->mdmreg[REG_SI1] = 4; - info->xmit_size = - m->mdmreg[REG_PSIZE] * 16; - break; -#ifdef CONFIG_ISDN_TTY_FAX - case '1': - p[0]++; - if (!(dev->global_features & - ISDN_FEATURE_L3_FCLASS1)) - PARSE_ERROR1; - m->mdmreg[REG_SI1] = 1; - m->mdmreg[REG_L2PROT] = ISDN_PROTO_L2_FAX; - m->mdmreg[REG_L3PROT] = ISDN_PROTO_L3_FCLASS1; - info->xmit_size = - m->mdmreg[REG_PSIZE] * 16; - break; - case '2': - p[0]++; - if (!(dev->global_features & - ISDN_FEATURE_L3_FCLASS2)) - PARSE_ERROR1; - m->mdmreg[REG_SI1] = 1; - m->mdmreg[REG_L2PROT] = ISDN_PROTO_L2_FAX; - m->mdmreg[REG_L3PROT] = ISDN_PROTO_L3_FCLASS2; - info->xmit_size = - m->mdmreg[REG_PSIZE] * 16; - break; -#endif - case '8': - p[0]++; - /* L2 will change on dialout with si=1 */ - m->mdmreg[REG_L2PROT] = ISDN_PROTO_L2_X75I; - m->mdmreg[REG_L3PROT] = ISDN_PROTO_L3_TRANS; - m->mdmreg[REG_SI1] = 5; - info->xmit_size = VBUF; - break; - case '?': - p[0]++; - strcpy(rs, "\r\n0,"); -#ifdef CONFIG_ISDN_TTY_FAX - if (dev->global_features & - ISDN_FEATURE_L3_FCLASS1) - strcat(rs, "1,"); - if (dev->global_features & - ISDN_FEATURE_L3_FCLASS2) - strcat(rs, "2,"); -#endif - strcat(rs, "8"); - isdn_tty_at_cout(rs, info); - break; - default: - PARSE_ERROR1; - } - break; - default: - PARSE_ERROR1; - } - return 0; - } -#ifdef CONFIG_ISDN_TTY_FAX - return (isdn_tty_cmd_PLUSF_FAX(p, info)); -#else - PARSE_ERROR1; -#endif -} - -/* - * Parse AT+V.. commands - */ -static int -isdn_tty_cmd_PLUSV(char **p, modem_info *info) -{ - atemu *m = &info->emu; - isdn_ctrl cmd; - static char *vcmd[] = - {"NH", "IP", "LS", "RX", "SD", "SM", "TX", "DD", NULL}; - int i; - int par1; - int par2; - char rs[20]; - - i = 0; - while (vcmd[i]) { - if (!strncmp(vcmd[i], p[0], 2)) { - p[0] += 2; - break; - } - i++; - } - switch (i) { - case 0: - /* AT+VNH - Auto hangup feature */ - switch (*p[0]) { - case '?': - p[0]++; - isdn_tty_at_cout("\r\n1", info); - break; - case '=': - p[0]++; - switch (*p[0]) { - case '1': - p[0]++; - break; - case '?': - p[0]++; - isdn_tty_at_cout("\r\n1", info); - break; - default: - PARSE_ERROR1; - } - break; - default: - PARSE_ERROR1; - } - break; - case 1: - /* AT+VIP - Reset all voice parameters */ - isdn_tty_modem_reset_vpar(m); - break; - case 2: - /* AT+VLS - Select device, accept incoming call */ - switch (*p[0]) { - case '?': - p[0]++; - sprintf(rs, "\r\n%d", m->vpar[0]); - isdn_tty_at_cout(rs, info); - break; - case '=': - p[0]++; - switch (*p[0]) { - case '0': - p[0]++; - m->vpar[0] = 0; - break; - case '2': - p[0]++; - m->vpar[0] = 2; - break; - case '?': - p[0]++; - isdn_tty_at_cout("\r\n0,2", info); - break; - default: - PARSE_ERROR1; - } - break; - default: - PARSE_ERROR1; - } - break; - case 3: - /* AT+VRX - Start recording */ - if (!m->vpar[0]) - PARSE_ERROR1; - if (info->online != 1) { - isdn_tty_modem_result(RESULT_NO_ANSWER, info); - return 1; - } - info->dtmf_state = isdn_audio_dtmf_init(info->dtmf_state); - if (!info->dtmf_state) { - printk(KERN_WARNING "isdn_tty: Couldn't malloc dtmf state\n"); - PARSE_ERROR1; - } - info->silence_state = isdn_audio_silence_init(info->silence_state); - if (!info->silence_state) { - printk(KERN_WARNING "isdn_tty: Couldn't malloc silence state\n"); - PARSE_ERROR1; - } - if (m->vpar[3] < 5) { - info->adpcmr = isdn_audio_adpcm_init(info->adpcmr, m->vpar[3]); - if (!info->adpcmr) { - printk(KERN_WARNING "isdn_tty: Couldn't malloc adpcm state\n"); - PARSE_ERROR1; - } - } -#ifdef ISDN_DEBUG_AT - printk(KERN_DEBUG "AT: +VRX\n"); -#endif - info->vonline |= 1; - isdn_tty_modem_result(RESULT_CONNECT, info); - return 0; - break; - case 4: - /* AT+VSD - Silence detection */ - switch (*p[0]) { - case '?': - p[0]++; - sprintf(rs, "\r\n<%d>,<%d>", - m->vpar[1], - m->vpar[2]); - isdn_tty_at_cout(rs, info); - break; - case '=': - p[0]++; - if ((*p[0] >= '0') && (*p[0] <= '9')) { - par1 = isdn_getnum(p); - if ((par1 < 0) || (par1 > 31)) - PARSE_ERROR1; - if (*p[0] != ',') - PARSE_ERROR1; - p[0]++; - par2 = isdn_getnum(p); - if ((par2 < 0) || (par2 > 255)) - PARSE_ERROR1; - m->vpar[1] = par1; - m->vpar[2] = par2; - break; - } else - if (*p[0] == '?') { - p[0]++; - isdn_tty_at_cout("\r\n<0-31>,<0-255>", - info); - break; - } else - PARSE_ERROR1; - break; - default: - PARSE_ERROR1; - } - break; - case 5: - /* AT+VSM - Select compression */ - switch (*p[0]) { - case '?': - p[0]++; - sprintf(rs, "\r\n<%d>,<%d><8000>", - m->vpar[3], - m->vpar[1]); - isdn_tty_at_cout(rs, info); - break; - case '=': - p[0]++; - switch (*p[0]) { - case '2': - case '3': - case '4': - case '5': - case '6': - par1 = isdn_getnum(p); - if ((par1 < 2) || (par1 > 6)) - PARSE_ERROR1; - m->vpar[3] = par1; - break; - case '?': - p[0]++; - isdn_tty_at_cout("\r\n2;ADPCM;2;0;(8000)\r\n", - info); - isdn_tty_at_cout("3;ADPCM;3;0;(8000)\r\n", - info); - isdn_tty_at_cout("4;ADPCM;4;0;(8000)\r\n", - info); - isdn_tty_at_cout("5;ALAW;8;0;(8000)\r\n", - info); - isdn_tty_at_cout("6;ULAW;8;0;(8000)\r\n", - info); - break; - default: - PARSE_ERROR1; - } - break; - default: - PARSE_ERROR1; - } - break; - case 6: - /* AT+VTX - Start sending */ - if (!m->vpar[0]) - PARSE_ERROR1; - if (info->online != 1) { - isdn_tty_modem_result(RESULT_NO_ANSWER, info); - return 1; - } - info->dtmf_state = isdn_audio_dtmf_init(info->dtmf_state); - if (!info->dtmf_state) { - printk(KERN_WARNING "isdn_tty: Couldn't malloc dtmf state\n"); - PARSE_ERROR1; - } - if (m->vpar[3] < 5) { - info->adpcms = isdn_audio_adpcm_init(info->adpcms, m->vpar[3]); - if (!info->adpcms) { - printk(KERN_WARNING "isdn_tty: Couldn't malloc adpcm state\n"); - PARSE_ERROR1; - } - } -#ifdef ISDN_DEBUG_AT - printk(KERN_DEBUG "AT: +VTX\n"); -#endif - m->lastDLE = 0; - info->vonline |= 2; - isdn_tty_modem_result(RESULT_CONNECT, info); - return 0; - break; - case 7: - /* AT+VDD - DTMF detection */ - switch (*p[0]) { - case '?': - p[0]++; - sprintf(rs, "\r\n<%d>,<%d>", - m->vpar[4], - m->vpar[5]); - isdn_tty_at_cout(rs, info); - break; - case '=': - p[0]++; - if ((*p[0] >= '0') && (*p[0] <= '9')) { - if (info->online != 1) - PARSE_ERROR1; - par1 = isdn_getnum(p); - if ((par1 < 0) || (par1 > 15)) - PARSE_ERROR1; - if (*p[0] != ',') - PARSE_ERROR1; - p[0]++; - par2 = isdn_getnum(p); - if ((par2 < 0) || (par2 > 255)) - PARSE_ERROR1; - m->vpar[4] = par1; - m->vpar[5] = par2; - cmd.driver = info->isdn_driver; - cmd.command = ISDN_CMD_AUDIO; - cmd.arg = info->isdn_channel + (ISDN_AUDIO_SETDD << 8); - cmd.parm.num[0] = par1; - cmd.parm.num[1] = par2; - isdn_command(&cmd); - break; - } else - if (*p[0] == '?') { - p[0]++; - isdn_tty_at_cout("\r\n<0-15>,<0-255>", - info); - break; - } else - PARSE_ERROR1; - break; - default: - PARSE_ERROR1; - } - break; - default: - PARSE_ERROR1; - } - return 0; -} -#endif /* CONFIG_ISDN_AUDIO */ - -/* - * Parse and perform an AT-command-line. - */ -static void -isdn_tty_parse_at(modem_info *info) -{ - atemu *m = &info->emu; - char *p; - char ds[ISDN_MSNLEN]; - -#ifdef ISDN_DEBUG_AT - printk(KERN_DEBUG "AT: '%s'\n", m->mdmcmd); -#endif - for (p = &m->mdmcmd[2]; *p;) { - switch (*p) { - case ' ': - p++; - break; - case 'A': - /* A - Accept incoming call */ - p++; - isdn_tty_cmd_ATA(info); - return; - case 'D': - /* D - Dial */ - if (info->msr & UART_MSR_DCD) - PARSE_ERROR; - if (info->msr & UART_MSR_RI) { - isdn_tty_modem_result(RESULT_NO_CARRIER, info); - return; - } - isdn_tty_getdial(++p, ds, sizeof ds); - p += strlen(p); - if (!strlen(m->msn)) - isdn_tty_modem_result(RESULT_NO_MSN_EAZ, info); - else if (strlen(ds)) - isdn_tty_dial(ds, info, m); - else - PARSE_ERROR; - return; - case 'E': - /* E - Turn Echo on/off */ - p++; - switch (isdn_getnum(&p)) { - case 0: - m->mdmreg[REG_ECHO] &= ~BIT_ECHO; - break; - case 1: - m->mdmreg[REG_ECHO] |= BIT_ECHO; - break; - default: - PARSE_ERROR; - } - break; - case 'H': - /* H - On/Off-hook */ - p++; - switch (*p) { - case '0': - p++; - isdn_tty_on_hook(info); - break; - case '1': - p++; - isdn_tty_off_hook(); - break; - default: - isdn_tty_on_hook(info); - break; - } - break; - case 'I': - /* I - Information */ - p++; - isdn_tty_at_cout("\r\nLinux ISDN", info); - switch (*p) { - case '0': - case '1': - p++; - break; - case '2': - p++; - isdn_tty_report(info); - break; - case '3': - p++; - snprintf(ds, sizeof(ds), "\r\n%d", info->emu.charge); - isdn_tty_at_cout(ds, info); - break; - default:; - } - break; -#ifdef DUMMY_HAYES_AT - case 'L': - case 'M': - /* only for be compilant with common scripts */ - /* no function */ - p++; - isdn_getnum(&p); - break; -#endif - case 'O': - /* O - Go online */ - p++; - if (info->msr & UART_MSR_DCD) - /* if B-Channel is up */ - isdn_tty_modem_result((m->mdmreg[REG_L2PROT] == ISDN_PROTO_L2_MODEM) ? RESULT_CONNECT : RESULT_CONNECT64000, info); - else - isdn_tty_modem_result(RESULT_NO_CARRIER, info); - return; - case 'Q': - /* Q - Turn Emulator messages on/off */ - p++; - switch (isdn_getnum(&p)) { - case 0: - m->mdmreg[REG_RESP] |= BIT_RESP; - break; - case 1: - m->mdmreg[REG_RESP] &= ~BIT_RESP; - break; - default: - PARSE_ERROR; - } - break; - case 'S': - /* S - Set/Get Register */ - p++; - if (isdn_tty_cmd_ATS(&p, info)) - return; - break; - case 'V': - /* V - Numeric or ASCII Emulator-messages */ - p++; - switch (isdn_getnum(&p)) { - case 0: - m->mdmreg[REG_RESP] |= BIT_RESPNUM; - break; - case 1: - m->mdmreg[REG_RESP] &= ~BIT_RESPNUM; - break; - default: - PARSE_ERROR; - } - break; - case 'Z': - /* Z - Load Registers from Profile */ - p++; - if (info->msr & UART_MSR_DCD) { - info->online = 0; - isdn_tty_on_hook(info); - } - isdn_tty_modem_reset_regs(info, 1); - break; - case '+': - p++; - switch (*p) { -#ifdef CONFIG_ISDN_AUDIO - case 'F': - p++; - if (isdn_tty_cmd_PLUSF(&p, info)) - return; - break; - case 'V': - if ((!(m->mdmreg[REG_SI1] & 1)) || - (m->mdmreg[REG_L2PROT] == ISDN_PROTO_L2_MODEM)) - PARSE_ERROR; - p++; - if (isdn_tty_cmd_PLUSV(&p, info)) - return; - break; -#endif /* CONFIG_ISDN_AUDIO */ - case 'S': /* SUSPEND */ - p++; - isdn_tty_get_msnstr(ds, &p); - isdn_tty_suspend(ds, info, m); - break; - case 'R': /* RESUME */ - p++; - isdn_tty_get_msnstr(ds, &p); - isdn_tty_resume(ds, info, m); - break; - case 'M': /* MESSAGE */ - p++; - isdn_tty_send_msg(info, m, p); - break; - default: - PARSE_ERROR; - } - break; - case '&': - p++; - if (isdn_tty_cmd_ATand(&p, info)) - return; - break; - default: - PARSE_ERROR; - } - } -#ifdef CONFIG_ISDN_AUDIO - if (!info->vonline) -#endif - isdn_tty_modem_result(RESULT_OK, info); -} - -/* Need own toupper() because standard-toupper is not available - * within modules. - */ -#define my_toupper(c) (((c >= 'a') && (c <= 'z')) ? (c & 0xdf) : c) - -/* - * Perform line-editing of AT-commands - * - * Parameters: - * p inputbuffer - * count length of buffer - * channel index to line (minor-device) - */ -static int -isdn_tty_edit_at(const char *p, int count, modem_info *info) -{ - atemu *m = &info->emu; - int total = 0; - u_char c; - char eb[2]; - int cnt; - - for (cnt = count; cnt > 0; p++, cnt--) { - c = *p; - total++; - if (c == m->mdmreg[REG_CR] || c == m->mdmreg[REG_LF]) { - /* Separator (CR or LF) */ - m->mdmcmd[m->mdmcmdl] = 0; - if (m->mdmreg[REG_ECHO] & BIT_ECHO) { - eb[0] = c; - eb[1] = 0; - isdn_tty_at_cout(eb, info); - } - if ((m->mdmcmdl >= 2) && (!(strncmp(m->mdmcmd, "AT", 2)))) - isdn_tty_parse_at(info); - m->mdmcmdl = 0; - continue; - } - if (c == m->mdmreg[REG_BS] && m->mdmreg[REG_BS] < 128) { - /* Backspace-Function */ - if ((m->mdmcmdl > 2) || (!m->mdmcmdl)) { - if (m->mdmcmdl) - m->mdmcmdl--; - if (m->mdmreg[REG_ECHO] & BIT_ECHO) - isdn_tty_at_cout("\b", info); - } - continue; - } - if (cmdchar(c)) { - if (m->mdmreg[REG_ECHO] & BIT_ECHO) { - eb[0] = c; - eb[1] = 0; - isdn_tty_at_cout(eb, info); - } - if (m->mdmcmdl < 255) { - c = my_toupper(c); - switch (m->mdmcmdl) { - case 1: - if (c == 'T') { - m->mdmcmd[m->mdmcmdl] = c; - m->mdmcmd[++m->mdmcmdl] = 0; - break; - } else - m->mdmcmdl = 0; - /* Fall through - check for 'A' */ - case 0: - if (c == 'A') { - m->mdmcmd[m->mdmcmdl] = c; - m->mdmcmd[++m->mdmcmdl] = 0; - } - break; - default: - m->mdmcmd[m->mdmcmdl] = c; - m->mdmcmd[++m->mdmcmdl] = 0; - } - } - } - } - return total; -} - -/* - * Switch all modem-channels who are online and got a valid - * escape-sequence 1.5 seconds ago, to command-mode. - * This function is called every second via timer-interrupt from within - * timer-dispatcher isdn_timer_function() - */ -void -isdn_tty_modem_escape(void) -{ - int ton = 0; - int i; - int midx; - - for (i = 0; i < ISDN_MAX_CHANNELS; i++) - if (USG_MODEM(dev->usage[i]) && (midx = dev->m_idx[i]) >= 0) { - modem_info *info = &dev->mdm.info[midx]; - if (info->online) { - ton = 1; - if ((info->emu.pluscount == 3) && - time_after(jiffies, - info->emu.lastplus + PLUSWAIT2)) { - info->emu.pluscount = 0; - info->online = 0; - isdn_tty_modem_result(RESULT_OK, info); - } - } - } - isdn_timer_ctrl(ISDN_TIMER_MODEMPLUS, ton); -} - -/* - * Put a RING-message to all modem-channels who have the RI-bit set. - * This function is called every second via timer-interrupt from within - * timer-dispatcher isdn_timer_function() - */ -void -isdn_tty_modem_ring(void) -{ - int ton = 0; - int i; - - for (i = 0; i < ISDN_MAX_CHANNELS; i++) { - modem_info *info = &dev->mdm.info[i]; - if (info->msr & UART_MSR_RI) { - ton = 1; - isdn_tty_modem_result(RESULT_RING, info); - } - } - isdn_timer_ctrl(ISDN_TIMER_MODEMRING, ton); -} - -/* - * For all online tty's, try sending data to - * the lower levels. - */ -void -isdn_tty_modem_xmit(void) -{ - int ton = 1; - int i; - - for (i = 0; i < ISDN_MAX_CHANNELS; i++) { - modem_info *info = &dev->mdm.info[i]; - if (info->online) { - ton = 1; - isdn_tty_senddown(info); - isdn_tty_tint(info); - } - } - isdn_timer_ctrl(ISDN_TIMER_MODEMXMIT, ton); -} - -/* - * Check all channels if we have a 'no carrier' timeout. - * Timeout value is set by Register S7. - */ -void -isdn_tty_carrier_timeout(void) -{ - int ton = 0; - int i; - - for (i = 0; i < ISDN_MAX_CHANNELS; i++) { - modem_info *info = &dev->mdm.info[i]; - if (!info->dialing) - continue; - if (info->emu.carrierwait++ > info->emu.mdmreg[REG_WAITC]) { - info->dialing = 0; - isdn_tty_modem_result(RESULT_NO_CARRIER, info); - isdn_tty_modem_hup(info, 1); - } else - ton = 1; - } - isdn_timer_ctrl(ISDN_TIMER_CARRIER, ton); -} diff --git a/drivers/isdn/i4l/isdn_tty.h b/drivers/isdn/i4l/isdn_tty.h deleted file mode 100644 index a6f801d2263b..000000000000 --- a/drivers/isdn/i4l/isdn_tty.h +++ /dev/null @@ -1,120 +0,0 @@ -/* $Id: isdn_tty.h,v 1.1.2.2 2004/01/12 22:37:19 keil Exp $ - * - * header for Linux ISDN subsystem, tty related functions (linklevel). - * - * Copyright 1994-1999 by Fritz Elfert (fritz@isdn4linux.de) - * Copyright 1995,96 by Thinking Objects Software GmbH Wuerzburg - * - * This software may be used and distributed according to the terms - * of the GNU General Public License, incorporated herein by reference. - * - */ - - -#define DLE 0x10 -#define ETX 0x03 -#define DC4 0x14 - - -/* - * Definition of some special Registers of AT-Emulator - */ -#define REG_RINGATA 0 -#define REG_RINGCNT 1 /* ring counter register */ -#define REG_ESC 2 -#define REG_CR 3 -#define REG_LF 4 -#define REG_BS 5 - -#define REG_WAITC 7 - -#define REG_RESP 12 /* show response messages register */ -#define BIT_RESP 1 /* show response messages bit */ -#define REG_RESPNUM 12 /* show numeric responses register */ -#define BIT_RESPNUM 2 /* show numeric responses bit */ -#define REG_ECHO 12 -#define BIT_ECHO 4 -#define REG_DCD 12 -#define BIT_DCD 8 -#define REG_CTS 12 -#define BIT_CTS 16 -#define REG_DTRR 12 -#define BIT_DTRR 32 -#define REG_DSR 12 -#define BIT_DSR 64 -#define REG_CPPP 12 -#define BIT_CPPP 128 - -#define REG_DXMT 13 -#define BIT_DXMT 1 -#define REG_T70 13 -#define BIT_T70 2 -#define BIT_T70_EXT 32 -#define REG_DTRHUP 13 -#define BIT_DTRHUP 4 -#define REG_RESPXT 13 -#define BIT_RESPXT 8 -#define REG_CIDONCE 13 -#define BIT_CIDONCE 16 -#define REG_RUNG 13 /* show RUNG message register */ -#define BIT_RUNG 64 /* show RUNG message bit */ -#define REG_DISPLAY 13 -#define BIT_DISPLAY 128 - -#define REG_L2PROT 14 -#define REG_L3PROT 15 -#define REG_PSIZE 16 -#define REG_WSIZE 17 -#define REG_SI1 18 -#define REG_SI2 19 -#define REG_SI1I 20 -#define REG_PLAN 21 -#define REG_SCREEN 22 - -#define REG_CPN 23 -#define BIT_CPN 1 -#define REG_CPNFCON 23 -#define BIT_CPNFCON 2 -#define REG_CDN 23 -#define BIT_CDN 4 - -/* defines for result codes */ -#define RESULT_OK 0 -#define RESULT_CONNECT 1 -#define RESULT_RING 2 -#define RESULT_NO_CARRIER 3 -#define RESULT_ERROR 4 -#define RESULT_CONNECT64000 5 -#define RESULT_NO_DIALTONE 6 -#define RESULT_BUSY 7 -#define RESULT_NO_ANSWER 8 -#define RESULT_RINGING 9 -#define RESULT_NO_MSN_EAZ 10 -#define RESULT_VCON 11 -#define RESULT_RUNG 12 - -#define TTY_IS_FCLASS1(info) \ - ((info->emu.mdmreg[REG_L2PROT] == ISDN_PROTO_L2_FAX) && \ - (info->emu.mdmreg[REG_L3PROT] == ISDN_PROTO_L3_FCLASS1)) -#define TTY_IS_FCLASS2(info) \ - ((info->emu.mdmreg[REG_L2PROT] == ISDN_PROTO_L2_FAX) && \ - (info->emu.mdmreg[REG_L3PROT] == ISDN_PROTO_L3_FCLASS2)) - -extern void isdn_tty_modem_escape(void); -extern void isdn_tty_modem_ring(void); -extern void isdn_tty_carrier_timeout(void); -extern void isdn_tty_modem_xmit(void); -extern int isdn_tty_modem_init(void); -extern void isdn_tty_exit(void); -extern void isdn_tty_readmodem(void); -extern int isdn_tty_find_icall(int, int, setup_parm *); -extern int isdn_tty_stat_callback(int, isdn_ctrl *); -extern int isdn_tty_rcv_skb(int, int, int, struct sk_buff *); -extern int isdn_tty_capi_facility(capi_msg *cm); -extern void isdn_tty_at_cout(char *, modem_info *); -extern void isdn_tty_modem_hup(modem_info *, int); -#ifdef CONFIG_ISDN_TTY_FAX -extern int isdn_tty_cmd_PLUSF_FAX(char **, modem_info *); -extern int isdn_tty_fax_command(modem_info *, isdn_ctrl *); -extern void isdn_tty_fax_bitorder(modem_info *, struct sk_buff *); -#endif diff --git a/drivers/isdn/i4l/isdn_ttyfax.c b/drivers/isdn/i4l/isdn_ttyfax.c deleted file mode 100644 index 47aae4916730..000000000000 --- a/drivers/isdn/i4l/isdn_ttyfax.c +++ /dev/null @@ -1,1123 +0,0 @@ -/* $Id: isdn_ttyfax.c,v 1.1.2.2 2004/01/12 22:37:19 keil Exp $ - * - * Linux ISDN subsystem, tty_fax AT-command emulator (linklevel). - * - * Copyright 1999 by Armin Schindler (mac@melware.de) - * Copyright 1999 by Ralf Spachmann (mel@melware.de) - * Copyright 1999 by Cytronics & Melware - * - * This software may be used and distributed according to the terms - * of the GNU General Public License, incorporated herein by reference. - * - */ - -#undef ISDN_TTY_FAX_STAT_DEBUG -#undef ISDN_TTY_FAX_CMD_DEBUG - -#include -#include "isdn_common.h" -#include "isdn_tty.h" -#include "isdn_ttyfax.h" - - -static char *isdn_tty_fax_revision = "$Revision: 1.1.2.2 $"; - -#define PARSE_ERROR1 { isdn_tty_fax_modem_result(1, info); return 1; } - -static char * -isdn_getrev(const char *revision) -{ - char *rev; - char *p; - - if ((p = strchr(revision, ':'))) { - rev = p + 2; - p = strchr(rev, '$'); - *--p = 0; - } else - rev = "???"; - return rev; -} - -/* - * Fax Class 2 Modem results - * - */ - -static void -isdn_tty_fax_modem_result(int code, modem_info *info) -{ - atemu *m = &info->emu; - T30_s *f = info->fax; - char rs[50]; - char rss[50]; - char *rp; - int i; - static char *msg[] = - {"OK", "ERROR", "+FCON", "+FCSI:", "+FDIS:", - "+FHNG:", "+FDCS:", "CONNECT", "+FTSI:", - "+FCFR", "+FPTS:", "+FET:"}; - - - isdn_tty_at_cout("\r\n", info); - isdn_tty_at_cout(msg[code], info); - -#ifdef ISDN_TTY_FAX_CMD_DEBUG - printk(KERN_DEBUG "isdn_tty: Fax send %s on ttyI%d\n", - msg[code], info->line); -#endif - switch (code) { - case 0: /* OK */ - break; - case 1: /* ERROR */ - break; - case 2: /* +FCON */ - /* Append CPN, if enabled */ - if ((m->mdmreg[REG_CPNFCON] & BIT_CPNFCON) && - (!(dev->usage[info->isdn_channel] & ISDN_USAGE_OUTGOING))) { - sprintf(rs, "/%s", m->cpn); - isdn_tty_at_cout(rs, info); - } - info->online = 1; - f->fet = 0; - if (f->phase == ISDN_FAX_PHASE_A) - f->phase = ISDN_FAX_PHASE_B; - break; - case 3: /* +FCSI */ - case 8: /* +FTSI */ - sprintf(rs, "\"%s\"", f->r_id); - isdn_tty_at_cout(rs, info); - break; - case 4: /* +FDIS */ - rs[0] = 0; - rp = &f->r_resolution; - for (i = 0; i < 8; i++) { - sprintf(rss, "%c%s", rp[i] + 48, - (i < 7) ? "," : ""); - strcat(rs, rss); - } - isdn_tty_at_cout(rs, info); -#ifdef ISDN_TTY_FAX_CMD_DEBUG - printk(KERN_DEBUG "isdn_tty: Fax DIS=%s on ttyI%d\n", - rs, info->line); -#endif - break; - case 5: /* +FHNG */ - sprintf(rs, "%d", f->code); - isdn_tty_at_cout(rs, info); - info->faxonline = 0; - break; - case 6: /* +FDCS */ - rs[0] = 0; - rp = &f->r_resolution; - for (i = 0; i < 8; i++) { - sprintf(rss, "%c%s", rp[i] + 48, - (i < 7) ? "," : ""); - strcat(rs, rss); - } - isdn_tty_at_cout(rs, info); -#ifdef ISDN_TTY_FAX_CMD_DEBUG - printk(KERN_DEBUG "isdn_tty: Fax DCS=%s on ttyI%d\n", - rs, info->line); -#endif - break; - case 7: /* CONNECT */ - info->faxonline |= 2; - break; - case 9: /* FCFR */ - break; - case 10: /* FPTS */ - isdn_tty_at_cout("1", info); - break; - case 11: /* FET */ - sprintf(rs, "%d", f->fet); - isdn_tty_at_cout(rs, info); - break; - } - - isdn_tty_at_cout("\r\n", info); - - switch (code) { - case 7: /* CONNECT */ - info->online = 2; - if (info->faxonline & 1) { - sprintf(rs, "%c", XON); - isdn_tty_at_cout(rs, info); - } - break; - } -} - -static int -isdn_tty_fax_command1(modem_info *info, isdn_ctrl *c) -{ - static char *msg[] = - {"OK", "CONNECT", "NO CARRIER", "ERROR", "FCERROR"}; - -#ifdef ISDN_TTY_FAX_CMD_DEBUG - printk(KERN_DEBUG "isdn_tty: FCLASS1 cmd(%d)\n", c->parm.aux.cmd); -#endif - if (c->parm.aux.cmd < ISDN_FAX_CLASS1_QUERY) { - if (info->online) - info->online = 1; - isdn_tty_at_cout("\r\n", info); - isdn_tty_at_cout(msg[c->parm.aux.cmd], info); - isdn_tty_at_cout("\r\n", info); - } - switch (c->parm.aux.cmd) { - case ISDN_FAX_CLASS1_CONNECT: - info->online = 2; - break; - case ISDN_FAX_CLASS1_OK: - case ISDN_FAX_CLASS1_FCERROR: - case ISDN_FAX_CLASS1_ERROR: - case ISDN_FAX_CLASS1_NOCARR: - break; - case ISDN_FAX_CLASS1_QUERY: - isdn_tty_at_cout("\r\n", info); - if (!c->parm.aux.para[0]) { - isdn_tty_at_cout(msg[ISDN_FAX_CLASS1_ERROR], info); - isdn_tty_at_cout("\r\n", info); - } else { - isdn_tty_at_cout(c->parm.aux.para, info); - isdn_tty_at_cout("\r\nOK\r\n", info); - } - break; - } - return (0); -} - -int -isdn_tty_fax_command(modem_info *info, isdn_ctrl *c) -{ - T30_s *f = info->fax; - char rs[10]; - - if (TTY_IS_FCLASS1(info)) - return (isdn_tty_fax_command1(info, c)); - -#ifdef ISDN_TTY_FAX_CMD_DEBUG - printk(KERN_DEBUG "isdn_tty: Fax cmd %d on ttyI%d\n", - f->r_code, info->line); -#endif - switch (f->r_code) { - case ISDN_TTY_FAX_FCON: - info->faxonline = 1; - isdn_tty_fax_modem_result(2, info); /* +FCON */ - return (0); - case ISDN_TTY_FAX_FCON_I: - info->faxonline = 16; - isdn_tty_fax_modem_result(2, info); /* +FCON */ - return (0); - case ISDN_TTY_FAX_RID: - if (info->faxonline & 1) - isdn_tty_fax_modem_result(3, info); /* +FCSI */ - if (info->faxonline & 16) - isdn_tty_fax_modem_result(8, info); /* +FTSI */ - return (0); - case ISDN_TTY_FAX_DIS: - isdn_tty_fax_modem_result(4, info); /* +FDIS */ - return (0); - case ISDN_TTY_FAX_HNG: - if (f->phase == ISDN_FAX_PHASE_C) { - if (f->direction == ISDN_TTY_FAX_CONN_IN) { - sprintf(rs, "%c%c", DLE, ETX); - isdn_tty_at_cout(rs, info); - } else { - sprintf(rs, "%c", 0x18); - isdn_tty_at_cout(rs, info); - } - info->faxonline &= ~2; /* leave data mode */ - info->online = 1; - } - f->phase = ISDN_FAX_PHASE_E; - isdn_tty_fax_modem_result(5, info); /* +FHNG */ - isdn_tty_fax_modem_result(0, info); /* OK */ - return (0); - case ISDN_TTY_FAX_DCS: - isdn_tty_fax_modem_result(6, info); /* +FDCS */ - isdn_tty_fax_modem_result(7, info); /* CONNECT */ - f->phase = ISDN_FAX_PHASE_C; - return (0); - case ISDN_TTY_FAX_TRAIN_OK: - isdn_tty_fax_modem_result(6, info); /* +FDCS */ - isdn_tty_fax_modem_result(0, info); /* OK */ - return (0); - case ISDN_TTY_FAX_SENT: - isdn_tty_fax_modem_result(0, info); /* OK */ - return (0); - case ISDN_TTY_FAX_CFR: - isdn_tty_fax_modem_result(9, info); /* +FCFR */ - return (0); - case ISDN_TTY_FAX_ET: - sprintf(rs, "%c%c", DLE, ETX); - isdn_tty_at_cout(rs, info); - isdn_tty_fax_modem_result(10, info); /* +FPTS */ - isdn_tty_fax_modem_result(11, info); /* +FET */ - isdn_tty_fax_modem_result(0, info); /* OK */ - info->faxonline &= ~2; /* leave data mode */ - info->online = 1; - f->phase = ISDN_FAX_PHASE_D; - return (0); - case ISDN_TTY_FAX_PTS: - isdn_tty_fax_modem_result(10, info); /* +FPTS */ - if (f->direction == ISDN_TTY_FAX_CONN_OUT) { - if (f->fet == 1) - f->phase = ISDN_FAX_PHASE_B; - if (f->fet == 0) - isdn_tty_fax_modem_result(0, info); /* OK */ - } - return (0); - case ISDN_TTY_FAX_EOP: - info->faxonline &= ~2; /* leave data mode */ - info->online = 1; - f->phase = ISDN_FAX_PHASE_D; - return (0); - - } - return (-1); -} - - -void -isdn_tty_fax_bitorder(modem_info *info, struct sk_buff *skb) -{ - __u8 LeftMask; - __u8 RightMask; - __u8 fBit; - __u8 Data; - int i; - - if (!info->fax->bor) { - for (i = 0; i < skb->len; i++) { - Data = skb->data[i]; - for ( - LeftMask = 0x80, RightMask = 0x01; - LeftMask > RightMask; - LeftMask >>= 1, RightMask <<= 1 - ) { - fBit = (Data & LeftMask); - if (Data & RightMask) - Data |= LeftMask; - else - Data &= ~LeftMask; - if (fBit) - Data |= RightMask; - else - Data &= ~RightMask; - - } - skb->data[i] = Data; - } - } -} - -/* - * Parse AT+F.. FAX class 1 commands - */ - -static int -isdn_tty_cmd_FCLASS1(char **p, modem_info *info) -{ - static char *cmd[] = - {"AE", "TS", "RS", "TM", "RM", "TH", "RH"}; - isdn_ctrl c; - int par, i; - u_long flags; - - for (c.parm.aux.cmd = 0; c.parm.aux.cmd < 7; c.parm.aux.cmd++) - if (!strncmp(p[0], cmd[c.parm.aux.cmd], 2)) - break; - -#ifdef ISDN_TTY_FAX_CMD_DEBUG - printk(KERN_DEBUG "isdn_tty_cmd_FCLASS1 (%s,%d)\n", p[0], c.parm.aux.cmd); -#endif - if (c.parm.aux.cmd == 7) - PARSE_ERROR1; - - p[0] += 2; - switch (*p[0]) { - case '?': - p[0]++; - c.parm.aux.subcmd = AT_QUERY; - break; - case '=': - p[0]++; - if (*p[0] == '?') { - p[0]++; - c.parm.aux.subcmd = AT_EQ_QUERY; - } else { - par = isdn_getnum(p); - if ((par < 0) || (par > 255)) - PARSE_ERROR1; - c.parm.aux.subcmd = AT_EQ_VALUE; - c.parm.aux.para[0] = par; - } - break; - case 0: - c.parm.aux.subcmd = AT_COMMAND; - break; - default: - PARSE_ERROR1; - } - c.command = ISDN_CMD_FAXCMD; -#ifdef ISDN_TTY_FAX_CMD_DEBUG - printk(KERN_DEBUG "isdn_tty_cmd_FCLASS1 %d/%d/%d)\n", - c.parm.aux.cmd, c.parm.aux.subcmd, c.parm.aux.para[0]); -#endif - if (info->isdn_driver < 0) { - if ((c.parm.aux.subcmd == AT_EQ_VALUE) || - (c.parm.aux.subcmd == AT_COMMAND)) { - PARSE_ERROR1; - } - spin_lock_irqsave(&dev->lock, flags); - /* get a temporary connection to the first free fax driver */ - i = isdn_get_free_channel(ISDN_USAGE_FAX, ISDN_PROTO_L2_FAX, - ISDN_PROTO_L3_FCLASS1, -1, -1, "00"); - if (i < 0) { - spin_unlock_irqrestore(&dev->lock, flags); - PARSE_ERROR1; - } - info->isdn_driver = dev->drvmap[i]; - info->isdn_channel = dev->chanmap[i]; - info->drv_index = i; - dev->m_idx[i] = info->line; - spin_unlock_irqrestore(&dev->lock, flags); - c.driver = info->isdn_driver; - c.arg = info->isdn_channel; - isdn_command(&c); - spin_lock_irqsave(&dev->lock, flags); - isdn_free_channel(info->isdn_driver, info->isdn_channel, - ISDN_USAGE_FAX); - info->isdn_driver = -1; - info->isdn_channel = -1; - if (info->drv_index >= 0) { - dev->m_idx[info->drv_index] = -1; - info->drv_index = -1; - } - spin_unlock_irqrestore(&dev->lock, flags); - } else { - c.driver = info->isdn_driver; - c.arg = info->isdn_channel; - isdn_command(&c); - } - return 1; -} - -/* - * Parse AT+F.. FAX class 2 commands - */ - -static int -isdn_tty_cmd_FCLASS2(char **p, modem_info *info) -{ - atemu *m = &info->emu; - T30_s *f = info->fax; - isdn_ctrl cmd; - int par; - char rs[50]; - char rss[50]; - int maxdccval[] = - {1, 5, 2, 2, 3, 2, 0, 7}; - - /* FAA still unchanged */ - if (!strncmp(p[0], "AA", 2)) { /* TODO */ - p[0] += 2; - switch (*p[0]) { - case '?': - p[0]++; - sprintf(rs, "\r\n%d", 0); - isdn_tty_at_cout(rs, info); - break; - case '=': - p[0]++; - par = isdn_getnum(p); - if ((par < 0) || (par > 255)) - PARSE_ERROR1; - break; - default: - PARSE_ERROR1; - } - return 0; - } - /* BADLIN=value - dummy 0=disable errorchk disabled, 1-255 nr. of lines for making page bad */ - if (!strncmp(p[0], "BADLIN", 6)) { - p[0] += 6; - switch (*p[0]) { - case '?': - p[0]++; - sprintf(rs, "\r\n%d", f->badlin); - isdn_tty_at_cout(rs, info); - break; - case '=': - p[0]++; - if (*p[0] == '?') { - p[0]++; - sprintf(rs, "\r\n0-255"); - isdn_tty_at_cout(rs, info); - } else { - par = isdn_getnum(p); - if ((par < 0) || (par > 255)) - PARSE_ERROR1; - f->badlin = par; -#ifdef ISDN_TTY_FAX_STAT_DEBUG - printk(KERN_DEBUG "isdn_tty: Fax FBADLIN=%d\n", par); -#endif - } - break; - default: - PARSE_ERROR1; - } - return 0; - } - /* BADMUL=value - dummy 0=disable errorchk disabled (threshold multiplier) */ - if (!strncmp(p[0], "BADMUL", 6)) { - p[0] += 6; - switch (*p[0]) { - case '?': - p[0]++; - sprintf(rs, "\r\n%d", f->badmul); - isdn_tty_at_cout(rs, info); - break; - case '=': - p[0]++; - if (*p[0] == '?') { - p[0]++; - sprintf(rs, "\r\n0-255"); - isdn_tty_at_cout(rs, info); - } else { - par = isdn_getnum(p); - if ((par < 0) || (par > 255)) - PARSE_ERROR1; - f->badmul = par; -#ifdef ISDN_TTY_FAX_STAT_DEBUG - printk(KERN_DEBUG "isdn_tty: Fax FBADMUL=%d\n", par); -#endif - } - break; - default: - PARSE_ERROR1; - } - return 0; - } - /* BOR=n - Phase C bit order, 0=direct, 1=reverse */ - if (!strncmp(p[0], "BOR", 3)) { - p[0] += 3; - switch (*p[0]) { - case '?': - p[0]++; - sprintf(rs, "\r\n%d", f->bor); - isdn_tty_at_cout(rs, info); - break; - case '=': - p[0]++; - if (*p[0] == '?') { - p[0]++; - sprintf(rs, "\r\n0,1"); - isdn_tty_at_cout(rs, info); - } else { - par = isdn_getnum(p); - if ((par < 0) || (par > 1)) - PARSE_ERROR1; - f->bor = par; -#ifdef ISDN_TTY_FAX_STAT_DEBUG - printk(KERN_DEBUG "isdn_tty: Fax FBOR=%d\n", par); -#endif - } - break; - default: - PARSE_ERROR1; - } - return 0; - } - /* NBC=n - No Best Capabilities */ - if (!strncmp(p[0], "NBC", 3)) { - p[0] += 3; - switch (*p[0]) { - case '?': - p[0]++; - sprintf(rs, "\r\n%d", f->nbc); - isdn_tty_at_cout(rs, info); - break; - case '=': - p[0]++; - if (*p[0] == '?') { - p[0]++; - sprintf(rs, "\r\n0,1"); - isdn_tty_at_cout(rs, info); - } else { - par = isdn_getnum(p); - if ((par < 0) || (par > 1)) - PARSE_ERROR1; - f->nbc = par; -#ifdef ISDN_TTY_FAX_STAT_DEBUG - printk(KERN_DEBUG "isdn_tty: Fax FNBC=%d\n", par); -#endif - } - break; - default: - PARSE_ERROR1; - } - return 0; - } - /* BUF? - Readonly buffersize readout */ - if (!strncmp(p[0], "BUF?", 4)) { - p[0] += 4; -#ifdef ISDN_TTY_FAX_STAT_DEBUG - printk(KERN_DEBUG "isdn_tty: Fax FBUF? (%d) \n", (16 * m->mdmreg[REG_PSIZE])); -#endif - p[0]++; - sprintf(rs, "\r\n %d ", (16 * m->mdmreg[REG_PSIZE])); - isdn_tty_at_cout(rs, info); - return 0; - } - /* CIG=string - local fax station id string for polling rx */ - if (!strncmp(p[0], "CIG", 3)) { - int i, r; - p[0] += 3; - switch (*p[0]) { - case '?': - p[0]++; - sprintf(rs, "\r\n\"%s\"", f->pollid); - isdn_tty_at_cout(rs, info); - break; - case '=': - p[0]++; - if (*p[0] == '?') { - p[0]++; - sprintf(rs, "\r\n\"STRING\""); - isdn_tty_at_cout(rs, info); - } else { - if (*p[0] == '"') - p[0]++; - for (i = 0; (*p[0]) && i < (FAXIDLEN - 1) && (*p[0] != '"'); i++) { - f->pollid[i] = *p[0]++; - } - if (*p[0] == '"') - p[0]++; - for (r = i; r < FAXIDLEN; r++) { - f->pollid[r] = 32; - } - f->pollid[FAXIDLEN - 1] = 0; -#ifdef ISDN_TTY_FAX_STAT_DEBUG - printk(KERN_DEBUG "isdn_tty: Fax local poll ID rx \"%s\"\n", f->pollid); -#endif - } - break; - default: - PARSE_ERROR1; - } - return 0; - } - /* CQ=n - copy qlty chk, 0= no chk, 1=only 1D chk, 2=1D+2D chk */ - if (!strncmp(p[0], "CQ", 2)) { - p[0] += 2; - switch (*p[0]) { - case '?': - p[0]++; - sprintf(rs, "\r\n%d", f->cq); - isdn_tty_at_cout(rs, info); - break; - case '=': - p[0]++; - if (*p[0] == '?') { - p[0]++; - sprintf(rs, "\r\n0,1,2"); - isdn_tty_at_cout(rs, info); - } else { - par = isdn_getnum(p); - if ((par < 0) || (par > 2)) - PARSE_ERROR1; - f->cq = par; -#ifdef ISDN_TTY_FAX_STAT_DEBUG - printk(KERN_DEBUG "isdn_tty: Fax FCQ=%d\n", par); -#endif - } - break; - default: - PARSE_ERROR1; - } - return 0; - } - /* CR=n - can receive? 0= no data rx or poll remote dev, 1=do receive data or poll remote dev */ - if (!strncmp(p[0], "CR", 2)) { - p[0] += 2; - switch (*p[0]) { - case '?': - p[0]++; - sprintf(rs, "\r\n%d", f->cr); /* read actual value from struct and print */ - isdn_tty_at_cout(rs, info); - break; - case '=': - p[0]++; - if (*p[0] == '?') { - p[0]++; - sprintf(rs, "\r\n0,1"); /* display online help */ - isdn_tty_at_cout(rs, info); - } else { - par = isdn_getnum(p); - if ((par < 0) || (par > 1)) - PARSE_ERROR1; - f->cr = par; -#ifdef ISDN_TTY_FAX_STAT_DEBUG - printk(KERN_DEBUG "isdn_tty: Fax FCR=%d\n", par); -#endif - } - break; - default: - PARSE_ERROR1; - } - return 0; - } - /* CTCRTY=value - ECM retry count */ - if (!strncmp(p[0], "CTCRTY", 6)) { - p[0] += 6; - switch (*p[0]) { - case '?': - p[0]++; - sprintf(rs, "\r\n%d", f->ctcrty); - isdn_tty_at_cout(rs, info); - break; - case '=': - p[0]++; - if (*p[0] == '?') { - p[0]++; - sprintf(rs, "\r\n0-255"); - isdn_tty_at_cout(rs, info); - } else { - par = isdn_getnum(p); - if ((par < 0) || (par > 255)) - PARSE_ERROR1; - f->ctcrty = par; -#ifdef ISDN_TTY_FAX_STAT_DEBUG - printk(KERN_DEBUG "isdn_tty: Fax FCTCRTY=%d\n", par); -#endif - } - break; - default: - PARSE_ERROR1; - } - return 0; - } - /* DCC=vr,br,wd,ln,df,ec,bf,st - DCE capabilities parms */ - if (!strncmp(p[0], "DCC", 3)) { - char *rp = &f->resolution; - int i; - - p[0] += 3; - switch (*p[0]) { - case '?': - p[0]++; - strcpy(rs, "\r\n"); - for (i = 0; i < 8; i++) { - sprintf(rss, "%c%s", rp[i] + 48, - (i < 7) ? "," : ""); - strcat(rs, rss); - } - isdn_tty_at_cout(rs, info); - break; - case '=': - p[0]++; - if (*p[0] == '?') { - isdn_tty_at_cout("\r\n(0,1),(0-5),(0-2),(0-2),(0-3),(0-2),(0),(0-7)", info); - p[0]++; - } else { - for (i = 0; (((*p[0] >= '0') && (*p[0] <= '9')) || (*p[0] == ',')) && (i < 8); i++) { - if (*p[0] != ',') { - if ((*p[0] - 48) > maxdccval[i]) { - PARSE_ERROR1; - } - rp[i] = *p[0] - 48; - p[0]++; - if (*p[0] == ',') - p[0]++; - } else - p[0]++; - } -#ifdef ISDN_TTY_FAX_STAT_DEBUG - printk(KERN_DEBUG "isdn_tty: Fax FDCC capabilities DCE=%d,%d,%d,%d,%d,%d,%d,%d\n", - rp[0], rp[1], rp[2], rp[3], rp[4], rp[5], rp[6], rp[7]); -#endif - } - break; - default: - PARSE_ERROR1; - } - return 0; - } - /* DIS=vr,br,wd,ln,df,ec,bf,st - current session parms */ - if (!strncmp(p[0], "DIS", 3)) { - char *rp = &f->resolution; - int i; - - p[0] += 3; - switch (*p[0]) { - case '?': - p[0]++; - strcpy(rs, "\r\n"); - for (i = 0; i < 8; i++) { - sprintf(rss, "%c%s", rp[i] + 48, - (i < 7) ? "," : ""); - strcat(rs, rss); - } - isdn_tty_at_cout(rs, info); - break; - case '=': - p[0]++; - if (*p[0] == '?') { - isdn_tty_at_cout("\r\n(0,1),(0-5),(0-2),(0-2),(0-3),(0-2),(0),(0-7)", info); - p[0]++; - } else { - for (i = 0; (((*p[0] >= '0') && (*p[0] <= '9')) || (*p[0] == ',')) && (i < 8); i++) { - if (*p[0] != ',') { - if ((*p[0] - 48) > maxdccval[i]) { - PARSE_ERROR1; - } - rp[i] = *p[0] - 48; - p[0]++; - if (*p[0] == ',') - p[0]++; - } else - p[0]++; - } -#ifdef ISDN_TTY_FAX_STAT_DEBUG - printk(KERN_DEBUG "isdn_tty: Fax FDIS session parms=%d,%d,%d,%d,%d,%d,%d,%d\n", - rp[0], rp[1], rp[2], rp[3], rp[4], rp[5], rp[6], rp[7]); -#endif - } - break; - default: - PARSE_ERROR1; - } - return 0; - } - /* DR - Receive Phase C data command, initiates document reception */ - if (!strncmp(p[0], "DR", 2)) { - p[0] += 2; - if ((info->faxonline & 16) && /* incoming connection */ - ((f->phase == ISDN_FAX_PHASE_B) || (f->phase == ISDN_FAX_PHASE_D))) { -#ifdef ISDN_TTY_FAX_STAT_DEBUG - printk(KERN_DEBUG "isdn_tty: Fax FDR\n"); -#endif - f->code = ISDN_TTY_FAX_DR; - cmd.driver = info->isdn_driver; - cmd.arg = info->isdn_channel; - cmd.command = ISDN_CMD_FAXCMD; - isdn_command(&cmd); - if (f->phase == ISDN_FAX_PHASE_B) { - f->phase = ISDN_FAX_PHASE_C; - } else if (f->phase == ISDN_FAX_PHASE_D) { - switch (f->fet) { - case 0: /* next page will be received */ - f->phase = ISDN_FAX_PHASE_C; - isdn_tty_fax_modem_result(7, info); /* CONNECT */ - break; - case 1: /* next doc will be received */ - f->phase = ISDN_FAX_PHASE_B; - break; - case 2: /* fax session is terminating */ - f->phase = ISDN_FAX_PHASE_E; - break; - default: - PARSE_ERROR1; - } - } - } else { - PARSE_ERROR1; - } - return 1; - } - /* DT=df,vr,wd,ln - TX phase C data command (release DCE to proceed with negotiation) */ - if (!strncmp(p[0], "DT", 2)) { - int i, val[] = - {4, 0, 2, 3}; - char *rp = &f->resolution; - - p[0] += 2; - if (!(info->faxonline & 1)) /* not outgoing connection */ - PARSE_ERROR1; - - for (i = 0; (((*p[0] >= '0') && (*p[0] <= '9')) || (*p[0] == ',')) && (i < 4); i++) { - if (*p[0] != ',') { - if ((*p[0] - 48) > maxdccval[val[i]]) { - PARSE_ERROR1; - } - rp[val[i]] = *p[0] - 48; - p[0]++; - if (*p[0] == ',') - p[0]++; - } else - p[0]++; - } -#ifdef ISDN_TTY_FAX_STAT_DEBUG - printk(KERN_DEBUG "isdn_tty: Fax FDT tx data command parms=%d,%d,%d,%d\n", - rp[4], rp[0], rp[2], rp[3]); -#endif - if ((f->phase == ISDN_FAX_PHASE_B) || (f->phase == ISDN_FAX_PHASE_D)) { - f->code = ISDN_TTY_FAX_DT; - cmd.driver = info->isdn_driver; - cmd.arg = info->isdn_channel; - cmd.command = ISDN_CMD_FAXCMD; - isdn_command(&cmd); - if (f->phase == ISDN_FAX_PHASE_D) { - f->phase = ISDN_FAX_PHASE_C; - isdn_tty_fax_modem_result(7, info); /* CONNECT */ - } - } else { - PARSE_ERROR1; - } - return 1; - } - /* ECM=n - Error mode control 0=disabled, 2=enabled, handled by DCE alone incl. buff of partial pages */ - if (!strncmp(p[0], "ECM", 3)) { - p[0] += 3; - switch (*p[0]) { - case '?': - p[0]++; - sprintf(rs, "\r\n%d", f->ecm); - isdn_tty_at_cout(rs, info); - break; - case '=': - p[0]++; - if (*p[0] == '?') { - p[0]++; - sprintf(rs, "\r\n0,2"); - isdn_tty_at_cout(rs, info); - } else { - par = isdn_getnum(p); - if ((par != 0) && (par != 2)) - PARSE_ERROR1; - f->ecm = par; -#ifdef ISDN_TTY_FAX_STAT_DEBUG - printk(KERN_DEBUG "isdn_tty: Fax FECM=%d\n", par); -#endif - } - break; - default: - PARSE_ERROR1; - } - return 0; - } - /* ET=n - End of page or document */ - if (!strncmp(p[0], "ET=", 3)) { - p[0] += 3; - if (*p[0] == '?') { - p[0]++; - sprintf(rs, "\r\n0-2"); - isdn_tty_at_cout(rs, info); - } else { - if ((f->phase != ISDN_FAX_PHASE_D) || - (!(info->faxonline & 1))) - PARSE_ERROR1; - par = isdn_getnum(p); - if ((par < 0) || (par > 2)) - PARSE_ERROR1; - f->fet = par; - f->code = ISDN_TTY_FAX_ET; - cmd.driver = info->isdn_driver; - cmd.arg = info->isdn_channel; - cmd.command = ISDN_CMD_FAXCMD; - isdn_command(&cmd); -#ifdef ISDN_TTY_FAX_STAT_DEBUG - printk(KERN_DEBUG "isdn_tty: Fax FET=%d\n", par); -#endif - return 1; - } - return 0; - } - /* K - terminate */ - if (!strncmp(p[0], "K", 1)) { - p[0] += 1; - if ((f->phase == ISDN_FAX_PHASE_IDLE) || (f->phase == ISDN_FAX_PHASE_E)) - PARSE_ERROR1; - isdn_tty_modem_hup(info, 1); - return 1; - } - /* LID=string - local fax ID */ - if (!strncmp(p[0], "LID", 3)) { - int i, r; - p[0] += 3; - switch (*p[0]) { - case '?': - p[0]++; - sprintf(rs, "\r\n\"%s\"", f->id); - isdn_tty_at_cout(rs, info); - break; - case '=': - p[0]++; - if (*p[0] == '?') { - p[0]++; - sprintf(rs, "\r\n\"STRING\""); - isdn_tty_at_cout(rs, info); - } else { - if (*p[0] == '"') - p[0]++; - for (i = 0; (*p[0]) && i < (FAXIDLEN - 1) && (*p[0] != '"'); i++) { - f->id[i] = *p[0]++; - } - if (*p[0] == '"') - p[0]++; - for (r = i; r < FAXIDLEN; r++) { - f->id[r] = 32; - } - f->id[FAXIDLEN - 1] = 0; -#ifdef ISDN_TTY_FAX_STAT_DEBUG - printk(KERN_DEBUG "isdn_tty: Fax local ID \"%s\"\n", f->id); -#endif - } - break; - default: - PARSE_ERROR1; - } - return 0; - } - - /* MDL? - DCE Model */ - if (!strncmp(p[0], "MDL?", 4)) { - p[0] += 4; -#ifdef ISDN_TTY_FAX_STAT_DEBUG - printk(KERN_DEBUG "isdn_tty: FMDL?\n"); -#endif - isdn_tty_at_cout("\r\nisdn4linux", info); - return 0; - } - /* MFR? - DCE Manufacturer */ - if (!strncmp(p[0], "MFR?", 4)) { - p[0] += 4; -#ifdef ISDN_TTY_FAX_STAT_DEBUG - printk(KERN_DEBUG "isdn_tty: FMFR?\n"); -#endif - isdn_tty_at_cout("\r\nisdn4linux", info); - return 0; - } - /* MINSP=n - Minimum Speed for Phase C */ - if (!strncmp(p[0], "MINSP", 5)) { - p[0] += 5; - switch (*p[0]) { - case '?': - p[0]++; - sprintf(rs, "\r\n%d", f->minsp); - isdn_tty_at_cout(rs, info); - break; - case '=': - p[0]++; - if (*p[0] == '?') { - p[0]++; - sprintf(rs, "\r\n0-5"); - isdn_tty_at_cout(rs, info); - } else { - par = isdn_getnum(p); - if ((par < 0) || (par > 5)) - PARSE_ERROR1; - f->minsp = par; -#ifdef ISDN_TTY_FAX_STAT_DEBUG - printk(KERN_DEBUG "isdn_tty: Fax FMINSP=%d\n", par); -#endif - } - break; - default: - PARSE_ERROR1; - } - return 0; - } - /* PHCTO=value - DTE phase C timeout */ - if (!strncmp(p[0], "PHCTO", 5)) { - p[0] += 5; - switch (*p[0]) { - case '?': - p[0]++; - sprintf(rs, "\r\n%d", f->phcto); - isdn_tty_at_cout(rs, info); - break; - case '=': - p[0]++; - if (*p[0] == '?') { - p[0]++; - sprintf(rs, "\r\n0-255"); - isdn_tty_at_cout(rs, info); - } else { - par = isdn_getnum(p); - if ((par < 0) || (par > 255)) - PARSE_ERROR1; - f->phcto = par; -#ifdef ISDN_TTY_FAX_STAT_DEBUG - printk(KERN_DEBUG "isdn_tty: Fax FPHCTO=%d\n", par); -#endif - } - break; - default: - PARSE_ERROR1; - } - return 0; - } - - /* REL=n - Phase C received EOL alignment */ - if (!strncmp(p[0], "REL", 3)) { - p[0] += 3; - switch (*p[0]) { - case '?': - p[0]++; - sprintf(rs, "\r\n%d", f->rel); - isdn_tty_at_cout(rs, info); - break; - case '=': - p[0]++; - if (*p[0] == '?') { - p[0]++; - sprintf(rs, "\r\n0,1"); - isdn_tty_at_cout(rs, info); - } else { - par = isdn_getnum(p); - if ((par < 0) || (par > 1)) - PARSE_ERROR1; - f->rel = par; -#ifdef ISDN_TTY_FAX_STAT_DEBUG - printk(KERN_DEBUG "isdn_tty: Fax FREL=%d\n", par); -#endif - } - break; - default: - PARSE_ERROR1; - } - return 0; - } - /* REV? - DCE Revision */ - if (!strncmp(p[0], "REV?", 4)) { - p[0] += 4; -#ifdef ISDN_TTY_FAX_STAT_DEBUG - printk(KERN_DEBUG "isdn_tty: FREV?\n"); -#endif - strcpy(rss, isdn_tty_fax_revision); - sprintf(rs, "\r\nRev: %s", isdn_getrev(rss)); - isdn_tty_at_cout(rs, info); - return 0; - } - - /* Phase C Transmit Data Block Size */ - if (!strncmp(p[0], "TBC=", 4)) { /* dummy, not used */ - p[0] += 4; -#ifdef ISDN_TTY_FAX_STAT_DEBUG - printk(KERN_DEBUG "isdn_tty: Fax FTBC=%c\n", *p[0]); -#endif - switch (*p[0]) { - case '0': - p[0]++; - break; - default: - PARSE_ERROR1; - } - return 0; - } - printk(KERN_DEBUG "isdn_tty: unknown token=>AT+F%s<\n", p[0]); - PARSE_ERROR1; -} - -int -isdn_tty_cmd_PLUSF_FAX(char **p, modem_info *info) -{ - if (TTY_IS_FCLASS2(info)) - return (isdn_tty_cmd_FCLASS2(p, info)); - else if (TTY_IS_FCLASS1(info)) - return (isdn_tty_cmd_FCLASS1(p, info)); - PARSE_ERROR1; -} diff --git a/drivers/isdn/i4l/isdn_ttyfax.h b/drivers/isdn/i4l/isdn_ttyfax.h deleted file mode 100644 index ccda4fcf8f7b..000000000000 --- a/drivers/isdn/i4l/isdn_ttyfax.h +++ /dev/null @@ -1,17 +0,0 @@ -/* $Id: isdn_ttyfax.h,v 1.1.2.2 2004/01/12 22:37:19 keil Exp $ - * - * header for Linux ISDN subsystem, tty_fax related functions (linklevel). - * - * Copyright 1999 by Armin Schindler (mac@melware.de) - * Copyright 1999 by Ralf Spachmann (mel@melware.de) - * Copyright 1999 by Cytronics & Melware - * - * This software may be used and distributed according to the terms - * of the GNU General Public License, incorporated herein by reference. - * - */ - - -#define XON 0x11 -#define XOFF 0x13 -#define DC2 0x12 diff --git a/drivers/isdn/i4l/isdn_v110.c b/drivers/isdn/i4l/isdn_v110.c deleted file mode 100644 index d11fe76f138f..000000000000 --- a/drivers/isdn/i4l/isdn_v110.c +++ /dev/null @@ -1,625 +0,0 @@ -/* $Id: isdn_v110.c,v 1.1.2.2 2004/01/12 22:37:19 keil Exp $ - * - * Linux ISDN subsystem, V.110 related functions (linklevel). - * - * Copyright by Thomas Pfeiffer (pfeiffer@pds.de) - * - * This software may be used and distributed according to the terms - * of the GNU General Public License, incorporated herein by reference. - * - */ - -#include -#include -#include -#include -#include - -#include -#include "isdn_v110.h" - -#undef ISDN_V110_DEBUG - -char *isdn_v110_revision = "$Revision: 1.1.2.2 $"; - -#define V110_38400 255 -#define V110_19200 15 -#define V110_9600 3 - -/* - * The following data are precoded matrices, online and offline matrix - * for 9600, 19200 und 38400, respectively - */ -static unsigned char V110_OnMatrix_9600[] = -{0xfc, 0xfc, 0xfc, 0xfc, 0xff, 0xff, 0xff, 0xfd, 0xff, 0xff, - 0xff, 0xfd, 0xff, 0xff, 0xff, 0xfd, 0xff, 0xff, 0xff, 0xfd, - 0xfd, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xfd, 0xff, 0xff, - 0xff, 0xfd, 0xff, 0xff, 0xff, 0xfd, 0xff, 0xff, 0xff, 0xfd}; - -static unsigned char V110_OffMatrix_9600[] = -{0xfc, 0xfc, 0xfc, 0xfc, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xfd, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; - -static unsigned char V110_OnMatrix_19200[] = -{0xf0, 0xf0, 0xff, 0xf7, 0xff, 0xf7, 0xff, 0xf7, 0xff, 0xf7, - 0xfd, 0xff, 0xff, 0xf7, 0xff, 0xf7, 0xff, 0xf7, 0xff, 0xf7}; - -static unsigned char V110_OffMatrix_19200[] = -{0xf0, 0xf0, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xfd, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; - -static unsigned char V110_OnMatrix_38400[] = -{0x00, 0x7f, 0x7f, 0x7f, 0x7f, 0xfd, 0x7f, 0x7f, 0x7f, 0x7f}; - -static unsigned char V110_OffMatrix_38400[] = -{0x00, 0xff, 0xff, 0xff, 0xff, 0xfd, 0xff, 0xff, 0xff, 0xff}; - -/* - * FlipBits reorders sequences of keylen bits in one byte. - * E.g. source order 7654321 will be converted to 45670123 when keylen = 4, - * and to 67452301 when keylen = 2. This is necessary because ordering on - * the isdn line is the other way. - */ -static inline unsigned char -FlipBits(unsigned char c, int keylen) -{ - unsigned char b = c; - unsigned char bit = 128; - int i; - int j; - int hunks = (8 / keylen); - - c = 0; - for (i = 0; i < hunks; i++) { - for (j = 0; j < keylen; j++) { - if (b & (bit >> j)) - c |= bit >> (keylen - j - 1); - } - bit >>= keylen; - } - return c; -} - - -/* isdn_v110_open allocates and initializes private V.110 data - * structures and returns a pointer to these. - */ -static isdn_v110_stream * -isdn_v110_open(unsigned char key, int hdrlen, int maxsize) -{ - int i; - isdn_v110_stream *v; - - if ((v = kzalloc(sizeof(isdn_v110_stream), GFP_ATOMIC)) == NULL) - return NULL; - v->key = key; - v->nbits = 0; - for (i = 0; key & (1 << i); i++) - v->nbits++; - - v->nbytes = 8 / v->nbits; - v->decodelen = 0; - - switch (key) { - case V110_38400: - v->OnlineFrame = V110_OnMatrix_38400; - v->OfflineFrame = V110_OffMatrix_38400; - break; - case V110_19200: - v->OnlineFrame = V110_OnMatrix_19200; - v->OfflineFrame = V110_OffMatrix_19200; - break; - default: - v->OnlineFrame = V110_OnMatrix_9600; - v->OfflineFrame = V110_OffMatrix_9600; - break; - } - v->framelen = v->nbytes * 10; - v->SyncInit = 5; - v->introducer = 0; - v->dbit = 1; - v->b = 0; - v->skbres = hdrlen; - v->maxsize = maxsize - hdrlen; - if ((v->encodebuf = kmalloc(maxsize, GFP_ATOMIC)) == NULL) { - kfree(v); - return NULL; - } - return v; -} - -/* isdn_v110_close frees private V.110 data structures */ -void -isdn_v110_close(isdn_v110_stream *v) -{ - if (v == NULL) - return; -#ifdef ISDN_V110_DEBUG - printk(KERN_DEBUG "v110 close\n"); -#endif - kfree(v->encodebuf); - kfree(v); -} - - -/* - * ValidHeaderBytes return the number of valid bytes in v->decodebuf - */ -static int -ValidHeaderBytes(isdn_v110_stream *v) -{ - int i; - for (i = 0; (i < v->decodelen) && (i < v->nbytes); i++) - if ((v->decodebuf[i] & v->key) != 0) - break; - return i; -} - -/* - * SyncHeader moves the decodebuf ptr to the next valid header - */ -static void -SyncHeader(isdn_v110_stream *v) -{ - unsigned char *rbuf = v->decodebuf; - int len = v->decodelen; - - if (len == 0) - return; - for (rbuf++, len--; len > 0; len--, rbuf++) /* such den SyncHeader in buf ! */ - if ((*rbuf & v->key) == 0) /* erstes byte gefunden ? */ - break; /* jupp! */ - if (len) - memcpy(v->decodebuf, rbuf, len); - - v->decodelen = len; -#ifdef ISDN_V110_DEBUG - printk(KERN_DEBUG "isdn_v110: Header resync\n"); -#endif -} - -/* DecodeMatrix takes n (n>=1) matrices (v110 frames, 10 bytes) where - len is the number of matrix-lines. len must be a multiple of 10, i.e. - only complete matices must be given. - From these, netto data is extracted and returned in buf. The return-value - is the bytecount of the decoded data. -*/ -static int -DecodeMatrix(isdn_v110_stream *v, unsigned char *m, int len, unsigned char *buf) -{ - int line = 0; - int buflen = 0; - int mbit = 64; - int introducer = v->introducer; - int dbit = v->dbit; - unsigned char b = v->b; - - while (line < len) { /* Are we done with all lines of the matrix? */ - if ((line % 10) == 0) { /* the 0. line of the matrix is always 0 ! */ - if (m[line] != 0x00) { /* not 0 ? -> error! */ -#ifdef ISDN_V110_DEBUG - printk(KERN_DEBUG "isdn_v110: DecodeMatrix, V110 Bad Header\n"); - /* returning now is not the right thing, though :-( */ -#endif - } - line++; /* next line of matrix */ - continue; - } else if ((line % 10) == 5) { /* in line 5 there's only e-bits ! */ - if ((m[line] & 0x70) != 0x30) { /* 011 has to be at the beginning! */ -#ifdef ISDN_V110_DEBUG - printk(KERN_DEBUG "isdn_v110: DecodeMatrix, V110 Bad 5th line\n"); - /* returning now is not the right thing, though :-( */ -#endif - } - line++; /* next line */ - continue; - } else if (!introducer) { /* every byte starts with 10 (stopbit, startbit) */ - introducer = (m[line] & mbit) ? 0 : 1; /* current bit of the matrix */ - next_byte: - if (mbit > 2) { /* was it the last bit in this line ? */ - mbit >>= 1; /* no -> take next */ - continue; - } /* otherwise start with leftmost bit in the next line */ - mbit = 64; - line++; - continue; - } else { /* otherwise we need to set a data bit */ - if (m[line] & mbit) /* was that bit set in the matrix ? */ - b |= dbit; /* yes -> set it in the data byte */ - else - b &= dbit - 1; /* no -> clear it in the data byte */ - if (dbit < 128) /* is that data byte done ? */ - dbit <<= 1; /* no, got the next bit */ - else { /* data byte is done */ - buf[buflen++] = b; /* copy byte into the output buffer */ - introducer = b = 0; /* init of the intro sequence and of the data byte */ - dbit = 1; /* next we look for the 0th bit */ - } - goto next_byte; /* look for next bit in the matrix */ - } - } - v->introducer = introducer; - v->dbit = dbit; - v->b = b; - return buflen; /* return number of bytes in the output buffer */ -} - -/* - * DecodeStream receives V.110 coded data from the input stream. It recovers the - * original frames. - * The input stream doesn't need to be framed - */ -struct sk_buff * -isdn_v110_decode(isdn_v110_stream *v, struct sk_buff *skb) -{ - int i; - int j; - int len; - unsigned char *v110_buf; - unsigned char *rbuf; - - if (!skb) { - printk(KERN_WARNING "isdn_v110_decode called with NULL skb!\n"); - return NULL; - } - rbuf = skb->data; - len = skb->len; - if (v == NULL) { - /* invalid handle, no chance to proceed */ - printk(KERN_WARNING "isdn_v110_decode called with NULL stream!\n"); - dev_kfree_skb(skb); - return NULL; - } - if (v->decodelen == 0) /* cache empty? */ - for (; len > 0; len--, rbuf++) /* scan for SyncHeader in buf */ - if ((*rbuf & v->key) == 0) - break; /* found first byte */ - if (len == 0) { - dev_kfree_skb(skb); - return NULL; - } - /* copy new data to decode-buffer */ - memcpy(&(v->decodebuf[v->decodelen]), rbuf, len); - v->decodelen += len; -ReSync: - if (v->decodelen < v->nbytes) { /* got a new header ? */ - dev_kfree_skb(skb); - return NULL; /* no, try later */ - } - if (ValidHeaderBytes(v) != v->nbytes) { /* is that a valid header? */ - SyncHeader(v); /* no -> look for header */ - goto ReSync; - } - len = (v->decodelen - (v->decodelen % (10 * v->nbytes))) / v->nbytes; - if ((v110_buf = kmalloc(len, GFP_ATOMIC)) == NULL) { - printk(KERN_WARNING "isdn_v110_decode: Couldn't allocate v110_buf\n"); - dev_kfree_skb(skb); - return NULL; - } - for (i = 0; i < len; i++) { - v110_buf[i] = 0; - for (j = 0; j < v->nbytes; j++) - v110_buf[i] |= (v->decodebuf[(i * v->nbytes) + j] & v->key) << (8 - ((j + 1) * v->nbits)); - v110_buf[i] = FlipBits(v110_buf[i], v->nbits); - } - v->decodelen = (v->decodelen % (10 * v->nbytes)); - memcpy(v->decodebuf, &(v->decodebuf[len * v->nbytes]), v->decodelen); - - skb_trim(skb, DecodeMatrix(v, v110_buf, len, skb->data)); - kfree(v110_buf); - if (skb->len) - return skb; - else { - kfree_skb(skb); - return NULL; - } -} - -/* EncodeMatrix takes input data in buf, len is the bytecount. - Data is encoded into v110 frames in m. Return value is the number of - matrix-lines generated. -*/ -static int -EncodeMatrix(unsigned char *buf, int len, unsigned char *m, int mlen) -{ - int line = 0; - int i = 0; - int mbit = 128; - int dbit = 1; - int introducer = 3; - int ibit[] = {0, 1, 1}; - - while ((i < len) && (line < mlen)) { /* while we still have input data */ - switch (line % 10) { /* in which line of the matrix are we? */ - case 0: - m[line++] = 0x00; /* line 0 is always 0 */ - mbit = 128; /* go on with the 7th bit */ - break; - case 5: - m[line++] = 0xbf; /* line 5 is always 10111111 */ - mbit = 128; /* go on with the 7th bit */ - break; - } - if (line >= mlen) { - printk(KERN_WARNING "isdn_v110 (EncodeMatrix): buffer full!\n"); - return line; - } - next_bit: - switch (mbit) { /* leftmost or rightmost bit ? */ - case 1: - line++; /* rightmost -> go to next line */ - if (line >= mlen) { - printk(KERN_WARNING "isdn_v110 (EncodeMatrix): buffer full!\n"); - return line; - } - /* fall through */ - case 128: - m[line] = 128; /* leftmost -> set byte to 1000000 */ - mbit = 64; /* current bit in the matrix line */ - continue; - } - if (introducer) { /* set 110 sequence ? */ - introducer--; /* set on digit less */ - m[line] |= ibit[introducer] ? mbit : 0; /* set corresponding bit */ - mbit >>= 1; /* bit of matrix line >> 1 */ - goto next_bit; /* and go on there */ - } /* else push data bits into the matrix! */ - m[line] |= (buf[i] & dbit) ? mbit : 0; /* set data bit in matrix */ - if (dbit == 128) { /* was it the last one? */ - dbit = 1; /* then go on with first bit of */ - i++; /* next byte in input buffer */ - if (i < len) /* input buffer done ? */ - introducer = 3; /* no, write introducer 110 */ - else { /* input buffer done ! */ - m[line] |= (mbit - 1) & 0xfe; /* set remaining bits in line to 1 */ - break; - } - } else /* not the last data bit */ - dbit <<= 1; /* then go to next data bit */ - mbit >>= 1; /* go to next bit of matrix */ - goto next_bit; - - } - /* if necessary, generate remaining lines of the matrix... */ - if ((line) && ((line + 10) < mlen)) - switch (++line % 10) { - case 1: - m[line++] = 0xfe; - /* fall through */ - case 2: - m[line++] = 0xfe; - /* fall through */ - case 3: - m[line++] = 0xfe; - /* fall through */ - case 4: - m[line++] = 0xfe; - /* fall through */ - case 5: - m[line++] = 0xbf; - /* fall through */ - case 6: - m[line++] = 0xfe; - /* fall through */ - case 7: - m[line++] = 0xfe; - /* fall through */ - case 8: - m[line++] = 0xfe; - /* fall through */ - case 9: - m[line++] = 0xfe; - } - return line; /* that's how many lines we have */ -} - -/* - * Build a sync frame. - */ -static struct sk_buff * -isdn_v110_sync(isdn_v110_stream *v) -{ - struct sk_buff *skb; - - if (v == NULL) { - /* invalid handle, no chance to proceed */ - printk(KERN_WARNING "isdn_v110_sync called with NULL stream!\n"); - return NULL; - } - if ((skb = dev_alloc_skb(v->framelen + v->skbres))) { - skb_reserve(skb, v->skbres); - skb_put_data(skb, v->OfflineFrame, v->framelen); - } - return skb; -} - -/* - * Build an idle frame. - */ -static struct sk_buff * -isdn_v110_idle(isdn_v110_stream *v) -{ - struct sk_buff *skb; - - if (v == NULL) { - /* invalid handle, no chance to proceed */ - printk(KERN_WARNING "isdn_v110_sync called with NULL stream!\n"); - return NULL; - } - if ((skb = dev_alloc_skb(v->framelen + v->skbres))) { - skb_reserve(skb, v->skbres); - skb_put_data(skb, v->OnlineFrame, v->framelen); - } - return skb; -} - -struct sk_buff * -isdn_v110_encode(isdn_v110_stream *v, struct sk_buff *skb) -{ - int i; - int j; - int rlen; - int mlen; - int olen; - int size; - int sval1; - int sval2; - int nframes; - unsigned char *v110buf; - unsigned char *rbuf; - struct sk_buff *nskb; - - if (v == NULL) { - /* invalid handle, no chance to proceed */ - printk(KERN_WARNING "isdn_v110_encode called with NULL stream!\n"); - return NULL; - } - if (!skb) { - /* invalid skb, no chance to proceed */ - printk(KERN_WARNING "isdn_v110_encode called with NULL skb!\n"); - return NULL; - } - rlen = skb->len; - nframes = (rlen + 3) / 4; - v110buf = v->encodebuf; - if ((nframes * 40) > v->maxsize) { - size = v->maxsize; - rlen = v->maxsize / 40; - } else - size = nframes * 40; - if (!(nskb = dev_alloc_skb(size + v->skbres + sizeof(int)))) { - printk(KERN_WARNING "isdn_v110_encode: Couldn't alloc skb\n"); - return NULL; - } - skb_reserve(nskb, v->skbres + sizeof(int)); - if (skb->len == 0) { - skb_put_data(nskb, v->OnlineFrame, v->framelen); - *((int *)skb_push(nskb, sizeof(int))) = 0; - return nskb; - } - mlen = EncodeMatrix(skb->data, rlen, v110buf, size); - /* now distribute 2 or 4 bits each to the output stream! */ - rbuf = skb_put(nskb, size); - olen = 0; - sval1 = 8 - v->nbits; - sval2 = v->key << sval1; - for (i = 0; i < mlen; i++) { - v110buf[i] = FlipBits(v110buf[i], v->nbits); - for (j = 0; j < v->nbytes; j++) { - if (size--) - *rbuf++ = ~v->key | (((v110buf[i] << (j * v->nbits)) & sval2) >> sval1); - else { - printk(KERN_WARNING "isdn_v110_encode: buffers full!\n"); - goto buffer_full; - } - olen++; - } - } -buffer_full: - skb_trim(nskb, olen); - *((int *)skb_push(nskb, sizeof(int))) = rlen; - return nskb; -} - -int -isdn_v110_stat_callback(int idx, isdn_ctrl *c) -{ - isdn_v110_stream *v = NULL; - int i; - int ret = 0; - - if (idx < 0) - return 0; - switch (c->command) { - case ISDN_STAT_BSENT: - /* Keep the send-queue of the driver filled - * with frames: - * If number of outstanding frames < 3, - * send down an Idle-Frame (or an Sync-Frame, if - * v->SyncInit != 0). - */ - if (!(v = dev->v110[idx])) - return 0; - atomic_inc(&dev->v110use[idx]); - for (i = 0; i * v->framelen < c->parm.length; i++) { - if (v->skbidle > 0) { - v->skbidle--; - ret = 1; - } else { - if (v->skbuser > 0) - v->skbuser--; - ret = 0; - } - } - for (i = v->skbuser + v->skbidle; i < 2; i++) { - struct sk_buff *skb; - if (v->SyncInit > 0) - skb = isdn_v110_sync(v); - else - skb = isdn_v110_idle(v); - if (skb) { - if (dev->drv[c->driver]->interface->writebuf_skb(c->driver, c->arg, 1, skb) <= 0) { - dev_kfree_skb(skb); - break; - } else { - if (v->SyncInit) - v->SyncInit--; - v->skbidle++; - } - } else - break; - } - atomic_dec(&dev->v110use[idx]); - return ret; - case ISDN_STAT_DHUP: - case ISDN_STAT_BHUP: - while (1) { - atomic_inc(&dev->v110use[idx]); - if (atomic_dec_and_test(&dev->v110use[idx])) { - isdn_v110_close(dev->v110[idx]); - dev->v110[idx] = NULL; - break; - } - mdelay(1); - } - break; - case ISDN_STAT_BCONN: - if (dev->v110emu[idx] && (dev->v110[idx] == NULL)) { - int hdrlen = dev->drv[c->driver]->interface->hl_hdrlen; - int maxsize = dev->drv[c->driver]->interface->maxbufsize; - atomic_inc(&dev->v110use[idx]); - switch (dev->v110emu[idx]) { - case ISDN_PROTO_L2_V11096: - dev->v110[idx] = isdn_v110_open(V110_9600, hdrlen, maxsize); - break; - case ISDN_PROTO_L2_V11019: - dev->v110[idx] = isdn_v110_open(V110_19200, hdrlen, maxsize); - break; - case ISDN_PROTO_L2_V11038: - dev->v110[idx] = isdn_v110_open(V110_38400, hdrlen, maxsize); - break; - default:; - } - if ((v = dev->v110[idx])) { - while (v->SyncInit) { - struct sk_buff *skb = isdn_v110_sync(v); - if (dev->drv[c->driver]->interface->writebuf_skb(c->driver, c->arg, 1, skb) <= 0) { - dev_kfree_skb(skb); - /* Unable to send, try later */ - break; - } - v->SyncInit--; - v->skbidle++; - } - } else - printk(KERN_WARNING "isdn_v110: Couldn't open stream for chan %d\n", idx); - atomic_dec(&dev->v110use[idx]); - } - break; - default: - return 0; - } - return 0; -} diff --git a/drivers/isdn/i4l/isdn_v110.h b/drivers/isdn/i4l/isdn_v110.h deleted file mode 100644 index de774ab598c9..000000000000 --- a/drivers/isdn/i4l/isdn_v110.h +++ /dev/null @@ -1,29 +0,0 @@ -/* $Id: isdn_v110.h,v 1.1.2.2 2004/01/12 22:37:19 keil Exp $ - * - * Linux ISDN subsystem, V.110 related functions (linklevel). - * - * Copyright by Thomas Pfeiffer (pfeiffer@pds.de) - * - * This software may be used and distributed according to the terms - * of the GNU General Public License, incorporated herein by reference. - * - */ - -#ifndef _isdn_v110_h_ -#define _isdn_v110_h_ - -/* - * isdn_v110_encode will take raw data and encode it using V.110 - */ -extern struct sk_buff *isdn_v110_encode(isdn_v110_stream *, struct sk_buff *); - -/* - * isdn_v110_decode receives V.110 coded data from the stream and rebuilds - * frames from them. The source stream doesn't need to be framed. - */ -extern struct sk_buff *isdn_v110_decode(isdn_v110_stream *, struct sk_buff *); - -extern int isdn_v110_stat_callback(int, isdn_ctrl *); -extern void isdn_v110_close(isdn_v110_stream *v); - -#endif diff --git a/drivers/isdn/i4l/isdn_x25iface.c b/drivers/isdn/i4l/isdn_x25iface.c deleted file mode 100644 index 48bfbcb4a09d..000000000000 --- a/drivers/isdn/i4l/isdn_x25iface.c +++ /dev/null @@ -1,332 +0,0 @@ -/* $Id: isdn_x25iface.c,v 1.1.2.2 2004/01/12 22:37:19 keil Exp $ - * - * Linux ISDN subsystem, X.25 related functions - * - * This software may be used and distributed according to the terms - * of the GNU General Public License, incorporated herein by reference. - * - * stuff needed to support the Linux X.25 PLP code on top of devices that - * can provide a lab_b service using the concap_proto mechanism. - * This module supports a network interface which provides lapb_sematics - * -- as defined in Documentation/networking/x25-iface.txt -- to - * the upper layer and assumes that the lower layer provides a reliable - * data link service by means of the concap_device_ops callbacks. - * - * Only protocol specific stuff goes here. Device specific stuff - * goes to another -- device related -- concap_proto support source file. - * - */ - -/* #include */ -#include -#include -#include -#include -#include -#include "isdn_x25iface.h" - -/* for debugging messages not to cause an oops when device pointer is NULL*/ -#define MY_DEVNAME(dev) ((dev) ? (dev)->name : "DEVICE UNSPECIFIED") - - -typedef struct isdn_x25iface_proto_data { - int magic; - enum wan_states state; - /* Private stuff, not to be accessed via proto_data. We provide the - other storage for the concap_proto instance here as well, - enabling us to allocate both with just one kmalloc(): */ - struct concap_proto priv; -} ix25_pdata_t; - - - -/* is now in header file (extern): struct concap_proto * isdn_x25iface_proto_new(void); */ -static void isdn_x25iface_proto_del(struct concap_proto *); -static int isdn_x25iface_proto_close(struct concap_proto *); -static int isdn_x25iface_proto_restart(struct concap_proto *, - struct net_device *, - struct concap_device_ops *); -static int isdn_x25iface_xmit(struct concap_proto *, struct sk_buff *); -static int isdn_x25iface_receive(struct concap_proto *, struct sk_buff *); -static int isdn_x25iface_connect_ind(struct concap_proto *); -static int isdn_x25iface_disconn_ind(struct concap_proto *); - - -static struct concap_proto_ops ix25_pops = { - .proto_new = &isdn_x25iface_proto_new, - .proto_del = &isdn_x25iface_proto_del, - .restart = &isdn_x25iface_proto_restart, - .close = &isdn_x25iface_proto_close, - .encap_and_xmit = &isdn_x25iface_xmit, - .data_ind = &isdn_x25iface_receive, - .connect_ind = &isdn_x25iface_connect_ind, - .disconn_ind = &isdn_x25iface_disconn_ind -}; - -/* error message helper function */ -static void illegal_state_warn(unsigned state, unsigned char firstbyte) -{ - printk(KERN_WARNING "isdn_x25iface: firstbyte %x illegal in" - "current state %d\n", firstbyte, state); -} - -/* check protocol data field for consistency */ -static int pdata_is_bad(ix25_pdata_t *pda) { - - if (pda && pda->magic == ISDN_X25IFACE_MAGIC) return 0; - printk(KERN_WARNING - "isdn_x25iface_xxx: illegal pointer to proto data\n"); - return 1; -} - -/* create a new x25 interface protocol instance - */ -struct concap_proto *isdn_x25iface_proto_new(void) -{ - ix25_pdata_t *tmp = kmalloc(sizeof(ix25_pdata_t), GFP_KERNEL); - IX25DEBUG("isdn_x25iface_proto_new\n"); - if (tmp) { - tmp->magic = ISDN_X25IFACE_MAGIC; - tmp->state = WAN_UNCONFIGURED; - /* private data space used to hold the concap_proto data. - Only to be accessed via the returned pointer */ - spin_lock_init(&tmp->priv.lock); - tmp->priv.dops = NULL; - tmp->priv.net_dev = NULL; - tmp->priv.pops = &ix25_pops; - tmp->priv.flags = 0; - tmp->priv.proto_data = tmp; - return (&(tmp->priv)); - } - return NULL; -}; - -/* close the x25iface encapsulation protocol - */ -static int isdn_x25iface_proto_close(struct concap_proto *cprot) { - - ix25_pdata_t *tmp; - int ret = 0; - ulong flags; - - if (!cprot) { - printk(KERN_ERR "isdn_x25iface_proto_close: " - "invalid concap_proto pointer\n"); - return -1; - } - IX25DEBUG("isdn_x25iface_proto_close %s \n", MY_DEVNAME(cprot->net_dev)); - spin_lock_irqsave(&cprot->lock, flags); - cprot->dops = NULL; - cprot->net_dev = NULL; - tmp = cprot->proto_data; - if (pdata_is_bad(tmp)) { - ret = -1; - } else { - tmp->state = WAN_UNCONFIGURED; - } - spin_unlock_irqrestore(&cprot->lock, flags); - return ret; -} - -/* Delete the x25iface encapsulation protocol instance - */ -static void isdn_x25iface_proto_del(struct concap_proto *cprot) { - - ix25_pdata_t *tmp; - - IX25DEBUG("isdn_x25iface_proto_del \n"); - if (!cprot) { - printk(KERN_ERR "isdn_x25iface_proto_del: " - "concap_proto pointer is NULL\n"); - return; - } - tmp = cprot->proto_data; - if (tmp == NULL) { - printk(KERN_ERR "isdn_x25iface_proto_del: inconsistent " - "proto_data pointer (maybe already deleted?)\n"); - return; - } - /* close if the protocol is still open */ - if (cprot->dops) isdn_x25iface_proto_close(cprot); - /* freeing the storage should be sufficient now. But some additional - settings might help to catch wild pointer bugs */ - tmp->magic = 0; - cprot->proto_data = NULL; - - kfree(tmp); - return; -} - -/* (re-)initialize the data structures for x25iface encapsulation - */ -static int isdn_x25iface_proto_restart(struct concap_proto *cprot, - struct net_device *ndev, - struct concap_device_ops *dops) -{ - ix25_pdata_t *pda = cprot->proto_data; - ulong flags; - - IX25DEBUG("isdn_x25iface_proto_restart %s \n", MY_DEVNAME(ndev)); - - if (pdata_is_bad(pda)) return -1; - - if (!(dops && dops->data_req && dops->connect_req - && dops->disconn_req)) { - printk(KERN_WARNING "isdn_x25iface_restart: required dops" - " missing\n"); - isdn_x25iface_proto_close(cprot); - return -1; - } - spin_lock_irqsave(&cprot->lock, flags); - cprot->net_dev = ndev; - cprot->pops = &ix25_pops; - cprot->dops = dops; - pda->state = WAN_DISCONNECTED; - spin_unlock_irqrestore(&cprot->lock, flags); - return 0; -} - -/* deliver a dl_data frame received from i4l HL driver to the network layer - */ -static int isdn_x25iface_receive(struct concap_proto *cprot, struct sk_buff *skb) -{ - IX25DEBUG("isdn_x25iface_receive %s \n", MY_DEVNAME(cprot->net_dev)); - if (((ix25_pdata_t *)(cprot->proto_data)) - ->state == WAN_CONNECTED) { - if (skb_push(skb, 1)) { - skb->data[0] = X25_IFACE_DATA; - skb->protocol = x25_type_trans(skb, cprot->net_dev); - netif_rx(skb); - return 0; - } - } - printk(KERN_WARNING "isdn_x25iface_receive %s: not connected, skb dropped\n", MY_DEVNAME(cprot->net_dev)); - dev_kfree_skb(skb); - return -1; -} - -/* a connection set up is indicated by lower layer - */ -static int isdn_x25iface_connect_ind(struct concap_proto *cprot) -{ - struct sk_buff *skb; - enum wan_states *state_p - = &(((ix25_pdata_t *)(cprot->proto_data))->state); - IX25DEBUG("isdn_x25iface_connect_ind %s \n" - , MY_DEVNAME(cprot->net_dev)); - if (*state_p == WAN_UNCONFIGURED) { - printk(KERN_WARNING - "isdn_x25iface_connect_ind while unconfigured %s\n" - , MY_DEVNAME(cprot->net_dev)); - return -1; - } - *state_p = WAN_CONNECTED; - - skb = dev_alloc_skb(1); - if (skb) { - skb_put_u8(skb, X25_IFACE_CONNECT); - skb->protocol = x25_type_trans(skb, cprot->net_dev); - netif_rx(skb); - return 0; - } else { - printk(KERN_WARNING "isdn_x25iface_connect_ind: " - " out of memory -- disconnecting\n"); - cprot->dops->disconn_req(cprot); - return -1; - } -} - -/* a disconnect is indicated by lower layer - */ -static int isdn_x25iface_disconn_ind(struct concap_proto *cprot) -{ - struct sk_buff *skb; - enum wan_states *state_p - = &(((ix25_pdata_t *)(cprot->proto_data))->state); - IX25DEBUG("isdn_x25iface_disconn_ind %s \n", MY_DEVNAME(cprot->net_dev)); - if (*state_p == WAN_UNCONFIGURED) { - printk(KERN_WARNING - "isdn_x25iface_disconn_ind while unconfigured\n"); - return -1; - } - if (!cprot->net_dev) return -1; - *state_p = WAN_DISCONNECTED; - skb = dev_alloc_skb(1); - if (skb) { - skb_put_u8(skb, X25_IFACE_DISCONNECT); - skb->protocol = x25_type_trans(skb, cprot->net_dev); - netif_rx(skb); - return 0; - } else { - printk(KERN_WARNING "isdn_x25iface_disconn_ind:" - " out of memory\n"); - return -1; - } -} - -/* process a frame handed over to us from linux network layer. First byte - semantics as defined in Documentation/networking/x25-iface.txt -*/ -static int isdn_x25iface_xmit(struct concap_proto *cprot, struct sk_buff *skb) -{ - unsigned char firstbyte = skb->data[0]; - enum wan_states *state = &((ix25_pdata_t *)cprot->proto_data)->state; - int ret = 0; - IX25DEBUG("isdn_x25iface_xmit: %s first=%x state=%d\n", - MY_DEVNAME(cprot->net_dev), firstbyte, *state); - switch (firstbyte) { - case X25_IFACE_DATA: - if (*state == WAN_CONNECTED) { - skb_pull(skb, 1); - netif_trans_update(cprot->net_dev); - ret = (cprot->dops->data_req(cprot, skb)); - /* prepare for future retransmissions */ - if (ret) skb_push(skb, 1); - return ret; - } - illegal_state_warn(*state, firstbyte); - break; - case X25_IFACE_CONNECT: - if (*state == WAN_DISCONNECTED) { - *state = WAN_CONNECTING; - ret = cprot->dops->connect_req(cprot); - if (ret) { - /* reset state and notify upper layer about - * immidiatly failed attempts */ - isdn_x25iface_disconn_ind(cprot); - } - } else { - illegal_state_warn(*state, firstbyte); - } - break; - case X25_IFACE_DISCONNECT: - switch (*state) { - case WAN_DISCONNECTED: - /* Should not happen. However, give upper layer a - chance to recover from inconstistency but don't - trust the lower layer sending the disconn_confirm - when already disconnected */ - printk(KERN_WARNING "isdn_x25iface_xmit: disconnect " - " requested while disconnected\n"); - isdn_x25iface_disconn_ind(cprot); - break; /* prevent infinite loops */ - case WAN_CONNECTING: - case WAN_CONNECTED: - *state = WAN_DISCONNECTED; - cprot->dops->disconn_req(cprot); - break; - default: - illegal_state_warn(*state, firstbyte); - } - break; - case X25_IFACE_PARAMS: - printk(KERN_WARNING "isdn_x25iface_xmit: setting of lapb" - " options not yet supported\n"); - break; - default: - printk(KERN_WARNING "isdn_x25iface_xmit: frame with illegal" - " first byte %x ignored:\n", firstbyte); - } - dev_kfree_skb(skb); - return 0; -} diff --git a/drivers/isdn/i4l/isdn_x25iface.h b/drivers/isdn/i4l/isdn_x25iface.h deleted file mode 100644 index ca08e082cf7c..000000000000 --- a/drivers/isdn/i4l/isdn_x25iface.h +++ /dev/null @@ -1,30 +0,0 @@ -/* $Id: isdn_x25iface.h,v 1.1.2.2 2004/01/12 22:37:19 keil Exp $ - * - * header for Linux ISDN subsystem, x.25 related functions - * - * This software may be used and distributed according to the terms - * of the GNU General Public License, incorporated herein by reference. - * - */ - -#ifndef _LINUX_ISDN_X25IFACE_H -#define _LINUX_ISDN_X25IFACE_H - -#define ISDN_X25IFACE_MAGIC 0x1e75a2b9 -/* #define DEBUG_ISDN_X25 if you want isdn_x25 debugging messages */ -#ifdef DEBUG_ISDN_X25 -# define IX25DEBUG(fmt, args...) printk(KERN_DEBUG fmt, ##args) -#else -# define IX25DEBUG(fmt, args...) -#endif - -#include -#include -#include - -extern struct concap_proto_ops *isdn_x25iface_concap_proto_ops_pt; -extern struct concap_proto *isdn_x25iface_proto_new(void); - - - -#endif diff --git a/drivers/isdn/isdnloop/Makefile b/drivers/isdn/isdnloop/Makefile deleted file mode 100644 index 5ff4c0e09768..000000000000 --- a/drivers/isdn/isdnloop/Makefile +++ /dev/null @@ -1,6 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only -# Makefile for the isdnloop ISDN device driver - -# Each configuration option enables a list of files. - -obj-$(CONFIG_ISDN_DRV_LOOP) += isdnloop.o diff --git a/drivers/isdn/isdnloop/isdnloop.c b/drivers/isdn/isdnloop/isdnloop.c deleted file mode 100644 index 755c6bbc9553..000000000000 --- a/drivers/isdn/isdnloop/isdnloop.c +++ /dev/null @@ -1,1528 +0,0 @@ -/* $Id: isdnloop.c,v 1.11.6.7 2001/11/11 19:54:31 kai Exp $ - * - * ISDN low-level module implementing a dummy loop driver. - * - * Copyright 1997 by Fritz Elfert (fritz@isdn4linux.de) - * - * This software may be used and distributed according to the terms - * of the GNU General Public License, incorporated herein by reference. - * - */ - -#include -#include -#include -#include -#include -#include "isdnloop.h" - -static char *isdnloop_id = "loop0"; - -MODULE_DESCRIPTION("ISDN4Linux: Pseudo Driver that simulates an ISDN card"); -MODULE_AUTHOR("Fritz Elfert"); -MODULE_LICENSE("GPL"); -module_param(isdnloop_id, charp, 0); -MODULE_PARM_DESC(isdnloop_id, "ID-String of first card"); - -static int isdnloop_addcard(char *); - -/* - * Free queue completely. - * - * Parameter: - * card = pointer to card struct - * channel = channel number - */ -static void -isdnloop_free_queue(isdnloop_card *card, int channel) -{ - struct sk_buff_head *queue = &card->bqueue[channel]; - - skb_queue_purge(queue); - card->sndcount[channel] = 0; -} - -/* - * Send B-Channel data to another virtual card. - * This routine is called via timer-callback from isdnloop_pollbchan(). - * - * Parameter: - * card = pointer to card struct. - * ch = channel number (0-based) - */ -static void -isdnloop_bchan_send(isdnloop_card *card, int ch) -{ - isdnloop_card *rcard = card->rcard[ch]; - int rch = card->rch[ch], len, ack; - struct sk_buff *skb; - isdn_ctrl cmd; - - while (card->sndcount[ch]) { - skb = skb_dequeue(&card->bqueue[ch]); - if (skb) { - len = skb->len; - card->sndcount[ch] -= len; - ack = *(skb->head); /* used as scratch area */ - cmd.driver = card->myid; - cmd.arg = ch; - if (rcard) { - rcard->interface.rcvcallb_skb(rcard->myid, rch, skb); - } else { - printk(KERN_WARNING "isdnloop: no rcard, skb dropped\n"); - dev_kfree_skb(skb); - - } - cmd.command = ISDN_STAT_BSENT; - cmd.parm.length = len; - card->interface.statcallb(&cmd); - } else - card->sndcount[ch] = 0; - } -} - -/* - * Send/Receive Data to/from the B-Channel. - * This routine is called via timer-callback. - * It schedules itself while any B-Channel is open. - * - * Parameter: - * data = pointer to card struct, set by kernel timer.data - */ -static void -isdnloop_pollbchan(struct timer_list *t) -{ - isdnloop_card *card = from_timer(card, t, rb_timer); - unsigned long flags; - - if (card->flags & ISDNLOOP_FLAGS_B1ACTIVE) - isdnloop_bchan_send(card, 0); - if (card->flags & ISDNLOOP_FLAGS_B2ACTIVE) - isdnloop_bchan_send(card, 1); - if (card->flags & (ISDNLOOP_FLAGS_B1ACTIVE | ISDNLOOP_FLAGS_B2ACTIVE)) { - /* schedule b-channel polling again */ - spin_lock_irqsave(&card->isdnloop_lock, flags); - card->rb_timer.expires = jiffies + ISDNLOOP_TIMER_BCREAD; - add_timer(&card->rb_timer); - card->flags |= ISDNLOOP_FLAGS_RBTIMER; - spin_unlock_irqrestore(&card->isdnloop_lock, flags); - } else - card->flags &= ~ISDNLOOP_FLAGS_RBTIMER; -} - -/* - * Parse ICN-type setup string and fill fields of setup-struct - * with parsed data. - * - * Parameter: - * setup = setup string, format: [caller-id],si1,si2,[called-id] - * cmd = pointer to struct to be filled. - */ -static void -isdnloop_parse_setup(char *setup, isdn_ctrl *cmd) -{ - char *t = setup; - char *s = strchr(t, ','); - - *s++ = '\0'; - strlcpy(cmd->parm.setup.phone, t, sizeof(cmd->parm.setup.phone)); - s = strchr(t = s, ','); - *s++ = '\0'; - if (!strlen(t)) - cmd->parm.setup.si1 = 0; - else - cmd->parm.setup.si1 = simple_strtoul(t, NULL, 10); - s = strchr(t = s, ','); - *s++ = '\0'; - if (!strlen(t)) - cmd->parm.setup.si2 = 0; - else - cmd->parm.setup.si2 = - simple_strtoul(t, NULL, 10); - strlcpy(cmd->parm.setup.eazmsn, s, sizeof(cmd->parm.setup.eazmsn)); - cmd->parm.setup.plan = 0; - cmd->parm.setup.screen = 0; -} - -typedef struct isdnloop_stat { - char *statstr; - int command; - int action; -} isdnloop_stat; -/* *INDENT-OFF* */ -static isdnloop_stat isdnloop_stat_table[] = { - {"BCON_", ISDN_STAT_BCONN, 1}, /* B-Channel connected */ - {"BDIS_", ISDN_STAT_BHUP, 2}, /* B-Channel disconnected */ - {"DCON_", ISDN_STAT_DCONN, 0}, /* D-Channel connected */ - {"DDIS_", ISDN_STAT_DHUP, 0}, /* D-Channel disconnected */ - {"DCAL_I", ISDN_STAT_ICALL, 3}, /* Incoming call dialup-line */ - {"DSCA_I", ISDN_STAT_ICALL, 3}, /* Incoming call 1TR6-SPV */ - {"FCALL", ISDN_STAT_ICALL, 4}, /* Leased line connection up */ - {"CIF", ISDN_STAT_CINF, 5}, /* Charge-info, 1TR6-type */ - {"AOC", ISDN_STAT_CINF, 6}, /* Charge-info, DSS1-type */ - {"CAU", ISDN_STAT_CAUSE, 7}, /* Cause code */ - {"TEI OK", ISDN_STAT_RUN, 0}, /* Card connected to wallplug */ - {"E_L1: ACT FAIL", ISDN_STAT_BHUP, 8}, /* Layer-1 activation failed */ - {"E_L2: DATA LIN", ISDN_STAT_BHUP, 8}, /* Layer-2 data link lost */ - {"E_L1: ACTIVATION FAILED", - ISDN_STAT_BHUP, 8}, /* Layer-1 activation failed */ - {NULL, 0, -1} -}; -/* *INDENT-ON* */ - - -/* - * Parse Status message-strings from virtual card. - * Depending on status, call statcallb for sending messages to upper - * levels. Also set/reset B-Channel active-flags. - * - * Parameter: - * status = status string to parse. - * channel = channel where message comes from. - * card = card where message comes from. - */ -static void -isdnloop_parse_status(u_char *status, int channel, isdnloop_card *card) -{ - isdnloop_stat *s = isdnloop_stat_table; - int action = -1; - isdn_ctrl cmd; - - while (s->statstr) { - if (!strncmp(status, s->statstr, strlen(s->statstr))) { - cmd.command = s->command; - action = s->action; - break; - } - s++; - } - if (action == -1) - return; - cmd.driver = card->myid; - cmd.arg = channel; - switch (action) { - case 1: - /* BCON_x */ - card->flags |= (channel) ? - ISDNLOOP_FLAGS_B2ACTIVE : ISDNLOOP_FLAGS_B1ACTIVE; - break; - case 2: - /* BDIS_x */ - card->flags &= ~((channel) ? - ISDNLOOP_FLAGS_B2ACTIVE : ISDNLOOP_FLAGS_B1ACTIVE); - isdnloop_free_queue(card, channel); - break; - case 3: - /* DCAL_I and DSCA_I */ - isdnloop_parse_setup(status + 6, &cmd); - break; - case 4: - /* FCALL */ - sprintf(cmd.parm.setup.phone, "LEASED%d", card->myid); - sprintf(cmd.parm.setup.eazmsn, "%d", channel + 1); - cmd.parm.setup.si1 = 7; - cmd.parm.setup.si2 = 0; - cmd.parm.setup.plan = 0; - cmd.parm.setup.screen = 0; - break; - case 5: - /* CIF */ - strlcpy(cmd.parm.num, status + 3, sizeof(cmd.parm.num)); - break; - case 6: - /* AOC */ - snprintf(cmd.parm.num, sizeof(cmd.parm.num), "%d", - (int) simple_strtoul(status + 7, NULL, 16)); - break; - case 7: - /* CAU */ - status += 3; - if (strlen(status) == 4) - snprintf(cmd.parm.num, sizeof(cmd.parm.num), "%s%c%c", - status + 2, *status, *(status + 1)); - else - strlcpy(cmd.parm.num, status + 1, sizeof(cmd.parm.num)); - break; - case 8: - /* Misc Errors on L1 and L2 */ - card->flags &= ~ISDNLOOP_FLAGS_B1ACTIVE; - isdnloop_free_queue(card, 0); - cmd.arg = 0; - cmd.driver = card->myid; - card->interface.statcallb(&cmd); - cmd.command = ISDN_STAT_DHUP; - cmd.arg = 0; - cmd.driver = card->myid; - card->interface.statcallb(&cmd); - cmd.command = ISDN_STAT_BHUP; - card->flags &= ~ISDNLOOP_FLAGS_B2ACTIVE; - isdnloop_free_queue(card, 1); - cmd.arg = 1; - cmd.driver = card->myid; - card->interface.statcallb(&cmd); - cmd.command = ISDN_STAT_DHUP; - cmd.arg = 1; - cmd.driver = card->myid; - break; - } - card->interface.statcallb(&cmd); -} - -/* - * Store a cwcharacter into ringbuffer for reading from /dev/isdnctrl - * - * Parameter: - * card = pointer to card struct. - * c = char to store. - */ -static void -isdnloop_putmsg(isdnloop_card *card, unsigned char c) -{ - ulong flags; - - spin_lock_irqsave(&card->isdnloop_lock, flags); - *card->msg_buf_write++ = (c == 0xff) ? '\n' : c; - if (card->msg_buf_write == card->msg_buf_read) { - if (++card->msg_buf_read > card->msg_buf_end) - card->msg_buf_read = card->msg_buf; - } - if (card->msg_buf_write > card->msg_buf_end) - card->msg_buf_write = card->msg_buf; - spin_unlock_irqrestore(&card->isdnloop_lock, flags); -} - -/* - * Poll a virtual cards message queue. - * If there are new status-replies from the card, copy them to - * ringbuffer for reading on /dev/isdnctrl and call - * isdnloop_parse_status() for processing them. Watch for special - * Firmware bootmessage and parse it, to get the D-Channel protocol. - * If there are B-Channels open, initiate a timer-callback to - * isdnloop_pollbchan(). - * This routine is called periodically via timer interrupt. - * - * Parameter: - * data = pointer to card struct - */ -static void -isdnloop_polldchan(struct timer_list *t) -{ - isdnloop_card *card = from_timer(card, t, st_timer); - struct sk_buff *skb; - int avail; - int left; - u_char c; - int ch; - unsigned long flags; - u_char *p; - isdn_ctrl cmd; - - skb = skb_dequeue(&card->dqueue); - if (skb) - avail = skb->len; - else - avail = 0; - for (left = avail; left > 0; left--) { - c = *skb->data; - skb_pull(skb, 1); - isdnloop_putmsg(card, c); - card->imsg[card->iptr] = c; - if (card->iptr < 59) - card->iptr++; - if (!skb->len) { - avail++; - isdnloop_putmsg(card, '\n'); - card->imsg[card->iptr] = 0; - card->iptr = 0; - if (card->imsg[0] == '0' && card->imsg[1] >= '0' && - card->imsg[1] <= '2' && card->imsg[2] == ';') { - ch = (card->imsg[1] - '0') - 1; - p = &card->imsg[3]; - isdnloop_parse_status(p, ch, card); - } else { - p = card->imsg; - if (!strncmp(p, "DRV1.", 5)) { - printk(KERN_INFO "isdnloop: (%s) %s\n", CID, p); - if (!strncmp(p + 7, "TC", 2)) { - card->ptype = ISDN_PTYPE_1TR6; - card->interface.features |= ISDN_FEATURE_P_1TR6; - printk(KERN_INFO - "isdnloop: (%s) 1TR6-Protocol loaded and running\n", CID); - } - if (!strncmp(p + 7, "EC", 2)) { - card->ptype = ISDN_PTYPE_EURO; - card->interface.features |= ISDN_FEATURE_P_EURO; - printk(KERN_INFO - "isdnloop: (%s) Euro-Protocol loaded and running\n", CID); - } - continue; - - } - } - } - } - if (avail) { - cmd.command = ISDN_STAT_STAVAIL; - cmd.driver = card->myid; - cmd.arg = avail; - card->interface.statcallb(&cmd); - } - if (card->flags & (ISDNLOOP_FLAGS_B1ACTIVE | ISDNLOOP_FLAGS_B2ACTIVE)) - if (!(card->flags & ISDNLOOP_FLAGS_RBTIMER)) { - /* schedule b-channel polling */ - card->flags |= ISDNLOOP_FLAGS_RBTIMER; - spin_lock_irqsave(&card->isdnloop_lock, flags); - del_timer(&card->rb_timer); - card->rb_timer.expires = jiffies + ISDNLOOP_TIMER_BCREAD; - add_timer(&card->rb_timer); - spin_unlock_irqrestore(&card->isdnloop_lock, flags); - } - /* schedule again */ - spin_lock_irqsave(&card->isdnloop_lock, flags); - card->st_timer.expires = jiffies + ISDNLOOP_TIMER_DCREAD; - add_timer(&card->st_timer); - spin_unlock_irqrestore(&card->isdnloop_lock, flags); -} - -/* - * Append a packet to the transmit buffer-queue. - * - * Parameter: - * channel = Number of B-channel - * skb = packet to send. - * card = pointer to card-struct - * Return: - * Number of bytes transferred, -E??? on error - */ -static int -isdnloop_sendbuf(int channel, struct sk_buff *skb, isdnloop_card *card) -{ - int len = skb->len; - unsigned long flags; - struct sk_buff *nskb; - - if (len > 4000) { - printk(KERN_WARNING - "isdnloop: Send packet too large\n"); - return -EINVAL; - } - if (len) { - if (!(card->flags & (channel ? ISDNLOOP_FLAGS_B2ACTIVE : ISDNLOOP_FLAGS_B1ACTIVE))) - return 0; - if (card->sndcount[channel] > ISDNLOOP_MAX_SQUEUE) - return 0; - spin_lock_irqsave(&card->isdnloop_lock, flags); - nskb = dev_alloc_skb(skb->len); - if (nskb) { - skb_copy_from_linear_data(skb, - skb_put(nskb, len), len); - skb_queue_tail(&card->bqueue[channel], nskb); - dev_kfree_skb(skb); - } else - len = 0; - card->sndcount[channel] += len; - spin_unlock_irqrestore(&card->isdnloop_lock, flags); - } - return len; -} - -/* - * Read the messages from the card's ringbuffer - * - * Parameter: - * buf = pointer to buffer. - * len = number of bytes to read. - * user = flag, 1: called from userlevel 0: called from kernel. - * card = pointer to card struct. - * Return: - * number of bytes actually transferred. - */ -static int -isdnloop_readstatus(u_char __user *buf, int len, isdnloop_card *card) -{ - int count; - u_char __user *p; - - for (p = buf, count = 0; count < len; p++, count++) { - if (card->msg_buf_read == card->msg_buf_write) - return count; - if (put_user(*card->msg_buf_read++, p)) - return -EFAULT; - if (card->msg_buf_read > card->msg_buf_end) - card->msg_buf_read = card->msg_buf; - } - return count; -} - -/* - * Simulate a card's response by appending it to the cards - * message queue. - * - * Parameter: - * card = pointer to card struct. - * s = pointer to message-string. - * ch = channel: 0 = generic messages, 1 and 2 = D-channel messages. - * Return: - * 0 on success, 1 on memory squeeze. - */ -static int -isdnloop_fake(isdnloop_card *card, char *s, int ch) -{ - struct sk_buff *skb; - int len = strlen(s) + ((ch >= 0) ? 3 : 0); - skb = dev_alloc_skb(len); - if (!skb) { - printk(KERN_WARNING "isdnloop: Out of memory in isdnloop_fake\n"); - return 1; - } - if (ch >= 0) - sprintf(skb_put(skb, 3), "%02d;", ch); - skb_put_data(skb, s, strlen(s)); - skb_queue_tail(&card->dqueue, skb); - return 0; -} -/* *INDENT-OFF* */ -static isdnloop_stat isdnloop_cmd_table[] = { - {"BCON_R", 0, 1}, /* B-Channel connect */ - {"BCON_I", 0, 17}, /* B-Channel connect ind */ - {"BDIS_R", 0, 2}, /* B-Channel disconnect */ - {"DDIS_R", 0, 3}, /* D-Channel disconnect */ - {"DCON_R", 0, 16}, /* D-Channel connect */ - {"DSCA_R", 0, 4}, /* Dial 1TR6-SPV */ - {"DCAL_R", 0, 5}, /* Dial */ - {"EAZC", 0, 6}, /* Clear EAZ listener */ - {"EAZ", 0, 7}, /* Set EAZ listener */ - {"SEEAZ", 0, 8}, /* Get EAZ listener */ - {"MSN", 0, 9}, /* Set/Clear MSN listener */ - {"MSALL", 0, 10}, /* Set multi MSN listeners */ - {"SETSIL", 0, 11}, /* Set SI list */ - {"SEESIL", 0, 12}, /* Get SI list */ - {"SILC", 0, 13}, /* Clear SI list */ - {"LOCK", 0, -1}, /* LOCK channel */ - {"UNLOCK", 0, -1}, /* UNLOCK channel */ - {"FV2ON", 1, 14}, /* Leased mode on */ - {"FV2OFF", 1, 15}, /* Leased mode off */ - {NULL, 0, -1} -}; -/* *INDENT-ON* */ - - -/* - * Simulate an error-response from a card. - * - * Parameter: - * card = pointer to card struct. - */ -static void -isdnloop_fake_err(isdnloop_card *card) -{ - char buf[64]; - - snprintf(buf, sizeof(buf), "E%s", card->omsg); - isdnloop_fake(card, buf, -1); - isdnloop_fake(card, "NAK", -1); -} - -static u_char ctable_eu[] = {0x00, 0x11, 0x01, 0x12}; -static u_char ctable_1t[] = {0x00, 0x3b, 0x01, 0x3a}; - -/* - * Assemble a simplified cause message depending on the - * D-channel protocol used. - * - * Parameter: - * card = pointer to card struct. - * loc = location: 0 = local, 1 = remote. - * cau = cause: 1 = busy, 2 = nonexistent callerid, 3 = no user responding. - * Return: - * Pointer to buffer containing the assembled message. - */ -static char * -isdnloop_unicause(isdnloop_card *card, int loc, int cau) -{ - static char buf[6]; - - switch (card->ptype) { - case ISDN_PTYPE_EURO: - sprintf(buf, "E%02X%02X", (loc) ? 4 : 2, ctable_eu[cau]); - break; - case ISDN_PTYPE_1TR6: - sprintf(buf, "%02X44", ctable_1t[cau]); - break; - default: - return "0000"; - } - return buf; -} - -/* - * Release a virtual connection. Called from timer interrupt, when - * called party did not respond. - * - * Parameter: - * card = pointer to card struct. - * ch = channel (0-based) - */ -static void -isdnloop_atimeout(isdnloop_card *card, int ch) -{ - unsigned long flags; - char buf[60]; - - spin_lock_irqsave(&card->isdnloop_lock, flags); - if (card->rcard[ch]) { - isdnloop_fake(card->rcard[ch], "DDIS_I", card->rch[ch] + 1); - card->rcard[ch]->rcard[card->rch[ch]] = NULL; - card->rcard[ch] = NULL; - } - isdnloop_fake(card, "DDIS_I", ch + 1); - /* No user responding */ - sprintf(buf, "CAU%s", isdnloop_unicause(card, 1, 3)); - isdnloop_fake(card, buf, ch + 1); - spin_unlock_irqrestore(&card->isdnloop_lock, flags); -} - -/* - * Wrapper for isdnloop_atimeout(). - */ -static void -isdnloop_atimeout0(struct timer_list *t) -{ - isdnloop_card *card = from_timer(card, t, c_timer[0]); - - isdnloop_atimeout(card, 0); -} - -/* - * Wrapper for isdnloop_atimeout(). - */ -static void -isdnloop_atimeout1(struct timer_list *t) -{ - isdnloop_card *card = from_timer(card, t, c_timer[1]); - - isdnloop_atimeout(card, 1); -} - -/* - * Install a watchdog for a user, not responding. - * - * Parameter: - * card = pointer to card struct. - * ch = channel to watch for. - */ -static void -isdnloop_start_ctimer(isdnloop_card *card, int ch) -{ - unsigned long flags; - - spin_lock_irqsave(&card->isdnloop_lock, flags); - timer_setup(&card->c_timer[ch], ch ? isdnloop_atimeout1 - : isdnloop_atimeout0, 0); - card->c_timer[ch].expires = jiffies + ISDNLOOP_TIMER_ALERTWAIT; - add_timer(&card->c_timer[ch]); - spin_unlock_irqrestore(&card->isdnloop_lock, flags); -} - -/* - * Kill a pending channel watchdog. - * - * Parameter: - * card = pointer to card struct. - * ch = channel (0-based). - */ -static void -isdnloop_kill_ctimer(isdnloop_card *card, int ch) -{ - unsigned long flags; - - spin_lock_irqsave(&card->isdnloop_lock, flags); - del_timer(&card->c_timer[ch]); - spin_unlock_irqrestore(&card->isdnloop_lock, flags); -} - -static u_char si2bit[] = {0, 1, 0, 0, 0, 2, 0, 4, 0, 0}; -static u_char bit2si[] = {1, 5, 7}; - -/* - * Try finding a listener for an outgoing call. - * - * Parameter: - * card = pointer to calling card. - * p = pointer to ICN-type setup-string. - * lch = channel of calling card. - * cmd = pointer to struct to be filled when parsing setup. - * Return: - * 0 = found match, alerting should happen. - * 1 = found matching number but it is busy. - * 2 = no matching listener. - * 3 = found matching number but SI does not match. - */ -static int -isdnloop_try_call(isdnloop_card *card, char *p, int lch, isdn_ctrl *cmd) -{ - isdnloop_card *cc = cards; - unsigned long flags; - int ch; - int num_match; - int i; - char *e; - char nbuf[32]; - - isdnloop_parse_setup(p, cmd); - while (cc) { - for (ch = 0; ch < 2; ch++) { - /* Exclude ourself */ - if ((cc == card) && (ch == lch)) - continue; - num_match = 0; - switch (cc->ptype) { - case ISDN_PTYPE_EURO: - for (i = 0; i < 3; i++) - if (!(strcmp(cc->s0num[i], cmd->parm.setup.phone))) - num_match = 1; - break; - case ISDN_PTYPE_1TR6: - e = cc->eazlist[ch]; - while (*e) { - sprintf(nbuf, "%s%c", cc->s0num[0], *e); - if (!(strcmp(nbuf, cmd->parm.setup.phone))) - num_match = 1; - e++; - } - } - if (num_match) { - spin_lock_irqsave(&card->isdnloop_lock, flags); - /* channel idle? */ - if (!(cc->rcard[ch])) { - /* Check SI */ - if (!(si2bit[cmd->parm.setup.si1] & cc->sil[ch])) { - spin_unlock_irqrestore(&card->isdnloop_lock, flags); - return 3; - } - /* ch is idle, si and number matches */ - cc->rcard[ch] = card; - cc->rch[ch] = lch; - card->rcard[lch] = cc; - card->rch[lch] = ch; - spin_unlock_irqrestore(&card->isdnloop_lock, flags); - return 0; - } else { - spin_unlock_irqrestore(&card->isdnloop_lock, flags); - /* num matches, but busy */ - if (ch == 1) - return 1; - } - } - } - cc = cc->next; - } - return 2; -} - -/* - * Depending on D-channel protocol and caller/called, modify - * phone number. - * - * Parameter: - * card = pointer to card struct. - * phone = pointer phone number. - * caller = flag: 1 = caller, 0 = called. - * Return: - * pointer to new phone number. - */ -static char * -isdnloop_vstphone(isdnloop_card *card, char *phone, int caller) -{ - int i; - static char nphone[30]; - - if (!card) { - printk("BUG!!!\n"); - return ""; - } - switch (card->ptype) { - case ISDN_PTYPE_EURO: - if (caller) { - for (i = 0; i < 2; i++) - if (!(strcmp(card->s0num[i], phone))) - return phone; - return card->s0num[0]; - } - return phone; - break; - case ISDN_PTYPE_1TR6: - if (caller) { - sprintf(nphone, "%s%c", card->s0num[0], phone[0]); - return nphone; - } else - return &phone[strlen(phone) - 1]; - break; - } - return ""; -} - -/* - * Parse an ICN-type command string sent to the 'card'. - * Perform misc. actions depending on the command. - * - * Parameter: - * card = pointer to card struct. - */ -static void -isdnloop_parse_cmd(isdnloop_card *card) -{ - char *p = card->omsg; - isdn_ctrl cmd; - char buf[60]; - isdnloop_stat *s = isdnloop_cmd_table; - int action = -1; - int i; - int ch; - - if ((card->omsg[0] != '0') && (card->omsg[2] != ';')) { - isdnloop_fake_err(card); - return; - } - ch = card->omsg[1] - '0'; - if ((ch < 0) || (ch > 2)) { - isdnloop_fake_err(card); - return; - } - p += 3; - while (s->statstr) { - if (!strncmp(p, s->statstr, strlen(s->statstr))) { - action = s->action; - if (s->command && (ch != 0)) { - isdnloop_fake_err(card); - return; - } - break; - } - s++; - } - if (action == -1) - return; - switch (action) { - case 1: - /* 0x;BCON_R */ - if (card->rcard[ch - 1]) { - isdnloop_fake(card->rcard[ch - 1], "BCON_I", - card->rch[ch - 1] + 1); - isdnloop_fake(card, "BCON_C", ch); - } - break; - case 17: - /* 0x;BCON_I */ - if (card->rcard[ch - 1]) { - isdnloop_fake(card->rcard[ch - 1], "BCON_C", - card->rch[ch - 1] + 1); - } - break; - case 2: - /* 0x;BDIS_R */ - isdnloop_fake(card, "BDIS_C", ch); - if (card->rcard[ch - 1]) { - isdnloop_fake(card->rcard[ch - 1], "BDIS_I", - card->rch[ch - 1] + 1); - } - break; - case 16: - /* 0x;DCON_R */ - isdnloop_kill_ctimer(card, ch - 1); - if (card->rcard[ch - 1]) { - isdnloop_kill_ctimer(card->rcard[ch - 1], card->rch[ch - 1]); - isdnloop_fake(card->rcard[ch - 1], "DCON_C", - card->rch[ch - 1] + 1); - isdnloop_fake(card, "DCON_C", ch); - } - break; - case 3: - /* 0x;DDIS_R */ - isdnloop_kill_ctimer(card, ch - 1); - if (card->rcard[ch - 1]) { - isdnloop_kill_ctimer(card->rcard[ch - 1], card->rch[ch - 1]); - isdnloop_fake(card->rcard[ch - 1], "DDIS_I", - card->rch[ch - 1] + 1); - card->rcard[ch - 1] = NULL; - } - isdnloop_fake(card, "DDIS_C", ch); - break; - case 4: - /* 0x;DSCA_Rdd,yy,zz,oo */ - if (card->ptype != ISDN_PTYPE_1TR6) { - isdnloop_fake_err(card); - return; - } - /* Fall through */ - case 5: - /* 0x;DCAL_Rdd,yy,zz,oo */ - p += 6; - switch (isdnloop_try_call(card, p, ch - 1, &cmd)) { - case 0: - /* Alerting */ - sprintf(buf, "D%s_I%s,%02d,%02d,%s", - (action == 4) ? "SCA" : "CAL", - isdnloop_vstphone(card, cmd.parm.setup.eazmsn, 1), - cmd.parm.setup.si1, - cmd.parm.setup.si2, - isdnloop_vstphone(card->rcard[ch - 1], - cmd.parm.setup.phone, 0)); - isdnloop_fake(card->rcard[ch - 1], buf, card->rch[ch - 1] + 1); - /* Fall through */ - case 3: - /* si1 does not match, don't alert but start timer */ - isdnloop_start_ctimer(card, ch - 1); - break; - case 1: - /* Remote busy */ - isdnloop_fake(card, "DDIS_I", ch); - sprintf(buf, "CAU%s", isdnloop_unicause(card, 1, 1)); - isdnloop_fake(card, buf, ch); - break; - case 2: - /* No such user */ - isdnloop_fake(card, "DDIS_I", ch); - sprintf(buf, "CAU%s", isdnloop_unicause(card, 1, 2)); - isdnloop_fake(card, buf, ch); - break; - } - break; - case 6: - /* 0x;EAZC */ - card->eazlist[ch - 1][0] = '\0'; - break; - case 7: - /* 0x;EAZ */ - p += 3; - if (strlen(p) >= sizeof(card->eazlist[0])) - break; - strcpy(card->eazlist[ch - 1], p); - break; - case 8: - /* 0x;SEEAZ */ - sprintf(buf, "EAZ-LIST: %s", card->eazlist[ch - 1]); - isdnloop_fake(card, buf, ch + 1); - break; - case 9: - /* 0x;MSN */ - break; - case 10: - /* 0x;MSNALL */ - break; - case 11: - /* 0x;SETSIL */ - p += 6; - i = 0; - while (strchr("0157", *p)) { - if (i) - card->sil[ch - 1] |= si2bit[*p - '0']; - i = (*p++ == '0'); - } - if (*p) - isdnloop_fake_err(card); - break; - case 12: - /* 0x;SEESIL */ - sprintf(buf, "SIN-LIST: "); - p = buf + 10; - for (i = 0; i < 3; i++) - if (card->sil[ch - 1] & (1 << i)) - p += sprintf(p, "%02d", bit2si[i]); - isdnloop_fake(card, buf, ch + 1); - break; - case 13: - /* 0x;SILC */ - card->sil[ch - 1] = 0; - break; - case 14: - /* 00;FV2ON */ - break; - case 15: - /* 00;FV2OFF */ - break; - } -} - -/* - * Put command-strings into the of the 'card'. In reality, execute them - * right in place by calling isdnloop_parse_cmd(). Also copy every - * command to the read message ringbuffer, preceding it with a '>'. - * These mesagges can be read at /dev/isdnctrl. - * - * Parameter: - * buf = pointer to command buffer. - * len = length of buffer data. - * user = flag: 1 = called form userlevel, 0 called from kernel. - * card = pointer to card struct. - * Return: - * number of bytes transferred (currently always equals len). - */ -static int -isdnloop_writecmd(const u_char *buf, int len, int user, isdnloop_card *card) -{ - int xcount = 0; - int ocount = 1; - isdn_ctrl cmd; - - while (len) { - int count = len; - u_char *p; - u_char msg[0x100]; - - if (count > 255) - count = 255; - if (user) { - if (copy_from_user(msg, buf, count)) - return -EFAULT; - } else - memcpy(msg, buf, count); - isdnloop_putmsg(card, '>'); - for (p = msg; count > 0; count--, p++) { - len--; - xcount++; - isdnloop_putmsg(card, *p); - card->omsg[card->optr] = *p; - if (*p == '\n') { - card->omsg[card->optr] = '\0'; - card->optr = 0; - isdnloop_parse_cmd(card); - if (len) { - isdnloop_putmsg(card, '>'); - ocount++; - } - } else { - if (card->optr < 59) - card->optr++; - } - ocount++; - } - } - cmd.command = ISDN_STAT_STAVAIL; - cmd.driver = card->myid; - cmd.arg = ocount; - card->interface.statcallb(&cmd); - return xcount; -} - -/* - * Delete card's pending timers, send STOP to linklevel - */ -static void -isdnloop_stopcard(isdnloop_card *card) -{ - unsigned long flags; - isdn_ctrl cmd; - - spin_lock_irqsave(&card->isdnloop_lock, flags); - if (card->flags & ISDNLOOP_FLAGS_RUNNING) { - card->flags &= ~ISDNLOOP_FLAGS_RUNNING; - del_timer(&card->st_timer); - del_timer(&card->rb_timer); - del_timer(&card->c_timer[0]); - del_timer(&card->c_timer[1]); - cmd.command = ISDN_STAT_STOP; - cmd.driver = card->myid; - card->interface.statcallb(&cmd); - } - spin_unlock_irqrestore(&card->isdnloop_lock, flags); -} - -/* - * Stop all cards before unload. - */ -static void -isdnloop_stopallcards(void) -{ - isdnloop_card *p = cards; - - while (p) { - isdnloop_stopcard(p); - p = p->next; - } -} - -/* - * Start a 'card'. Simulate card's boot message and set the phone - * number(s) of the virtual 'S0-Interface'. Install D-channel - * poll timer. - * - * Parameter: - * card = pointer to card struct. - * sdefp = pointer to struct holding ioctl parameters. - * Return: - * 0 on success, -E??? otherwise. - */ -static int -isdnloop_start(isdnloop_card *card, isdnloop_sdef *sdefp) -{ - unsigned long flags; - isdnloop_sdef sdef; - int i; - - if (card->flags & ISDNLOOP_FLAGS_RUNNING) - return -EBUSY; - if (copy_from_user((char *) &sdef, (char *) sdefp, sizeof(sdef))) - return -EFAULT; - - for (i = 0; i < 3; i++) { - if (!memchr(sdef.num[i], 0, sizeof(sdef.num[i]))) - return -EINVAL; - } - - spin_lock_irqsave(&card->isdnloop_lock, flags); - switch (sdef.ptype) { - case ISDN_PTYPE_EURO: - if (isdnloop_fake(card, "DRV1.23EC-Q.931-CAPI-CNS-BASIS-20.02.96", - -1)) { - spin_unlock_irqrestore(&card->isdnloop_lock, flags); - return -ENOMEM; - } - card->sil[0] = card->sil[1] = 4; - if (isdnloop_fake(card, "TEI OK", 0)) { - spin_unlock_irqrestore(&card->isdnloop_lock, flags); - return -ENOMEM; - } - for (i = 0; i < 3; i++) { - strlcpy(card->s0num[i], sdef.num[i], - sizeof(card->s0num[0])); - } - break; - case ISDN_PTYPE_1TR6: - if (isdnloop_fake(card, "DRV1.04TC-1TR6-CAPI-CNS-BASIS-29.11.95", - -1)) { - spin_unlock_irqrestore(&card->isdnloop_lock, flags); - return -ENOMEM; - } - card->sil[0] = card->sil[1] = 4; - if (isdnloop_fake(card, "TEI OK", 0)) { - spin_unlock_irqrestore(&card->isdnloop_lock, flags); - return -ENOMEM; - } - strlcpy(card->s0num[0], sdef.num[0], sizeof(card->s0num[0])); - card->s0num[1][0] = '\0'; - card->s0num[2][0] = '\0'; - break; - default: - spin_unlock_irqrestore(&card->isdnloop_lock, flags); - printk(KERN_WARNING "isdnloop: Illegal D-channel protocol %d\n", - sdef.ptype); - return -EINVAL; - } - timer_setup(&card->rb_timer, isdnloop_pollbchan, 0); - timer_setup(&card->st_timer, isdnloop_polldchan, 0); - card->st_timer.expires = jiffies + ISDNLOOP_TIMER_DCREAD; - add_timer(&card->st_timer); - card->flags |= ISDNLOOP_FLAGS_RUNNING; - spin_unlock_irqrestore(&card->isdnloop_lock, flags); - return 0; -} - -/* - * Main handler for commands sent by linklevel. - */ -static int -isdnloop_command(isdn_ctrl *c, isdnloop_card *card) -{ - ulong a; - int i; - char cbuf[80]; - isdn_ctrl cmd; - isdnloop_cdef cdef; - - switch (c->command) { - case ISDN_CMD_IOCTL: - memcpy(&a, c->parm.num, sizeof(ulong)); - switch (c->arg) { - case ISDNLOOP_IOCTL_DEBUGVAR: - return (ulong) card; - case ISDNLOOP_IOCTL_STARTUP: - return isdnloop_start(card, (isdnloop_sdef *) a); - break; - case ISDNLOOP_IOCTL_ADDCARD: - if (copy_from_user((char *)&cdef, - (char *)a, - sizeof(cdef))) - return -EFAULT; - return isdnloop_addcard(cdef.id1); - break; - case ISDNLOOP_IOCTL_LEASEDCFG: - if (a) { - if (!card->leased) { - card->leased = 1; - while (card->ptype == ISDN_PTYPE_UNKNOWN) - schedule_timeout_interruptible(10); - schedule_timeout_interruptible(10); - sprintf(cbuf, "00;FV2ON\n01;EAZ1\n02;EAZ2\n"); - i = isdnloop_writecmd(cbuf, strlen(cbuf), 0, card); - printk(KERN_INFO - "isdnloop: (%s) Leased-line mode enabled\n", - CID); - cmd.command = ISDN_STAT_RUN; - cmd.driver = card->myid; - cmd.arg = 0; - card->interface.statcallb(&cmd); - } - } else { - if (card->leased) { - card->leased = 0; - sprintf(cbuf, "00;FV2OFF\n"); - i = isdnloop_writecmd(cbuf, strlen(cbuf), 0, card); - printk(KERN_INFO - "isdnloop: (%s) Leased-line mode disabled\n", - CID); - cmd.command = ISDN_STAT_RUN; - cmd.driver = card->myid; - cmd.arg = 0; - card->interface.statcallb(&cmd); - } - } - return 0; - default: - return -EINVAL; - } - break; - case ISDN_CMD_DIAL: - if (!(card->flags & ISDNLOOP_FLAGS_RUNNING)) - return -ENODEV; - if (card->leased) - break; - if ((c->arg & 255) < ISDNLOOP_BCH) { - char *p; - char dcode[4]; - - a = c->arg; - p = c->parm.setup.phone; - if (*p == 's' || *p == 'S') { - /* Dial for SPV */ - p++; - strcpy(dcode, "SCA"); - } else - /* Normal Dial */ - strcpy(dcode, "CAL"); - snprintf(cbuf, sizeof(cbuf), - "%02d;D%s_R%s,%02d,%02d,%s\n", (int) (a + 1), - dcode, p, c->parm.setup.si1, - c->parm.setup.si2, c->parm.setup.eazmsn); - i = isdnloop_writecmd(cbuf, strlen(cbuf), 0, card); - } - break; - case ISDN_CMD_ACCEPTD: - if (!(card->flags & ISDNLOOP_FLAGS_RUNNING)) - return -ENODEV; - if (c->arg < ISDNLOOP_BCH) { - a = c->arg + 1; - cbuf[0] = 0; - switch (card->l2_proto[a - 1]) { - case ISDN_PROTO_L2_X75I: - sprintf(cbuf, "%02d;BX75\n", (int) a); - break; -#ifdef CONFIG_ISDN_X25 - case ISDN_PROTO_L2_X25DTE: - sprintf(cbuf, "%02d;BX2T\n", (int) a); - break; - case ISDN_PROTO_L2_X25DCE: - sprintf(cbuf, "%02d;BX2C\n", (int) a); - break; -#endif - case ISDN_PROTO_L2_HDLC: - sprintf(cbuf, "%02d;BTRA\n", (int) a); - break; - } - if (strlen(cbuf)) - i = isdnloop_writecmd(cbuf, strlen(cbuf), 0, card); - sprintf(cbuf, "%02d;DCON_R\n", (int) a); - i = isdnloop_writecmd(cbuf, strlen(cbuf), 0, card); - } - break; - case ISDN_CMD_ACCEPTB: - if (!(card->flags & ISDNLOOP_FLAGS_RUNNING)) - return -ENODEV; - if (c->arg < ISDNLOOP_BCH) { - a = c->arg + 1; - switch (card->l2_proto[a - 1]) { - case ISDN_PROTO_L2_X75I: - sprintf(cbuf, "%02d;BCON_R,BX75\n", (int) a); - break; -#ifdef CONFIG_ISDN_X25 - case ISDN_PROTO_L2_X25DTE: - sprintf(cbuf, "%02d;BCON_R,BX2T\n", (int) a); - break; - case ISDN_PROTO_L2_X25DCE: - sprintf(cbuf, "%02d;BCON_R,BX2C\n", (int) a); - break; -#endif - case ISDN_PROTO_L2_HDLC: - sprintf(cbuf, "%02d;BCON_R,BTRA\n", (int) a); - break; - default: - sprintf(cbuf, "%02d;BCON_R\n", (int) a); - } - printk(KERN_DEBUG "isdnloop writecmd '%s'\n", cbuf); - i = isdnloop_writecmd(cbuf, strlen(cbuf), 0, card); - break; - case ISDN_CMD_HANGUP: - if (!(card->flags & ISDNLOOP_FLAGS_RUNNING)) - return -ENODEV; - if (c->arg < ISDNLOOP_BCH) { - a = c->arg + 1; - sprintf(cbuf, "%02d;BDIS_R\n%02d;DDIS_R\n", (int) a, (int) a); - i = isdnloop_writecmd(cbuf, strlen(cbuf), 0, card); - } - break; - case ISDN_CMD_SETEAZ: - if (!(card->flags & ISDNLOOP_FLAGS_RUNNING)) - return -ENODEV; - if (card->leased) - break; - if (c->arg < ISDNLOOP_BCH) { - a = c->arg + 1; - if (card->ptype == ISDN_PTYPE_EURO) { - sprintf(cbuf, "%02d;MS%s%s\n", (int) a, - c->parm.num[0] ? "N" : "ALL", c->parm.num); - } else - sprintf(cbuf, "%02d;EAZ%s\n", (int) a, - c->parm.num[0] ? c->parm.num : (u_char *) "0123456789"); - i = isdnloop_writecmd(cbuf, strlen(cbuf), 0, card); - } - break; - case ISDN_CMD_CLREAZ: - if (!(card->flags & ISDNLOOP_FLAGS_RUNNING)) - return -ENODEV; - if (card->leased) - break; - if (c->arg < ISDNLOOP_BCH) { - a = c->arg + 1; - if (card->ptype == ISDN_PTYPE_EURO) - sprintf(cbuf, "%02d;MSNC\n", (int) a); - else - sprintf(cbuf, "%02d;EAZC\n", (int) a); - i = isdnloop_writecmd(cbuf, strlen(cbuf), 0, card); - } - break; - case ISDN_CMD_SETL2: - if (!(card->flags & ISDNLOOP_FLAGS_RUNNING)) - return -ENODEV; - if ((c->arg & 255) < ISDNLOOP_BCH) { - a = c->arg; - switch (a >> 8) { - case ISDN_PROTO_L2_X75I: - sprintf(cbuf, "%02d;BX75\n", (int) (a & 255) + 1); - break; -#ifdef CONFIG_ISDN_X25 - case ISDN_PROTO_L2_X25DTE: - sprintf(cbuf, "%02d;BX2T\n", (int) (a & 255) + 1); - break; - case ISDN_PROTO_L2_X25DCE: - sprintf(cbuf, "%02d;BX2C\n", (int) (a & 255) + 1); - break; -#endif - case ISDN_PROTO_L2_HDLC: - sprintf(cbuf, "%02d;BTRA\n", (int) (a & 255) + 1); - break; - case ISDN_PROTO_L2_TRANS: - sprintf(cbuf, "%02d;BTRA\n", (int) (a & 255) + 1); - break; - default: - return -EINVAL; - } - i = isdnloop_writecmd(cbuf, strlen(cbuf), 0, card); - card->l2_proto[a & 255] = (a >> 8); - } - break; - case ISDN_CMD_SETL3: - if (!(card->flags & ISDNLOOP_FLAGS_RUNNING)) - return -ENODEV; - return 0; - default: - return -EINVAL; - } - } - return 0; -} - -/* - * Find card with given driverId - */ -static inline isdnloop_card * -isdnloop_findcard(int driverid) -{ - isdnloop_card *p = cards; - - while (p) { - if (p->myid == driverid) - return p; - p = p->next; - } - return (isdnloop_card *) 0; -} - -/* - * Wrapper functions for interface to linklevel - */ -static int -if_command(isdn_ctrl *c) -{ - isdnloop_card *card = isdnloop_findcard(c->driver); - - if (card) - return isdnloop_command(c, card); - printk(KERN_ERR - "isdnloop: if_command called with invalid driverId!\n"); - return -ENODEV; -} - -static int -if_writecmd(const u_char __user *buf, int len, int id, int channel) -{ - isdnloop_card *card = isdnloop_findcard(id); - - if (card) { - if (!(card->flags & ISDNLOOP_FLAGS_RUNNING)) - return -ENODEV; - return isdnloop_writecmd(buf, len, 1, card); - } - printk(KERN_ERR - "isdnloop: if_writecmd called with invalid driverId!\n"); - return -ENODEV; -} - -static int -if_readstatus(u_char __user *buf, int len, int id, int channel) -{ - isdnloop_card *card = isdnloop_findcard(id); - - if (card) { - if (!(card->flags & ISDNLOOP_FLAGS_RUNNING)) - return -ENODEV; - return isdnloop_readstatus(buf, len, card); - } - printk(KERN_ERR - "isdnloop: if_readstatus called with invalid driverId!\n"); - return -ENODEV; -} - -static int -if_sendbuf(int id, int channel, int ack, struct sk_buff *skb) -{ - isdnloop_card *card = isdnloop_findcard(id); - - if (card) { - if (!(card->flags & ISDNLOOP_FLAGS_RUNNING)) - return -ENODEV; - /* ack request stored in skb scratch area */ - *(skb->head) = ack; - return isdnloop_sendbuf(channel, skb, card); - } - printk(KERN_ERR - "isdnloop: if_sendbuf called with invalid driverId!\n"); - return -ENODEV; -} - -/* - * Allocate a new card-struct, initialize it - * link it into cards-list and register it at linklevel. - */ -static isdnloop_card * -isdnloop_initcard(char *id) -{ - isdnloop_card *card; - int i; - card = kzalloc(sizeof(isdnloop_card), GFP_KERNEL); - if (!card) { - printk(KERN_WARNING - "isdnloop: (%s) Could not allocate card-struct.\n", id); - return (isdnloop_card *) 0; - } - card->interface.owner = THIS_MODULE; - card->interface.channels = ISDNLOOP_BCH; - card->interface.hl_hdrlen = 1; /* scratch area for storing ack flag*/ - card->interface.maxbufsize = 4000; - card->interface.command = if_command; - card->interface.writebuf_skb = if_sendbuf; - card->interface.writecmd = if_writecmd; - card->interface.readstat = if_readstatus; - card->interface.features = ISDN_FEATURE_L2_X75I | -#ifdef CONFIG_ISDN_X25 - ISDN_FEATURE_L2_X25DTE | - ISDN_FEATURE_L2_X25DCE | -#endif - ISDN_FEATURE_L2_HDLC | - ISDN_FEATURE_L3_TRANS | - ISDN_FEATURE_P_UNKNOWN; - card->ptype = ISDN_PTYPE_UNKNOWN; - strlcpy(card->interface.id, id, sizeof(card->interface.id)); - card->msg_buf_write = card->msg_buf; - card->msg_buf_read = card->msg_buf; - card->msg_buf_end = &card->msg_buf[sizeof(card->msg_buf) - 1]; - for (i = 0; i < ISDNLOOP_BCH; i++) { - card->l2_proto[i] = ISDN_PROTO_L2_X75I; - skb_queue_head_init(&card->bqueue[i]); - } - skb_queue_head_init(&card->dqueue); - spin_lock_init(&card->isdnloop_lock); - card->next = cards; - cards = card; - if (!register_isdn(&card->interface)) { - cards = cards->next; - printk(KERN_WARNING - "isdnloop: Unable to register %s\n", id); - kfree(card); - return (isdnloop_card *) 0; - } - card->myid = card->interface.channels; - return card; -} - -static int -isdnloop_addcard(char *id1) -{ - isdnloop_card *card; - card = isdnloop_initcard(id1); - if (!card) { - return -EIO; - } - printk(KERN_INFO - "isdnloop: (%s) virtual card added\n", - card->interface.id); - return 0; -} - -static int __init -isdnloop_init(void) -{ - if (isdnloop_id) - return isdnloop_addcard(isdnloop_id); - - return 0; -} - -static void __exit -isdnloop_exit(void) -{ - isdn_ctrl cmd; - isdnloop_card *card = cards; - isdnloop_card *last; - int i; - - isdnloop_stopallcards(); - while (card) { - cmd.command = ISDN_STAT_UNLOAD; - cmd.driver = card->myid; - card->interface.statcallb(&cmd); - for (i = 0; i < ISDNLOOP_BCH; i++) - isdnloop_free_queue(card, i); - card = card->next; - } - card = cards; - while (card) { - last = card; - skb_queue_purge(&card->dqueue); - card = card->next; - kfree(last); - } - printk(KERN_NOTICE "isdnloop-ISDN-driver unloaded\n"); -} - -module_init(isdnloop_init); -module_exit(isdnloop_exit); diff --git a/drivers/isdn/isdnloop/isdnloop.h b/drivers/isdn/isdnloop/isdnloop.h deleted file mode 100644 index e9e035552bb4..000000000000 --- a/drivers/isdn/isdnloop/isdnloop.h +++ /dev/null @@ -1,112 +0,0 @@ -/* $Id: isdnloop.h,v 1.5.6.3 2001/09/23 22:24:56 kai Exp $ - * - * Loopback lowlevel module for testing of linklevel. - * - * Copyright 1997 by Fritz Elfert (fritz@isdn4linux.de) - * - * This software may be used and distributed according to the terms - * of the GNU General Public License, incorporated herein by reference. - * - */ - -#ifndef isdnloop_h -#define isdnloop_h - -#define ISDNLOOP_IOCTL_DEBUGVAR 0 -#define ISDNLOOP_IOCTL_ADDCARD 1 -#define ISDNLOOP_IOCTL_LEASEDCFG 2 -#define ISDNLOOP_IOCTL_STARTUP 3 - -/* Struct for adding new cards */ -typedef struct isdnloop_cdef { - char id1[10]; -} isdnloop_cdef; - -/* Struct for configuring cards */ -typedef struct isdnloop_sdef { - int ptype; - char num[3][20]; -} isdnloop_sdef; - -#if defined(__KERNEL__) || defined(__DEBUGVAR__) - -#ifdef __KERNEL__ -/* Kernel includes */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#endif /* __KERNEL__ */ - -#define ISDNLOOP_FLAGS_B1ACTIVE 1 /* B-Channel-1 is open */ -#define ISDNLOOP_FLAGS_B2ACTIVE 2 /* B-Channel-2 is open */ -#define ISDNLOOP_FLAGS_RUNNING 4 /* Cards driver activated */ -#define ISDNLOOP_FLAGS_RBTIMER 8 /* scheduling of B-Channel-poll */ -#define ISDNLOOP_TIMER_BCREAD 1 /* B-Channel poll-cycle */ -#define ISDNLOOP_TIMER_DCREAD (HZ/2) /* D-Channel poll-cycle */ -#define ISDNLOOP_TIMER_ALERTWAIT (10 * HZ) /* Alert timeout */ -#define ISDNLOOP_MAX_SQUEUE 65536 /* Max. outstanding send-data */ -#define ISDNLOOP_BCH 2 /* channels per card */ - -/* - * Per card driver data - */ -typedef struct isdnloop_card { - struct isdnloop_card *next; /* Pointer to next device struct */ - struct isdnloop_card - *rcard[ISDNLOOP_BCH]; /* Pointer to 'remote' card */ - int rch[ISDNLOOP_BCH]; /* 'remote' channel */ - int myid; /* Driver-Nr. assigned by linklevel */ - int leased; /* Flag: This Adapter is connected */ - /* to a leased line */ - int sil[ISDNLOOP_BCH]; /* SI's to listen for */ - char eazlist[ISDNLOOP_BCH][11]; - /* EAZ's to listen for */ - char s0num[3][20]; /* 1TR6 base-number or MSN's */ - unsigned short flags; /* Statusflags */ - int ptype; /* Protocol type (1TR6 or Euro) */ - struct timer_list st_timer; /* Timer for Status-Polls */ - struct timer_list rb_timer; /* Timer for B-Channel-Polls */ - struct timer_list - c_timer[ISDNLOOP_BCH]; /* Timer for Alerting */ - int l2_proto[ISDNLOOP_BCH]; /* Current layer-2-protocol */ - isdn_if interface; /* Interface to upper layer */ - int iptr; /* Index to imsg-buffer */ - char imsg[60]; /* Internal buf for status-parsing */ - int optr; /* Index to omsg-buffer */ - char omsg[60]; /* Internal buf for cmd-parsing */ - char msg_buf[2048]; /* Buffer for status-messages */ - char *msg_buf_write; /* Writepointer for statusbuffer */ - char *msg_buf_read; /* Readpointer for statusbuffer */ - char *msg_buf_end; /* Pointer to end of statusbuffer */ - int sndcount[ISDNLOOP_BCH]; /* Byte-counters for B-Ch.-send */ - struct sk_buff_head - bqueue[ISDNLOOP_BCH]; /* B-Channel queues */ - struct sk_buff_head dqueue; /* D-Channel queue */ - spinlock_t isdnloop_lock; -} isdnloop_card; - -/* - * Main driver data - */ -#ifdef __KERNEL__ -static isdnloop_card *cards = (isdnloop_card *) 0; -#endif /* __KERNEL__ */ - -/* Utility-Macros */ - -#define CID (card->interface.id) - -#endif /* defined(__KERNEL__) || defined(__DEBUGVAR__) */ -#endif /* isdnloop_h */ diff --git a/include/linux/concap.h b/include/linux/concap.h deleted file mode 100644 index 977acb3d1fb2..000000000000 --- a/include/linux/concap.h +++ /dev/null @@ -1,112 +0,0 @@ -/* $Id: concap.h,v 1.3.2.2 2004/01/12 23:08:35 keil Exp $ - * - * Copyright 1997 by Henner Eisen - * - * This software may be used and distributed according to the terms - * of the GNU General Public License, incorporated herein by reference. - */ - -#ifndef _LINUX_CONCAP_H -#define _LINUX_CONCAP_H - -#include -#include - -/* Stuff to support encapsulation protocols genericly. The encapsulation - protocol is processed at the uppermost layer of the network interface. - - Based on a ideas developed in a 'synchronous device' thread in the - linux-x25 mailing list contributed by Alan Cox, Thomasz Motylewski - and Jonathan Naylor. - - For more documetation on this refer to Documentation/isdn/README.concap -*/ - -struct concap_proto_ops; -struct concap_device_ops; - -/* this manages all data needed by the encapsulation protocol - */ -struct concap_proto{ - struct net_device *net_dev; /* net device using our service */ - struct concap_device_ops *dops; /* callbacks provided by device */ - struct concap_proto_ops *pops; /* callbacks provided by us */ - spinlock_t lock; - int flags; - void *proto_data; /* protocol specific private data, to - be accessed via *pops methods only*/ - /* - : - whatever - : - */ -}; - -/* Operations to be supported by the net device. Called by the encapsulation - * protocol entity. No receive method is offered because the encapsulation - * protocol directly calls netif_rx(). - */ -struct concap_device_ops{ - - /* to request data is submitted by device*/ - int (*data_req)(struct concap_proto *, struct sk_buff *); - - /* Control methods must be set to NULL by devices which do not - support connection control.*/ - /* to request a connection is set up */ - int (*connect_req)(struct concap_proto *); - - /* to request a connection is released */ - int (*disconn_req)(struct concap_proto *); -}; - -/* Operations to be supported by the encapsulation protocol. Called by - * device driver. - */ -struct concap_proto_ops{ - - /* create a new encapsulation protocol instance of same type */ - struct concap_proto * (*proto_new) (void); - - /* delete encapsulation protocol instance and free all its resources. - cprot may no loger be referenced after calling this */ - void (*proto_del)(struct concap_proto *cprot); - - /* initialize the protocol's data. To be called at interface startup - or when the device driver resets the interface. All services of the - encapsulation protocol may be used after this*/ - int (*restart)(struct concap_proto *cprot, - struct net_device *ndev, - struct concap_device_ops *dops); - - /* inactivate an encapsulation protocol instance. The encapsulation - protocol may not call any *dops methods after this. */ - int (*close)(struct concap_proto *cprot); - - /* process a frame handed down to us by upper layer */ - int (*encap_and_xmit)(struct concap_proto *cprot, struct sk_buff *skb); - - /* to be called for each data entity received from lower layer*/ - int (*data_ind)(struct concap_proto *cprot, struct sk_buff *skb); - - /* to be called when a connection was set up/down. - Protocols that don't process these primitives might fill in - dummy methods here */ - int (*connect_ind)(struct concap_proto *cprot); - int (*disconn_ind)(struct concap_proto *cprot); - /* - Some network device support functions, like net_header(), rebuild_header(), - and others, that depend solely on the encapsulation protocol, might - be provided here, too. The net device would just fill them in its - corresponding fields when it is opened. - */ -}; - -/* dummy restart/close/connect/reset/disconn methods - */ -extern int concap_nop(struct concap_proto *cprot); - -/* dummy submit method - */ -extern int concap_drop_skb(struct concap_proto *cprot, struct sk_buff *skb); -#endif diff --git a/include/linux/isdn.h b/include/linux/isdn.h deleted file mode 100644 index df97c8444f5d..000000000000 --- a/include/linux/isdn.h +++ /dev/null @@ -1,473 +0,0 @@ -/* $Id: isdn.h,v 1.125.2.3 2004/02/10 01:07:14 keil Exp $ - * - * Main header for the Linux ISDN subsystem (linklevel). - * - * Copyright 1994,95,96 by Fritz Elfert (fritz@isdn4linux.de) - * Copyright 1995,96 by Thinking Objects Software GmbH Wuerzburg - * Copyright 1995,96 by Michael Hipp (Michael.Hipp@student.uni-tuebingen.de) - * - * This software may be used and distributed according to the terms - * of the GNU General Public License, incorporated herein by reference. - * - */ -#ifndef __ISDN_H__ -#define __ISDN_H__ - - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#define ISDN_TTY_MAJOR 43 -#define ISDN_TTYAUX_MAJOR 44 -#define ISDN_MAJOR 45 - -/* The minor-devicenumbers for Channel 0 and 1 are used as arguments for - * physical Channel-Mapping, so they MUST NOT be changed without changing - * the correspondent code in isdn.c - */ - -#define ISDN_MINOR_B 0 -#define ISDN_MINOR_BMAX (ISDN_MAX_CHANNELS-1) -#define ISDN_MINOR_CTRL 64 -#define ISDN_MINOR_CTRLMAX (64 + (ISDN_MAX_CHANNELS-1)) -#define ISDN_MINOR_PPP 128 -#define ISDN_MINOR_PPPMAX (128 + (ISDN_MAX_CHANNELS-1)) -#define ISDN_MINOR_STATUS 255 - -#ifdef CONFIG_ISDN_PPP - -#ifdef CONFIG_ISDN_PPP_VJ -# include -#endif - -#include -#include - -#include -#endif - -#ifdef CONFIG_ISDN_X25 -# include -#endif - -#include - -#define ISDN_DRVIOCTL_MASK 0x7f /* Mask for Device-ioctl */ - -/* Until now unused */ -#define ISDN_SERVICE_VOICE 1 -#define ISDN_SERVICE_AB 1<<1 -#define ISDN_SERVICE_X21 1<<2 -#define ISDN_SERVICE_G4 1<<3 -#define ISDN_SERVICE_BTX 1<<4 -#define ISDN_SERVICE_DFUE 1<<5 -#define ISDN_SERVICE_X25 1<<6 -#define ISDN_SERVICE_TTX 1<<7 -#define ISDN_SERVICE_MIXED 1<<8 -#define ISDN_SERVICE_FW 1<<9 -#define ISDN_SERVICE_GTEL 1<<10 -#define ISDN_SERVICE_BTXN 1<<11 -#define ISDN_SERVICE_BTEL 1<<12 - -/* Macros checking plain usage */ -#define USG_NONE(x) ((x & ISDN_USAGE_MASK)==ISDN_USAGE_NONE) -#define USG_RAW(x) ((x & ISDN_USAGE_MASK)==ISDN_USAGE_RAW) -#define USG_MODEM(x) ((x & ISDN_USAGE_MASK)==ISDN_USAGE_MODEM) -#define USG_VOICE(x) ((x & ISDN_USAGE_MASK)==ISDN_USAGE_VOICE) -#define USG_NET(x) ((x & ISDN_USAGE_MASK)==ISDN_USAGE_NET) -#define USG_FAX(x) ((x & ISDN_USAGE_MASK)==ISDN_USAGE_FAX) -#define USG_OUTGOING(x) ((x & ISDN_USAGE_OUTGOING)==ISDN_USAGE_OUTGOING) -#define USG_MODEMORVOICE(x) (((x & ISDN_USAGE_MASK)==ISDN_USAGE_MODEM) || \ - ((x & ISDN_USAGE_MASK)==ISDN_USAGE_VOICE) ) - -/* Timer-delays and scheduling-flags */ -#define ISDN_TIMER_RES 4 /* Main Timer-Resolution */ -#define ISDN_TIMER_02SEC (HZ/ISDN_TIMER_RES/5) /* Slow-Timer1 .2 sec */ -#define ISDN_TIMER_1SEC (HZ/ISDN_TIMER_RES) /* Slow-Timer2 1 sec */ -#define ISDN_TIMER_RINGING 5 /* tty RINGs = ISDN_TIMER_1SEC * this factor */ -#define ISDN_TIMER_KEEPINT 10 /* Cisco-Keepalive = ISDN_TIMER_1SEC * this factor */ -#define ISDN_TIMER_MODEMREAD 1 -#define ISDN_TIMER_MODEMPLUS 2 -#define ISDN_TIMER_MODEMRING 4 -#define ISDN_TIMER_MODEMXMIT 8 -#define ISDN_TIMER_NETDIAL 16 -#define ISDN_TIMER_NETHANGUP 32 -#define ISDN_TIMER_CARRIER 256 /* Wait for Carrier */ -#define ISDN_TIMER_FAST (ISDN_TIMER_MODEMREAD | ISDN_TIMER_MODEMPLUS | \ - ISDN_TIMER_MODEMXMIT) -#define ISDN_TIMER_SLOW (ISDN_TIMER_MODEMRING | ISDN_TIMER_NETHANGUP | \ - ISDN_TIMER_NETDIAL | ISDN_TIMER_CARRIER) - -/* Timeout-Values for isdn_net_dial() */ -#define ISDN_TIMER_DTIMEOUT10 (10*HZ/(ISDN_TIMER_02SEC*(ISDN_TIMER_RES+1))) -#define ISDN_TIMER_DTIMEOUT15 (15*HZ/(ISDN_TIMER_02SEC*(ISDN_TIMER_RES+1))) -#define ISDN_TIMER_DTIMEOUT60 (60*HZ/(ISDN_TIMER_02SEC*(ISDN_TIMER_RES+1))) - -/* GLOBAL_FLAGS */ -#define ISDN_GLOBAL_STOPPED 1 - -/*=================== Start of ip-over-ISDN stuff =========================*/ - -/* Feature- and status-flags for a net-interface */ -#define ISDN_NET_CONNECTED 0x01 /* Bound to ISDN-Channel */ -#define ISDN_NET_SECURE 0x02 /* Accept calls from phonelist only */ -#define ISDN_NET_CALLBACK 0x04 /* activate callback */ -#define ISDN_NET_CBHUP 0x08 /* hangup before callback */ -#define ISDN_NET_CBOUT 0x10 /* remote machine does callback */ - -#define ISDN_NET_MAGIC 0x49344C02 /* for paranoia-checking */ - -/* Phone-list-element */ -typedef struct { - void *next; - char num[ISDN_MSNLEN]; -} isdn_net_phone; - -/* - Principles when extending structures for generic encapsulation protocol - ("concap") support: - - Stuff which is hardware specific (here i4l-specific) goes in - the netdev -> local structure (here: isdn_net_local) - - Stuff which is encapsulation protocol specific goes in the structure - which holds the linux device structure (here: isdn_net_device) -*/ - -/* Local interface-data */ -typedef struct isdn_net_local_s { - ulong magic; - struct net_device_stats stats; /* Ethernet Statistics */ - int isdn_device; /* Index to isdn-device */ - int isdn_channel; /* Index to isdn-channel */ - int ppp_slot; /* PPPD device slot number */ - int pre_device; /* Preselected isdn-device */ - int pre_channel; /* Preselected isdn-channel */ - int exclusive; /* If non-zero idx to reserved chan.*/ - int flags; /* Connection-flags */ - int dialretry; /* Counter for Dialout-retries */ - int dialmax; /* Max. Number of Dial-retries */ - int cbdelay; /* Delay before Callback starts */ - int dtimer; /* Timeout-counter for dialing */ - char msn[ISDN_MSNLEN]; /* MSNs/EAZs for this interface */ - u_char cbhup; /* Flag: Reject Call before Callback*/ - u_char dialstate; /* State for dialing */ - u_char p_encap; /* Packet encapsulation */ - /* 0 = Ethernet over ISDN */ - /* 1 = RAW-IP */ - /* 2 = IP with type field */ - u_char l2_proto; /* Layer-2-protocol */ - /* See ISDN_PROTO_L2..-constants in */ - /* isdnif.h */ - /* 0 = X75/LAPB with I-Frames */ - /* 1 = X75/LAPB with UI-Frames */ - /* 2 = X75/LAPB with BUI-Frames */ - /* 3 = HDLC */ - u_char l3_proto; /* Layer-3-protocol */ - /* See ISDN_PROTO_L3..-constants in */ - /* isdnif.h */ - /* 0 = Transparent */ - int huptimer; /* Timeout-counter for auto-hangup */ - int charge; /* Counter for charging units */ - ulong chargetime; /* Timer for Charging info */ - int hupflags; /* Flags for charge-unit-hangup: */ - /* bit0: chargeint is invalid */ - /* bit1: Getting charge-interval */ - /* bit2: Do charge-unit-hangup */ - /* bit3: Do hangup even on incoming */ - int outgoing; /* Flag: outgoing call */ - int onhtime; /* Time to keep link up */ - int chargeint; /* Interval between charge-infos */ - int onum; /* Flag: at least 1 outgoing number */ - int cps; /* current speed of this interface */ - int transcount; /* byte-counter for cps-calculation */ - int sqfull; /* Flag: netdev-queue overloaded */ - ulong sqfull_stamp; /* Start-Time of overload */ - ulong slavedelay; /* Dynamic bundling delaytime */ - int triggercps; /* BogoCPS needed for trigger slave */ - isdn_net_phone *phone[2]; /* List of remote-phonenumbers */ - /* phone[0] = Incoming Numbers */ - /* phone[1] = Outgoing Numbers */ - isdn_net_phone *dial; /* Pointer to dialed number */ - struct net_device *master; /* Ptr to Master device for slaves */ - struct net_device *slave; /* Ptr to Slave device for masters */ - struct isdn_net_local_s *next; /* Ptr to next link in bundle */ - struct isdn_net_local_s *last; /* Ptr to last link in bundle */ - struct isdn_net_dev_s *netdev; /* Ptr to netdev */ - struct sk_buff_head super_tx_queue; /* List of supervisory frames to */ - /* be transmitted asap */ - atomic_t frame_cnt; /* number of frames currently */ - /* queued in HL driver */ - /* Ptr to orig. hard_header_cache */ - spinlock_t xmit_lock; /* used to protect the xmit path of */ - /* a particular channel (including */ - /* the frame_cnt */ - - int pppbind; /* ippp device for bindings */ - int dialtimeout; /* How long shall we try on dialing? (jiffies) */ - int dialwait; /* How long shall we wait after failed attempt? (jiffies) */ - ulong dialstarted; /* jiffies of first dialing-attempt */ - ulong dialwait_timer; /* jiffies of earliest next dialing-attempt */ - int huptimeout; /* How long will the connection be up? (seconds) */ -#ifdef CONFIG_ISDN_X25 - struct concap_device_ops *dops; /* callbacks used by encapsulator */ -#endif - /* use an own struct for that in later versions */ - ulong cisco_myseq; /* Local keepalive seq. for Cisco */ - ulong cisco_mineseen; /* returned keepalive seq. from remote */ - ulong cisco_yourseq; /* Remote keepalive seq. for Cisco */ - int cisco_keepalive_period; /* keepalive period */ - ulong cisco_last_slarp_in; /* jiffie of last keepalive packet we received */ - char cisco_line_state; /* state of line according to keepalive packets */ - char cisco_debserint; /* debugging flag of cisco hdlc with slarp */ - struct timer_list cisco_timer; - struct work_struct tqueue; -} isdn_net_local; - -/* the interface itself */ -typedef struct isdn_net_dev_s { - isdn_net_local *local; - isdn_net_local *queue; /* circular list of all bundled - channels, which are currently - online */ - spinlock_t queue_lock; /* lock to protect queue */ - void *next; /* Pointer to next isdn-interface */ - struct net_device *dev; /* interface to upper levels */ -#ifdef CONFIG_ISDN_PPP - ippp_bundle * pb; /* pointer to the common bundle structure - * with the per-bundle data */ -#endif -#ifdef CONFIG_ISDN_X25 - struct concap_proto *cprot; /* connection oriented encapsulation protocol */ -#endif - -} isdn_net_dev; - -/*===================== End of ip-over-ISDN stuff ===========================*/ - -/*======================= Start of ISDN-tty stuff ===========================*/ - -#define ISDN_ASYNC_MAGIC 0x49344C01 /* for paranoia-checking */ -#define ISDN_SERIAL_XMIT_SIZE 1024 /* Default bufsize for write */ -#define ISDN_SERIAL_XMIT_MAX 4000 /* Maximum bufsize for write */ - -#ifdef CONFIG_ISDN_AUDIO -/* For using sk_buffs with audio we need some private variables - * within each sk_buff. For this purpose, we declare a struct here, - * and put it always at the private skb->cb data array. A few macros help - * accessing the variables. - */ -typedef struct _isdn_audio_data { - unsigned short dle_count; - unsigned char lock; -} isdn_audio_data_t; - -#define ISDN_AUDIO_SKB_DLECOUNT(skb) (((isdn_audio_data_t *)&skb->cb[0])->dle_count) -#define ISDN_AUDIO_SKB_LOCK(skb) (((isdn_audio_data_t *)&skb->cb[0])->lock) -#endif - -/* Private data of AT-command-interpreter */ -typedef struct atemu { - u_char profile[ISDN_MODEM_NUMREG]; /* Modem-Regs. Profile 0 */ - u_char mdmreg[ISDN_MODEM_NUMREG]; /* Modem-Registers */ - char pmsn[ISDN_MSNLEN]; /* EAZ/MSNs Profile 0 */ - char msn[ISDN_MSNLEN]; /* EAZ/MSN */ - char plmsn[ISDN_LMSNLEN]; /* Listening MSNs Profile 0 */ - char lmsn[ISDN_LMSNLEN]; /* Listening MSNs */ - char cpn[ISDN_MSNLEN]; /* CalledPartyNumber on incoming call */ - char connmsg[ISDN_CMSGLEN]; /* CONNECT-Msg from HL-Driver */ -#ifdef CONFIG_ISDN_AUDIO - u_char vpar[10]; /* Voice-parameters */ - int lastDLE; /* Flag for voice-coding: DLE seen */ -#endif - int mdmcmdl; /* Length of Modem-Commandbuffer */ - int pluscount; /* Counter for +++ sequence */ - u_long lastplus; /* Timestamp of last + */ - int carrierwait; /* Seconds of carrier waiting */ - char mdmcmd[255]; /* Modem-Commandbuffer */ - unsigned int charge; /* Charge units of current connection */ -} atemu; - -/* Private data (similar to async_struct in ) */ -typedef struct modem_info { - int magic; - struct tty_port port; - int x_char; /* xon/xoff character */ - int mcr; /* Modem control register */ - int msr; /* Modem status register */ - int lsr; /* Line status register */ - int line; - int online; /* 1 = B-Channel is up, drop data */ - /* 2 = B-Channel is up, deliver d.*/ - int dialing; /* Dial in progress or ATA */ - int closing; - int rcvsched; /* Receive needs schedule */ - int isdn_driver; /* Index to isdn-driver */ - int isdn_channel; /* Index to isdn-channel */ - int drv_index; /* Index to dev->usage */ - int ncarrier; /* Flag: schedule NO CARRIER */ - unsigned char last_cause[8]; /* Last cause message */ - unsigned char last_num[ISDN_MSNLEN]; - /* Last phone-number */ - unsigned char last_l2; /* Last layer-2 protocol */ - unsigned char last_si; /* Last service */ - unsigned char last_lhup; /* Last hangup local? */ - unsigned char last_dir; /* Last direction (in or out) */ - struct timer_list nc_timer; /* Timer for delayed NO CARRIER */ - int send_outstanding;/* # of outstanding send-requests */ - int xmit_size; /* max. # of chars in xmit_buf */ - int xmit_count; /* # of chars in xmit_buf */ - struct sk_buff_head xmit_queue; /* transmit queue */ - atomic_t xmit_lock; /* Semaphore for isdn_tty_write */ -#ifdef CONFIG_ISDN_AUDIO - int vonline; /* Voice-channel status */ - /* Bit 0 = recording */ - /* Bit 1 = playback */ - /* Bit 2 = playback, DLE-ETX seen */ - struct sk_buff_head dtmf_queue; /* queue for dtmf results */ - void *adpcms; /* state for adpcm decompression */ - void *adpcmr; /* state for adpcm compression */ - void *dtmf_state; /* state for dtmf decoder */ - void *silence_state; /* state for silence detection */ -#endif -#ifdef CONFIG_ISDN_TTY_FAX - struct T30_s *fax; /* T30 Fax Group 3 data/interface */ - int faxonline; /* Fax-channel status */ -#endif - atemu emu; /* AT-emulator data */ - spinlock_t readlock; -} modem_info; - -#define ISDN_MODEM_WINSIZE 8 - -/* Description of one ISDN-tty */ -typedef struct _isdn_modem { - int refcount; /* Number of opens */ - struct tty_driver *tty_modem; /* tty-device */ - struct tty_struct *modem_table[ISDN_MAX_CHANNELS]; /* ?? copied from Orig */ - struct ktermios *modem_termios[ISDN_MAX_CHANNELS]; - struct ktermios *modem_termios_locked[ISDN_MAX_CHANNELS]; - modem_info info[ISDN_MAX_CHANNELS]; /* Private data */ -} isdn_modem_t; - -/*======================= End of ISDN-tty stuff ============================*/ - -/*======================== Start of V.110 stuff ============================*/ -#define V110_BUFSIZE 1024 - -typedef struct { - int nbytes; /* 1 Matrixbyte -> nbytes in stream */ - int nbits; /* Number of used bits in streambyte */ - unsigned char key; /* Bitmask in stream eg. 11 (nbits=2) */ - int decodelen; /* Amount of data in decodebuf */ - int SyncInit; /* Number of sync frames to send */ - unsigned char *OnlineFrame; /* Precalculated V110 idle frame */ - unsigned char *OfflineFrame; /* Precalculated V110 sync Frame */ - int framelen; /* Length of frames */ - int skbuser; /* Number of unacked userdata skbs */ - int skbidle; /* Number of unacked idle/sync skbs */ - int introducer; /* Local vars for decoder */ - int dbit; - unsigned char b; - int skbres; /* space to reserve in outgoing skb */ - int maxsize; /* maxbufsize of lowlevel driver */ - unsigned char *encodebuf; /* temporary buffer for encoding */ - unsigned char decodebuf[V110_BUFSIZE]; /* incomplete V110 matrices */ -} isdn_v110_stream; - -/*========================= End of V.110 stuff =============================*/ - -/*======================= Start of general stuff ===========================*/ - -typedef struct { - char *next; - char *private; -} infostruct; - -#define DRV_FLAG_RUNNING 1 -#define DRV_FLAG_REJBUS 2 -#define DRV_FLAG_LOADED 4 - -/* Description of hardware-level-driver */ -typedef struct _isdn_driver { - ulong online; /* Channel-Online flags */ - ulong flags; /* Misc driver Flags */ - int locks; /* Number of locks for this driver */ - int channels; /* Number of channels */ - wait_queue_head_t st_waitq; /* Wait-Queue for status-read's */ - int maxbufsize; /* Maximum Buffersize supported */ - unsigned long pktcount; /* Until now: unused */ - int stavail; /* Chars avail on Status-device */ - isdn_if *interface; /* Interface to driver */ - int *rcverr; /* Error-counters for B-Ch.-receive */ - int *rcvcount; /* Byte-counters for B-Ch.-receive */ -#ifdef CONFIG_ISDN_AUDIO - unsigned long DLEflag; /* Flags: Insert DLE at next read */ -#endif - struct sk_buff_head *rpqueue; /* Pointers to start of Rcv-Queue */ - wait_queue_head_t *rcv_waitq; /* Wait-Queues for B-Channel-Reads */ - wait_queue_head_t *snd_waitq; /* Wait-Queue for B-Channel-Send's */ - char msn2eaz[10][ISDN_MSNLEN]; /* Mapping-Table MSN->EAZ */ -} isdn_driver_t; - -/* Main driver-data */ -typedef struct isdn_devt { - struct module *owner; - spinlock_t lock; - unsigned short flags; /* Bitmapped Flags: */ - int drivers; /* Current number of drivers */ - int channels; /* Current number of channels */ - int net_verbose; /* Verbose-Flag */ - int modempoll; /* Flag: tty-read active */ - spinlock_t timerlock; - int tflags; /* Timer-Flags: */ - /* see ISDN_TIMER_..defines */ - int global_flags; - infostruct *infochain; /* List of open info-devs. */ - wait_queue_head_t info_waitq; /* Wait-Queue for isdninfo */ - struct timer_list timer; /* Misc.-function Timer */ - int chanmap[ISDN_MAX_CHANNELS]; /* Map minor->device-channel */ - int drvmap[ISDN_MAX_CHANNELS]; /* Map minor->driver-index */ - int usage[ISDN_MAX_CHANNELS]; /* Used by tty/ip/voice */ - char num[ISDN_MAX_CHANNELS][ISDN_MSNLEN]; - /* Remote number of active ch.*/ - int m_idx[ISDN_MAX_CHANNELS]; /* Index for mdm.... */ - isdn_driver_t *drv[ISDN_MAX_DRIVERS]; /* Array of drivers */ - isdn_net_dev *netdev; /* Linked list of net-if's */ - char drvid[ISDN_MAX_DRIVERS][20];/* Driver-ID */ - struct task_struct *profd; /* For iprofd */ - isdn_modem_t mdm; /* tty-driver-data */ - isdn_net_dev *rx_netdev[ISDN_MAX_CHANNELS]; /* rx netdev-pointers */ - isdn_net_dev *st_netdev[ISDN_MAX_CHANNELS]; /* stat netdev-pointers */ - ulong ibytes[ISDN_MAX_CHANNELS]; /* Statistics incoming bytes */ - ulong obytes[ISDN_MAX_CHANNELS]; /* Statistics outgoing bytes */ - int v110emu[ISDN_MAX_CHANNELS]; /* V.110 emulator-mode 0=none */ - atomic_t v110use[ISDN_MAX_CHANNELS]; /* Usage-Semaphore for stream */ - isdn_v110_stream *v110[ISDN_MAX_CHANNELS]; /* V.110 private data */ - struct mutex mtx; /* serialize list access*/ - unsigned long global_features; -} isdn_dev; - -extern isdn_dev *dev; - - -#endif /* __ISDN_H__ */ diff --git a/include/linux/isdn_divertif.h b/include/linux/isdn_divertif.h deleted file mode 100644 index 19ab361f9f07..000000000000 --- a/include/linux/isdn_divertif.h +++ /dev/null @@ -1,35 +0,0 @@ -/* $Id: isdn_divertif.h,v 1.4.6.1 2001/09/23 22:25:05 kai Exp $ - * - * Header for the diversion supplementary interface for i4l. - * - * Author Werner Cornelius (werner@titro.de) - * Copyright by Werner Cornelius (werner@titro.de) - * - * This software may be used and distributed according to the terms - * of the GNU General Public License, incorporated herein by reference. - * - */ -#ifndef _LINUX_ISDN_DIVERTIF_H -#define _LINUX_ISDN_DIVERTIF_H - -#include -#include -#include - -/***************************************************************/ -/* structure exchanging data between isdn hl and divert module */ -/***************************************************************/ -typedef struct - { ulong if_magic; /* magic info and version */ - int cmd; /* command */ - int (*stat_callback)(isdn_ctrl *); /* supplied by divert module when calling */ - int (*ll_cmd)(isdn_ctrl *); /* supplied by hl on return */ - char * (*drv_to_name)(int); /* map a driver id to name, supplied by hl */ - int (*name_to_drv)(char *); /* map a driver id to name, supplied by hl */ - } isdn_divert_if; - -/*********************/ -/* function register */ -/*********************/ -extern int DIVERT_REG_NAME(isdn_divert_if *); -#endif /* _LINUX_ISDN_DIVERTIF_H */ diff --git a/include/linux/isdn_ppp.h b/include/linux/isdn_ppp.h deleted file mode 100644 index a0070c6dfaf8..000000000000 --- a/include/linux/isdn_ppp.h +++ /dev/null @@ -1,194 +0,0 @@ -/* Linux ISDN subsystem, sync PPP, interface to ipppd - * - * Copyright 1994-1999 by Fritz Elfert (fritz@isdn4linux.de) - * Copyright 1995,96 Thinking Objects Software GmbH Wuerzburg - * Copyright 1995,96 by Michael Hipp (Michael.Hipp@student.uni-tuebingen.de) - * Copyright 2000-2002 by Kai Germaschewski (kai@germaschewski.name) - * - * This software may be used and distributed according to the terms - * of the GNU General Public License, incorporated herein by reference. - * - */ -#ifndef _LINUX_ISDN_PPP_H -#define _LINUX_ISDN_PPP_H - - - - -#ifdef CONFIG_IPPP_FILTER -#include -#endif -#include - -#define DECOMP_ERR_NOMEM (-10) - -#define MP_END_FRAG 0x40 -#define MP_BEGIN_FRAG 0x80 - -#define MP_MAX_QUEUE_LEN 16 - -/* - * We need a way for the decompressor to influence the generation of CCP - * Reset-Requests in a variety of ways. The decompressor is already returning - * a lot of information (generated skb length, error conditions) so we use - * another parameter. This parameter is a pointer to a structure which is - * to be marked valid by the decompressor and only in this case is ever used. - * Furthermore, the only case where this data is used is when the decom- - * pressor returns DECOMP_ERROR. - * - * We use this same struct for the reset entry of the compressor to commu- - * nicate to its caller how to deal with sending of a Reset Ack. In this - * case, expra is not used, but other options still apply (suppressing - * sending with rsend, appending arbitrary data, etc). - */ - -#define IPPP_RESET_MAXDATABYTES 32 - -struct isdn_ppp_resetparams { - unsigned char valid:1; /* rw Is this structure filled at all ? */ - unsigned char rsend:1; /* rw Should we send one at all ? */ - unsigned char idval:1; /* rw Is the id field valid ? */ - unsigned char dtval:1; /* rw Is the data field valid ? */ - unsigned char expra:1; /* rw Is an Ack expected for this Req ? */ - unsigned char id; /* wo Send CCP ResetReq with this id */ - unsigned short maxdlen; /* ro Max bytes to be stored in data field */ - unsigned short dlen; /* rw Bytes stored in data field */ - unsigned char *data; /* wo Data for ResetReq info field */ -}; - -/* - * this is an 'old friend' from ppp-comp.h under a new name - * check the original include for more information - */ -struct isdn_ppp_compressor { - struct isdn_ppp_compressor *next, *prev; - struct module *owner; - int num; /* CCP compression protocol number */ - - void *(*alloc) (struct isdn_ppp_comp_data *); - void (*free) (void *state); - int (*init) (void *state, struct isdn_ppp_comp_data *, - int unit,int debug); - - /* The reset entry needs to get more exact information about the - ResetReq or ResetAck it was called with. The parameters are - obvious. If reset is called without a Req or Ack frame which - could be handed into it, code MUST be set to 0. Using rsparm, - the reset entry can control if and how a ResetAck is returned. */ - - void (*reset) (void *state, unsigned char code, unsigned char id, - unsigned char *data, unsigned len, - struct isdn_ppp_resetparams *rsparm); - - int (*compress) (void *state, struct sk_buff *in, - struct sk_buff *skb_out, int proto); - - int (*decompress) (void *state,struct sk_buff *in, - struct sk_buff *skb_out, - struct isdn_ppp_resetparams *rsparm); - - void (*incomp) (void *state, struct sk_buff *in,int proto); - void (*stat) (void *state, struct compstat *stats); -}; - -extern int isdn_ppp_register_compressor(struct isdn_ppp_compressor *); -extern int isdn_ppp_unregister_compressor(struct isdn_ppp_compressor *); -extern int isdn_ppp_dial_slave(char *); -extern int isdn_ppp_hangup_slave(char *); - -typedef struct { - unsigned long seqerrs; - unsigned long frame_drops; - unsigned long overflows; - unsigned long max_queue_len; -} isdn_mppp_stats; - -typedef struct { - int mp_mrru; /* unused */ - struct sk_buff * frags; /* fragments sl list -- use skb->next */ - long frames; /* number of frames in the frame list */ - unsigned int seq; /* last processed packet seq #: any packets - * with smaller seq # will be dropped - * unconditionally */ - spinlock_t lock; - int ref_ct; - /* statistics */ - isdn_mppp_stats stats; -} ippp_bundle; - -#define NUM_RCV_BUFFS 64 - -struct ippp_buf_queue { - struct ippp_buf_queue *next; - struct ippp_buf_queue *last; - char *buf; /* NULL here indicates end of queue */ - int len; -}; - -/* The data structure for one CCP reset transaction */ -enum ippp_ccp_reset_states { - CCPResetIdle, - CCPResetSentReq, - CCPResetRcvdReq, - CCPResetSentAck, - CCPResetRcvdAck -}; - -struct ippp_ccp_reset_state { - enum ippp_ccp_reset_states state; /* State of this transaction */ - struct ippp_struct *is; /* Backlink to device stuff */ - unsigned char id; /* Backlink id index */ - unsigned char ta:1; /* The timer is active (flag) */ - unsigned char expra:1; /* We expect a ResetAck at all */ - int dlen; /* Databytes stored in data */ - struct timer_list timer; /* For timeouts/retries */ - /* This is a hack but seems sufficient for the moment. We do not want - to have this be yet another allocation for some bytes, it is more - memory management overhead than the whole mess is worth. */ - unsigned char data[IPPP_RESET_MAXDATABYTES]; -}; - -/* The data structure keeping track of the currently outstanding CCP Reset - transactions. */ -struct ippp_ccp_reset { - struct ippp_ccp_reset_state *rs[256]; /* One per possible id */ - unsigned char lastid; /* Last id allocated by the engine */ -}; - -struct ippp_struct { - struct ippp_struct *next_link; - int state; - spinlock_t buflock; - struct ippp_buf_queue rq[NUM_RCV_BUFFS]; /* packet queue for isdn_ppp_read() */ - struct ippp_buf_queue *first; /* pointer to (current) first packet */ - struct ippp_buf_queue *last; /* pointer to (current) last used packet in queue */ - wait_queue_head_t wq; - struct task_struct *tk; - unsigned int mpppcfg; - unsigned int pppcfg; - unsigned int mru; - unsigned int mpmru; - unsigned int mpmtu; - unsigned int maxcid; - struct isdn_net_local_s *lp; - int unit; - int minor; - unsigned int last_link_seqno; - long mp_seqno; -#ifdef CONFIG_ISDN_PPP_VJ - unsigned char *cbuf; - struct slcompress *slcomp; -#endif -#ifdef CONFIG_IPPP_FILTER - struct bpf_prog *pass_filter; /* filter for packets to pass */ - struct bpf_prog *active_filter; /* filter for pkts to reset idle */ -#endif - unsigned long debug; - struct isdn_ppp_compressor *compressor,*decompressor; - struct isdn_ppp_compressor *link_compressor,*link_decompressor; - void *decomp_stat,*comp_stat,*link_decomp_stat,*link_comp_stat; - struct ippp_ccp_reset *reset; /* Allocated on demand, may never be needed */ - unsigned long compflags; -}; - -#endif /* _LINUX_ISDN_PPP_H */ diff --git a/include/linux/isdnif.h b/include/linux/isdnif.h deleted file mode 100644 index 8d80fdc68647..000000000000 --- a/include/linux/isdnif.h +++ /dev/null @@ -1,505 +0,0 @@ -/* $Id: isdnif.h,v 1.43.2.2 2004/01/12 23:08:35 keil Exp $ - * - * Linux ISDN subsystem - * Definition of the interface between the subsystem and its low-level drivers. - * - * Copyright 1994,95,96 by Fritz Elfert (fritz@isdn4linux.de) - * Copyright 1995,96 Thinking Objects Software GmbH Wuerzburg - * - * This software may be used and distributed according to the terms - * of the GNU General Public License, incorporated herein by reference. - * - */ -#ifndef __ISDNIF_H__ -#define __ISDNIF_H__ - - -#include -#include - -/***************************************************************************/ -/* Extensions made by Werner Cornelius (werner@ikt.de) */ -/* */ -/* The proceed command holds a incoming call in a state to leave processes */ -/* enough time to check whether ist should be accepted. */ -/* The PROT_IO Command extends the interface to make protocol dependent */ -/* features available (call diversion, call waiting...). */ -/* */ -/* The PROT_IO Command is executed with the desired driver id and the arg */ -/* parameter coded as follows: */ -/* The lower 8 bits of arg contain the desired protocol from ISDN_PTYPE */ -/* definitions. The upper 24 bits represent the protocol specific cmd/stat.*/ -/* Any additional data is protocol and command specific. */ -/* This mechanism also applies to the statcallb callback STAT_PROT. */ -/* */ -/* This suggested extension permits an easy expansion of protocol specific */ -/* handling. Extensions may be added at any time without changing the HL */ -/* driver code and not getting conflicts without certifications. */ -/* The well known CAPI 2.0 interface handles such extensions in a similar */ -/* way. Perhaps a protocol specific module may be added and separately */ -/* loaded and linked to the basic isdn module for handling. */ -/***************************************************************************/ - -/*****************/ -/* DSS1 commands */ -/*****************/ -#define DSS1_CMD_INVOKE ((0x00 << 8) | ISDN_PTYPE_EURO) /* invoke a supplementary service */ -#define DSS1_CMD_INVOKE_ABORT ((0x01 << 8) | ISDN_PTYPE_EURO) /* abort a invoke cmd */ - -/*******************************/ -/* DSS1 Status callback values */ -/*******************************/ -#define DSS1_STAT_INVOKE_RES ((0x80 << 8) | ISDN_PTYPE_EURO) /* Result for invocation */ -#define DSS1_STAT_INVOKE_ERR ((0x81 << 8) | ISDN_PTYPE_EURO) /* Error Return for invocation */ -#define DSS1_STAT_INVOKE_BRD ((0x82 << 8) | ISDN_PTYPE_EURO) /* Deliver invoke broadcast info */ - - -/*********************************************************************/ -/* structures for DSS1 commands and callback */ -/* */ -/* An action is invoked by sending a DSS1_CMD_INVOKE. The ll_id, proc*/ -/* timeout, datalen and data fields must be set before calling. */ -/* */ -/* The return value is a positive hl_id value also delivered in the */ -/* hl_id field. A value of zero signals no more left hl_id capacitys.*/ -/* A negative return value signals errors in LL. So if the return */ -/* value is <= 0 no action in LL will be taken -> request ignored */ -/* */ -/* The timeout field must be filled with a positive value specifying */ -/* the amount of time the INVOKED process waits for a reaction from */ -/* the network. */ -/* If a response (either error or result) is received during this */ -/* intervall, a reporting callback is initiated and the process will */ -/* be deleted, the hl identifier will be freed. */ -/* If no response is received during the specified intervall, a error*/ -/* callback is initiated with timeout set to -1 and a datalen set */ -/* to 0. */ -/* If timeout is set to a value <= 0 during INVOCATION the process is*/ -/* immediately deleted after sending the data. No callback occurs ! */ -/* */ -/* A currently waiting process may be aborted with INVOKE_ABORT. No */ -/* callback will occur when a process has been aborted. */ -/* */ -/* Broadcast invoke frames from the network are reported via the */ -/* STAT_INVOKE_BRD callback. The ll_id is set to 0, the other fields */ -/* are supplied by the network and not by the HL. */ -/*********************************************************************/ - -/*****************/ -/* NI1 commands */ -/*****************/ -#define NI1_CMD_INVOKE ((0x00 << 8) | ISDN_PTYPE_NI1) /* invoke a supplementary service */ -#define NI1_CMD_INVOKE_ABORT ((0x01 << 8) | ISDN_PTYPE_NI1) /* abort a invoke cmd */ - -/*******************************/ -/* NI1 Status callback values */ -/*******************************/ -#define NI1_STAT_INVOKE_RES ((0x80 << 8) | ISDN_PTYPE_NI1) /* Result for invocation */ -#define NI1_STAT_INVOKE_ERR ((0x81 << 8) | ISDN_PTYPE_NI1) /* Error Return for invocation */ -#define NI1_STAT_INVOKE_BRD ((0x82 << 8) | ISDN_PTYPE_NI1) /* Deliver invoke broadcast info */ - -typedef struct - { ulong ll_id; /* ID supplied by LL when executing */ - /* a command and returned by HL for */ - /* INVOKE_RES and INVOKE_ERR */ - int hl_id; /* ID supplied by HL when called */ - /* for executing a cmd and delivered */ - /* for results and errors */ - /* must be supplied by LL when aborting*/ - int proc; /* invoke procedure used by CMD_INVOKE */ - /* returned by callback and broadcast */ - int timeout; /* timeout for INVOKE CMD in ms */ - /* -1 in stat callback when timed out */ - /* error value when error callback */ - int datalen; /* length of cmd or stat data */ - u_char *data;/* pointer to data delivered or send */ - } isdn_cmd_stat; - -/* - * Commands from linklevel to lowlevel - * - */ -#define ISDN_CMD_IOCTL 0 /* Perform ioctl */ -#define ISDN_CMD_DIAL 1 /* Dial out */ -#define ISDN_CMD_ACCEPTD 2 /* Accept an incoming call on D-Chan. */ -#define ISDN_CMD_ACCEPTB 3 /* Request B-Channel connect. */ -#define ISDN_CMD_HANGUP 4 /* Hangup */ -#define ISDN_CMD_CLREAZ 5 /* Clear EAZ(s) of channel */ -#define ISDN_CMD_SETEAZ 6 /* Set EAZ(s) of channel */ -#define ISDN_CMD_GETEAZ 7 /* Get EAZ(s) of channel */ -#define ISDN_CMD_SETSIL 8 /* Set Service-Indicator-List of channel */ -#define ISDN_CMD_GETSIL 9 /* Get Service-Indicator-List of channel */ -#define ISDN_CMD_SETL2 10 /* Set B-Chan. Layer2-Parameter */ -#define ISDN_CMD_GETL2 11 /* Get B-Chan. Layer2-Parameter */ -#define ISDN_CMD_SETL3 12 /* Set B-Chan. Layer3-Parameter */ -#define ISDN_CMD_GETL3 13 /* Get B-Chan. Layer3-Parameter */ -// #define ISDN_CMD_LOCK 14 /* Signal usage by upper levels */ -// #define ISDN_CMD_UNLOCK 15 /* Release usage-lock */ -#define ISDN_CMD_SUSPEND 16 /* Suspend connection */ -#define ISDN_CMD_RESUME 17 /* Resume connection */ -#define ISDN_CMD_PROCEED 18 /* Proceed with call establishment */ -#define ISDN_CMD_ALERT 19 /* Alert after Proceeding */ -#define ISDN_CMD_REDIR 20 /* Redir a incoming call */ -#define ISDN_CMD_PROT_IO 21 /* Protocol specific commands */ -#define CAPI_PUT_MESSAGE 22 /* CAPI message send down or up */ -#define ISDN_CMD_FAXCMD 23 /* FAX commands to HL-driver */ -#define ISDN_CMD_AUDIO 24 /* DSP, DTMF, ... settings */ - -/* - * Status-Values delivered from lowlevel to linklevel via - * statcallb(). - * - */ -#define ISDN_STAT_STAVAIL 256 /* Raw status-data available */ -#define ISDN_STAT_ICALL 257 /* Incoming call detected */ -#define ISDN_STAT_RUN 258 /* Signal protocol-code is running */ -#define ISDN_STAT_STOP 259 /* Signal halt of protocol-code */ -#define ISDN_STAT_DCONN 260 /* Signal D-Channel connect */ -#define ISDN_STAT_BCONN 261 /* Signal B-Channel connect */ -#define ISDN_STAT_DHUP 262 /* Signal D-Channel disconnect */ -#define ISDN_STAT_BHUP 263 /* Signal B-Channel disconnect */ -#define ISDN_STAT_CINF 264 /* Charge-Info */ -#define ISDN_STAT_LOAD 265 /* Signal new lowlevel-driver is loaded */ -#define ISDN_STAT_UNLOAD 266 /* Signal unload of lowlevel-driver */ -#define ISDN_STAT_BSENT 267 /* Signal packet sent */ -#define ISDN_STAT_NODCH 268 /* Signal no D-Channel */ -#define ISDN_STAT_ADDCH 269 /* Add more Channels */ -#define ISDN_STAT_CAUSE 270 /* Cause-Message */ -#define ISDN_STAT_ICALLW 271 /* Incoming call without B-chan waiting */ -#define ISDN_STAT_REDIR 272 /* Redir result */ -#define ISDN_STAT_PROT 273 /* protocol IO specific callback */ -#define ISDN_STAT_DISPLAY 274 /* deliver a received display message */ -#define ISDN_STAT_L1ERR 275 /* Signal Layer-1 Error */ -#define ISDN_STAT_FAXIND 276 /* FAX indications from HL-driver */ -#define ISDN_STAT_AUDIO 277 /* DTMF, DSP indications */ -#define ISDN_STAT_DISCH 278 /* Disable/Enable channel usage */ - -/* - * Audio commands - */ -#define ISDN_AUDIO_SETDD 0 /* Set DTMF detection */ -#define ISDN_AUDIO_DTMF 1 /* Rx/Tx DTMF */ - -/* - * Values for errcode field - */ -#define ISDN_STAT_L1ERR_SEND 1 -#define ISDN_STAT_L1ERR_RECV 2 - -/* - * Values for feature-field of interface-struct. - */ -/* Layer 2 */ -#define ISDN_FEATURE_L2_X75I (0x0001 << ISDN_PROTO_L2_X75I) -#define ISDN_FEATURE_L2_X75UI (0x0001 << ISDN_PROTO_L2_X75UI) -#define ISDN_FEATURE_L2_X75BUI (0x0001 << ISDN_PROTO_L2_X75BUI) -#define ISDN_FEATURE_L2_HDLC (0x0001 << ISDN_PROTO_L2_HDLC) -#define ISDN_FEATURE_L2_TRANS (0x0001 << ISDN_PROTO_L2_TRANS) -#define ISDN_FEATURE_L2_X25DTE (0x0001 << ISDN_PROTO_L2_X25DTE) -#define ISDN_FEATURE_L2_X25DCE (0x0001 << ISDN_PROTO_L2_X25DCE) -#define ISDN_FEATURE_L2_V11096 (0x0001 << ISDN_PROTO_L2_V11096) -#define ISDN_FEATURE_L2_V11019 (0x0001 << ISDN_PROTO_L2_V11019) -#define ISDN_FEATURE_L2_V11038 (0x0001 << ISDN_PROTO_L2_V11038) -#define ISDN_FEATURE_L2_MODEM (0x0001 << ISDN_PROTO_L2_MODEM) -#define ISDN_FEATURE_L2_FAX (0x0001 << ISDN_PROTO_L2_FAX) -#define ISDN_FEATURE_L2_HDLC_56K (0x0001 << ISDN_PROTO_L2_HDLC_56K) - -#define ISDN_FEATURE_L2_MASK (0x0FFFF) /* Max. 16 protocols */ -#define ISDN_FEATURE_L2_SHIFT (0) - -/* Layer 3 */ -#define ISDN_FEATURE_L3_TRANS (0x10000 << ISDN_PROTO_L3_TRANS) -#define ISDN_FEATURE_L3_TRANSDSP (0x10000 << ISDN_PROTO_L3_TRANSDSP) -#define ISDN_FEATURE_L3_FCLASS2 (0x10000 << ISDN_PROTO_L3_FCLASS2) -#define ISDN_FEATURE_L3_FCLASS1 (0x10000 << ISDN_PROTO_L3_FCLASS1) - -#define ISDN_FEATURE_L3_MASK (0x0FF0000) /* Max. 8 Protocols */ -#define ISDN_FEATURE_L3_SHIFT (16) - -/* Signaling */ -#define ISDN_FEATURE_P_UNKNOWN (0x1000000 << ISDN_PTYPE_UNKNOWN) -#define ISDN_FEATURE_P_1TR6 (0x1000000 << ISDN_PTYPE_1TR6) -#define ISDN_FEATURE_P_EURO (0x1000000 << ISDN_PTYPE_EURO) -#define ISDN_FEATURE_P_NI1 (0x1000000 << ISDN_PTYPE_NI1) - -#define ISDN_FEATURE_P_MASK (0x0FF000000) /* Max. 8 Protocols */ -#define ISDN_FEATURE_P_SHIFT (24) - -typedef struct setup_parm { - unsigned char phone[32]; /* Remote Phone-Number */ - unsigned char eazmsn[32]; /* Local EAZ or MSN */ - unsigned char si1; /* Service Indicator 1 */ - unsigned char si2; /* Service Indicator 2 */ - unsigned char plan; /* Numbering plan */ - unsigned char screen; /* Screening info */ -} setup_parm; - - -#ifdef CONFIG_ISDN_TTY_FAX -/* T.30 Fax G3 */ - -#define FAXIDLEN 21 - -typedef struct T30_s { - /* session parameters */ - __u8 resolution; - __u8 rate; - __u8 width; - __u8 length; - __u8 compression; - __u8 ecm; - __u8 binary; - __u8 scantime; - __u8 id[FAXIDLEN]; - /* additional parameters */ - __u8 phase; - __u8 direction; - __u8 code; - __u8 badlin; - __u8 badmul; - __u8 bor; - __u8 fet; - __u8 pollid[FAXIDLEN]; - __u8 cq; - __u8 cr; - __u8 ctcrty; - __u8 minsp; - __u8 phcto; - __u8 rel; - __u8 nbc; - /* remote station parameters */ - __u8 r_resolution; - __u8 r_rate; - __u8 r_width; - __u8 r_length; - __u8 r_compression; - __u8 r_ecm; - __u8 r_binary; - __u8 r_scantime; - __u8 r_id[FAXIDLEN]; - __u8 r_code; -} __packed T30_s; - -#define ISDN_TTY_FAX_CONN_IN 0 -#define ISDN_TTY_FAX_CONN_OUT 1 - -#define ISDN_TTY_FAX_FCON 0 -#define ISDN_TTY_FAX_DIS 1 -#define ISDN_TTY_FAX_FTT 2 -#define ISDN_TTY_FAX_MCF 3 -#define ISDN_TTY_FAX_DCS 4 -#define ISDN_TTY_FAX_TRAIN_OK 5 -#define ISDN_TTY_FAX_EOP 6 -#define ISDN_TTY_FAX_EOM 7 -#define ISDN_TTY_FAX_MPS 8 -#define ISDN_TTY_FAX_DTC 9 -#define ISDN_TTY_FAX_RID 10 -#define ISDN_TTY_FAX_HNG 11 -#define ISDN_TTY_FAX_DT 12 -#define ISDN_TTY_FAX_FCON_I 13 -#define ISDN_TTY_FAX_DR 14 -#define ISDN_TTY_FAX_ET 15 -#define ISDN_TTY_FAX_CFR 16 -#define ISDN_TTY_FAX_PTS 17 -#define ISDN_TTY_FAX_SENT 18 - -#define ISDN_FAX_PHASE_IDLE 0 -#define ISDN_FAX_PHASE_A 1 -#define ISDN_FAX_PHASE_B 2 -#define ISDN_FAX_PHASE_C 3 -#define ISDN_FAX_PHASE_D 4 -#define ISDN_FAX_PHASE_E 5 - -#endif /* TTY_FAX */ - -#define ISDN_FAX_CLASS1_FAE 0 -#define ISDN_FAX_CLASS1_FTS 1 -#define ISDN_FAX_CLASS1_FRS 2 -#define ISDN_FAX_CLASS1_FTM 3 -#define ISDN_FAX_CLASS1_FRM 4 -#define ISDN_FAX_CLASS1_FTH 5 -#define ISDN_FAX_CLASS1_FRH 6 -#define ISDN_FAX_CLASS1_CTRL 7 - -#define ISDN_FAX_CLASS1_OK 0 -#define ISDN_FAX_CLASS1_CONNECT 1 -#define ISDN_FAX_CLASS1_NOCARR 2 -#define ISDN_FAX_CLASS1_ERROR 3 -#define ISDN_FAX_CLASS1_FCERROR 4 -#define ISDN_FAX_CLASS1_QUERY 5 - -typedef struct { - __u8 cmd; - __u8 subcmd; - __u8 para[50]; -} aux_s; - -#define AT_COMMAND 0 -#define AT_EQ_VALUE 1 -#define AT_QUERY 2 -#define AT_EQ_QUERY 3 - -/* CAPI structs */ - -/* this is compatible to the old union size */ -#define MAX_CAPI_PARA_LEN 50 - -typedef struct { - /* Header */ - __u16 Length; - __u16 ApplId; - __u8 Command; - __u8 Subcommand; - __u16 Messagenumber; - - /* Parameter */ - union { - __u32 Controller; - __u32 PLCI; - __u32 NCCI; - } adr; - __u8 para[MAX_CAPI_PARA_LEN]; -} capi_msg; - -/* - * Structure for exchanging above infos - * - */ -typedef struct { - int driver; /* Lowlevel-Driver-ID */ - int command; /* Command or Status (see above) */ - ulong arg; /* Additional Data */ - union { - ulong errcode; /* Type of error with STAT_L1ERR */ - int length; /* Amount of bytes sent with STAT_BSENT */ - u_char num[50]; /* Additional Data */ - setup_parm setup;/* For SETUP msg */ - capi_msg cmsg; /* For CAPI like messages */ - char display[85];/* display message data */ - isdn_cmd_stat isdn_io; /* ISDN IO-parameter/result */ - aux_s aux; /* for modem commands/indications */ -#ifdef CONFIG_ISDN_TTY_FAX - T30_s *fax; /* Pointer to ttys fax struct */ -#endif - ulong userdata; /* User Data */ - } parm; -} isdn_ctrl; - -#define dss1_io isdn_io -#define ni1_io isdn_io - -/* - * The interface-struct itself (initialized at load-time of lowlevel-driver) - * - * See Documentation/isdn/INTERFACE for a description, how the communication - * between the ISDN subsystem and its drivers is done. - * - */ -typedef struct { - struct module *owner; - - /* Number of channels supported by this driver - */ - int channels; - - /* - * Maximum Size of transmit/receive-buffer this driver supports. - */ - int maxbufsize; - - /* Feature-Flags for this driver. - * See defines ISDN_FEATURE_... for Values - */ - unsigned long features; - - /* - * Needed for calculating - * dev->hard_header_len = linklayer header + hl_hdrlen; - * Drivers, not supporting sk_buff's should set this to 0. - */ - unsigned short hl_hdrlen; - - /* - * Receive-Callback using sk_buff's - * Parameters: - * int Driver-ID - * int local channel-number (0 ...) - * struct sk_buff *skb received Data - */ - void (*rcvcallb_skb)(int, int, struct sk_buff *); - - /* Status-Callback - * Parameters: - * isdn_ctrl* - * driver = Driver ID. - * command = One of above ISDN_STAT_... constants. - * arg = depending on status-type. - * num = depending on status-type. - */ - int (*statcallb)(isdn_ctrl*); - - /* Send command - * Parameters: - * isdn_ctrl* - * driver = Driver ID. - * command = One of above ISDN_CMD_... constants. - * arg = depending on command. - * num = depending on command. - */ - int (*command)(isdn_ctrl*); - - /* - * Send data using sk_buff's - * Parameters: - * int driverId - * int local channel-number (0...) - * int Flag: Need ACK for this packet. - * struct sk_buff *skb Data to send - */ - int (*writebuf_skb) (int, int, int, struct sk_buff *); - - /* Send raw D-Channel-Commands - * Parameters: - * u_char pointer data - * int length of data - * int driverId - * int local channel-number (0 ...) - */ - int (*writecmd)(const u_char __user *, int, int, int); - - /* Read raw Status replies - * u_char pointer data (volatile) - * int length of buffer - * int driverId - * int local channel-number (0 ...) - */ - int (*readstat)(u_char __user *, int, int, int); - - char id[20]; -} isdn_if; - -/* - * Function which must be called by lowlevel-driver at loadtime with - * the following fields of above struct set: - * - * channels Number of channels that will be supported. - * hl_hdrlen Space to preserve in sk_buff's when sending. Drivers, not - * supporting sk_buff's should set this to 0. - * command Address of Command-Handler. - * features Bitwise coded Features of this driver. (use ISDN_FEATURE_...) - * writebuf_skb Address of Skbuff-Send-Handler. - * writecmd " " D-Channel " which accepts raw D-Ch-Commands. - * readstat " " D-Channel " which delivers raw Status-Data. - * - * The linklevel-driver fills the following fields: - * - * channels Driver-ID assigned to this driver. (Must be used on all - * subsequent callbacks. - * rcvcallb_skb Address of handler for received Skbuff's. - * statcallb " " " for status-changes. - * - */ -extern int register_isdn(isdn_if*); -#include - -#endif /* __ISDNIF_H__ */ diff --git a/include/linux/wanrouter.h b/include/linux/wanrouter.h deleted file mode 100644 index f6358558f9f5..000000000000 --- a/include/linux/wanrouter.h +++ /dev/null @@ -1,11 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * wanrouter.h Legacy declarations kept around until X25 is removed - */ - -#ifndef _ROUTER_H -#define _ROUTER_H - -#include - -#endif /* _ROUTER_H */ diff --git a/include/uapi/linux/isdn.h b/include/uapi/linux/isdn.h deleted file mode 100644 index f371fd52ed75..000000000000 --- a/include/uapi/linux/isdn.h +++ /dev/null @@ -1,144 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -/* $Id: isdn.h,v 1.125.2.3 2004/02/10 01:07:14 keil Exp $ - * - * Main header for the Linux ISDN subsystem (linklevel). - * - * Copyright 1994,95,96 by Fritz Elfert (fritz@isdn4linux.de) - * Copyright 1995,96 by Thinking Objects Software GmbH Wuerzburg - * Copyright 1995,96 by Michael Hipp (Michael.Hipp@student.uni-tuebingen.de) - * - * This software may be used and distributed according to the terms - * of the GNU General Public License, incorporated herein by reference. - * - */ - -#ifndef _UAPI__ISDN_H__ -#define _UAPI__ISDN_H__ - -#include -#include - -#define ISDN_MAX_DRIVERS 32 -#define ISDN_MAX_CHANNELS 64 - -/* New ioctl-codes */ -#define IIOCNETAIF _IO('I',1) -#define IIOCNETDIF _IO('I',2) -#define IIOCNETSCF _IO('I',3) -#define IIOCNETGCF _IO('I',4) -#define IIOCNETANM _IO('I',5) -#define IIOCNETDNM _IO('I',6) -#define IIOCNETGNM _IO('I',7) -#define IIOCGETSET _IO('I',8) /* no longer supported */ -#define IIOCSETSET _IO('I',9) /* no longer supported */ -#define IIOCSETVER _IO('I',10) -#define IIOCNETHUP _IO('I',11) -#define IIOCSETGST _IO('I',12) -#define IIOCSETBRJ _IO('I',13) -#define IIOCSIGPRF _IO('I',14) -#define IIOCGETPRF _IO('I',15) -#define IIOCSETPRF _IO('I',16) -#define IIOCGETMAP _IO('I',17) -#define IIOCSETMAP _IO('I',18) -#define IIOCNETASL _IO('I',19) -#define IIOCNETDIL _IO('I',20) -#define IIOCGETCPS _IO('I',21) -#define IIOCGETDVR _IO('I',22) -#define IIOCNETLCR _IO('I',23) /* dwabc ioctl for LCR from isdnlog */ -#define IIOCNETDWRSET _IO('I',24) /* dwabc ioctl to reset abc-values to default on a net-interface */ - -#define IIOCNETALN _IO('I',32) -#define IIOCNETDLN _IO('I',33) - -#define IIOCNETGPN _IO('I',34) - -#define IIOCDBGVAR _IO('I',127) - -#define IIOCDRVCTL _IO('I',128) - -/* cisco hdlck device private ioctls */ -#define SIOCGKEEPPERIOD (SIOCDEVPRIVATE + 0) -#define SIOCSKEEPPERIOD (SIOCDEVPRIVATE + 1) -#define SIOCGDEBSERINT (SIOCDEVPRIVATE + 2) -#define SIOCSDEBSERINT (SIOCDEVPRIVATE + 3) - -/* Packet encapsulations for net-interfaces */ -#define ISDN_NET_ENCAP_ETHER 0 -#define ISDN_NET_ENCAP_RAWIP 1 -#define ISDN_NET_ENCAP_IPTYP 2 -#define ISDN_NET_ENCAP_CISCOHDLC 3 /* Without SLARP and keepalive */ -#define ISDN_NET_ENCAP_SYNCPPP 4 -#define ISDN_NET_ENCAP_UIHDLC 5 -#define ISDN_NET_ENCAP_CISCOHDLCK 6 /* With SLARP and keepalive */ -#define ISDN_NET_ENCAP_X25IFACE 7 /* Documentation/networking/x25-iface.txt */ -#define ISDN_NET_ENCAP_MAX_ENCAP ISDN_NET_ENCAP_X25IFACE - -/* Facility which currently uses an ISDN-channel */ -#define ISDN_USAGE_NONE 0 -#define ISDN_USAGE_RAW 1 -#define ISDN_USAGE_MODEM 2 -#define ISDN_USAGE_NET 3 -#define ISDN_USAGE_VOICE 4 -#define ISDN_USAGE_FAX 5 -#define ISDN_USAGE_MASK 7 /* Mask to get plain usage */ -#define ISDN_USAGE_DISABLED 32 /* This bit is set, if channel is disabled */ -#define ISDN_USAGE_EXCLUSIVE 64 /* This bit is set, if channel is exclusive */ -#define ISDN_USAGE_OUTGOING 128 /* This bit is set, if channel is outgoing */ - -#define ISDN_MODEM_NUMREG 24 /* Number of Modem-Registers */ -#define ISDN_LMSNLEN 255 /* Length of tty's Listen-MSN string */ -#define ISDN_CMSGLEN 50 /* Length of CONNECT-Message to add for Modem */ - -#define ISDN_MSNLEN 32 -#define NET_DV 0x06 /* Data version for isdn_net_ioctl_cfg */ -#define TTY_DV 0x06 /* Data version for iprofd etc. */ - -#define INF_DV 0x01 /* Data version for /dev/isdninfo */ - -typedef struct { - char drvid[25]; - unsigned long arg; -} isdn_ioctl_struct; - -typedef struct { - char name[10]; - char phone[ISDN_MSNLEN]; - int outgoing; -} isdn_net_ioctl_phone; - -typedef struct { - char name[10]; /* Name of interface */ - char master[10]; /* Name of Master for Bundling */ - char slave[10]; /* Name of Slave for Bundling */ - char eaz[256]; /* EAZ/MSN */ - char drvid[25]; /* DriverId for Bindings */ - int onhtime; /* Hangup-Timeout */ - int charge; /* Charge-Units */ - int l2_proto; /* Layer-2 protocol */ - int l3_proto; /* Layer-3 protocol */ - int p_encap; /* Encapsulation */ - int exclusive; /* Channel, if bound exclusive */ - int dialmax; /* Dial Retry-Counter */ - int slavedelay; /* Delay until slave starts up */ - int cbdelay; /* Delay before Callback */ - int chargehup; /* Flag: Charge-Hangup */ - int ihup; /* Flag: Hangup-Timeout on incoming line */ - int secure; /* Flag: Secure */ - int callback; /* Flag: Callback */ - int cbhup; /* Flag: Reject Call before Callback */ - int pppbind; /* ippp device for bindings */ - int chargeint; /* Use fixed charge interval length */ - int triggercps; /* BogoCPS needed for triggering slave */ - int dialtimeout; /* Dial-Timeout */ - int dialwait; /* Time to wait after failed dial */ - int dialmode; /* Flag: off / on / auto */ -} isdn_net_ioctl_cfg; - -#define ISDN_NET_DIALMODE_MASK 0xC0 /* bits for status */ -#define ISDN_NET_DM_OFF 0x00 /* this interface is stopped */ -#define ISDN_NET_DM_MANUAL 0x40 /* this interface is on (manual) */ -#define ISDN_NET_DM_AUTO 0x80 /* this interface is autodial */ -#define ISDN_NET_DIALMODE(x) ((&(x))->flags & ISDN_NET_DIALMODE_MASK) - - -#endif /* _UAPI__ISDN_H__ */ diff --git a/include/uapi/linux/isdn_divertif.h b/include/uapi/linux/isdn_divertif.h deleted file mode 100644 index 0a17bb1bcb1b..000000000000 --- a/include/uapi/linux/isdn_divertif.h +++ /dev/null @@ -1,31 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -/* $Id: isdn_divertif.h,v 1.4.6.1 2001/09/23 22:25:05 kai Exp $ - * - * Header for the diversion supplementary interface for i4l. - * - * Author Werner Cornelius (werner@titro.de) - * Copyright by Werner Cornelius (werner@titro.de) - * - * This software may be used and distributed according to the terms - * of the GNU General Public License, incorporated herein by reference. - * - */ - -#ifndef _UAPI_LINUX_ISDN_DIVERTIF_H -#define _UAPI_LINUX_ISDN_DIVERTIF_H - -/***********************************************************/ -/* magic value is also used to control version information */ -/***********************************************************/ -#define DIVERT_IF_MAGIC 0x25873401 -#define DIVERT_CMD_REG 0x00 /* register command */ -#define DIVERT_CMD_REL 0x01 /* release command */ -#define DIVERT_NO_ERR 0x00 /* return value no error */ -#define DIVERT_CMD_ERR 0x01 /* invalid cmd */ -#define DIVERT_VER_ERR 0x02 /* magic/version invalid */ -#define DIVERT_REG_ERR 0x03 /* module already registered */ -#define DIVERT_REL_ERR 0x04 /* module not registered */ -#define DIVERT_REG_NAME isdn_register_divert - - -#endif /* _UAPI_LINUX_ISDN_DIVERTIF_H */ diff --git a/include/uapi/linux/isdn_ppp.h b/include/uapi/linux/isdn_ppp.h deleted file mode 100644 index 0bdc4efaacb2..000000000000 --- a/include/uapi/linux/isdn_ppp.h +++ /dev/null @@ -1,68 +0,0 @@ -/* SPDX-License-Identifier: GPL-1.0+ WITH Linux-syscall-note */ -/* Linux ISDN subsystem, sync PPP, interface to ipppd - * - * Copyright 1994-1999 by Fritz Elfert (fritz@isdn4linux.de) - * Copyright 1995,96 Thinking Objects Software GmbH Wuerzburg - * Copyright 1995,96 by Michael Hipp (Michael.Hipp@student.uni-tuebingen.de) - * Copyright 2000-2002 by Kai Germaschewski (kai@germaschewski.name) - * - * This software may be used and distributed according to the terms - * of the GNU General Public License, incorporated herein by reference. - * - */ - -#ifndef _UAPI_LINUX_ISDN_PPP_H -#define _UAPI_LINUX_ISDN_PPP_H - -#define CALLTYPE_INCOMING 0x1 -#define CALLTYPE_OUTGOING 0x2 -#define CALLTYPE_CALLBACK 0x4 - -#define IPPP_VERSION "2.2.0" - -struct pppcallinfo -{ - int calltype; - unsigned char local_num[64]; - unsigned char remote_num[64]; - int charge_units; -}; - -#define PPPIOCGCALLINFO _IOWR('t',128,struct pppcallinfo) -#define PPPIOCBUNDLE _IOW('t',129,int) -#define PPPIOCGMPFLAGS _IOR('t',130,int) -#define PPPIOCSMPFLAGS _IOW('t',131,int) -#define PPPIOCSMPMTU _IOW('t',132,int) -#define PPPIOCSMPMRU _IOW('t',133,int) -#define PPPIOCGCOMPRESSORS _IOR('t',134,unsigned long [8]) -#define PPPIOCSCOMPRESSOR _IOW('t',135,int) -#define PPPIOCGIFNAME _IOR('t',136, char [IFNAMSIZ] ) - - -#define SC_MP_PROT 0x00000200 -#define SC_REJ_MP_PROT 0x00000400 -#define SC_OUT_SHORT_SEQ 0x00000800 -#define SC_IN_SHORT_SEQ 0x00004000 - -#define SC_DECOMP_ON 0x01 -#define SC_COMP_ON 0x02 -#define SC_DECOMP_DISCARD 0x04 -#define SC_COMP_DISCARD 0x08 -#define SC_LINK_DECOMP_ON 0x10 -#define SC_LINK_COMP_ON 0x20 -#define SC_LINK_DECOMP_DISCARD 0x40 -#define SC_LINK_COMP_DISCARD 0x80 - -#define ISDN_PPP_COMP_MAX_OPTIONS 16 - -#define IPPP_COMP_FLAG_XMIT 0x1 -#define IPPP_COMP_FLAG_LINK 0x2 - -struct isdn_ppp_comp_data { - int num; - unsigned char options[ISDN_PPP_COMP_MAX_OPTIONS]; - int optlen; - int flags; -}; - -#endif /* _UAPI_LINUX_ISDN_PPP_H */ diff --git a/include/uapi/linux/isdnif.h b/include/uapi/linux/isdnif.h deleted file mode 100644 index 611a69196738..000000000000 --- a/include/uapi/linux/isdnif.h +++ /dev/null @@ -1,57 +0,0 @@ -/* SPDX-License-Identifier: GPL-1.0+ WITH Linux-syscall-note */ -/* $Id: isdnif.h,v 1.43.2.2 2004/01/12 23:08:35 keil Exp $ - * - * Linux ISDN subsystem - * Definition of the interface between the subsystem and its low-level drivers. - * - * Copyright 1994,95,96 by Fritz Elfert (fritz@isdn4linux.de) - * Copyright 1995,96 Thinking Objects Software GmbH Wuerzburg - * - * This software may be used and distributed according to the terms - * of the GNU General Public License, incorporated herein by reference. - * - */ - -#ifndef _UAPI__ISDNIF_H__ -#define _UAPI__ISDNIF_H__ - - -/* - * Values for general protocol-selection - */ -#define ISDN_PTYPE_UNKNOWN 0 /* Protocol undefined */ -#define ISDN_PTYPE_1TR6 1 /* german 1TR6-protocol */ -#define ISDN_PTYPE_EURO 2 /* EDSS1-protocol */ -#define ISDN_PTYPE_LEASED 3 /* for leased lines */ -#define ISDN_PTYPE_NI1 4 /* US NI-1 protocol */ -#define ISDN_PTYPE_MAX 7 /* Max. 8 Protocols */ - -/* - * Values for Layer-2-protocol-selection - */ -#define ISDN_PROTO_L2_X75I 0 /* X75/LAPB with I-Frames */ -#define ISDN_PROTO_L2_X75UI 1 /* X75/LAPB with UI-Frames */ -#define ISDN_PROTO_L2_X75BUI 2 /* X75/LAPB with UI-Frames */ -#define ISDN_PROTO_L2_HDLC 3 /* HDLC */ -#define ISDN_PROTO_L2_TRANS 4 /* Transparent (Voice) */ -#define ISDN_PROTO_L2_X25DTE 5 /* X25/LAPB DTE mode */ -#define ISDN_PROTO_L2_X25DCE 6 /* X25/LAPB DCE mode */ -#define ISDN_PROTO_L2_V11096 7 /* V.110 bitrate adaption 9600 Baud */ -#define ISDN_PROTO_L2_V11019 8 /* V.110 bitrate adaption 19200 Baud */ -#define ISDN_PROTO_L2_V11038 9 /* V.110 bitrate adaption 38400 Baud */ -#define ISDN_PROTO_L2_MODEM 10 /* Analog Modem on Board */ -#define ISDN_PROTO_L2_FAX 11 /* Fax Group 2/3 */ -#define ISDN_PROTO_L2_HDLC_56K 12 /* HDLC 56k */ -#define ISDN_PROTO_L2_MAX 15 /* Max. 16 Protocols */ - -/* - * Values for Layer-3-protocol-selection - */ -#define ISDN_PROTO_L3_TRANS 0 /* Transparent */ -#define ISDN_PROTO_L3_TRANSDSP 1 /* Transparent with DSP */ -#define ISDN_PROTO_L3_FCLASS2 2 /* Fax Group 2/3 CLASS 2 */ -#define ISDN_PROTO_L3_FCLASS1 3 /* Fax Group 2/3 CLASS 1 */ -#define ISDN_PROTO_L3_MAX 7 /* Max. 8 Protocols */ - - -#endif /* _UAPI__ISDNIF_H__ */ diff --git a/include/uapi/linux/wanrouter.h b/include/uapi/linux/wanrouter.h deleted file mode 100644 index 2f1216d00caa..000000000000 --- a/include/uapi/linux/wanrouter.h +++ /dev/null @@ -1,18 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -/* - * wanrouter.h Legacy declarations kept around until X25 is removed - */ - -#ifndef _UAPI_ROUTER_H -#define _UAPI_ROUTER_H - -/* 'state' defines */ -enum wan_states -{ - WAN_UNCONFIGURED, /* link/channel is not configured */ - WAN_DISCONNECTED, /* link/channel is disconnected */ - WAN_CONNECTING, /* connection is in progress */ - WAN_CONNECTED /* link/channel is operational */ -}; - -#endif /* _UAPI_ROUTER_H */ -- cgit v1.2.3-71-gd317 From ea6cc2fd8a2b89ab6dcd096ba6dbc1ecbdf26564 Mon Sep 17 00:00:00 2001 From: Lukasz Pawelczyk Date: Fri, 10 May 2019 13:46:22 +0200 Subject: netfilter: xt_owner: Add supplementary groups option The XT_OWNER_SUPPL_GROUPS flag causes GIDs specified with XT_OWNER_GID to be also checked in the supplementary groups of a process. f_cred->group_info cannot be modified during its lifetime and f_cred holds a reference to it so it's safe to use. Signed-off-by: Lukasz Pawelczyk Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/xt_owner.h | 7 ++++--- net/netfilter/xt_owner.c | 23 ++++++++++++++++++++--- 2 files changed, 24 insertions(+), 6 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/netfilter/xt_owner.h b/include/uapi/linux/netfilter/xt_owner.h index fa3ad84957d5..9e98c09eda32 100644 --- a/include/uapi/linux/netfilter/xt_owner.h +++ b/include/uapi/linux/netfilter/xt_owner.h @@ -5,9 +5,10 @@ #include enum { - XT_OWNER_UID = 1 << 0, - XT_OWNER_GID = 1 << 1, - XT_OWNER_SOCKET = 1 << 2, + XT_OWNER_UID = 1 << 0, + XT_OWNER_GID = 1 << 1, + XT_OWNER_SOCKET = 1 << 2, + XT_OWNER_SUPPL_GROUPS = 1 << 3, }; struct xt_owner_match_info { diff --git a/net/netfilter/xt_owner.c b/net/netfilter/xt_owner.c index 46686fb73784..a8784502aca6 100644 --- a/net/netfilter/xt_owner.c +++ b/net/netfilter/xt_owner.c @@ -91,11 +91,28 @@ owner_mt(const struct sk_buff *skb, struct xt_action_param *par) } if (info->match & XT_OWNER_GID) { + unsigned int i, match = false; kgid_t gid_min = make_kgid(net->user_ns, info->gid_min); kgid_t gid_max = make_kgid(net->user_ns, info->gid_max); - if ((gid_gte(filp->f_cred->fsgid, gid_min) && - gid_lte(filp->f_cred->fsgid, gid_max)) ^ - !(info->invert & XT_OWNER_GID)) + struct group_info *gi = filp->f_cred->group_info; + + if (gid_gte(filp->f_cred->fsgid, gid_min) && + gid_lte(filp->f_cred->fsgid, gid_max)) + match = true; + + if (!match && (info->match & XT_OWNER_SUPPL_GROUPS) && gi) { + for (i = 0; i < gi->ngroups; ++i) { + kgid_t group = gi->gid[i]; + + if (gid_gte(group, gid_min) && + gid_lte(group, gid_max)) { + match = true; + break; + } + } + } + + if (match ^ !(info->invert & XT_OWNER_GID)) return false; } -- cgit v1.2.3-71-gd317 From 29930e314da3833437a2ddc7b17f6a954f38d8fb Mon Sep 17 00:00:00 2001 From: Jacky Hu Date: Thu, 30 May 2019 08:16:40 +0800 Subject: ipvs: add checksum support for gue encapsulation Add checksum support for gue encapsulation with the tun_flags parameter, which could be one of the values below: IP_VS_TUNNEL_ENCAP_FLAG_NOCSUM IP_VS_TUNNEL_ENCAP_FLAG_CSUM IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM Signed-off-by: Jacky Hu Signed-off-by: Julian Anastasov Signed-off-by: Simon Horman Signed-off-by: Pablo Neira Ayuso --- include/net/ip_vs.h | 2 + include/uapi/linux/ip_vs.h | 7 ++ net/netfilter/ipvs/ip_vs_ctl.c | 11 +++- net/netfilter/ipvs/ip_vs_xmit.c | 143 +++++++++++++++++++++++++++++++++++----- 4 files changed, 146 insertions(+), 17 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index b01a94ebfc0e..cb1ad0cc5c7b 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -603,6 +603,7 @@ struct ip_vs_dest_user_kern { u16 tun_type; /* tunnel type */ __be16 tun_port; /* tunnel port */ + u16 tun_flags; /* tunnel flags */ }; @@ -665,6 +666,7 @@ struct ip_vs_dest { atomic_t last_weight; /* server latest weight */ __u16 tun_type; /* tunnel type */ __be16 tun_port; /* tunnel port */ + __u16 tun_flags; /* tunnel flags */ refcount_t refcnt; /* reference counter */ struct ip_vs_stats stats; /* statistics */ diff --git a/include/uapi/linux/ip_vs.h b/include/uapi/linux/ip_vs.h index e34f436fc79d..e4f18061a4fd 100644 --- a/include/uapi/linux/ip_vs.h +++ b/include/uapi/linux/ip_vs.h @@ -131,6 +131,11 @@ enum { IP_VS_CONN_F_TUNNEL_TYPE_MAX, }; +/* Tunnel encapsulation flags */ +#define IP_VS_TUNNEL_ENCAP_FLAG_NOCSUM (0) +#define IP_VS_TUNNEL_ENCAP_FLAG_CSUM (1 << 0) +#define IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM (1 << 1) + /* * The struct ip_vs_service_user and struct ip_vs_dest_user are * used to set IPVS rules through setsockopt. @@ -403,6 +408,8 @@ enum { IPVS_DEST_ATTR_TUN_PORT, /* tunnel port */ + IPVS_DEST_ATTR_TUN_FLAGS, /* tunnel flags */ + __IPVS_DEST_ATTR_MAX, }; diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index d5847e06350f..ad19ac08622f 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -893,6 +893,7 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest, /* set the tunnel info */ dest->tun_type = udest->tun_type; dest->tun_port = udest->tun_port; + dest->tun_flags = udest->tun_flags; /* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */ if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ) { @@ -2967,6 +2968,7 @@ static const struct nla_policy ip_vs_dest_policy[IPVS_DEST_ATTR_MAX + 1] = { [IPVS_DEST_ATTR_ADDR_FAMILY] = { .type = NLA_U16 }, [IPVS_DEST_ATTR_TUN_TYPE] = { .type = NLA_U8 }, [IPVS_DEST_ATTR_TUN_PORT] = { .type = NLA_U16 }, + [IPVS_DEST_ATTR_TUN_FLAGS] = { .type = NLA_U16 }, }; static int ip_vs_genl_fill_stats(struct sk_buff *skb, int container_type, @@ -3273,6 +3275,8 @@ static int ip_vs_genl_fill_dest(struct sk_buff *skb, struct ip_vs_dest *dest) dest->tun_type) || nla_put_be16(skb, IPVS_DEST_ATTR_TUN_PORT, dest->tun_port) || + nla_put_u16(skb, IPVS_DEST_ATTR_TUN_FLAGS, + dest->tun_flags) || nla_put_u32(skb, IPVS_DEST_ATTR_U_THRESH, dest->u_threshold) || nla_put_u32(skb, IPVS_DEST_ATTR_L_THRESH, dest->l_threshold) || nla_put_u32(skb, IPVS_DEST_ATTR_ACTIVE_CONNS, @@ -3393,7 +3397,8 @@ static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest, /* If a full entry was requested, check for the additional fields */ if (full_entry) { struct nlattr *nla_fwd, *nla_weight, *nla_u_thresh, - *nla_l_thresh, *nla_tun_type, *nla_tun_port; + *nla_l_thresh, *nla_tun_type, *nla_tun_port, + *nla_tun_flags; nla_fwd = attrs[IPVS_DEST_ATTR_FWD_METHOD]; nla_weight = attrs[IPVS_DEST_ATTR_WEIGHT]; @@ -3401,6 +3406,7 @@ static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest, nla_l_thresh = attrs[IPVS_DEST_ATTR_L_THRESH]; nla_tun_type = attrs[IPVS_DEST_ATTR_TUN_TYPE]; nla_tun_port = attrs[IPVS_DEST_ATTR_TUN_PORT]; + nla_tun_flags = attrs[IPVS_DEST_ATTR_TUN_FLAGS]; if (!(nla_fwd && nla_weight && nla_u_thresh && nla_l_thresh)) return -EINVAL; @@ -3416,6 +3422,9 @@ static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest, if (nla_tun_port) udest->tun_port = nla_get_be16(nla_tun_port); + + if (nla_tun_flags) + udest->tun_flags = nla_get_u16(nla_tun_flags); } return 0; diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index 0b41d0504429..af3379d5e5bc 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -40,6 +40,7 @@ #include #include #include +#include #include #include #include @@ -385,8 +386,13 @@ __ip_vs_get_out_rt(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb, mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr); if (!dest) goto err_put; - if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) + if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { mtu -= sizeof(struct udphdr) + sizeof(struct guehdr); + if ((dest->tun_flags & + IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) && + skb->ip_summed == CHECKSUM_PARTIAL) + mtu -= GUE_PLEN_REMCSUM + GUE_LEN_PRIV; + } if (mtu < 68) { IP_VS_DBG_RL("%s(): mtu less than 68\n", __func__); goto err_put; @@ -540,8 +546,13 @@ __ip_vs_get_out_rt_v6(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb, mtu = dst_mtu(&rt->dst) - sizeof(struct ipv6hdr); if (!dest) goto err_put; - if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) + if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { mtu -= sizeof(struct udphdr) + sizeof(struct guehdr); + if ((dest->tun_flags & + IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) && + skb->ip_summed == CHECKSUM_PARTIAL) + mtu -= GUE_PLEN_REMCSUM + GUE_LEN_PRIV; + } if (mtu < IPV6_MIN_MTU) { IP_VS_DBG_RL("%s(): mtu less than %d\n", __func__, IPV6_MIN_MTU); @@ -1006,17 +1017,56 @@ ipvs_gue_encap(struct net *net, struct sk_buff *skb, __be16 sport = udp_flow_src_port(net, skb, 0, 0, false); struct udphdr *udph; /* Our new UDP header */ struct guehdr *gueh; /* Our new GUE header */ + size_t hdrlen, optlen = 0; + void *data; + bool need_priv = false; + + if ((cp->dest->tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) && + skb->ip_summed == CHECKSUM_PARTIAL) { + optlen += GUE_PLEN_REMCSUM + GUE_LEN_PRIV; + need_priv = true; + } - skb_push(skb, sizeof(struct guehdr)); + hdrlen = sizeof(struct guehdr) + optlen; + + skb_push(skb, hdrlen); gueh = (struct guehdr *)skb->data; gueh->control = 0; gueh->version = 0; - gueh->hlen = 0; + gueh->hlen = optlen >> 2; gueh->flags = 0; gueh->proto_ctype = *next_protocol; + data = &gueh[1]; + + if (need_priv) { + __be32 *flags = data; + u16 csum_start = skb_checksum_start_offset(skb); + __be16 *pd; + + gueh->flags |= GUE_FLAG_PRIV; + *flags = 0; + data += GUE_LEN_PRIV; + + if (csum_start < hdrlen) + return -EINVAL; + + csum_start -= hdrlen; + pd = data; + pd[0] = htons(csum_start); + pd[1] = htons(csum_start + skb->csum_offset); + + if (!skb_is_gso(skb)) { + skb->ip_summed = CHECKSUM_NONE; + skb->encapsulation = 0; + } + + *flags |= GUE_PFLAG_REMCSUM; + data += GUE_PLEN_REMCSUM; + } + skb_push(skb, sizeof(struct udphdr)); skb_reset_transport_header(skb); @@ -1070,6 +1120,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, unsigned int max_headroom; /* The extra header space needed */ int ret, local; int tun_type, gso_type; + int tun_flags; EnterFunction(10); @@ -1092,9 +1143,19 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct iphdr); tun_type = cp->dest->tun_type; + tun_flags = cp->dest->tun_flags; - if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) - max_headroom += sizeof(struct udphdr) + sizeof(struct guehdr); + if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { + size_t gue_hdrlen, gue_optlen = 0; + + if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) && + skb->ip_summed == CHECKSUM_PARTIAL) { + gue_optlen += GUE_PLEN_REMCSUM + GUE_LEN_PRIV; + } + gue_hdrlen = sizeof(struct guehdr) + gue_optlen; + + max_headroom += sizeof(struct udphdr) + gue_hdrlen; + } /* We only care about the df field if sysctl_pmtu_disc(ipvs) is set */ dfp = sysctl_pmtu_disc(ipvs) ? &df : NULL; @@ -1105,8 +1166,17 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, goto tx_error; gso_type = __tun_gso_type_mask(AF_INET, cp->af); - if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) - gso_type |= SKB_GSO_UDP_TUNNEL; + if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { + if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) || + (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM)) + gso_type |= SKB_GSO_UDP_TUNNEL_CSUM; + else + gso_type |= SKB_GSO_UDP_TUNNEL; + if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) && + skb->ip_summed == CHECKSUM_PARTIAL) { + gso_type |= SKB_GSO_TUNNEL_REMCSUM; + } + } if (iptunnel_handle_offloads(skb, gso_type)) goto tx_error; @@ -1115,8 +1185,19 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, skb_set_inner_ipproto(skb, next_protocol); - if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) - ipvs_gue_encap(net, skb, cp, &next_protocol); + if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { + bool check = false; + + if (ipvs_gue_encap(net, skb, cp, &next_protocol)) + goto tx_error; + + if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) || + (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM)) + check = true; + + udp_set_csum(!check, skb, saddr, cp->daddr.ip, skb->len); + } + skb_push(skb, sizeof(struct iphdr)); skb_reset_network_header(skb); @@ -1174,6 +1255,7 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, unsigned int max_headroom; /* The extra header space needed */ int ret, local; int tun_type, gso_type; + int tun_flags; EnterFunction(10); @@ -1197,9 +1279,19 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct ipv6hdr); tun_type = cp->dest->tun_type; + tun_flags = cp->dest->tun_flags; - if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) - max_headroom += sizeof(struct udphdr) + sizeof(struct guehdr); + if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { + size_t gue_hdrlen, gue_optlen = 0; + + if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) && + skb->ip_summed == CHECKSUM_PARTIAL) { + gue_optlen += GUE_PLEN_REMCSUM + GUE_LEN_PRIV; + } + gue_hdrlen = sizeof(struct guehdr) + gue_optlen; + + max_headroom += sizeof(struct udphdr) + gue_hdrlen; + } skb = ip_vs_prepare_tunneled_skb(skb, cp->af, max_headroom, &next_protocol, &payload_len, @@ -1208,8 +1300,17 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, goto tx_error; gso_type = __tun_gso_type_mask(AF_INET6, cp->af); - if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) - gso_type |= SKB_GSO_UDP_TUNNEL; + if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { + if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) || + (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM)) + gso_type |= SKB_GSO_UDP_TUNNEL_CSUM; + else + gso_type |= SKB_GSO_UDP_TUNNEL; + if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) && + skb->ip_summed == CHECKSUM_PARTIAL) { + gso_type |= SKB_GSO_TUNNEL_REMCSUM; + } + } if (iptunnel_handle_offloads(skb, gso_type)) goto tx_error; @@ -1218,8 +1319,18 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, skb_set_inner_ipproto(skb, next_protocol); - if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) - ipvs_gue_encap(net, skb, cp, &next_protocol); + if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { + bool check = false; + + if (ipvs_gue_encap(net, skb, cp, &next_protocol)) + goto tx_error; + + if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) || + (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM)) + check = true; + + udp6_set_csum(!check, skb, &saddr, &cp->daddr.in6, skb->len); + } skb_push(skb, sizeof(struct ipv6hdr)); skb_reset_network_header(skb); -- cgit v1.2.3-71-gd317 From b51700632e0e53254733ff706e5bdca22d19dbe5 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Tue, 21 May 2019 14:06:53 +0800 Subject: KVM: X86: Provide a capability to disable cstate msr read intercepts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Allow guest reads CORE cstate when exposing host CPU power management capabilities to the guest. PKG cstate is restricted to avoid a guest to get the whole package information in multi-tenant scenario. Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Sean Christopherson Cc: Liran Alon Signed-off-by: Wanpeng Li Signed-off-by: Paolo Bonzini --- Documentation/virtual/kvm/api.txt | 1 + arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/vmx/vmx.c | 6 ++++++ arch/x86/kvm/x86.c | 5 ++++- arch/x86/kvm/x86.h | 5 +++++ include/uapi/linux/kvm.h | 4 +++- tools/include/uapi/linux/kvm.h | 4 +++- 7 files changed, 23 insertions(+), 3 deletions(-) (limited to 'include/uapi/linux') diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 33cd92dd6aa5..91fd86fcc49f 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -4894,6 +4894,7 @@ Valid bits in args[0] are #define KVM_X86_DISABLE_EXITS_MWAIT (1 << 0) #define KVM_X86_DISABLE_EXITS_HLT (1 << 1) #define KVM_X86_DISABLE_EXITS_PAUSE (1 << 2) +#define KVM_X86_DISABLE_EXITS_CSTATE (1 << 3) Enabling this capability on a VM provides userspace with a way to no longer intercept some instructions for improved latency in some diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 15e973d9b840..aeadbc770eb2 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -882,6 +882,7 @@ struct kvm_arch { bool mwait_in_guest; bool hlt_in_guest; bool pause_in_guest; + bool cstate_in_guest; unsigned long irq_sources_bitmap; s64 kvmclock_offset; diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 0861c71a4379..da24f1858acc 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6637,6 +6637,12 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW); vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW); vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW); + if (kvm_cstate_in_guest(kvm)) { + vmx_disable_intercept_for_msr(msr_bitmap, MSR_CORE_C1_RES, MSR_TYPE_R); + vmx_disable_intercept_for_msr(msr_bitmap, MSR_CORE_C3_RESIDENCY, MSR_TYPE_R); + vmx_disable_intercept_for_msr(msr_bitmap, MSR_CORE_C6_RESIDENCY, MSR_TYPE_R); + vmx_disable_intercept_for_msr(msr_bitmap, MSR_CORE_C7_RESIDENCY, MSR_TYPE_R); + } vmx->msr_bitmap_mode = 0; vmx->loaded_vmcs = &vmx->vmcs01; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 8aa6b5a75e7a..17e9533f51eb 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3098,7 +3098,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) r = KVM_CLOCK_TSC_STABLE; break; case KVM_CAP_X86_DISABLE_EXITS: - r |= KVM_X86_DISABLE_EXITS_HLT | KVM_X86_DISABLE_EXITS_PAUSE; + r |= KVM_X86_DISABLE_EXITS_HLT | KVM_X86_DISABLE_EXITS_PAUSE | + KVM_X86_DISABLE_EXITS_CSTATE; if(kvm_can_mwait_in_guest()) r |= KVM_X86_DISABLE_EXITS_MWAIT; break; @@ -4615,6 +4616,8 @@ split_irqchip_unlock: kvm->arch.hlt_in_guest = true; if (cap->args[0] & KVM_X86_DISABLE_EXITS_PAUSE) kvm->arch.pause_in_guest = true; + if (cap->args[0] & KVM_X86_DISABLE_EXITS_CSTATE) + kvm->arch.cstate_in_guest = true; r = 0; break; case KVM_CAP_MSR_PLATFORM_INFO: diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index a470ff0868c5..275b3b646023 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -333,6 +333,11 @@ static inline bool kvm_pause_in_guest(struct kvm *kvm) return kvm->arch.pause_in_guest; } +static inline bool kvm_cstate_in_guest(struct kvm *kvm) +{ + return kvm->arch.cstate_in_guest; +} + DECLARE_PER_CPU(struct kvm_vcpu *, current_vcpu); static inline void kvm_before_interrupt(struct kvm_vcpu *vcpu) diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 2fe12b40d503..c2152f3dd02d 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -696,9 +696,11 @@ struct kvm_ioeventfd { #define KVM_X86_DISABLE_EXITS_MWAIT (1 << 0) #define KVM_X86_DISABLE_EXITS_HLT (1 << 1) #define KVM_X86_DISABLE_EXITS_PAUSE (1 << 2) +#define KVM_X86_DISABLE_EXITS_CSTATE (1 << 3) #define KVM_X86_DISABLE_VALID_EXITS (KVM_X86_DISABLE_EXITS_MWAIT | \ KVM_X86_DISABLE_EXITS_HLT | \ - KVM_X86_DISABLE_EXITS_PAUSE) + KVM_X86_DISABLE_EXITS_PAUSE | \ + KVM_X86_DISABLE_EXITS_CSTATE) /* for KVM_ENABLE_CAP */ struct kvm_enable_cap { diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h index 6d4ea4b6c922..ef3303f72c46 100644 --- a/tools/include/uapi/linux/kvm.h +++ b/tools/include/uapi/linux/kvm.h @@ -696,9 +696,11 @@ struct kvm_ioeventfd { #define KVM_X86_DISABLE_EXITS_MWAIT (1 << 0) #define KVM_X86_DISABLE_EXITS_HLT (1 << 1) #define KVM_X86_DISABLE_EXITS_PAUSE (1 << 2) +#define KVM_X86_DISABLE_EXITS_CSTATE (1 << 3) #define KVM_X86_DISABLE_VALID_EXITS (KVM_X86_DISABLE_EXITS_MWAIT | \ KVM_X86_DISABLE_EXITS_HLT | \ - KVM_X86_DISABLE_EXITS_PAUSE) + KVM_X86_DISABLE_EXITS_PAUSE | \ + KVM_X86_DISABLE_EXITS_CSTATE) /* for KVM_ENABLE_CAP */ struct kvm_enable_cap { -- cgit v1.2.3-71-gd317 From 191ed2024de9fcfaab24106f9dbf7e544b07d633 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 4 Jun 2019 15:40:40 +0200 Subject: devlink: allow driver to update progress of flash update Introduce a function to be called from drivers during flash. It sends notification to userspace about flash update progress. Signed-off-by: Jiri Pirko Reviewed-by: Jakub Kicinski Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- include/net/devlink.h | 8 ++++ include/uapi/linux/devlink.h | 5 +++ net/core/devlink.c | 102 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 115 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/net/devlink.h b/include/net/devlink.h index 151eb930d329..8f65356132be 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -741,6 +741,14 @@ void devlink_health_reporter_state_update(struct devlink_health_reporter *reporter, enum devlink_health_reporter_state state); +void devlink_flash_update_begin_notify(struct devlink *devlink); +void devlink_flash_update_end_notify(struct devlink *devlink); +void devlink_flash_update_status_notify(struct devlink *devlink, + const char *status_msg, + const char *component, + unsigned long done, + unsigned long total); + #if IS_ENABLED(CONFIG_NET_DEVLINK) void devlink_compat_running_version(struct net_device *dev, diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index 5bb4ea67d84f..5287b42c181f 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -104,6 +104,8 @@ enum devlink_command { DEVLINK_CMD_HEALTH_REPORTER_DUMP_CLEAR, DEVLINK_CMD_FLASH_UPDATE, + DEVLINK_CMD_FLASH_UPDATE_END, /* notification only */ + DEVLINK_CMD_FLASH_UPDATE_STATUS, /* notification only */ /* add new commands above here */ __DEVLINK_CMD_MAX, @@ -331,6 +333,9 @@ enum devlink_attr { DEVLINK_ATTR_FLASH_UPDATE_FILE_NAME, /* string */ DEVLINK_ATTR_FLASH_UPDATE_COMPONENT, /* string */ + DEVLINK_ATTR_FLASH_UPDATE_STATUS_MSG, /* string */ + DEVLINK_ATTR_FLASH_UPDATE_STATUS_DONE, /* u64 */ + DEVLINK_ATTR_FLASH_UPDATE_STATUS_TOTAL, /* u64 */ /* add new attributes above here, update the policy in devlink.c */ diff --git a/net/core/devlink.c b/net/core/devlink.c index 9716a7f382cb..963178d32dda 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -2673,6 +2673,108 @@ static int devlink_nl_cmd_reload(struct sk_buff *skb, struct genl_info *info) return devlink->ops->reload(devlink, info->extack); } +static int devlink_nl_flash_update_fill(struct sk_buff *msg, + struct devlink *devlink, + enum devlink_command cmd, + const char *status_msg, + const char *component, + unsigned long done, unsigned long total) +{ + void *hdr; + + hdr = genlmsg_put(msg, 0, 0, &devlink_nl_family, 0, cmd); + if (!hdr) + return -EMSGSIZE; + + if (devlink_nl_put_handle(msg, devlink)) + goto nla_put_failure; + + if (cmd != DEVLINK_CMD_FLASH_UPDATE_STATUS) + goto out; + + if (status_msg && + nla_put_string(msg, DEVLINK_ATTR_FLASH_UPDATE_STATUS_MSG, + status_msg)) + goto nla_put_failure; + if (component && + nla_put_string(msg, DEVLINK_ATTR_FLASH_UPDATE_COMPONENT, + component)) + goto nla_put_failure; + if (nla_put_u64_64bit(msg, DEVLINK_ATTR_FLASH_UPDATE_STATUS_DONE, + done, DEVLINK_ATTR_PAD)) + goto nla_put_failure; + if (nla_put_u64_64bit(msg, DEVLINK_ATTR_FLASH_UPDATE_STATUS_TOTAL, + total, DEVLINK_ATTR_PAD)) + goto nla_put_failure; + +out: + genlmsg_end(msg, hdr); + return 0; + +nla_put_failure: + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; +} + +static void __devlink_flash_update_notify(struct devlink *devlink, + enum devlink_command cmd, + const char *status_msg, + const char *component, + unsigned long done, + unsigned long total) +{ + struct sk_buff *msg; + int err; + + WARN_ON(cmd != DEVLINK_CMD_FLASH_UPDATE && + cmd != DEVLINK_CMD_FLASH_UPDATE_END && + cmd != DEVLINK_CMD_FLASH_UPDATE_STATUS); + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return; + + err = devlink_nl_flash_update_fill(msg, devlink, cmd, status_msg, + component, done, total); + if (err) + goto out_free_msg; + + genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink), + msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); + return; + +out_free_msg: + nlmsg_free(msg); +} + +void devlink_flash_update_begin_notify(struct devlink *devlink) +{ + __devlink_flash_update_notify(devlink, + DEVLINK_CMD_FLASH_UPDATE, + NULL, NULL, 0, 0); +} +EXPORT_SYMBOL_GPL(devlink_flash_update_begin_notify); + +void devlink_flash_update_end_notify(struct devlink *devlink) +{ + __devlink_flash_update_notify(devlink, + DEVLINK_CMD_FLASH_UPDATE_END, + NULL, NULL, 0, 0); +} +EXPORT_SYMBOL_GPL(devlink_flash_update_end_notify); + +void devlink_flash_update_status_notify(struct devlink *devlink, + const char *status_msg, + const char *component, + unsigned long done, + unsigned long total) +{ + __devlink_flash_update_notify(devlink, + DEVLINK_CMD_FLASH_UPDATE_STATUS, + status_msg, component, done, total); +} +EXPORT_SYMBOL_GPL(devlink_flash_update_status_notify); + static int devlink_nl_cmd_flash_update(struct sk_buff *skb, struct genl_info *info) { -- cgit v1.2.3-71-gd317 From e9ca90074c26c50c16805fb54de45d1b46a0f1e5 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Tue, 4 Jun 2019 07:13:34 -0400 Subject: media: do not use C++ style comments in uapi headers Linux kernel tolerates C++ style comments these days. Actually, the SPDX License tags for .c files start with //. On the other hand, uapi headers are written in more strict C, where the C++ comment style is forbidden. [mchehab+samsung@kernel.org: fix a checkpatch --strict warning] Signed-off-by: Masahiro Yamada Signed-off-by: Mauro Carvalho Chehab --- include/uapi/linux/dvb/audio.h | 2 +- include/uapi/linux/dvb/osd.h | 170 ++++++++++++++++++++++++----------------- 2 files changed, 103 insertions(+), 69 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/dvb/audio.h b/include/uapi/linux/dvb/audio.h index afeae063e640..977bed135e22 100644 --- a/include/uapi/linux/dvb/audio.h +++ b/include/uapi/linux/dvb/audio.h @@ -52,7 +52,7 @@ typedef enum { typedef struct audio_mixer { unsigned int volume_left; unsigned int volume_right; - // what else do we need? bass, pass-through, ... + /* what else do we need? bass, pass-through, ... */ } audio_mixer_t; diff --git a/include/uapi/linux/dvb/osd.h b/include/uapi/linux/dvb/osd.h index e163508b9ae8..07572bc0c864 100644 --- a/include/uapi/linux/dvb/osd.h +++ b/include/uapi/linux/dvb/osd.h @@ -28,74 +28,108 @@ #include typedef enum { - // All functions return -2 on "not open" - OSD_Close=1, // () - // Disables OSD and releases the buffers - // returns 0 on success - OSD_Open, // (x0,y0,x1,y1,BitPerPixel[2/4/8](color&0x0F),mix[0..15](color&0xF0)) - // Opens OSD with this size and bit depth - // returns 0 on success, -1 on DRAM allocation error, -2 on "already open" - OSD_Show, // () - // enables OSD mode - // returns 0 on success - OSD_Hide, // () - // disables OSD mode - // returns 0 on success - OSD_Clear, // () - // Sets all pixel to color 0 - // returns 0 on success - OSD_Fill, // (color) - // Sets all pixel to color - // returns 0 on success - OSD_SetColor, // (color,R{x0},G{y0},B{x1},opacity{y1}) - // set palette entry to , and apply - // R,G,B: 0..255 - // R=Red, G=Green, B=Blue - // opacity=0: pixel opacity 0% (only video pixel shows) - // opacity=1..254: pixel opacity as specified in header - // opacity=255: pixel opacity 100% (only OSD pixel shows) - // returns 0 on success, -1 on error - OSD_SetPalette, // (firstcolor{color},lastcolor{x0},data) - // Set a number of entries in the palette - // sets the entries "firstcolor" through "lastcolor" from the array "data" - // data has 4 byte for each color: - // R,G,B, and a opacity value: 0->transparent, 1..254->mix, 255->pixel - OSD_SetTrans, // (transparency{color}) - // Sets transparency of mixed pixel (0..15) - // returns 0 on success - OSD_SetPixel, // (x0,y0,color) - // sets pixel , to color number - // returns 0 on success, -1 on error - OSD_GetPixel, // (x0,y0) - // returns color number of pixel ,, or -1 - OSD_SetRow, // (x0,y0,x1,data) - // fills pixels x0,y through x1,y with the content of data[] - // returns 0 on success, -1 on clipping all pixel (no pixel drawn) - OSD_SetBlock, // (x0,y0,x1,y1,increment{color},data) - // fills pixels x0,y0 through x1,y1 with the content of data[] - // inc contains the width of one line in the data block, - // inc<=0 uses blockwidth as linewidth - // returns 0 on success, -1 on clipping all pixel - OSD_FillRow, // (x0,y0,x1,color) - // fills pixels x0,y through x1,y with the color - // returns 0 on success, -1 on clipping all pixel - OSD_FillBlock, // (x0,y0,x1,y1,color) - // fills pixels x0,y0 through x1,y1 with the color - // returns 0 on success, -1 on clipping all pixel - OSD_Line, // (x0,y0,x1,y1,color) - // draw a line from x0,y0 to x1,y1 with the color - // returns 0 on success - OSD_Query, // (x0,y0,x1,y1,xasp{color}}), yasp=11 - // fills parameters with the picture dimensions and the pixel aspect ratio - // returns 0 on success - OSD_Test, // () - // draws a test picture. for debugging purposes only - // returns 0 on success -// TODO: remove "test" in final version - OSD_Text, // (x0,y0,size,color,text) - OSD_SetWindow, // (x0) set window with number 0 + * returns 0 on success + */ + OSD_SetColor, /* (color,R{x0},G{y0},B{x1},opacity{y1}) */ + /* + * set palette entry to , and apply + * R,G,B: 0..255 + * R=Red, G=Green, B=Blue + * opacity=0: pixel opacity 0% (only video pixel shows) + * opacity=1..254: pixel opacity as specified in header + * opacity=255: pixel opacity 100% (only OSD pixel shows) + * returns 0 on success, -1 on error + */ + OSD_SetPalette, /* (firstcolor{color},lastcolor{x0},data) */ + /* + * Set a number of entries in the palette + * sets the entries "firstcolor" through "lastcolor" from the array "data" + * data has 4 byte for each color: + * R,G,B, and a opacity value: 0->transparent, 1..254->mix, 255->pixel + */ + OSD_SetTrans, /* (transparency{color}) */ + /* + * Sets transparency of mixed pixel (0..15) + * returns 0 on success + */ + OSD_SetPixel, /* (x0,y0,color) */ + /* + * sets pixel , to color number + * returns 0 on success, -1 on error + */ + OSD_GetPixel, /* (x0,y0) */ + /* returns color number of pixel ,, or -1 */ + OSD_SetRow, /* (x0,y0,x1,data) */ + /* + * fills pixels x0,y through x1,y with the content of data[] + * returns 0 on success, -1 on clipping all pixel (no pixel drawn) + */ + OSD_SetBlock, /* (x0,y0,x1,y1,increment{color},data) */ + /* + * fills pixels x0,y0 through x1,y1 with the content of data[] + * inc contains the width of one line in the data block, + * inc<=0 uses blockwidth as linewidth + * returns 0 on success, -1 on clipping all pixel + */ + OSD_FillRow, /* (x0,y0,x1,color) */ + /* + * fills pixels x0,y through x1,y with the color + * returns 0 on success, -1 on clipping all pixel + */ + OSD_FillBlock, /* (x0,y0,x1,y1,color) */ + /* + * fills pixels x0,y0 through x1,y1 with the color + * returns 0 on success, -1 on clipping all pixel + */ + OSD_Line, /* (x0,y0,x1,y1,color) */ + /* + * draw a line from x0,y0 to x1,y1 with the color + * returns 0 on success + */ + OSD_Query, /* (x0,y0,x1,y1,xasp{color}}), yasp=11 */ + /* + * fills parameters with the picture dimensions and the pixel aspect ratio + * returns 0 on success + */ + OSD_Test, /* () */ + /* + * draws a test picture. for debugging purposes only + * returns 0 on success + * TODO: remove "test" in final version + */ + OSD_Text, /* (x0,y0,size,color,text) */ + OSD_SetWindow, /* (x0) set window with number 0 Date: Tue, 4 Jun 2019 10:41:20 -0400 Subject: media: dvb: tag deprecated DVB APIs as such There are three headers at DVB that should not be used on future projects: audio.h, osd.h and video.h. While this is already clear at the docs, make clear also at the headers that those files should not be used on future drivers. Signed-off-by: Mauro Carvalho Chehab --- include/uapi/linux/dvb/audio.h | 4 +++- include/uapi/linux/dvb/osd.h | 4 +++- include/uapi/linux/dvb/video.h | 4 +++- 3 files changed, 9 insertions(+), 3 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/dvb/audio.h b/include/uapi/linux/dvb/audio.h index 977bed135e22..2f869da69171 100644 --- a/include/uapi/linux/dvb/audio.h +++ b/include/uapi/linux/dvb/audio.h @@ -1,6 +1,8 @@ /* SPDX-License-Identifier: LGPL-2.1+ WITH Linux-syscall-note */ /* - * audio.h + * audio.h - DEPRECATED MPEG-TS audio decoder API + * + * NOTE: should not be used on future drivers * * Copyright (C) 2000 Ralph Metzler * & Marcus Metzler diff --git a/include/uapi/linux/dvb/osd.h b/include/uapi/linux/dvb/osd.h index 07572bc0c864..858997c74043 100644 --- a/include/uapi/linux/dvb/osd.h +++ b/include/uapi/linux/dvb/osd.h @@ -1,6 +1,8 @@ /* SPDX-License-Identifier: LGPL-2.1+ WITH Linux-syscall-note */ /* - * osd.h + * osd.h - DEPRECATED On Screen Display API + * + * NOTE: should not be used on future drivers * * Copyright (C) 2001 Ralph Metzler * & Marcus Metzler diff --git a/include/uapi/linux/dvb/video.h b/include/uapi/linux/dvb/video.h index 43ba8b0a3d14..179f1ec60af6 100644 --- a/include/uapi/linux/dvb/video.h +++ b/include/uapi/linux/dvb/video.h @@ -1,6 +1,8 @@ /* SPDX-License-Identifier: LGPL-2.1+ WITH Linux-syscall-note */ /* - * video.h + * video.h - DEPRECATED MPEG-TS video decoder API + * + * NOTE: should not be used on future drivers * * Copyright (C) 2000 Marcus Metzler * & Ralph Metzler -- cgit v1.2.3-71-gd317 From c54c2c72b2b90a3ba61b8cad032a578ce2bf5b35 Mon Sep 17 00:00:00 2001 From: Anirudh Venkataramanan Date: Thu, 11 Apr 2019 09:11:33 -0700 Subject: net: Add a define for LLDP ethertype Add a new define ETH_P_LLDP for Link Layer Discovery Protocol (LLDP) ethertype. Suggested-by: Bruce Allan Signed-off-by: Anirudh Venkataramanan Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- include/uapi/linux/if_ether.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/if_ether.h b/include/uapi/linux/if_ether.h index 3158ba672b72..f6ceb2e63d1e 100644 --- a/include/uapi/linux/if_ether.h +++ b/include/uapi/linux/if_ether.h @@ -91,6 +91,7 @@ #define ETH_P_802_EX1 0x88B5 /* 802.1 Local Experimental 1. */ #define ETH_P_PREAUTH 0x88C7 /* 802.11 Preauthentication */ #define ETH_P_TIPC 0x88CA /* TIPC */ +#define ETH_P_LLDP 0x88CC /* Link Layer Discovery Protocol */ #define ETH_P_MACSEC 0x88E5 /* 802.1ae MACsec */ #define ETH_P_8021AH 0x88E7 /* 802.1ah Backbone Service Tag */ #define ETH_P_MVRP 0x88F5 /* 802.1Q MVRP */ -- cgit v1.2.3-71-gd317 From fe3475af3bdf38fac78787ec2fe9eedaf2518188 Mon Sep 17 00:00:00 2001 From: Zhu Yanjun Date: Mon, 3 Jun 2019 00:28:01 -0400 Subject: net: rds: add per rds connection cache statistics The variable cache_allocs is to indicate how many frags (KiB) are in one rds connection frag cache. The command "rds-info -Iv" will output the rds connection cache statistics as below: " RDS IB Connections: LocalAddr RemoteAddr Tos SL LocalDev RemoteDev 1.1.1.14 1.1.1.14 58 255 fe80::2:c903:a:7a31 fe80::2:c903:a:7a31 send_wr=256, recv_wr=1024, send_sge=8, rdma_mr_max=4096, rdma_mr_size=257, cache_allocs=12 " This means that there are about 12KiB frag in this rds connection frag cache. Since rds.h in rds-tools is not related with the kernel rds.h, the change in kernel rds.h does not affect rds-tools. rds-info in rds-tools 2.0.5 and 2.0.6 is tested with this commit. It works well. Signed-off-by: Zhu Yanjun Signed-off-by: David S. Miller --- include/uapi/linux/rds.h | 2 ++ net/rds/ib.c | 2 ++ 2 files changed, 4 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/rds.h b/include/uapi/linux/rds.h index 5d0f76c780e5..fd6b5f66e2c5 100644 --- a/include/uapi/linux/rds.h +++ b/include/uapi/linux/rds.h @@ -250,6 +250,7 @@ struct rds_info_rdma_connection { __u32 rdma_mr_max; __u32 rdma_mr_size; __u8 tos; + __u32 cache_allocs; }; struct rds6_info_rdma_connection { @@ -264,6 +265,7 @@ struct rds6_info_rdma_connection { __u32 rdma_mr_max; __u32 rdma_mr_size; __u8 tos; + __u32 cache_allocs; }; /* RDS message Receive Path Latency points */ diff --git a/net/rds/ib.c b/net/rds/ib.c index 2da9b75bad16..f9baf2d5a82a 100644 --- a/net/rds/ib.c +++ b/net/rds/ib.c @@ -318,6 +318,7 @@ static int rds_ib_conn_info_visitor(struct rds_connection *conn, iinfo->max_recv_wr = ic->i_recv_ring.w_nr; iinfo->max_send_sge = rds_ibdev->max_sge; rds_ib_get_mr_info(rds_ibdev, iinfo); + iinfo->cache_allocs = atomic_read(&ic->i_cache_allocs); } return 1; } @@ -351,6 +352,7 @@ static int rds6_ib_conn_info_visitor(struct rds_connection *conn, iinfo6->max_recv_wr = ic->i_recv_ring.w_nr; iinfo6->max_send_sge = rds_ibdev->max_sge; rds6_ib_get_mr_info(rds_ibdev, iinfo6); + iinfo6->cache_allocs = atomic_read(&ic->i_cache_allocs); } return 1; } -- cgit v1.2.3-71-gd317 From ca72efb6bdc733006f335ca12ca615395077a873 Mon Sep 17 00:00:00 2001 From: Robert Hancock Date: Tue, 4 Jun 2019 16:15:01 -0600 Subject: net: phy: Add detection of 1000BaseX link mode support Add 1000BaseX to the link modes which are detected based on the MII_ESTATUS register as per 802.3 Clause 22. This allows PHYs which support 1000BaseX to work properly with drivers using phylink. Previously 1000BaseX support was not detected, and if that was the only mode the PHY indicated support for, phylink would refuse to attach it due to the list of supported modes being empty. Signed-off-by: Robert Hancock Signed-off-by: David S. Miller --- drivers/net/phy/phy_device.c | 3 +++ include/uapi/linux/mii.h | 2 ++ 2 files changed, 5 insertions(+) (limited to 'include/uapi/linux') diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c index 2c879ba01f35..03c885ec1f98 100644 --- a/drivers/net/phy/phy_device.c +++ b/drivers/net/phy/phy_device.c @@ -1930,6 +1930,9 @@ int genphy_config_init(struct phy_device *phydev) if (val & ESTATUS_1000_THALF) linkmode_set_bit(ETHTOOL_LINK_MODE_1000baseT_Half_BIT, features); + if (val & ESTATUS_1000_XFULL) + linkmode_set_bit(ETHTOOL_LINK_MODE_1000baseX_Full_BIT, + features); } linkmode_and(phydev->supported, phydev->supported, features); diff --git a/include/uapi/linux/mii.h b/include/uapi/linux/mii.h index a506216591d6..51b48e4be1f2 100644 --- a/include/uapi/linux/mii.h +++ b/include/uapi/linux/mii.h @@ -121,6 +121,8 @@ #define EXPANSION_MFAULTS 0x0010 /* Multiple faults detected */ #define EXPANSION_RESV 0xffe0 /* Unused... */ +#define ESTATUS_1000_XFULL 0x8000 /* Can do 1000BaseX Full */ +#define ESTATUS_1000_XHALF 0x4000 /* Can do 1000BaseX Half */ #define ESTATUS_1000_TFULL 0x2000 /* Can do 1000BT Full */ #define ESTATUS_1000_THALF 0x1000 /* Can do 1000BT Half */ -- cgit v1.2.3-71-gd317 From edcd69ab9a323b7ac7a86e1c44b6c9c46598391f Mon Sep 17 00:00:00 2001 From: Jean-Philippe Brucker Date: Tue, 15 Jan 2019 12:19:57 +0000 Subject: iommu: Add virtio-iommu driver The virtio IOMMU is a para-virtualized device, allowing to send IOMMU requests such as map/unmap over virtio transport without emulating page tables. This implementation handles ATTACH, DETACH, MAP and UNMAP requests. The bulk of the code transforms calls coming from the IOMMU API into corresponding virtio requests. Mappings are kept in an interval tree instead of page tables. A little more work is required for modular and x86 support, so for the moment the driver depends on CONFIG_VIRTIO=y and CONFIG_ARM64. Tested-by: Bharat Bhushan Tested-by: Eric Auger Reviewed-by: Eric Auger Signed-off-by: Jean-Philippe Brucker Signed-off-by: Michael S. Tsirkin --- MAINTAINERS | 7 + drivers/iommu/Kconfig | 11 + drivers/iommu/Makefile | 1 + drivers/iommu/virtio-iommu.c | 916 ++++++++++++++++++++++++++++++++++++++ include/uapi/linux/virtio_ids.h | 1 + include/uapi/linux/virtio_iommu.h | 106 +++++ 6 files changed, 1042 insertions(+) create mode 100644 drivers/iommu/virtio-iommu.c create mode 100644 include/uapi/linux/virtio_iommu.h (limited to 'include/uapi/linux') diff --git a/MAINTAINERS b/MAINTAINERS index 429c6c624861..62bd1834d95a 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -16807,6 +16807,13 @@ S: Maintained F: drivers/virtio/virtio_input.c F: include/uapi/linux/virtio_input.h +VIRTIO IOMMU DRIVER +M: Jean-Philippe Brucker +L: virtualization@lists.linux-foundation.org +S: Maintained +F: drivers/iommu/virtio-iommu.c +F: include/uapi/linux/virtio_iommu.h + VIRTUAL BOX GUEST DEVICE DRIVER M: Hans de Goede M: Arnd Bergmann diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig index 83664db5221d..e15cdcd8cb3c 100644 --- a/drivers/iommu/Kconfig +++ b/drivers/iommu/Kconfig @@ -473,4 +473,15 @@ config HYPERV_IOMMU Stub IOMMU driver to handle IRQs as to allow Hyper-V Linux guests to run with x2APIC mode enabled. +config VIRTIO_IOMMU + bool "Virtio IOMMU driver" + depends on VIRTIO=y + depends on ARM64 + select IOMMU_API + select INTERVAL_TREE + help + Para-virtualised IOMMU driver with virtio. + + Say Y here if you intend to run this kernel as a guest. + endif # IOMMU_SUPPORT diff --git a/drivers/iommu/Makefile b/drivers/iommu/Makefile index 8c71a15e986b..f13f36ae1af6 100644 --- a/drivers/iommu/Makefile +++ b/drivers/iommu/Makefile @@ -33,3 +33,4 @@ obj-$(CONFIG_FSL_PAMU) += fsl_pamu.o fsl_pamu_domain.o obj-$(CONFIG_S390_IOMMU) += s390-iommu.o obj-$(CONFIG_QCOM_IOMMU) += qcom_iommu.o obj-$(CONFIG_HYPERV_IOMMU) += hyperv-iommu.o +obj-$(CONFIG_VIRTIO_IOMMU) += virtio-iommu.o diff --git a/drivers/iommu/virtio-iommu.c b/drivers/iommu/virtio-iommu.c new file mode 100644 index 000000000000..6fa012cd727e --- /dev/null +++ b/drivers/iommu/virtio-iommu.c @@ -0,0 +1,916 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Virtio driver for the paravirtualized IOMMU + * + * Copyright (C) 2018 Arm Limited + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#define MSI_IOVA_BASE 0x8000000 +#define MSI_IOVA_LENGTH 0x100000 + +#define VIOMMU_REQUEST_VQ 0 +#define VIOMMU_NR_VQS 1 + +struct viommu_dev { + struct iommu_device iommu; + struct device *dev; + struct virtio_device *vdev; + + struct ida domain_ids; + + struct virtqueue *vqs[VIOMMU_NR_VQS]; + spinlock_t request_lock; + struct list_head requests; + + /* Device configuration */ + struct iommu_domain_geometry geometry; + u64 pgsize_bitmap; + u8 domain_bits; +}; + +struct viommu_mapping { + phys_addr_t paddr; + struct interval_tree_node iova; + u32 flags; +}; + +struct viommu_domain { + struct iommu_domain domain; + struct viommu_dev *viommu; + struct mutex mutex; /* protects viommu pointer */ + unsigned int id; + + spinlock_t mappings_lock; + struct rb_root_cached mappings; + + unsigned long nr_endpoints; +}; + +struct viommu_endpoint { + struct viommu_dev *viommu; + struct viommu_domain *vdomain; +}; + +struct viommu_request { + struct list_head list; + void *writeback; + unsigned int write_offset; + unsigned int len; + char buf[]; +}; + +#define to_viommu_domain(domain) \ + container_of(domain, struct viommu_domain, domain) + +static int viommu_get_req_errno(void *buf, size_t len) +{ + struct virtio_iommu_req_tail *tail = buf + len - sizeof(*tail); + + switch (tail->status) { + case VIRTIO_IOMMU_S_OK: + return 0; + case VIRTIO_IOMMU_S_UNSUPP: + return -ENOSYS; + case VIRTIO_IOMMU_S_INVAL: + return -EINVAL; + case VIRTIO_IOMMU_S_RANGE: + return -ERANGE; + case VIRTIO_IOMMU_S_NOENT: + return -ENOENT; + case VIRTIO_IOMMU_S_FAULT: + return -EFAULT; + case VIRTIO_IOMMU_S_IOERR: + case VIRTIO_IOMMU_S_DEVERR: + default: + return -EIO; + } +} + +static void viommu_set_req_status(void *buf, size_t len, int status) +{ + struct virtio_iommu_req_tail *tail = buf + len - sizeof(*tail); + + tail->status = status; +} + +static off_t viommu_get_write_desc_offset(struct viommu_dev *viommu, + struct virtio_iommu_req_head *req, + size_t len) +{ + size_t tail_size = sizeof(struct virtio_iommu_req_tail); + + return len - tail_size; +} + +/* + * __viommu_sync_req - Complete all in-flight requests + * + * Wait for all added requests to complete. When this function returns, all + * requests that were in-flight at the time of the call have completed. + */ +static int __viommu_sync_req(struct viommu_dev *viommu) +{ + int ret = 0; + unsigned int len; + size_t write_len; + struct viommu_request *req; + struct virtqueue *vq = viommu->vqs[VIOMMU_REQUEST_VQ]; + + assert_spin_locked(&viommu->request_lock); + + virtqueue_kick(vq); + + while (!list_empty(&viommu->requests)) { + len = 0; + req = virtqueue_get_buf(vq, &len); + if (!req) + continue; + + if (!len) + viommu_set_req_status(req->buf, req->len, + VIRTIO_IOMMU_S_IOERR); + + write_len = req->len - req->write_offset; + if (req->writeback && len == write_len) + memcpy(req->writeback, req->buf + req->write_offset, + write_len); + + list_del(&req->list); + kfree(req); + } + + return ret; +} + +static int viommu_sync_req(struct viommu_dev *viommu) +{ + int ret; + unsigned long flags; + + spin_lock_irqsave(&viommu->request_lock, flags); + ret = __viommu_sync_req(viommu); + if (ret) + dev_dbg(viommu->dev, "could not sync requests (%d)\n", ret); + spin_unlock_irqrestore(&viommu->request_lock, flags); + + return ret; +} + +/* + * __viommu_add_request - Add one request to the queue + * @buf: pointer to the request buffer + * @len: length of the request buffer + * @writeback: copy data back to the buffer when the request completes. + * + * Add a request to the queue. Only synchronize the queue if it's already full. + * Otherwise don't kick the queue nor wait for requests to complete. + * + * When @writeback is true, data written by the device, including the request + * status, is copied into @buf after the request completes. This is unsafe if + * the caller allocates @buf on stack and drops the lock between add_req() and + * sync_req(). + * + * Return 0 if the request was successfully added to the queue. + */ +static int __viommu_add_req(struct viommu_dev *viommu, void *buf, size_t len, + bool writeback) +{ + int ret; + off_t write_offset; + struct viommu_request *req; + struct scatterlist top_sg, bottom_sg; + struct scatterlist *sg[2] = { &top_sg, &bottom_sg }; + struct virtqueue *vq = viommu->vqs[VIOMMU_REQUEST_VQ]; + + assert_spin_locked(&viommu->request_lock); + + write_offset = viommu_get_write_desc_offset(viommu, buf, len); + if (write_offset <= 0) + return -EINVAL; + + req = kzalloc(sizeof(*req) + len, GFP_ATOMIC); + if (!req) + return -ENOMEM; + + req->len = len; + if (writeback) { + req->writeback = buf + write_offset; + req->write_offset = write_offset; + } + memcpy(&req->buf, buf, write_offset); + + sg_init_one(&top_sg, req->buf, write_offset); + sg_init_one(&bottom_sg, req->buf + write_offset, len - write_offset); + + ret = virtqueue_add_sgs(vq, sg, 1, 1, req, GFP_ATOMIC); + if (ret == -ENOSPC) { + /* If the queue is full, sync and retry */ + if (!__viommu_sync_req(viommu)) + ret = virtqueue_add_sgs(vq, sg, 1, 1, req, GFP_ATOMIC); + } + if (ret) + goto err_free; + + list_add_tail(&req->list, &viommu->requests); + return 0; + +err_free: + kfree(req); + return ret; +} + +static int viommu_add_req(struct viommu_dev *viommu, void *buf, size_t len) +{ + int ret; + unsigned long flags; + + spin_lock_irqsave(&viommu->request_lock, flags); + ret = __viommu_add_req(viommu, buf, len, false); + if (ret) + dev_dbg(viommu->dev, "could not add request: %d\n", ret); + spin_unlock_irqrestore(&viommu->request_lock, flags); + + return ret; +} + +/* + * Send a request and wait for it to complete. Return the request status (as an + * errno) + */ +static int viommu_send_req_sync(struct viommu_dev *viommu, void *buf, + size_t len) +{ + int ret; + unsigned long flags; + + spin_lock_irqsave(&viommu->request_lock, flags); + + ret = __viommu_add_req(viommu, buf, len, true); + if (ret) { + dev_dbg(viommu->dev, "could not add request (%d)\n", ret); + goto out_unlock; + } + + ret = __viommu_sync_req(viommu); + if (ret) { + dev_dbg(viommu->dev, "could not sync requests (%d)\n", ret); + /* Fall-through (get the actual request status) */ + } + + ret = viommu_get_req_errno(buf, len); +out_unlock: + spin_unlock_irqrestore(&viommu->request_lock, flags); + return ret; +} + +/* + * viommu_add_mapping - add a mapping to the internal tree + * + * On success, return the new mapping. Otherwise return NULL. + */ +static int viommu_add_mapping(struct viommu_domain *vdomain, unsigned long iova, + phys_addr_t paddr, size_t size, u32 flags) +{ + unsigned long irqflags; + struct viommu_mapping *mapping; + + mapping = kzalloc(sizeof(*mapping), GFP_ATOMIC); + if (!mapping) + return -ENOMEM; + + mapping->paddr = paddr; + mapping->iova.start = iova; + mapping->iova.last = iova + size - 1; + mapping->flags = flags; + + spin_lock_irqsave(&vdomain->mappings_lock, irqflags); + interval_tree_insert(&mapping->iova, &vdomain->mappings); + spin_unlock_irqrestore(&vdomain->mappings_lock, irqflags); + + return 0; +} + +/* + * viommu_del_mappings - remove mappings from the internal tree + * + * @vdomain: the domain + * @iova: start of the range + * @size: size of the range. A size of 0 corresponds to the entire address + * space. + * + * On success, returns the number of unmapped bytes (>= size) + */ +static size_t viommu_del_mappings(struct viommu_domain *vdomain, + unsigned long iova, size_t size) +{ + size_t unmapped = 0; + unsigned long flags; + unsigned long last = iova + size - 1; + struct viommu_mapping *mapping = NULL; + struct interval_tree_node *node, *next; + + spin_lock_irqsave(&vdomain->mappings_lock, flags); + next = interval_tree_iter_first(&vdomain->mappings, iova, last); + while (next) { + node = next; + mapping = container_of(node, struct viommu_mapping, iova); + next = interval_tree_iter_next(node, iova, last); + + /* Trying to split a mapping? */ + if (mapping->iova.start < iova) + break; + + /* + * Virtio-iommu doesn't allow UNMAP to split a mapping created + * with a single MAP request, so remove the full mapping. + */ + unmapped += mapping->iova.last - mapping->iova.start + 1; + + interval_tree_remove(node, &vdomain->mappings); + kfree(mapping); + } + spin_unlock_irqrestore(&vdomain->mappings_lock, flags); + + return unmapped; +} + +/* + * viommu_replay_mappings - re-send MAP requests + * + * When reattaching a domain that was previously detached from all endpoints, + * mappings were deleted from the device. Re-create the mappings available in + * the internal tree. + */ +static int viommu_replay_mappings(struct viommu_domain *vdomain) +{ + int ret = 0; + unsigned long flags; + struct viommu_mapping *mapping; + struct interval_tree_node *node; + struct virtio_iommu_req_map map; + + spin_lock_irqsave(&vdomain->mappings_lock, flags); + node = interval_tree_iter_first(&vdomain->mappings, 0, -1UL); + while (node) { + mapping = container_of(node, struct viommu_mapping, iova); + map = (struct virtio_iommu_req_map) { + .head.type = VIRTIO_IOMMU_T_MAP, + .domain = cpu_to_le32(vdomain->id), + .virt_start = cpu_to_le64(mapping->iova.start), + .virt_end = cpu_to_le64(mapping->iova.last), + .phys_start = cpu_to_le64(mapping->paddr), + .flags = cpu_to_le32(mapping->flags), + }; + + ret = viommu_send_req_sync(vdomain->viommu, &map, sizeof(map)); + if (ret) + break; + + node = interval_tree_iter_next(node, 0, -1UL); + } + spin_unlock_irqrestore(&vdomain->mappings_lock, flags); + + return ret; +} + +/* IOMMU API */ + +static struct iommu_domain *viommu_domain_alloc(unsigned type) +{ + struct viommu_domain *vdomain; + + if (type != IOMMU_DOMAIN_UNMANAGED && type != IOMMU_DOMAIN_DMA) + return NULL; + + vdomain = kzalloc(sizeof(*vdomain), GFP_KERNEL); + if (!vdomain) + return NULL; + + mutex_init(&vdomain->mutex); + spin_lock_init(&vdomain->mappings_lock); + vdomain->mappings = RB_ROOT_CACHED; + + if (type == IOMMU_DOMAIN_DMA && + iommu_get_dma_cookie(&vdomain->domain)) { + kfree(vdomain); + return NULL; + } + + return &vdomain->domain; +} + +static int viommu_domain_finalise(struct viommu_dev *viommu, + struct iommu_domain *domain) +{ + int ret; + struct viommu_domain *vdomain = to_viommu_domain(domain); + unsigned int max_domain = viommu->domain_bits > 31 ? ~0 : + (1U << viommu->domain_bits) - 1; + + vdomain->viommu = viommu; + + domain->pgsize_bitmap = viommu->pgsize_bitmap; + domain->geometry = viommu->geometry; + + ret = ida_alloc_max(&viommu->domain_ids, max_domain, GFP_KERNEL); + if (ret >= 0) + vdomain->id = (unsigned int)ret; + + return ret > 0 ? 0 : ret; +} + +static void viommu_domain_free(struct iommu_domain *domain) +{ + struct viommu_domain *vdomain = to_viommu_domain(domain); + + iommu_put_dma_cookie(domain); + + /* Free all remaining mappings (size 2^64) */ + viommu_del_mappings(vdomain, 0, 0); + + if (vdomain->viommu) + ida_free(&vdomain->viommu->domain_ids, vdomain->id); + + kfree(vdomain); +} + +static int viommu_attach_dev(struct iommu_domain *domain, struct device *dev) +{ + int i; + int ret = 0; + struct virtio_iommu_req_attach req; + struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); + struct viommu_endpoint *vdev = fwspec->iommu_priv; + struct viommu_domain *vdomain = to_viommu_domain(domain); + + mutex_lock(&vdomain->mutex); + if (!vdomain->viommu) { + /* + * Properly initialize the domain now that we know which viommu + * owns it. + */ + ret = viommu_domain_finalise(vdev->viommu, domain); + } else if (vdomain->viommu != vdev->viommu) { + dev_err(dev, "cannot attach to foreign vIOMMU\n"); + ret = -EXDEV; + } + mutex_unlock(&vdomain->mutex); + + if (ret) + return ret; + + /* + * In the virtio-iommu device, when attaching the endpoint to a new + * domain, it is detached from the old one and, if as as a result the + * old domain isn't attached to any endpoint, all mappings are removed + * from the old domain and it is freed. + * + * In the driver the old domain still exists, and its mappings will be + * recreated if it gets reattached to an endpoint. Otherwise it will be + * freed explicitly. + * + * vdev->vdomain is protected by group->mutex + */ + if (vdev->vdomain) + vdev->vdomain->nr_endpoints--; + + req = (struct virtio_iommu_req_attach) { + .head.type = VIRTIO_IOMMU_T_ATTACH, + .domain = cpu_to_le32(vdomain->id), + }; + + for (i = 0; i < fwspec->num_ids; i++) { + req.endpoint = cpu_to_le32(fwspec->ids[i]); + + ret = viommu_send_req_sync(vdomain->viommu, &req, sizeof(req)); + if (ret) + return ret; + } + + if (!vdomain->nr_endpoints) { + /* + * This endpoint is the first to be attached to the domain. + * Replay existing mappings (e.g. SW MSI). + */ + ret = viommu_replay_mappings(vdomain); + if (ret) + return ret; + } + + vdomain->nr_endpoints++; + vdev->vdomain = vdomain; + + return 0; +} + +static int viommu_map(struct iommu_domain *domain, unsigned long iova, + phys_addr_t paddr, size_t size, int prot) +{ + int ret; + int flags; + struct virtio_iommu_req_map map; + struct viommu_domain *vdomain = to_viommu_domain(domain); + + flags = (prot & IOMMU_READ ? VIRTIO_IOMMU_MAP_F_READ : 0) | + (prot & IOMMU_WRITE ? VIRTIO_IOMMU_MAP_F_WRITE : 0) | + (prot & IOMMU_MMIO ? VIRTIO_IOMMU_MAP_F_MMIO : 0); + + ret = viommu_add_mapping(vdomain, iova, paddr, size, flags); + if (ret) + return ret; + + map = (struct virtio_iommu_req_map) { + .head.type = VIRTIO_IOMMU_T_MAP, + .domain = cpu_to_le32(vdomain->id), + .virt_start = cpu_to_le64(iova), + .phys_start = cpu_to_le64(paddr), + .virt_end = cpu_to_le64(iova + size - 1), + .flags = cpu_to_le32(flags), + }; + + if (!vdomain->nr_endpoints) + return 0; + + ret = viommu_send_req_sync(vdomain->viommu, &map, sizeof(map)); + if (ret) + viommu_del_mappings(vdomain, iova, size); + + return ret; +} + +static size_t viommu_unmap(struct iommu_domain *domain, unsigned long iova, + size_t size) +{ + int ret = 0; + size_t unmapped; + struct virtio_iommu_req_unmap unmap; + struct viommu_domain *vdomain = to_viommu_domain(domain); + + unmapped = viommu_del_mappings(vdomain, iova, size); + if (unmapped < size) + return 0; + + /* Device already removed all mappings after detach. */ + if (!vdomain->nr_endpoints) + return unmapped; + + unmap = (struct virtio_iommu_req_unmap) { + .head.type = VIRTIO_IOMMU_T_UNMAP, + .domain = cpu_to_le32(vdomain->id), + .virt_start = cpu_to_le64(iova), + .virt_end = cpu_to_le64(iova + unmapped - 1), + }; + + ret = viommu_add_req(vdomain->viommu, &unmap, sizeof(unmap)); + return ret ? 0 : unmapped; +} + +static phys_addr_t viommu_iova_to_phys(struct iommu_domain *domain, + dma_addr_t iova) +{ + u64 paddr = 0; + unsigned long flags; + struct viommu_mapping *mapping; + struct interval_tree_node *node; + struct viommu_domain *vdomain = to_viommu_domain(domain); + + spin_lock_irqsave(&vdomain->mappings_lock, flags); + node = interval_tree_iter_first(&vdomain->mappings, iova, iova); + if (node) { + mapping = container_of(node, struct viommu_mapping, iova); + paddr = mapping->paddr + (iova - mapping->iova.start); + } + spin_unlock_irqrestore(&vdomain->mappings_lock, flags); + + return paddr; +} + +static void viommu_iotlb_sync(struct iommu_domain *domain) +{ + struct viommu_domain *vdomain = to_viommu_domain(domain); + + viommu_sync_req(vdomain->viommu); +} + +static void viommu_get_resv_regions(struct device *dev, struct list_head *head) +{ + struct iommu_resv_region *region; + int prot = IOMMU_WRITE | IOMMU_NOEXEC | IOMMU_MMIO; + + region = iommu_alloc_resv_region(MSI_IOVA_BASE, MSI_IOVA_LENGTH, prot, + IOMMU_RESV_SW_MSI); + if (!region) + return; + + list_add_tail(®ion->list, head); + iommu_dma_get_resv_regions(dev, head); +} + +static void viommu_put_resv_regions(struct device *dev, struct list_head *head) +{ + struct iommu_resv_region *entry, *next; + + list_for_each_entry_safe(entry, next, head, list) + kfree(entry); +} + +static struct iommu_ops viommu_ops; +static struct virtio_driver virtio_iommu_drv; + +static int viommu_match_node(struct device *dev, void *data) +{ + return dev->parent->fwnode == data; +} + +static struct viommu_dev *viommu_get_by_fwnode(struct fwnode_handle *fwnode) +{ + struct device *dev = driver_find_device(&virtio_iommu_drv.driver, NULL, + fwnode, viommu_match_node); + put_device(dev); + + return dev ? dev_to_virtio(dev)->priv : NULL; +} + +static int viommu_add_device(struct device *dev) +{ + int ret; + struct iommu_group *group; + struct viommu_endpoint *vdev; + struct viommu_dev *viommu = NULL; + struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); + + if (!fwspec || fwspec->ops != &viommu_ops) + return -ENODEV; + + viommu = viommu_get_by_fwnode(fwspec->iommu_fwnode); + if (!viommu) + return -ENODEV; + + vdev = kzalloc(sizeof(*vdev), GFP_KERNEL); + if (!vdev) + return -ENOMEM; + + vdev->viommu = viommu; + fwspec->iommu_priv = vdev; + + ret = iommu_device_link(&viommu->iommu, dev); + if (ret) + goto err_free_dev; + + /* + * Last step creates a default domain and attaches to it. Everything + * must be ready. + */ + group = iommu_group_get_for_dev(dev); + if (IS_ERR(group)) { + ret = PTR_ERR(group); + goto err_unlink_dev; + } + + iommu_group_put(group); + + return PTR_ERR_OR_ZERO(group); + +err_unlink_dev: + iommu_device_unlink(&viommu->iommu, dev); +err_free_dev: + kfree(vdev); + + return ret; +} + +static void viommu_remove_device(struct device *dev) +{ + struct viommu_endpoint *vdev; + struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); + + if (!fwspec || fwspec->ops != &viommu_ops) + return; + + vdev = fwspec->iommu_priv; + + iommu_group_remove_device(dev); + iommu_device_unlink(&vdev->viommu->iommu, dev); + kfree(vdev); +} + +static struct iommu_group *viommu_device_group(struct device *dev) +{ + if (dev_is_pci(dev)) + return pci_device_group(dev); + else + return generic_device_group(dev); +} + +static int viommu_of_xlate(struct device *dev, struct of_phandle_args *args) +{ + return iommu_fwspec_add_ids(dev, args->args, 1); +} + +static struct iommu_ops viommu_ops = { + .domain_alloc = viommu_domain_alloc, + .domain_free = viommu_domain_free, + .attach_dev = viommu_attach_dev, + .map = viommu_map, + .unmap = viommu_unmap, + .iova_to_phys = viommu_iova_to_phys, + .iotlb_sync = viommu_iotlb_sync, + .add_device = viommu_add_device, + .remove_device = viommu_remove_device, + .device_group = viommu_device_group, + .get_resv_regions = viommu_get_resv_regions, + .put_resv_regions = viommu_put_resv_regions, + .of_xlate = viommu_of_xlate, +}; + +static int viommu_init_vqs(struct viommu_dev *viommu) +{ + struct virtio_device *vdev = dev_to_virtio(viommu->dev); + const char *name = "request"; + void *ret; + + ret = virtio_find_single_vq(vdev, NULL, name); + if (IS_ERR(ret)) { + dev_err(viommu->dev, "cannot find VQ\n"); + return PTR_ERR(ret); + } + + viommu->vqs[VIOMMU_REQUEST_VQ] = ret; + + return 0; +} + +static int viommu_probe(struct virtio_device *vdev) +{ + struct device *parent_dev = vdev->dev.parent; + struct viommu_dev *viommu = NULL; + struct device *dev = &vdev->dev; + u64 input_start = 0; + u64 input_end = -1UL; + int ret; + + if (!virtio_has_feature(vdev, VIRTIO_F_VERSION_1) || + !virtio_has_feature(vdev, VIRTIO_IOMMU_F_MAP_UNMAP)) + return -ENODEV; + + viommu = devm_kzalloc(dev, sizeof(*viommu), GFP_KERNEL); + if (!viommu) + return -ENOMEM; + + spin_lock_init(&viommu->request_lock); + ida_init(&viommu->domain_ids); + viommu->dev = dev; + viommu->vdev = vdev; + INIT_LIST_HEAD(&viommu->requests); + + ret = viommu_init_vqs(viommu); + if (ret) + return ret; + + virtio_cread(vdev, struct virtio_iommu_config, page_size_mask, + &viommu->pgsize_bitmap); + + if (!viommu->pgsize_bitmap) { + ret = -EINVAL; + goto err_free_vqs; + } + + viommu->domain_bits = 32; + + /* Optional features */ + virtio_cread_feature(vdev, VIRTIO_IOMMU_F_INPUT_RANGE, + struct virtio_iommu_config, input_range.start, + &input_start); + + virtio_cread_feature(vdev, VIRTIO_IOMMU_F_INPUT_RANGE, + struct virtio_iommu_config, input_range.end, + &input_end); + + virtio_cread_feature(vdev, VIRTIO_IOMMU_F_DOMAIN_BITS, + struct virtio_iommu_config, domain_bits, + &viommu->domain_bits); + + viommu->geometry = (struct iommu_domain_geometry) { + .aperture_start = input_start, + .aperture_end = input_end, + .force_aperture = true, + }; + + viommu_ops.pgsize_bitmap = viommu->pgsize_bitmap; + + virtio_device_ready(vdev); + + ret = iommu_device_sysfs_add(&viommu->iommu, dev, NULL, "%s", + virtio_bus_name(vdev)); + if (ret) + goto err_free_vqs; + + iommu_device_set_ops(&viommu->iommu, &viommu_ops); + iommu_device_set_fwnode(&viommu->iommu, parent_dev->fwnode); + + iommu_device_register(&viommu->iommu); + +#ifdef CONFIG_PCI + if (pci_bus_type.iommu_ops != &viommu_ops) { + pci_request_acs(); + ret = bus_set_iommu(&pci_bus_type, &viommu_ops); + if (ret) + goto err_unregister; + } +#endif +#ifdef CONFIG_ARM_AMBA + if (amba_bustype.iommu_ops != &viommu_ops) { + ret = bus_set_iommu(&amba_bustype, &viommu_ops); + if (ret) + goto err_unregister; + } +#endif + if (platform_bus_type.iommu_ops != &viommu_ops) { + ret = bus_set_iommu(&platform_bus_type, &viommu_ops); + if (ret) + goto err_unregister; + } + + vdev->priv = viommu; + + dev_info(dev, "input address: %u bits\n", + order_base_2(viommu->geometry.aperture_end)); + dev_info(dev, "page mask: %#llx\n", viommu->pgsize_bitmap); + + return 0; + +err_unregister: + iommu_device_sysfs_remove(&viommu->iommu); + iommu_device_unregister(&viommu->iommu); +err_free_vqs: + vdev->config->del_vqs(vdev); + + return ret; +} + +static void viommu_remove(struct virtio_device *vdev) +{ + struct viommu_dev *viommu = vdev->priv; + + iommu_device_sysfs_remove(&viommu->iommu); + iommu_device_unregister(&viommu->iommu); + + /* Stop all virtqueues */ + vdev->config->reset(vdev); + vdev->config->del_vqs(vdev); + + dev_info(&vdev->dev, "device removed\n"); +} + +static void viommu_config_changed(struct virtio_device *vdev) +{ + dev_warn(&vdev->dev, "config changed\n"); +} + +static unsigned int features[] = { + VIRTIO_IOMMU_F_MAP_UNMAP, + VIRTIO_IOMMU_F_DOMAIN_BITS, + VIRTIO_IOMMU_F_INPUT_RANGE, +}; + +static struct virtio_device_id id_table[] = { + { VIRTIO_ID_IOMMU, VIRTIO_DEV_ANY_ID }, + { 0 }, +}; + +static struct virtio_driver virtio_iommu_drv = { + .driver.name = KBUILD_MODNAME, + .driver.owner = THIS_MODULE, + .id_table = id_table, + .feature_table = features, + .feature_table_size = ARRAY_SIZE(features), + .probe = viommu_probe, + .remove = viommu_remove, + .config_changed = viommu_config_changed, +}; + +module_virtio_driver(virtio_iommu_drv); + +MODULE_DESCRIPTION("Virtio IOMMU driver"); +MODULE_AUTHOR("Jean-Philippe Brucker "); +MODULE_LICENSE("GPL v2"); diff --git a/include/uapi/linux/virtio_ids.h b/include/uapi/linux/virtio_ids.h index 6d5c3b2d4f4d..cfe47c5d9a56 100644 --- a/include/uapi/linux/virtio_ids.h +++ b/include/uapi/linux/virtio_ids.h @@ -43,5 +43,6 @@ #define VIRTIO_ID_INPUT 18 /* virtio input */ #define VIRTIO_ID_VSOCK 19 /* virtio vsock transport */ #define VIRTIO_ID_CRYPTO 20 /* virtio crypto */ +#define VIRTIO_ID_IOMMU 23 /* virtio IOMMU */ #endif /* _LINUX_VIRTIO_IDS_H */ diff --git a/include/uapi/linux/virtio_iommu.h b/include/uapi/linux/virtio_iommu.h new file mode 100644 index 000000000000..5e5fd62689fb --- /dev/null +++ b/include/uapi/linux/virtio_iommu.h @@ -0,0 +1,106 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ +/* + * Virtio-iommu definition v0.9 + * + * Copyright (C) 2018 Arm Ltd. + */ +#ifndef _UAPI_LINUX_VIRTIO_IOMMU_H +#define _UAPI_LINUX_VIRTIO_IOMMU_H + +#include + +/* Feature bits */ +#define VIRTIO_IOMMU_F_INPUT_RANGE 0 +#define VIRTIO_IOMMU_F_DOMAIN_BITS 1 +#define VIRTIO_IOMMU_F_MAP_UNMAP 2 +#define VIRTIO_IOMMU_F_BYPASS 3 + +struct virtio_iommu_range { + __u64 start; + __u64 end; +}; + +struct virtio_iommu_config { + /* Supported page sizes */ + __u64 page_size_mask; + /* Supported IOVA range */ + struct virtio_iommu_range input_range; + /* Max domain ID size */ + __u8 domain_bits; + __u8 padding[3]; + /* Probe buffer size */ + __u32 probe_size; +}; + +/* Request types */ +#define VIRTIO_IOMMU_T_ATTACH 0x01 +#define VIRTIO_IOMMU_T_DETACH 0x02 +#define VIRTIO_IOMMU_T_MAP 0x03 +#define VIRTIO_IOMMU_T_UNMAP 0x04 + +/* Status types */ +#define VIRTIO_IOMMU_S_OK 0x00 +#define VIRTIO_IOMMU_S_IOERR 0x01 +#define VIRTIO_IOMMU_S_UNSUPP 0x02 +#define VIRTIO_IOMMU_S_DEVERR 0x03 +#define VIRTIO_IOMMU_S_INVAL 0x04 +#define VIRTIO_IOMMU_S_RANGE 0x05 +#define VIRTIO_IOMMU_S_NOENT 0x06 +#define VIRTIO_IOMMU_S_FAULT 0x07 + +struct virtio_iommu_req_head { + __u8 type; + __u8 reserved[3]; +}; + +struct virtio_iommu_req_tail { + __u8 status; + __u8 reserved[3]; +}; + +struct virtio_iommu_req_attach { + struct virtio_iommu_req_head head; + __le32 domain; + __le32 endpoint; + __u8 reserved[8]; + struct virtio_iommu_req_tail tail; +}; + +struct virtio_iommu_req_detach { + struct virtio_iommu_req_head head; + __le32 domain; + __le32 endpoint; + __u8 reserved[8]; + struct virtio_iommu_req_tail tail; +}; + +#define VIRTIO_IOMMU_MAP_F_READ (1 << 0) +#define VIRTIO_IOMMU_MAP_F_WRITE (1 << 1) +#define VIRTIO_IOMMU_MAP_F_EXEC (1 << 2) +#define VIRTIO_IOMMU_MAP_F_MMIO (1 << 3) + +#define VIRTIO_IOMMU_MAP_F_MASK (VIRTIO_IOMMU_MAP_F_READ | \ + VIRTIO_IOMMU_MAP_F_WRITE | \ + VIRTIO_IOMMU_MAP_F_EXEC | \ + VIRTIO_IOMMU_MAP_F_MMIO) + +struct virtio_iommu_req_map { + struct virtio_iommu_req_head head; + __le32 domain; + __le64 virt_start; + __le64 virt_end; + __le64 phys_start; + __le32 flags; + struct virtio_iommu_req_tail tail; +}; + +struct virtio_iommu_req_unmap { + struct virtio_iommu_req_head head; + __le32 domain; + __le64 virt_start; + __le64 virt_end; + __u8 reserved[4]; + struct virtio_iommu_req_tail tail; +}; + +#endif -- cgit v1.2.3-71-gd317 From 2a5a314874450decec244923209ce6ba97e3ed93 Mon Sep 17 00:00:00 2001 From: Jean-Philippe Brucker Date: Tue, 15 Jan 2019 12:19:58 +0000 Subject: iommu/virtio: Add probe request When the device offers the probe feature, send a probe request for each device managed by the IOMMU. Extract RESV_MEM information. When we encounter a MSI doorbell region, set it up as a IOMMU_RESV_MSI region. This will tell other subsystems that there is no need to map the MSI doorbell in the virtio-iommu, because MSIs bypass it. Tested-by: Bharat Bhushan Tested-by: Eric Auger Reviewed-by: Eric Auger Signed-off-by: Jean-Philippe Brucker Signed-off-by: Michael S. Tsirkin --- drivers/iommu/virtio-iommu.c | 157 ++++++++++++++++++++++++++++++++++++-- include/uapi/linux/virtio_iommu.h | 36 +++++++++ 2 files changed, 187 insertions(+), 6 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/iommu/virtio-iommu.c b/drivers/iommu/virtio-iommu.c index 6fa012cd727e..5e194493a531 100644 --- a/drivers/iommu/virtio-iommu.c +++ b/drivers/iommu/virtio-iommu.c @@ -46,6 +46,7 @@ struct viommu_dev { struct iommu_domain_geometry geometry; u64 pgsize_bitmap; u8 domain_bits; + u32 probe_size; }; struct viommu_mapping { @@ -67,8 +68,10 @@ struct viommu_domain { }; struct viommu_endpoint { + struct device *dev; struct viommu_dev *viommu; struct viommu_domain *vdomain; + struct list_head resv_regions; }; struct viommu_request { @@ -119,6 +122,9 @@ static off_t viommu_get_write_desc_offset(struct viommu_dev *viommu, { size_t tail_size = sizeof(struct virtio_iommu_req_tail); + if (req->type == VIRTIO_IOMMU_T_PROBE) + return len - viommu->probe_size - tail_size; + return len - tail_size; } @@ -393,6 +399,110 @@ static int viommu_replay_mappings(struct viommu_domain *vdomain) return ret; } +static int viommu_add_resv_mem(struct viommu_endpoint *vdev, + struct virtio_iommu_probe_resv_mem *mem, + size_t len) +{ + size_t size; + u64 start64, end64; + phys_addr_t start, end; + struct iommu_resv_region *region = NULL; + unsigned long prot = IOMMU_WRITE | IOMMU_NOEXEC | IOMMU_MMIO; + + start = start64 = le64_to_cpu(mem->start); + end = end64 = le64_to_cpu(mem->end); + size = end64 - start64 + 1; + + /* Catch any overflow, including the unlikely end64 - start64 + 1 = 0 */ + if (start != start64 || end != end64 || size < end64 - start64) + return -EOVERFLOW; + + if (len < sizeof(*mem)) + return -EINVAL; + + switch (mem->subtype) { + default: + dev_warn(vdev->dev, "unknown resv mem subtype 0x%x\n", + mem->subtype); + /* Fall-through */ + case VIRTIO_IOMMU_RESV_MEM_T_RESERVED: + region = iommu_alloc_resv_region(start, size, 0, + IOMMU_RESV_RESERVED); + break; + case VIRTIO_IOMMU_RESV_MEM_T_MSI: + region = iommu_alloc_resv_region(start, size, prot, + IOMMU_RESV_MSI); + break; + } + if (!region) + return -ENOMEM; + + list_add(&vdev->resv_regions, ®ion->list); + return 0; +} + +static int viommu_probe_endpoint(struct viommu_dev *viommu, struct device *dev) +{ + int ret; + u16 type, len; + size_t cur = 0; + size_t probe_len; + struct virtio_iommu_req_probe *probe; + struct virtio_iommu_probe_property *prop; + struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); + struct viommu_endpoint *vdev = fwspec->iommu_priv; + + if (!fwspec->num_ids) + return -EINVAL; + + probe_len = sizeof(*probe) + viommu->probe_size + + sizeof(struct virtio_iommu_req_tail); + probe = kzalloc(probe_len, GFP_KERNEL); + if (!probe) + return -ENOMEM; + + probe->head.type = VIRTIO_IOMMU_T_PROBE; + /* + * For now, assume that properties of an endpoint that outputs multiple + * IDs are consistent. Only probe the first one. + */ + probe->endpoint = cpu_to_le32(fwspec->ids[0]); + + ret = viommu_send_req_sync(viommu, probe, probe_len); + if (ret) + goto out_free; + + prop = (void *)probe->properties; + type = le16_to_cpu(prop->type) & VIRTIO_IOMMU_PROBE_T_MASK; + + while (type != VIRTIO_IOMMU_PROBE_T_NONE && + cur < viommu->probe_size) { + len = le16_to_cpu(prop->length) + sizeof(*prop); + + switch (type) { + case VIRTIO_IOMMU_PROBE_T_RESV_MEM: + ret = viommu_add_resv_mem(vdev, (void *)prop, len); + break; + default: + dev_err(dev, "unknown viommu prop 0x%x\n", type); + } + + if (ret) + dev_err(dev, "failed to parse viommu prop 0x%x\n", type); + + cur += len; + if (cur >= viommu->probe_size) + break; + + prop = (void *)probe->properties + cur; + type = le16_to_cpu(prop->type) & VIRTIO_IOMMU_PROBE_T_MASK; + } + +out_free: + kfree(probe); + return ret; +} + /* IOMMU API */ static struct iommu_domain *viommu_domain_alloc(unsigned type) @@ -614,15 +724,34 @@ static void viommu_iotlb_sync(struct iommu_domain *domain) static void viommu_get_resv_regions(struct device *dev, struct list_head *head) { - struct iommu_resv_region *region; + struct iommu_resv_region *entry, *new_entry, *msi = NULL; + struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); + struct viommu_endpoint *vdev = fwspec->iommu_priv; int prot = IOMMU_WRITE | IOMMU_NOEXEC | IOMMU_MMIO; - region = iommu_alloc_resv_region(MSI_IOVA_BASE, MSI_IOVA_LENGTH, prot, - IOMMU_RESV_SW_MSI); - if (!region) - return; + list_for_each_entry(entry, &vdev->resv_regions, list) { + if (entry->type == IOMMU_RESV_MSI) + msi = entry; + + new_entry = kmemdup(entry, sizeof(*entry), GFP_KERNEL); + if (!new_entry) + return; + list_add_tail(&new_entry->list, head); + } + + /* + * If the device didn't register any bypass MSI window, add a + * software-mapped region. + */ + if (!msi) { + msi = iommu_alloc_resv_region(MSI_IOVA_BASE, MSI_IOVA_LENGTH, + prot, IOMMU_RESV_SW_MSI); + if (!msi) + return; + + list_add_tail(&msi->list, head); + } - list_add_tail(®ion->list, head); iommu_dma_get_resv_regions(dev, head); } @@ -670,9 +799,18 @@ static int viommu_add_device(struct device *dev) if (!vdev) return -ENOMEM; + vdev->dev = dev; vdev->viommu = viommu; + INIT_LIST_HEAD(&vdev->resv_regions); fwspec->iommu_priv = vdev; + if (viommu->probe_size) { + /* Get additional information for this endpoint */ + ret = viommu_probe_endpoint(viommu, dev); + if (ret) + goto err_free_dev; + } + ret = iommu_device_link(&viommu->iommu, dev); if (ret) goto err_free_dev; @@ -694,6 +832,7 @@ static int viommu_add_device(struct device *dev) err_unlink_dev: iommu_device_unlink(&viommu->iommu, dev); err_free_dev: + viommu_put_resv_regions(dev, &vdev->resv_regions); kfree(vdev); return ret; @@ -711,6 +850,7 @@ static void viommu_remove_device(struct device *dev) iommu_group_remove_device(dev); iommu_device_unlink(&vdev->viommu->iommu, dev); + viommu_put_resv_regions(dev, &vdev->resv_regions); kfree(vdev); } @@ -810,6 +950,10 @@ static int viommu_probe(struct virtio_device *vdev) struct virtio_iommu_config, domain_bits, &viommu->domain_bits); + virtio_cread_feature(vdev, VIRTIO_IOMMU_F_PROBE, + struct virtio_iommu_config, probe_size, + &viommu->probe_size); + viommu->geometry = (struct iommu_domain_geometry) { .aperture_start = input_start, .aperture_end = input_end, @@ -891,6 +1035,7 @@ static unsigned int features[] = { VIRTIO_IOMMU_F_MAP_UNMAP, VIRTIO_IOMMU_F_DOMAIN_BITS, VIRTIO_IOMMU_F_INPUT_RANGE, + VIRTIO_IOMMU_F_PROBE, }; static struct virtio_device_id id_table[] = { diff --git a/include/uapi/linux/virtio_iommu.h b/include/uapi/linux/virtio_iommu.h index 5e5fd62689fb..ae6145cf5928 100644 --- a/include/uapi/linux/virtio_iommu.h +++ b/include/uapi/linux/virtio_iommu.h @@ -14,6 +14,7 @@ #define VIRTIO_IOMMU_F_DOMAIN_BITS 1 #define VIRTIO_IOMMU_F_MAP_UNMAP 2 #define VIRTIO_IOMMU_F_BYPASS 3 +#define VIRTIO_IOMMU_F_PROBE 4 struct virtio_iommu_range { __u64 start; @@ -37,6 +38,7 @@ struct virtio_iommu_config { #define VIRTIO_IOMMU_T_DETACH 0x02 #define VIRTIO_IOMMU_T_MAP 0x03 #define VIRTIO_IOMMU_T_UNMAP 0x04 +#define VIRTIO_IOMMU_T_PROBE 0x05 /* Status types */ #define VIRTIO_IOMMU_S_OK 0x00 @@ -103,4 +105,38 @@ struct virtio_iommu_req_unmap { struct virtio_iommu_req_tail tail; }; +#define VIRTIO_IOMMU_PROBE_T_NONE 0 +#define VIRTIO_IOMMU_PROBE_T_RESV_MEM 1 + +#define VIRTIO_IOMMU_PROBE_T_MASK 0xfff + +struct virtio_iommu_probe_property { + __le16 type; + __le16 length; +}; + +#define VIRTIO_IOMMU_RESV_MEM_T_RESERVED 0 +#define VIRTIO_IOMMU_RESV_MEM_T_MSI 1 + +struct virtio_iommu_probe_resv_mem { + struct virtio_iommu_probe_property head; + __u8 subtype; + __u8 reserved[3]; + __le64 start; + __le64 end; +}; + +struct virtio_iommu_req_probe { + struct virtio_iommu_req_head head; + __le32 endpoint; + __u8 reserved[64]; + + __u8 properties[]; + + /* + * Tail follows the variable-length properties array. No padding, + * property lengths are all aligned on 8 bytes. + */ +}; + #endif -- cgit v1.2.3-71-gd317 From 169a126c6e88a99578a309a9021f314b5d532c5f Mon Sep 17 00:00:00 2001 From: Jean-Philippe Brucker Date: Tue, 15 Jan 2019 12:19:59 +0000 Subject: iommu/virtio: Add event queue The event queue offers a way for the device to report access faults from endpoints. It is implemented on virtqueue #1. Whenever the host needs to signal a fault, it fills one of the buffers offered by the guest and interrupts it. Tested-by: Bharat Bhushan Tested-by: Eric Auger Reviewed-by: Eric Auger Signed-off-by: Jean-Philippe Brucker Signed-off-by: Michael S. Tsirkin --- drivers/iommu/virtio-iommu.c | 115 +++++++++++++++++++++++++++++++++++--- include/uapi/linux/virtio_iommu.h | 19 +++++++ 2 files changed, 125 insertions(+), 9 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/iommu/virtio-iommu.c b/drivers/iommu/virtio-iommu.c index 5e194493a531..4620dd221ffd 100644 --- a/drivers/iommu/virtio-iommu.c +++ b/drivers/iommu/virtio-iommu.c @@ -29,7 +29,8 @@ #define MSI_IOVA_LENGTH 0x100000 #define VIOMMU_REQUEST_VQ 0 -#define VIOMMU_NR_VQS 1 +#define VIOMMU_EVENT_VQ 1 +#define VIOMMU_NR_VQS 2 struct viommu_dev { struct iommu_device iommu; @@ -41,6 +42,7 @@ struct viommu_dev { struct virtqueue *vqs[VIOMMU_NR_VQS]; spinlock_t request_lock; struct list_head requests; + void *evts; /* Device configuration */ struct iommu_domain_geometry geometry; @@ -82,6 +84,15 @@ struct viommu_request { char buf[]; }; +#define VIOMMU_FAULT_RESV_MASK 0xffffff00 + +struct viommu_event { + union { + u32 head; + struct virtio_iommu_fault fault; + }; +}; + #define to_viommu_domain(domain) \ container_of(domain, struct viommu_domain, domain) @@ -503,6 +514,68 @@ out_free: return ret; } +static int viommu_fault_handler(struct viommu_dev *viommu, + struct virtio_iommu_fault *fault) +{ + char *reason_str; + + u8 reason = fault->reason; + u32 flags = le32_to_cpu(fault->flags); + u32 endpoint = le32_to_cpu(fault->endpoint); + u64 address = le64_to_cpu(fault->address); + + switch (reason) { + case VIRTIO_IOMMU_FAULT_R_DOMAIN: + reason_str = "domain"; + break; + case VIRTIO_IOMMU_FAULT_R_MAPPING: + reason_str = "page"; + break; + case VIRTIO_IOMMU_FAULT_R_UNKNOWN: + default: + reason_str = "unknown"; + break; + } + + /* TODO: find EP by ID and report_iommu_fault */ + if (flags & VIRTIO_IOMMU_FAULT_F_ADDRESS) + dev_err_ratelimited(viommu->dev, "%s fault from EP %u at %#llx [%s%s%s]\n", + reason_str, endpoint, address, + flags & VIRTIO_IOMMU_FAULT_F_READ ? "R" : "", + flags & VIRTIO_IOMMU_FAULT_F_WRITE ? "W" : "", + flags & VIRTIO_IOMMU_FAULT_F_EXEC ? "X" : ""); + else + dev_err_ratelimited(viommu->dev, "%s fault from EP %u\n", + reason_str, endpoint); + return 0; +} + +static void viommu_event_handler(struct virtqueue *vq) +{ + int ret; + unsigned int len; + struct scatterlist sg[1]; + struct viommu_event *evt; + struct viommu_dev *viommu = vq->vdev->priv; + + while ((evt = virtqueue_get_buf(vq, &len)) != NULL) { + if (len > sizeof(*evt)) { + dev_err(viommu->dev, + "invalid event buffer (len %u != %zu)\n", + len, sizeof(*evt)); + } else if (!(evt->head & VIOMMU_FAULT_RESV_MASK)) { + viommu_fault_handler(viommu, &evt->fault); + } + + sg_init_one(sg, evt, sizeof(*evt)); + ret = virtqueue_add_inbuf(vq, sg, 1, evt, GFP_ATOMIC); + if (ret) + dev_err(viommu->dev, "could not add event buffer\n"); + } + + virtqueue_kick(vq); +} + /* IOMMU API */ static struct iommu_domain *viommu_domain_alloc(unsigned type) @@ -886,16 +959,35 @@ static struct iommu_ops viommu_ops = { static int viommu_init_vqs(struct viommu_dev *viommu) { struct virtio_device *vdev = dev_to_virtio(viommu->dev); - const char *name = "request"; - void *ret; + const char *names[] = { "request", "event" }; + vq_callback_t *callbacks[] = { + NULL, /* No async requests */ + viommu_event_handler, + }; - ret = virtio_find_single_vq(vdev, NULL, name); - if (IS_ERR(ret)) { - dev_err(viommu->dev, "cannot find VQ\n"); - return PTR_ERR(ret); - } + return virtio_find_vqs(vdev, VIOMMU_NR_VQS, viommu->vqs, callbacks, + names, NULL); +} - viommu->vqs[VIOMMU_REQUEST_VQ] = ret; +static int viommu_fill_evtq(struct viommu_dev *viommu) +{ + int i, ret; + struct scatterlist sg[1]; + struct viommu_event *evts; + struct virtqueue *vq = viommu->vqs[VIOMMU_EVENT_VQ]; + size_t nr_evts = vq->num_free; + + viommu->evts = evts = devm_kmalloc_array(viommu->dev, nr_evts, + sizeof(*evts), GFP_KERNEL); + if (!evts) + return -ENOMEM; + + for (i = 0; i < nr_evts; i++) { + sg_init_one(sg, &evts[i], sizeof(*evts)); + ret = virtqueue_add_inbuf(vq, sg, 1, &evts[i], GFP_KERNEL); + if (ret) + return ret; + } return 0; } @@ -964,6 +1056,11 @@ static int viommu_probe(struct virtio_device *vdev) virtio_device_ready(vdev); + /* Populate the event queue with buffers */ + ret = viommu_fill_evtq(viommu); + if (ret) + goto err_free_vqs; + ret = iommu_device_sysfs_add(&viommu->iommu, dev, NULL, "%s", virtio_bus_name(vdev)); if (ret) diff --git a/include/uapi/linux/virtio_iommu.h b/include/uapi/linux/virtio_iommu.h index ae6145cf5928..ba1b460c9944 100644 --- a/include/uapi/linux/virtio_iommu.h +++ b/include/uapi/linux/virtio_iommu.h @@ -139,4 +139,23 @@ struct virtio_iommu_req_probe { */ }; +/* Fault types */ +#define VIRTIO_IOMMU_FAULT_R_UNKNOWN 0 +#define VIRTIO_IOMMU_FAULT_R_DOMAIN 1 +#define VIRTIO_IOMMU_FAULT_R_MAPPING 2 + +#define VIRTIO_IOMMU_FAULT_F_READ (1 << 0) +#define VIRTIO_IOMMU_FAULT_F_WRITE (1 << 1) +#define VIRTIO_IOMMU_FAULT_F_EXEC (1 << 2) +#define VIRTIO_IOMMU_FAULT_F_ADDRESS (1 << 8) + +struct virtio_iommu_fault { + __u8 reason; + __u8 reserved[3]; + __le32 flags; + __le32 endpoint; + __u8 reserved2[4]; + __le64 address; +}; + #endif -- cgit v1.2.3-71-gd317 From 7f192e3cd316ba58c88dfa26796cf77789dd9872 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Sat, 25 May 2019 11:36:41 +0200 Subject: fork: add clone3 This adds the clone3 system call. As mentioned several times already (cf. [7], [8]) here's the promised patchset for clone3(). We recently merged the CLONE_PIDFD patchset (cf. [1]). It took the last free flag from clone(). Independent of the CLONE_PIDFD patchset a time namespace has been discussed at Linux Plumber Conference last year and has been sent out and reviewed (cf. [5]). It is expected that it will go upstream in the not too distant future. However, it relies on the addition of the CLONE_NEWTIME flag to clone(). The only other good candidate - CLONE_DETACHED - is currently not recyclable as we have identified at least two large or widely used codebases that currently pass this flag (cf. [2], [3], and [4]). Given that CLONE_PIDFD grabbed the last clone() flag the time namespace is effectively blocked. clone3() has the advantage that it will unblock this patchset again. In general, clone3() is extensible and allows for the implementation of new features. The idea is to keep clone3() very simple and close to the original clone(), specifically, to keep on supporting old clone()-based workloads. We know there have been various creative proposals how a new process creation syscall or even api is supposed to look like. Some people even going so far as to argue that the traditional fork()+exec() split should be abandoned in favor of an in-kernel version of spawn(). Independent of whether or not we personally think spawn() is a good idea this patchset has and does not want to have anything to do with this. One stance we take is that there's no real good alternative to clone()+exec() and we need and want to support this model going forward; independent of spawn(). The following requirements guided clone3(): - bump the number of available flags - move arguments that are currently passed as separate arguments in clone() into a dedicated struct clone_args - choose a struct layout that is easy to handle on 32 and on 64 bit - choose a struct layout that is extensible - give new flags that currently need to abuse another flag's dedicated return argument in clone() their own dedicated return argument (e.g. CLONE_PIDFD) - use a separate kernel internal struct kernel_clone_args that is properly typed according to current kernel conventions in fork.c and is different from the uapi struct clone_args - port _do_fork() to use kernel_clone_args so that all process creation syscalls such as fork(), vfork(), clone(), and clone3() behave identical (Arnd suggested, that we can probably also port do_fork() itself in a separate patchset.) - ease of transition for userspace from clone() to clone3() This very much means that we do *not* remove functionality that userspace currently relies on as the latter is a good way of creating a syscall that won't be adopted. - do not try to be clever or complex: keep clone3() as dumb as possible In accordance with Linus suggestions (cf. [11]), clone3() has the following signature: /* uapi */ struct clone_args { __aligned_u64 flags; __aligned_u64 pidfd; __aligned_u64 child_tid; __aligned_u64 parent_tid; __aligned_u64 exit_signal; __aligned_u64 stack; __aligned_u64 stack_size; __aligned_u64 tls; }; /* kernel internal */ struct kernel_clone_args { u64 flags; int __user *pidfd; int __user *child_tid; int __user *parent_tid; int exit_signal; unsigned long stack; unsigned long stack_size; unsigned long tls; }; long sys_clone3(struct clone_args __user *uargs, size_t size) clone3() cleanly supports all of the supported flags from clone() and thus all legacy workloads. The advantage of sticking close to the old clone() is the low cost for userspace to switch to this new api. Quite a lot of userspace apis (e.g. pthreads) are based on the clone() syscall. With the new clone3() syscall supporting all of the old workloads and opening up the ability to add new features should make switching to it for userspace more appealing. In essence, glibc can just write a simple wrapper to switch from clone() to clone3(). There has been some interest in this patchset already. We have received a patch from the CRIU corner for clone3() that would set the PID/TID of a restored process without /proc/sys/kernel/ns_last_pid to eliminate a race. /* User visible differences to legacy clone() */ - CLONE_DETACHED will cause EINVAL with clone3() - CSIGNAL is deprecated It is superseeded by a dedicated "exit_signal" argument in struct clone_args freeing up space for additional flags. This is based on a suggestion from Andrei and Linus (cf. [9] and [10]) /* References */ [1]: b3e5838252665ee4cfa76b82bdf1198dca81e5be [2]: https://dxr.mozilla.org/mozilla-central/source/security/sandbox/linux/SandboxFilter.cpp#343 [3]: https://git.musl-libc.org/cgit/musl/tree/src/thread/pthread_create.c#n233 [4]: https://sources.debian.org/src/blcr/0.8.5-2.3/cr_module/cr_dump_self.c/?hl=740#L740 [5]: https://lore.kernel.org/lkml/20190425161416.26600-1-dima@arista.com/ [6]: https://lore.kernel.org/lkml/20190425161416.26600-2-dima@arista.com/ [7]: https://lore.kernel.org/lkml/CAHrFyr5HxpGXA2YrKza-oB-GGwJCqwPfyhD-Y5wbktWZdt0sGQ@mail.gmail.com/ [8]: https://lore.kernel.org/lkml/20190524102756.qjsjxukuq2f4t6bo@brauner.io/ [9]: https://lore.kernel.org/lkml/20190529222414.GA6492@gmail.com/ [10]: https://lore.kernel.org/lkml/CAHk-=whQP-Ykxi=zSYaV9iXsHsENa+2fdj-zYKwyeyed63Lsfw@mail.gmail.com/ [11]: https://lore.kernel.org/lkml/CAHk-=wieuV4hGwznPsX-8E0G2FKhx3NjZ9X3dTKh5zKd+iqOBw@mail.gmail.com/ Suggested-by: Linus Torvalds Signed-off-by: Christian Brauner Acked-by: Arnd Bergmann Acked-by: Serge Hallyn Cc: Kees Cook Cc: Pavel Emelyanov Cc: Jann Horn Cc: David Howells Cc: Andrew Morton Cc: Oleg Nesterov Cc: Adrian Reber Cc: Linus Torvalds Cc: Andrei Vagin Cc: Al Viro Cc: Florian Weimer Cc: linux-api@vger.kernel.org --- arch/x86/ia32/sys_ia32.c | 12 ++- include/linux/sched/task.h | 17 +++- include/linux/syscalls.h | 4 + include/uapi/linux/sched.h | 16 ++++ kernel/fork.c | 201 ++++++++++++++++++++++++++++++++++----------- 5 files changed, 199 insertions(+), 51 deletions(-) (limited to 'include/uapi/linux') diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c index a43212036257..64a6c952091e 100644 --- a/arch/x86/ia32/sys_ia32.c +++ b/arch/x86/ia32/sys_ia32.c @@ -237,6 +237,14 @@ COMPAT_SYSCALL_DEFINE5(x86_clone, unsigned long, clone_flags, unsigned long, newsp, int __user *, parent_tidptr, unsigned long, tls_val, int __user *, child_tidptr) { - return _do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr, - tls_val); + struct kernel_clone_args args = { + .flags = (clone_flags & ~CSIGNAL), + .child_tid = child_tidptr, + .parent_tid = parent_tidptr, + .exit_signal = (clone_flags & CSIGNAL), + .stack = newsp, + .tls = tls_val, + }; + + return _do_fork(&args); } diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index f1227f2c38a4..109a0df5af39 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -8,11 +8,26 @@ */ #include +#include struct task_struct; struct rusage; union thread_union; +/* All the bits taken by the old clone syscall. */ +#define CLONE_LEGACY_FLAGS 0xffffffffULL + +struct kernel_clone_args { + u64 flags; + int __user *pidfd; + int __user *child_tid; + int __user *parent_tid; + int exit_signal; + unsigned long stack; + unsigned long stack_size; + unsigned long tls; +}; + /* * This serializes "schedule()" and also protects * the run-queue from deletions/modifications (but @@ -73,7 +88,7 @@ extern void do_group_exit(int); extern void exit_files(struct task_struct *); extern void exit_itimers(struct signal_struct *); -extern long _do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *, unsigned long); +extern long _do_fork(struct kernel_clone_args *kargs); extern long do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *); struct task_struct *fork_idle(int); struct mm_struct *copy_init_mm(void); diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index e2870fe1be5b..60a81f374ca3 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -70,6 +70,7 @@ struct sigaltstack; struct rseq; union bpf_attr; struct io_uring_params; +struct clone_args; #include #include @@ -852,6 +853,9 @@ asmlinkage long sys_clone(unsigned long, unsigned long, int __user *, int __user *, unsigned long); #endif #endif + +asmlinkage long sys_clone3(struct clone_args __user *uargs, size_t size); + asmlinkage long sys_execve(const char __user *filename, const char __user *const __user *argv, const char __user *const __user *envp); diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h index ed4ee170bee2..f5331dbdcaa2 100644 --- a/include/uapi/linux/sched.h +++ b/include/uapi/linux/sched.h @@ -2,6 +2,8 @@ #ifndef _UAPI_LINUX_SCHED_H #define _UAPI_LINUX_SCHED_H +#include + /* * cloning flags: */ @@ -31,6 +33,20 @@ #define CLONE_NEWNET 0x40000000 /* New network namespace */ #define CLONE_IO 0x80000000 /* Clone io context */ +/* + * Arguments for the clone3 syscall + */ +struct clone_args { + __aligned_u64 flags; + __aligned_u64 pidfd; + __aligned_u64 child_tid; + __aligned_u64 parent_tid; + __aligned_u64 exit_signal; + __aligned_u64 stack; + __aligned_u64 stack_size; + __aligned_u64 tls; +}; + /* * Scheduling policies */ diff --git a/kernel/fork.c b/kernel/fork.c index b4cba953040a..08ff131f26b4 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1760,19 +1760,15 @@ static __always_inline void delayed_free_task(struct task_struct *tsk) * flags). The actual kick-off is left to the caller. */ static __latent_entropy struct task_struct *copy_process( - unsigned long clone_flags, - unsigned long stack_start, - unsigned long stack_size, - int __user *parent_tidptr, - int __user *child_tidptr, struct pid *pid, int trace, - unsigned long tls, - int node) + int node, + struct kernel_clone_args *args) { int pidfd = -1, retval; struct task_struct *p; struct multiprocess_signals delayed; + u64 clone_flags = args->flags; /* * Don't allow sharing the root directory with processes in a different @@ -1821,27 +1817,12 @@ static __latent_entropy struct task_struct *copy_process( } if (clone_flags & CLONE_PIDFD) { - int reserved; - /* - * - CLONE_PARENT_SETTID is useless for pidfds and also - * parent_tidptr is used to return pidfds. * - CLONE_DETACHED is blocked so that we can potentially * reuse it later for CLONE_PIDFD. * - CLONE_THREAD is blocked until someone really needs it. */ - if (clone_flags & - (CLONE_DETACHED | CLONE_PARENT_SETTID | CLONE_THREAD)) - return ERR_PTR(-EINVAL); - - /* - * Verify that parent_tidptr is sane so we can potentially - * reuse it later. - */ - if (get_user(reserved, parent_tidptr)) - return ERR_PTR(-EFAULT); - - if (reserved != 0) + if (clone_flags & (CLONE_DETACHED | CLONE_THREAD)) return ERR_PTR(-EINVAL); } @@ -1874,11 +1855,11 @@ static __latent_entropy struct task_struct *copy_process( * p->set_child_tid which is (ab)used as a kthread's data pointer for * kernel threads (PF_KTHREAD). */ - p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; + p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? args->child_tid : NULL; /* * Clear TID on mm_release()? */ - p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr : NULL; + p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? args->child_tid : NULL; ftrace_graph_init_task(p); @@ -2037,7 +2018,8 @@ static __latent_entropy struct task_struct *copy_process( retval = copy_io(clone_flags, p); if (retval) goto bad_fork_cleanup_namespaces; - retval = copy_thread_tls(clone_flags, stack_start, stack_size, p, tls); + retval = copy_thread_tls(clone_flags, args->stack, args->stack_size, p, + args->tls); if (retval) goto bad_fork_cleanup_io; @@ -2062,7 +2044,7 @@ static __latent_entropy struct task_struct *copy_process( goto bad_fork_free_pid; pidfd = retval; - retval = put_user(pidfd, parent_tidptr); + retval = put_user(pidfd, args->pidfd); if (retval) goto bad_fork_put_pidfd; } @@ -2105,7 +2087,7 @@ static __latent_entropy struct task_struct *copy_process( if (clone_flags & CLONE_PARENT) p->exit_signal = current->group_leader->exit_signal; else - p->exit_signal = (clone_flags & CSIGNAL); + p->exit_signal = args->exit_signal; p->group_leader = p; p->tgid = p->pid; } @@ -2313,8 +2295,11 @@ static inline void init_idle_pids(struct task_struct *idle) struct task_struct *fork_idle(int cpu) { struct task_struct *task; - task = copy_process(CLONE_VM, 0, 0, NULL, NULL, &init_struct_pid, 0, 0, - cpu_to_node(cpu)); + struct kernel_clone_args args = { + .flags = CLONE_VM, + }; + + task = copy_process(&init_struct_pid, 0, cpu_to_node(cpu), &args); if (!IS_ERR(task)) { init_idle_pids(task); init_idle(task, cpu); @@ -2334,13 +2319,9 @@ struct mm_struct *copy_init_mm(void) * It copies the process, and if successful kick-starts * it and waits for it to finish using the VM if required. */ -long _do_fork(unsigned long clone_flags, - unsigned long stack_start, - unsigned long stack_size, - int __user *parent_tidptr, - int __user *child_tidptr, - unsigned long tls) +long _do_fork(struct kernel_clone_args *args) { + u64 clone_flags = args->flags; struct completion vfork; struct pid *pid; struct task_struct *p; @@ -2356,7 +2337,7 @@ long _do_fork(unsigned long clone_flags, if (!(clone_flags & CLONE_UNTRACED)) { if (clone_flags & CLONE_VFORK) trace = PTRACE_EVENT_VFORK; - else if ((clone_flags & CSIGNAL) != SIGCHLD) + else if (args->exit_signal != SIGCHLD) trace = PTRACE_EVENT_CLONE; else trace = PTRACE_EVENT_FORK; @@ -2365,8 +2346,7 @@ long _do_fork(unsigned long clone_flags, trace = 0; } - p = copy_process(clone_flags, stack_start, stack_size, parent_tidptr, - child_tidptr, NULL, trace, tls, NUMA_NO_NODE); + p = copy_process(NULL, trace, NUMA_NO_NODE, args); add_latent_entropy(); if (IS_ERR(p)) @@ -2382,7 +2362,7 @@ long _do_fork(unsigned long clone_flags, nr = pid_vnr(pid); if (clone_flags & CLONE_PARENT_SETTID) - put_user(nr, parent_tidptr); + put_user(nr, args->parent_tid); if (clone_flags & CLONE_VFORK) { p->vfork_done = &vfork; @@ -2414,8 +2394,16 @@ long do_fork(unsigned long clone_flags, int __user *parent_tidptr, int __user *child_tidptr) { - return _do_fork(clone_flags, stack_start, stack_size, - parent_tidptr, child_tidptr, 0); + struct kernel_clone_args args = { + .flags = (clone_flags & ~CSIGNAL), + .child_tid = child_tidptr, + .parent_tid = parent_tidptr, + .exit_signal = (clone_flags & CSIGNAL), + .stack = stack_start, + .stack_size = stack_size, + }; + + return _do_fork(&args); } #endif @@ -2424,15 +2412,25 @@ long do_fork(unsigned long clone_flags, */ pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) { - return _do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn, - (unsigned long)arg, NULL, NULL, 0); + struct kernel_clone_args args = { + .flags = ((flags | CLONE_VM | CLONE_UNTRACED) & ~CSIGNAL), + .exit_signal = (flags & CSIGNAL), + .stack = (unsigned long)fn, + .stack_size = (unsigned long)arg, + }; + + return _do_fork(&args); } #ifdef __ARCH_WANT_SYS_FORK SYSCALL_DEFINE0(fork) { #ifdef CONFIG_MMU - return _do_fork(SIGCHLD, 0, 0, NULL, NULL, 0); + struct kernel_clone_args args = { + .exit_signal = SIGCHLD, + }; + + return _do_fork(&args); #else /* can not support in nommu mode */ return -EINVAL; @@ -2443,8 +2441,12 @@ SYSCALL_DEFINE0(fork) #ifdef __ARCH_WANT_SYS_VFORK SYSCALL_DEFINE0(vfork) { - return _do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0, - 0, NULL, NULL, 0); + struct kernel_clone_args args = { + .flags = CLONE_VFORK | CLONE_VM, + .exit_signal = SIGCHLD, + }; + + return _do_fork(&args); } #endif @@ -2472,7 +2474,110 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp, unsigned long, tls) #endif { - return _do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr, tls); + struct kernel_clone_args args = { + .flags = (clone_flags & ~CSIGNAL), + .pidfd = parent_tidptr, + .child_tid = child_tidptr, + .parent_tid = parent_tidptr, + .exit_signal = (clone_flags & CSIGNAL), + .stack = newsp, + .tls = tls, + }; + + /* clone(CLONE_PIDFD) uses parent_tidptr to return a pidfd */ + if ((clone_flags & CLONE_PIDFD) && (clone_flags & CLONE_PARENT_SETTID)) + return -EINVAL; + + return _do_fork(&args); +} + +noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs, + struct clone_args __user *uargs, + size_t size) +{ + struct clone_args args; + + if (unlikely(size > PAGE_SIZE)) + return -E2BIG; + + if (unlikely(size < sizeof(struct clone_args))) + return -EINVAL; + + if (unlikely(!access_ok(uargs, size))) + return -EFAULT; + + if (size > sizeof(struct clone_args)) { + unsigned char __user *addr; + unsigned char __user *end; + unsigned char val; + + addr = (void __user *)uargs + sizeof(struct clone_args); + end = (void __user *)uargs + size; + + for (; addr < end; addr++) { + if (get_user(val, addr)) + return -EFAULT; + if (val) + return -E2BIG; + } + + size = sizeof(struct clone_args); + } + + if (copy_from_user(&args, uargs, size)) + return -EFAULT; + + *kargs = (struct kernel_clone_args){ + .flags = args.flags, + .pidfd = u64_to_user_ptr(args.pidfd), + .child_tid = u64_to_user_ptr(args.child_tid), + .parent_tid = u64_to_user_ptr(args.parent_tid), + .exit_signal = args.exit_signal, + .stack = args.stack, + .stack_size = args.stack_size, + .tls = args.tls, + }; + + return 0; +} + +static bool clone3_args_valid(const struct kernel_clone_args *kargs) +{ + /* + * All lower bits of the flag word are taken. + * Verify that no other unknown flags are passed along. + */ + if (kargs->flags & ~CLONE_LEGACY_FLAGS) + return false; + + /* + * - make the CLONE_DETACHED bit reuseable for clone3 + * - make the CSIGNAL bits reuseable for clone3 + */ + if (kargs->flags & (CLONE_DETACHED | CSIGNAL)) + return false; + + if ((kargs->flags & (CLONE_THREAD | CLONE_PARENT)) && + kargs->exit_signal) + return false; + + return true; +} + +SYSCALL_DEFINE2(clone3, struct clone_args __user *, uargs, size_t, size) +{ + int err; + + struct kernel_clone_args kargs; + + err = copy_clone_args_from_user(&kargs, uargs, size); + if (err) + return err; + + if (!clone3_args_valid(&kargs)) + return -EINVAL; + + return _do_fork(&kargs); } #endif -- cgit v1.2.3-71-gd317 From fe03d4745675cbd678cb8c50d951df0abafdcaee Mon Sep 17 00:00:00 2001 From: Jozsef Kadlecsik Date: Mon, 10 Jun 2019 13:00:24 +0200 Subject: Update my email address It's better to use my kadlec@netfilter.org email address in the source code. I might not be able to use kadlec@blackhole.kfki.hu in the future. Signed-off-by: Jozsef Kadlecsik Signed-off-by: Jozsef Kadlecsik --- CREDITS | 2 +- MAINTAINERS | 2 +- include/linux/jhash.h | 2 +- include/linux/netfilter/ipset/ip_set.h | 2 +- include/linux/netfilter/ipset/ip_set_counter.h | 2 +- include/linux/netfilter/ipset/ip_set_skbinfo.h | 2 +- include/linux/netfilter/ipset/ip_set_timeout.h | 2 +- include/uapi/linux/netfilter/ipset/ip_set.h | 2 +- net/ipv4/netfilter/iptable_raw.c | 2 +- net/ipv4/netfilter/nf_nat_h323.c | 2 +- net/ipv6/netfilter/ip6table_raw.c | 2 +- net/netfilter/ipset/ip_set_bitmap_gen.h | 2 +- net/netfilter/ipset/ip_set_bitmap_ip.c | 4 ++-- net/netfilter/ipset/ip_set_bitmap_ipmac.c | 4 ++-- net/netfilter/ipset/ip_set_bitmap_port.c | 4 ++-- net/netfilter/ipset/ip_set_core.c | 4 ++-- net/netfilter/ipset/ip_set_getport.c | 2 +- net/netfilter/ipset/ip_set_hash_gen.h | 2 +- net/netfilter/ipset/ip_set_hash_ip.c | 4 ++-- net/netfilter/ipset/ip_set_hash_ipmark.c | 2 +- net/netfilter/ipset/ip_set_hash_ipport.c | 4 ++-- net/netfilter/ipset/ip_set_hash_ipportip.c | 4 ++-- net/netfilter/ipset/ip_set_hash_ipportnet.c | 4 ++-- net/netfilter/ipset/ip_set_hash_mac.c | 4 ++-- net/netfilter/ipset/ip_set_hash_net.c | 4 ++-- net/netfilter/ipset/ip_set_hash_netiface.c | 4 ++-- net/netfilter/ipset/ip_set_hash_netnet.c | 2 +- net/netfilter/ipset/ip_set_hash_netport.c | 4 ++-- net/netfilter/ipset/ip_set_hash_netportnet.c | 2 +- net/netfilter/ipset/ip_set_list_set.c | 4 ++-- net/netfilter/nf_conntrack_h323_main.c | 2 +- net/netfilter/nf_conntrack_proto_tcp.c | 2 +- net/netfilter/xt_iprange.c | 4 ++-- net/netfilter/xt_set.c | 4 ++-- 34 files changed, 49 insertions(+), 49 deletions(-) (limited to 'include/uapi/linux') diff --git a/CREDITS b/CREDITS index 8e0342620a06..4200f4f91a16 100644 --- a/CREDITS +++ b/CREDITS @@ -1800,7 +1800,7 @@ S: 2300 Copenhagen S. S: Denmark N: Jozsef Kadlecsik -E: kadlec@blackhole.kfki.hu +E: kadlec@netfilter.org P: 1024D/470DB964 4CB3 1A05 713E 9BF7 FAC5 5809 DD8C B7B1 470D B964 D: netfilter: TCP window tracking code D: netfilter: raw table diff --git a/MAINTAINERS b/MAINTAINERS index fcbd648b960e..4c65ce86fc9e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -10858,7 +10858,7 @@ F: drivers/net/ethernet/neterion/ NETFILTER M: Pablo Neira Ayuso -M: Jozsef Kadlecsik +M: Jozsef Kadlecsik M: Florian Westphal L: netfilter-devel@vger.kernel.org L: coreteam@netfilter.org diff --git a/include/linux/jhash.h b/include/linux/jhash.h index 8037850f3104..ba2f6a9776b6 100644 --- a/include/linux/jhash.h +++ b/include/linux/jhash.h @@ -17,7 +17,7 @@ * if SELF_TEST is defined. You can use this free for any purpose. It's in * the public domain. It has no warranty. * - * Copyright (C) 2009-2010 Jozsef Kadlecsik (kadlec@blackhole.kfki.hu) + * Copyright (C) 2009-2010 Jozsef Kadlecsik (kadlec@netfilter.org) * * I've modified Bob's hash to be useful in the Linux kernel, and * any bugs present are my fault. diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h index e499d170f12d..f5c6e7cd6469 100644 --- a/include/linux/netfilter/ipset/ip_set.h +++ b/include/linux/netfilter/ipset/ip_set.h @@ -1,7 +1,7 @@ /* Copyright (C) 2000-2002 Joakim Axelsson * Patrick Schaaf * Martin Josefsson - * Copyright (C) 2003-2013 Jozsef Kadlecsik + * Copyright (C) 2003-2013 Jozsef Kadlecsik * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as diff --git a/include/linux/netfilter/ipset/ip_set_counter.h b/include/linux/netfilter/ipset/ip_set_counter.h index 3d33a2c3f39f..305aeda2a899 100644 --- a/include/linux/netfilter/ipset/ip_set_counter.h +++ b/include/linux/netfilter/ipset/ip_set_counter.h @@ -1,7 +1,7 @@ #ifndef _IP_SET_COUNTER_H #define _IP_SET_COUNTER_H -/* Copyright (C) 2015 Jozsef Kadlecsik +/* Copyright (C) 2015 Jozsef Kadlecsik * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as diff --git a/include/linux/netfilter/ipset/ip_set_skbinfo.h b/include/linux/netfilter/ipset/ip_set_skbinfo.h index 29d7ef2bc3fa..fac57ef854c2 100644 --- a/include/linux/netfilter/ipset/ip_set_skbinfo.h +++ b/include/linux/netfilter/ipset/ip_set_skbinfo.h @@ -1,7 +1,7 @@ #ifndef _IP_SET_SKBINFO_H #define _IP_SET_SKBINFO_H -/* Copyright (C) 2015 Jozsef Kadlecsik +/* Copyright (C) 2015 Jozsef Kadlecsik * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as diff --git a/include/linux/netfilter/ipset/ip_set_timeout.h b/include/linux/netfilter/ipset/ip_set_timeout.h index 8ce271e187b6..dc74150f3432 100644 --- a/include/linux/netfilter/ipset/ip_set_timeout.h +++ b/include/linux/netfilter/ipset/ip_set_timeout.h @@ -1,7 +1,7 @@ #ifndef _IP_SET_TIMEOUT_H #define _IP_SET_TIMEOUT_H -/* Copyright (C) 2003-2013 Jozsef Kadlecsik +/* Copyright (C) 2003-2013 Jozsef Kadlecsik * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as diff --git a/include/uapi/linux/netfilter/ipset/ip_set.h b/include/uapi/linux/netfilter/ipset/ip_set.h index ea69ca21ff23..eea166c52c36 100644 --- a/include/uapi/linux/netfilter/ipset/ip_set.h +++ b/include/uapi/linux/netfilter/ipset/ip_set.h @@ -2,7 +2,7 @@ /* Copyright (C) 2000-2002 Joakim Axelsson * Patrick Schaaf * Martin Josefsson - * Copyright (C) 2003-2011 Jozsef Kadlecsik + * Copyright (C) 2003-2011 Jozsef Kadlecsik * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c index 6eefde5bc468..69697eb4bfc6 100644 --- a/net/ipv4/netfilter/iptable_raw.c +++ b/net/ipv4/netfilter/iptable_raw.c @@ -2,7 +2,7 @@ /* * 'raw' table, which is the very first hooked in at PRE_ROUTING and LOCAL_OUT . * - * Copyright (C) 2003 Jozsef Kadlecsik + * Copyright (C) 2003 Jozsef Kadlecsik */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include diff --git a/net/ipv4/netfilter/nf_nat_h323.c b/net/ipv4/netfilter/nf_nat_h323.c index 15f2b2604890..076b6b29d66d 100644 --- a/net/ipv4/netfilter/nf_nat_h323.c +++ b/net/ipv4/netfilter/nf_nat_h323.c @@ -7,7 +7,7 @@ * This source code is licensed under General Public License version 2. * * Based on the 'brute force' H.323 NAT module by - * Jozsef Kadlecsik + * Jozsef Kadlecsik */ #include diff --git a/net/ipv6/netfilter/ip6table_raw.c b/net/ipv6/netfilter/ip6table_raw.c index 3f7d4691c423..a22100b1cf2c 100644 --- a/net/ipv6/netfilter/ip6table_raw.c +++ b/net/ipv6/netfilter/ip6table_raw.c @@ -2,7 +2,7 @@ /* * IPv6 raw table, a port of the IPv4 raw table to IPv6 * - * Copyright (C) 2003 Jozsef Kadlecsik + * Copyright (C) 2003 Jozsef Kadlecsik */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include diff --git a/net/netfilter/ipset/ip_set_bitmap_gen.h b/net/netfilter/ipset/ip_set_bitmap_gen.h index 38ef2ea838cb..29c1e9a50601 100644 --- a/net/netfilter/ipset/ip_set_bitmap_gen.h +++ b/net/netfilter/ipset/ip_set_bitmap_gen.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2013 Jozsef Kadlecsik +/* Copyright (C) 2013 Jozsef Kadlecsik * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as diff --git a/net/netfilter/ipset/ip_set_bitmap_ip.c b/net/netfilter/ipset/ip_set_bitmap_ip.c index 488d6d05c65c..5a66c5499700 100644 --- a/net/netfilter/ipset/ip_set_bitmap_ip.c +++ b/net/netfilter/ipset/ip_set_bitmap_ip.c @@ -1,6 +1,6 @@ /* Copyright (C) 2000-2002 Joakim Axelsson * Patrick Schaaf - * Copyright (C) 2003-2013 Jozsef Kadlecsik + * Copyright (C) 2003-2013 Jozsef Kadlecsik * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -31,7 +31,7 @@ #define IPSET_TYPE_REV_MAX 3 /* skbinfo support added */ MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Jozsef Kadlecsik "); +MODULE_AUTHOR("Jozsef Kadlecsik "); IP_SET_MODULE_DESC("bitmap:ip", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); MODULE_ALIAS("ip_set_bitmap:ip"); diff --git a/net/netfilter/ipset/ip_set_bitmap_ipmac.c b/net/netfilter/ipset/ip_set_bitmap_ipmac.c index 980000fc3b50..ec7a8b12642c 100644 --- a/net/netfilter/ipset/ip_set_bitmap_ipmac.c +++ b/net/netfilter/ipset/ip_set_bitmap_ipmac.c @@ -1,7 +1,7 @@ /* Copyright (C) 2000-2002 Joakim Axelsson * Patrick Schaaf * Martin Josefsson - * Copyright (C) 2003-2013 Jozsef Kadlecsik + * Copyright (C) 2003-2013 Jozsef Kadlecsik * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -31,7 +31,7 @@ #define IPSET_TYPE_REV_MAX 3 /* skbinfo support added */ MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Jozsef Kadlecsik "); +MODULE_AUTHOR("Jozsef Kadlecsik "); IP_SET_MODULE_DESC("bitmap:ip,mac", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); MODULE_ALIAS("ip_set_bitmap:ip,mac"); diff --git a/net/netfilter/ipset/ip_set_bitmap_port.c b/net/netfilter/ipset/ip_set_bitmap_port.c index b561ca8b3659..18275ec4924c 100644 --- a/net/netfilter/ipset/ip_set_bitmap_port.c +++ b/net/netfilter/ipset/ip_set_bitmap_port.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2003-2013 Jozsef Kadlecsik +/* Copyright (C) 2003-2013 Jozsef Kadlecsik * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -26,7 +26,7 @@ #define IPSET_TYPE_REV_MAX 3 /* skbinfo support added */ MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Jozsef Kadlecsik "); +MODULE_AUTHOR("Jozsef Kadlecsik "); IP_SET_MODULE_DESC("bitmap:port", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); MODULE_ALIAS("ip_set_bitmap:port"); diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index 039892cd2b7d..18430ad2fdf2 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -1,6 +1,6 @@ /* Copyright (C) 2000-2002 Joakim Axelsson * Patrick Schaaf - * Copyright (C) 2003-2013 Jozsef Kadlecsik + * Copyright (C) 2003-2013 Jozsef Kadlecsik * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -51,7 +51,7 @@ static unsigned int max_sets; module_param(max_sets, int, 0600); MODULE_PARM_DESC(max_sets, "maximal number of sets"); MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Jozsef Kadlecsik "); +MODULE_AUTHOR("Jozsef Kadlecsik "); MODULE_DESCRIPTION("core IP set support"); MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_IPSET); diff --git a/net/netfilter/ipset/ip_set_getport.c b/net/netfilter/ipset/ip_set_getport.c index 3f09cdb42562..dc7b46b41354 100644 --- a/net/netfilter/ipset/ip_set_getport.c +++ b/net/netfilter/ipset/ip_set_getport.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2003-2011 Jozsef Kadlecsik +/* Copyright (C) 2003-2011 Jozsef Kadlecsik * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h index 623e0d675725..07ef941130a6 100644 --- a/net/netfilter/ipset/ip_set_hash_gen.h +++ b/net/netfilter/ipset/ip_set_hash_gen.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2013 Jozsef Kadlecsik +/* Copyright (C) 2013 Jozsef Kadlecsik * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as diff --git a/net/netfilter/ipset/ip_set_hash_ip.c b/net/netfilter/ipset/ip_set_hash_ip.c index 613eb212cb48..7b82bf1104ce 100644 --- a/net/netfilter/ipset/ip_set_hash_ip.c +++ b/net/netfilter/ipset/ip_set_hash_ip.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2003-2013 Jozsef Kadlecsik +/* Copyright (C) 2003-2013 Jozsef Kadlecsik * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -30,7 +30,7 @@ #define IPSET_TYPE_REV_MAX 4 /* skbinfo support */ MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Jozsef Kadlecsik "); +MODULE_AUTHOR("Jozsef Kadlecsik "); IP_SET_MODULE_DESC("hash:ip", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); MODULE_ALIAS("ip_set_hash:ip"); diff --git a/net/netfilter/ipset/ip_set_hash_ipmark.c b/net/netfilter/ipset/ip_set_hash_ipmark.c index f3ba8348cf9d..7d468f98a252 100644 --- a/net/netfilter/ipset/ip_set_hash_ipmark.c +++ b/net/netfilter/ipset/ip_set_hash_ipmark.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2003-2013 Jozsef Kadlecsik +/* Copyright (C) 2003-2013 Jozsef Kadlecsik * Copyright (C) 2013 Smoothwall Ltd. * * This program is free software; you can redistribute it and/or modify diff --git a/net/netfilter/ipset/ip_set_hash_ipport.c b/net/netfilter/ipset/ip_set_hash_ipport.c index ddb8039ec1d2..d358ee69d04b 100644 --- a/net/netfilter/ipset/ip_set_hash_ipport.c +++ b/net/netfilter/ipset/ip_set_hash_ipport.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2003-2013 Jozsef Kadlecsik +/* Copyright (C) 2003-2013 Jozsef Kadlecsik * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -32,7 +32,7 @@ #define IPSET_TYPE_REV_MAX 5 /* skbinfo support added */ MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Jozsef Kadlecsik "); +MODULE_AUTHOR("Jozsef Kadlecsik "); IP_SET_MODULE_DESC("hash:ip,port", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); MODULE_ALIAS("ip_set_hash:ip,port"); diff --git a/net/netfilter/ipset/ip_set_hash_ipportip.c b/net/netfilter/ipset/ip_set_hash_ipportip.c index a7f4d7a85420..0a304785f912 100644 --- a/net/netfilter/ipset/ip_set_hash_ipportip.c +++ b/net/netfilter/ipset/ip_set_hash_ipportip.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2003-2013 Jozsef Kadlecsik +/* Copyright (C) 2003-2013 Jozsef Kadlecsik * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -32,7 +32,7 @@ #define IPSET_TYPE_REV_MAX 5 /* skbinfo support added */ MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Jozsef Kadlecsik "); +MODULE_AUTHOR("Jozsef Kadlecsik "); IP_SET_MODULE_DESC("hash:ip,port,ip", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); MODULE_ALIAS("ip_set_hash:ip,port,ip"); diff --git a/net/netfilter/ipset/ip_set_hash_ipportnet.c b/net/netfilter/ipset/ip_set_hash_ipportnet.c index 88b83d6d3084..245f7d714870 100644 --- a/net/netfilter/ipset/ip_set_hash_ipportnet.c +++ b/net/netfilter/ipset/ip_set_hash_ipportnet.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2003-2013 Jozsef Kadlecsik +/* Copyright (C) 2003-2013 Jozsef Kadlecsik * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -34,7 +34,7 @@ #define IPSET_TYPE_REV_MAX 7 /* skbinfo support added */ MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Jozsef Kadlecsik "); +MODULE_AUTHOR("Jozsef Kadlecsik "); IP_SET_MODULE_DESC("hash:ip,port,net", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); MODULE_ALIAS("ip_set_hash:ip,port,net"); diff --git a/net/netfilter/ipset/ip_set_hash_mac.c b/net/netfilter/ipset/ip_set_hash_mac.c index 4fe5f243d0a3..3d1fc71dac38 100644 --- a/net/netfilter/ipset/ip_set_hash_mac.c +++ b/net/netfilter/ipset/ip_set_hash_mac.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2014 Jozsef Kadlecsik +/* Copyright (C) 2014 Jozsef Kadlecsik * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -23,7 +23,7 @@ #define IPSET_TYPE_REV_MAX 0 MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Jozsef Kadlecsik "); +MODULE_AUTHOR("Jozsef Kadlecsik "); IP_SET_MODULE_DESC("hash:mac", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); MODULE_ALIAS("ip_set_hash:mac"); diff --git a/net/netfilter/ipset/ip_set_hash_net.c b/net/netfilter/ipset/ip_set_hash_net.c index 5449e23af13a..470701fda231 100644 --- a/net/netfilter/ipset/ip_set_hash_net.c +++ b/net/netfilter/ipset/ip_set_hash_net.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2003-2013 Jozsef Kadlecsik +/* Copyright (C) 2003-2013 Jozsef Kadlecsik * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -31,7 +31,7 @@ #define IPSET_TYPE_REV_MAX 6 /* skbinfo mapping support added */ MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Jozsef Kadlecsik "); +MODULE_AUTHOR("Jozsef Kadlecsik "); IP_SET_MODULE_DESC("hash:net", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); MODULE_ALIAS("ip_set_hash:net"); diff --git a/net/netfilter/ipset/ip_set_hash_netiface.c b/net/netfilter/ipset/ip_set_hash_netiface.c index f5164c1efce2..1df8656ad84d 100644 --- a/net/netfilter/ipset/ip_set_hash_netiface.c +++ b/net/netfilter/ipset/ip_set_hash_netiface.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2011-2013 Jozsef Kadlecsik +/* Copyright (C) 2011-2013 Jozsef Kadlecsik * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -32,7 +32,7 @@ #define IPSET_TYPE_REV_MAX 6 /* skbinfo support added */ MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Jozsef Kadlecsik "); +MODULE_AUTHOR("Jozsef Kadlecsik "); IP_SET_MODULE_DESC("hash:net,iface", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); MODULE_ALIAS("ip_set_hash:net,iface"); diff --git a/net/netfilter/ipset/ip_set_hash_netnet.c b/net/netfilter/ipset/ip_set_hash_netnet.c index 5a2b923bd81f..e0553be89600 100644 --- a/net/netfilter/ipset/ip_set_hash_netnet.c +++ b/net/netfilter/ipset/ip_set_hash_netnet.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2003-2013 Jozsef Kadlecsik +/* Copyright (C) 2003-2013 Jozsef Kadlecsik * Copyright (C) 2013 Oliver Smith * * This program is free software; you can redistribute it and/or modify diff --git a/net/netfilter/ipset/ip_set_hash_netport.c b/net/netfilter/ipset/ip_set_hash_netport.c index 1a187be9ebc8..943d55d76fcf 100644 --- a/net/netfilter/ipset/ip_set_hash_netport.c +++ b/net/netfilter/ipset/ip_set_hash_netport.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2003-2013 Jozsef Kadlecsik +/* Copyright (C) 2003-2013 Jozsef Kadlecsik * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -33,7 +33,7 @@ #define IPSET_TYPE_REV_MAX 7 /* skbinfo support added */ MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Jozsef Kadlecsik "); +MODULE_AUTHOR("Jozsef Kadlecsik "); IP_SET_MODULE_DESC("hash:net,port", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); MODULE_ALIAS("ip_set_hash:net,port"); diff --git a/net/netfilter/ipset/ip_set_hash_netportnet.c b/net/netfilter/ipset/ip_set_hash_netportnet.c index 613e18e720a4..afaff99e578c 100644 --- a/net/netfilter/ipset/ip_set_hash_netportnet.c +++ b/net/netfilter/ipset/ip_set_hash_netportnet.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2003-2013 Jozsef Kadlecsik +/* Copyright (C) 2003-2013 Jozsef Kadlecsik * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c index 4f894165cdcd..ed4360072f64 100644 --- a/net/netfilter/ipset/ip_set_list_set.c +++ b/net/netfilter/ipset/ip_set_list_set.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2008-2013 Jozsef Kadlecsik +/* Copyright (C) 2008-2013 Jozsef Kadlecsik * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -22,7 +22,7 @@ #define IPSET_TYPE_REV_MAX 3 /* skbinfo support added */ MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Jozsef Kadlecsik "); +MODULE_AUTHOR("Jozsef Kadlecsik "); IP_SET_MODULE_DESC("list:set", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); MODULE_ALIAS("ip_set_list:set"); diff --git a/net/netfilter/nf_conntrack_h323_main.c b/net/netfilter/nf_conntrack_h323_main.c index 12de40390e97..1ff66e070cb2 100644 --- a/net/netfilter/nf_conntrack_h323_main.c +++ b/net/netfilter/nf_conntrack_h323_main.c @@ -7,7 +7,7 @@ * This source code is licensed under General Public License version 2. * * Based on the 'brute force' H.323 connection tracking module by - * Jozsef Kadlecsik + * Jozsef Kadlecsik * * For more information, please see http://nath323.sourceforge.net/ */ diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index 7ba01d8ee165..60b68400435d 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -1,6 +1,6 @@ /* (C) 1999-2001 Paul `Rusty' Russell * (C) 2002-2004 Netfilter Core Team - * (C) 2002-2013 Jozsef Kadlecsik + * (C) 2002-2013 Jozsef Kadlecsik * (C) 2006-2012 Patrick McHardy * * This program is free software; you can redistribute it and/or modify diff --git a/net/netfilter/xt_iprange.c b/net/netfilter/xt_iprange.c index b46626cddd93..4ab4155706d7 100644 --- a/net/netfilter/xt_iprange.c +++ b/net/netfilter/xt_iprange.c @@ -1,7 +1,7 @@ /* * xt_iprange - Netfilter module to match IP address ranges * - * (C) 2003 Jozsef Kadlecsik + * (C) 2003 Jozsef Kadlecsik * (C) CC Computer Consultants GmbH, 2008 * * This program is free software; you can redistribute it and/or modify @@ -133,7 +133,7 @@ static void __exit iprange_mt_exit(void) module_init(iprange_mt_init); module_exit(iprange_mt_exit); MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Jozsef Kadlecsik "); +MODULE_AUTHOR("Jozsef Kadlecsik "); MODULE_AUTHOR("Jan Engelhardt "); MODULE_DESCRIPTION("Xtables: arbitrary IPv4 range matching"); MODULE_ALIAS("ipt_iprange"); diff --git a/net/netfilter/xt_set.c b/net/netfilter/xt_set.c index cf67bbe07dc2..f025c51ba375 100644 --- a/net/netfilter/xt_set.c +++ b/net/netfilter/xt_set.c @@ -1,7 +1,7 @@ /* Copyright (C) 2000-2002 Joakim Axelsson * Patrick Schaaf * Martin Josefsson - * Copyright (C) 2003-2013 Jozsef Kadlecsik + * Copyright (C) 2003-2013 Jozsef Kadlecsik * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -21,7 +21,7 @@ #include MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Jozsef Kadlecsik "); +MODULE_AUTHOR("Jozsef Kadlecsik "); MODULE_DESCRIPTION("Xtables: IP set match and target module"); MODULE_ALIAS("xt_SET"); MODULE_ALIAS("ipt_set"); -- cgit v1.2.3-71-gd317 From fada7fdc83c0bf8755956bff707c42b609223301 Mon Sep 17 00:00:00 2001 From: Jonathan Lemon Date: Thu, 6 Jun 2019 13:59:40 -0700 Subject: bpf: Allow bpf_map_lookup_elem() on an xskmap Currently, the AF_XDP code uses a separate map in order to determine if an xsk is bound to a queue. Instead of doing this, have bpf_map_lookup_elem() return a xdp_sock. Rearrange some xdp_sock members to eliminate structure holes. Remove selftest - will be added back in later patch. Signed-off-by: Jonathan Lemon Acked-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 8 +++++ include/net/xdp_sock.h | 4 +-- include/uapi/linux/bpf.h | 4 +++ kernel/bpf/verifier.c | 26 ++++++++++++-- kernel/bpf/xskmap.c | 7 ++++ net/core/filter.c | 40 ++++++++++++++++++++++ .../selftests/bpf/verifier/prevent_map_lookup.c | 15 -------- 7 files changed, 85 insertions(+), 19 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index e5a309e6a400..1fe137afa898 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -280,6 +280,7 @@ enum bpf_reg_type { PTR_TO_TCP_SOCK, /* reg points to struct tcp_sock */ PTR_TO_TCP_SOCK_OR_NULL, /* reg points to struct tcp_sock or NULL */ PTR_TO_TP_BUFFER, /* reg points to a writable raw tp's buffer */ + PTR_TO_XDP_SOCK, /* reg points to struct xdp_sock */ }; /* The information passed from prog-specific *_is_valid_access @@ -727,6 +728,13 @@ void __cpu_map_insert_ctx(struct bpf_map *map, u32 index); void __cpu_map_flush(struct bpf_map *map); int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp, struct net_device *dev_rx); +bool bpf_xdp_sock_is_valid_access(int off, int size, enum bpf_access_type type, + struct bpf_insn_access_aux *info); +u32 bpf_xdp_sock_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, + struct bpf_insn *insn_buf, + struct bpf_prog *prog, + u32 *target_size); /* Return map's numa specified by userspace */ static inline int bpf_map_attr_numa_node(const union bpf_attr *attr) diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index d074b6d60f8a..ae0f368a62bb 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -58,11 +58,11 @@ struct xdp_sock { struct xdp_umem *umem; struct list_head flush_node; u16 queue_id; - struct xsk_queue *tx ____cacheline_aligned_in_smp; - struct list_head list; bool zc; /* Protects multiple processes in the control path */ struct mutex mutex; + struct xsk_queue *tx ____cacheline_aligned_in_smp; + struct list_head list; /* Mutual exclusion of NAPI TX thread and sendmsg error paths * in the SKB destructor callback. */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 7c6aef253173..ae0907d8c03a 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3083,6 +3083,10 @@ struct bpf_sock_tuple { }; }; +struct bpf_xdp_sock { + __u32 queue_id; +}; + #define XDP_PACKET_HEADROOM 256 /* User return codes for XDP prog type. diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 5c2cb5bd84ce..8d1786357a09 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -334,7 +334,8 @@ static bool type_is_sk_pointer(enum bpf_reg_type type) { return type == PTR_TO_SOCKET || type == PTR_TO_SOCK_COMMON || - type == PTR_TO_TCP_SOCK; + type == PTR_TO_TCP_SOCK || + type == PTR_TO_XDP_SOCK; } static bool reg_type_may_be_null(enum bpf_reg_type type) @@ -406,6 +407,7 @@ static const char * const reg_type_str[] = { [PTR_TO_TCP_SOCK] = "tcp_sock", [PTR_TO_TCP_SOCK_OR_NULL] = "tcp_sock_or_null", [PTR_TO_TP_BUFFER] = "tp_buffer", + [PTR_TO_XDP_SOCK] = "xdp_sock", }; static char slot_type_char[] = { @@ -1363,6 +1365,7 @@ static bool is_spillable_regtype(enum bpf_reg_type type) case PTR_TO_SOCK_COMMON_OR_NULL: case PTR_TO_TCP_SOCK: case PTR_TO_TCP_SOCK_OR_NULL: + case PTR_TO_XDP_SOCK: return true; default: return false; @@ -1843,6 +1846,9 @@ static int check_sock_access(struct bpf_verifier_env *env, int insn_idx, case PTR_TO_TCP_SOCK: valid = bpf_tcp_sock_is_valid_access(off, size, t, &info); break; + case PTR_TO_XDP_SOCK: + valid = bpf_xdp_sock_is_valid_access(off, size, t, &info); + break; default: valid = false; } @@ -2007,6 +2013,9 @@ static int check_ptr_alignment(struct bpf_verifier_env *env, case PTR_TO_TCP_SOCK: pointer_desc = "tcp_sock "; break; + case PTR_TO_XDP_SOCK: + pointer_desc = "xdp_sock "; + break; default: break; } @@ -2905,10 +2914,14 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, * appear. */ case BPF_MAP_TYPE_CPUMAP: - case BPF_MAP_TYPE_XSKMAP: if (func_id != BPF_FUNC_redirect_map) goto error; break; + case BPF_MAP_TYPE_XSKMAP: + if (func_id != BPF_FUNC_redirect_map && + func_id != BPF_FUNC_map_lookup_elem) + goto error; + break; case BPF_MAP_TYPE_ARRAY_OF_MAPS: case BPF_MAP_TYPE_HASH_OF_MAPS: if (func_id != BPF_FUNC_map_lookup_elem) @@ -3799,6 +3812,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, case PTR_TO_SOCK_COMMON_OR_NULL: case PTR_TO_TCP_SOCK: case PTR_TO_TCP_SOCK_OR_NULL: + case PTR_TO_XDP_SOCK: verbose(env, "R%d pointer arithmetic on %s prohibited\n", dst, reg_type_str[ptr_reg->type]); return -EACCES; @@ -5038,6 +5052,9 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state, if (reg->map_ptr->inner_map_meta) { reg->type = CONST_PTR_TO_MAP; reg->map_ptr = reg->map_ptr->inner_map_meta; + } else if (reg->map_ptr->map_type == + BPF_MAP_TYPE_XSKMAP) { + reg->type = PTR_TO_XDP_SOCK; } else { reg->type = PTR_TO_MAP_VALUE; } @@ -6299,6 +6316,7 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur, case PTR_TO_SOCK_COMMON_OR_NULL: case PTR_TO_TCP_SOCK: case PTR_TO_TCP_SOCK_OR_NULL: + case PTR_TO_XDP_SOCK: /* Only valid matches are exact, which memcmp() above * would have accepted */ @@ -6693,6 +6711,7 @@ static bool reg_type_mismatch_ok(enum bpf_reg_type type) case PTR_TO_SOCK_COMMON_OR_NULL: case PTR_TO_TCP_SOCK: case PTR_TO_TCP_SOCK_OR_NULL: + case PTR_TO_XDP_SOCK: return false; default: return true; @@ -7826,6 +7845,9 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) case PTR_TO_TCP_SOCK: convert_ctx_access = bpf_tcp_sock_convert_ctx_access; break; + case PTR_TO_XDP_SOCK: + convert_ctx_access = bpf_xdp_sock_convert_ctx_access; + break; default: continue; } diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c index 413d75f4fc72..ef7338cebd18 100644 --- a/kernel/bpf/xskmap.c +++ b/kernel/bpf/xskmap.c @@ -151,6 +151,12 @@ void __xsk_map_flush(struct bpf_map *map) } static void *xsk_map_lookup_elem(struct bpf_map *map, void *key) +{ + WARN_ON_ONCE(!rcu_read_lock_held()); + return __xsk_map_lookup_elem(map, *(u32 *)key); +} + +static void *xsk_map_lookup_elem_sys_only(struct bpf_map *map, void *key) { return ERR_PTR(-EOPNOTSUPP); } @@ -218,6 +224,7 @@ const struct bpf_map_ops xsk_map_ops = { .map_free = xsk_map_free, .map_get_next_key = xsk_map_get_next_key, .map_lookup_elem = xsk_map_lookup_elem, + .map_lookup_elem_sys_only = xsk_map_lookup_elem_sys_only, .map_update_elem = xsk_map_update_elem, .map_delete_elem = xsk_map_delete_elem, .map_check_btf = map_check_no_btf, diff --git a/net/core/filter.c b/net/core/filter.c index f2777dc0b624..a5e4ac7fcbe5 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5680,6 +5680,46 @@ BPF_CALL_1(bpf_skb_ecn_set_ce, struct sk_buff *, skb) return INET_ECN_set_ce(skb); } +bool bpf_xdp_sock_is_valid_access(int off, int size, enum bpf_access_type type, + struct bpf_insn_access_aux *info) +{ + if (off < 0 || off >= offsetofend(struct bpf_xdp_sock, queue_id)) + return false; + + if (off % size != 0) + return false; + + switch (off) { + default: + return size == sizeof(__u32); + } +} + +u32 bpf_xdp_sock_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, + struct bpf_insn *insn_buf, + struct bpf_prog *prog, u32 *target_size) +{ + struct bpf_insn *insn = insn_buf; + +#define BPF_XDP_SOCK_GET(FIELD) \ + do { \ + BUILD_BUG_ON(FIELD_SIZEOF(struct xdp_sock, FIELD) > \ + FIELD_SIZEOF(struct bpf_xdp_sock, FIELD)); \ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_sock, FIELD),\ + si->dst_reg, si->src_reg, \ + offsetof(struct xdp_sock, FIELD)); \ + } while (0) + + switch (si->off) { + case offsetof(struct bpf_xdp_sock, queue_id): + BPF_XDP_SOCK_GET(queue_id); + break; + } + + return insn - insn_buf; +} + static const struct bpf_func_proto bpf_skb_ecn_set_ce_proto = { .func = bpf_skb_ecn_set_ce, .gpl_only = false, diff --git a/tools/testing/selftests/bpf/verifier/prevent_map_lookup.c b/tools/testing/selftests/bpf/verifier/prevent_map_lookup.c index bbdba990fefb..da7a4b37cb98 100644 --- a/tools/testing/selftests/bpf/verifier/prevent_map_lookup.c +++ b/tools/testing/selftests/bpf/verifier/prevent_map_lookup.c @@ -28,21 +28,6 @@ .errstr = "cannot pass map_type 18 into func bpf_map_lookup_elem", .prog_type = BPF_PROG_TYPE_SOCK_OPS, }, -{ - "prevent map lookup in xskmap", - .insns = { - BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), - BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), - BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), - BPF_LD_MAP_FD(BPF_REG_1, 0), - BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), - BPF_EXIT_INSN(), - }, - .fixup_map_xskmap = { 3 }, - .result = REJECT, - .errstr = "cannot pass map_type 17 into func bpf_map_lookup_elem", - .prog_type = BPF_PROG_TYPE_XDP, -}, { "prevent map lookup in stack trace", .insns = { -- cgit v1.2.3-71-gd317 From 4e32348ba5269aac1165f496b78189201568dd8c Mon Sep 17 00:00:00 2001 From: Jacob Pan Date: Mon, 3 Jun 2019 15:57:47 +0100 Subject: iommu: Introduce device fault data Device faults detected by IOMMU can be reported outside the IOMMU subsystem for further processing. This patch introduces a generic device fault data structure. The fault can be either an unrecoverable fault or a page request, also referred to as a recoverable fault. We only care about non internal faults that are likely to be reported to an external subsystem. Signed-off-by: Jacob Pan Signed-off-by: Jean-Philippe Brucker Signed-off-by: Liu, Yi L Signed-off-by: Ashok Raj Signed-off-by: Eric Auger Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 39 +++++++++++++++ include/uapi/linux/iommu.h | 118 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 157 insertions(+) create mode 100644 include/uapi/linux/iommu.h (limited to 'include/uapi/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index a815cf6f6f47..2b05056d5fa7 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -25,6 +25,7 @@ #include #include #include +#include #define IOMMU_READ (1 << 0) #define IOMMU_WRITE (1 << 1) @@ -49,6 +50,7 @@ struct device; struct iommu_domain; struct notifier_block; struct iommu_sva; +struct iommu_fault_event; /* iommu fault flags */ #define IOMMU_FAULT_READ 0x0 @@ -58,6 +60,7 @@ typedef int (*iommu_fault_handler_t)(struct iommu_domain *, struct device *, unsigned long, int, void *); typedef int (*iommu_mm_exit_handler_t)(struct device *dev, struct iommu_sva *, void *); +typedef int (*iommu_dev_fault_handler_t)(struct iommu_fault *, void *); struct iommu_domain_geometry { dma_addr_t aperture_start; /* First address that can be mapped */ @@ -301,6 +304,41 @@ struct iommu_device { struct device *dev; }; +/** + * struct iommu_fault_event - Generic fault event + * + * Can represent recoverable faults such as a page requests or + * unrecoverable faults such as DMA or IRQ remapping faults. + * + * @fault: fault descriptor + */ +struct iommu_fault_event { + struct iommu_fault fault; +}; + +/** + * struct iommu_fault_param - per-device IOMMU fault data + * @handler: Callback function to handle IOMMU faults at device level + * @data: handler private data + */ +struct iommu_fault_param { + iommu_dev_fault_handler_t handler; + void *data; +}; + +/** + * struct iommu_param - collection of per-device IOMMU data + * + * @fault_param: IOMMU detected device fault reporting data + * + * TODO: migrate other per device data pointers under iommu_dev_data, e.g. + * struct iommu_group *iommu_group; + * struct iommu_fwspec *iommu_fwspec; + */ +struct iommu_param { + struct iommu_fault_param *fault_param; +}; + int iommu_device_register(struct iommu_device *iommu); void iommu_device_unregister(struct iommu_device *iommu); int iommu_device_sysfs_add(struct iommu_device *iommu, @@ -504,6 +542,7 @@ struct iommu_ops {}; struct iommu_group {}; struct iommu_fwspec {}; struct iommu_device {}; +struct iommu_fault_param {}; static inline bool iommu_present(struct bus_type *bus) { diff --git a/include/uapi/linux/iommu.h b/include/uapi/linux/iommu.h new file mode 100644 index 000000000000..796402174d6c --- /dev/null +++ b/include/uapi/linux/iommu.h @@ -0,0 +1,118 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * IOMMU user API definitions + */ + +#ifndef _UAPI_IOMMU_H +#define _UAPI_IOMMU_H + +#include + +#define IOMMU_FAULT_PERM_READ (1 << 0) /* read */ +#define IOMMU_FAULT_PERM_WRITE (1 << 1) /* write */ +#define IOMMU_FAULT_PERM_EXEC (1 << 2) /* exec */ +#define IOMMU_FAULT_PERM_PRIV (1 << 3) /* privileged */ + +/* Generic fault types, can be expanded IRQ remapping fault */ +enum iommu_fault_type { + IOMMU_FAULT_DMA_UNRECOV = 1, /* unrecoverable fault */ + IOMMU_FAULT_PAGE_REQ, /* page request fault */ +}; + +enum iommu_fault_reason { + IOMMU_FAULT_REASON_UNKNOWN = 0, + + /* Could not access the PASID table (fetch caused external abort) */ + IOMMU_FAULT_REASON_PASID_FETCH, + + /* PASID entry is invalid or has configuration errors */ + IOMMU_FAULT_REASON_BAD_PASID_ENTRY, + + /* + * PASID is out of range (e.g. exceeds the maximum PASID + * supported by the IOMMU) or disabled. + */ + IOMMU_FAULT_REASON_PASID_INVALID, + + /* + * An external abort occurred fetching (or updating) a translation + * table descriptor + */ + IOMMU_FAULT_REASON_WALK_EABT, + + /* + * Could not access the page table entry (Bad address), + * actual translation fault + */ + IOMMU_FAULT_REASON_PTE_FETCH, + + /* Protection flag check failed */ + IOMMU_FAULT_REASON_PERMISSION, + + /* access flag check failed */ + IOMMU_FAULT_REASON_ACCESS, + + /* Output address of a translation stage caused Address Size fault */ + IOMMU_FAULT_REASON_OOR_ADDRESS, +}; + +/** + * struct iommu_fault_unrecoverable - Unrecoverable fault data + * @reason: reason of the fault, from &enum iommu_fault_reason + * @flags: parameters of this fault (IOMMU_FAULT_UNRECOV_* values) + * @pasid: Process Address Space ID + * @perm: requested permission access using by the incoming transaction + * (IOMMU_FAULT_PERM_* values) + * @addr: offending page address + * @fetch_addr: address that caused a fetch abort, if any + */ +struct iommu_fault_unrecoverable { + __u32 reason; +#define IOMMU_FAULT_UNRECOV_PASID_VALID (1 << 0) +#define IOMMU_FAULT_UNRECOV_ADDR_VALID (1 << 1) +#define IOMMU_FAULT_UNRECOV_FETCH_ADDR_VALID (1 << 2) + __u32 flags; + __u32 pasid; + __u32 perm; + __u64 addr; + __u64 fetch_addr; +}; + +/** + * struct iommu_fault_page_request - Page Request data + * @flags: encodes whether the corresponding fields are valid and whether this + * is the last page in group (IOMMU_FAULT_PAGE_REQUEST_* values) + * @pasid: Process Address Space ID + * @grpid: Page Request Group Index + * @perm: requested page permissions (IOMMU_FAULT_PERM_* values) + * @addr: page address + * @private_data: device-specific private information + */ +struct iommu_fault_page_request { +#define IOMMU_FAULT_PAGE_REQUEST_PASID_VALID (1 << 0) +#define IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE (1 << 1) +#define IOMMU_FAULT_PAGE_REQUEST_PRIV_DATA (1 << 2) + __u32 flags; + __u32 pasid; + __u32 grpid; + __u32 perm; + __u64 addr; + __u64 private_data[2]; +}; + +/** + * struct iommu_fault - Generic fault data + * @type: fault type from &enum iommu_fault_type + * @padding: reserved for future use (should be zero) + * @event: fault event, when @type is %IOMMU_FAULT_DMA_UNRECOV + * @prm: Page Request message, when @type is %IOMMU_FAULT_PAGE_REQ + */ +struct iommu_fault { + __u32 type; + __u32 padding; + union { + struct iommu_fault_unrecoverable event; + struct iommu_fault_page_request prm; + }; +}; +#endif /* _UAPI_IOMMU_H */ -- cgit v1.2.3-71-gd317 From bf3255b3cfe2d06280340dbac3f44b65d3ee6da3 Mon Sep 17 00:00:00 2001 From: Jean-Philippe Brucker Date: Mon, 3 Jun 2019 15:57:49 +0100 Subject: iommu: Add recoverable fault reporting Some IOMMU hardware features, for example PCI PRI and Arm SMMU Stall, enable recoverable I/O page faults. Allow IOMMU drivers to report PRI Page Requests and Stall events through the new fault reporting API. The consumer of the fault can be either an I/O page fault handler in the host, or a guest OS. Once handled, the fault must be completed by sending a page response back to the IOMMU. Add an iommu_page_response() function to complete a page fault. There are two ways to extend the userspace API: * Add a field to iommu_page_response and a flag to iommu_page_response::flags describing the validity of this field. * Introduce a new iommu_page_response_X structure with a different version number. The kernel must then support both versions. Signed-off-by: Jacob Pan Signed-off-by: Jean-Philippe Brucker Signed-off-by: Joerg Roedel --- drivers/iommu/iommu.c | 94 +++++++++++++++++++++++++++++++++++++++++++++- include/linux/iommu.h | 19 ++++++++++ include/uapi/linux/iommu.h | 35 +++++++++++++++++ 3 files changed, 146 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 293a6fa716e0..ac1f29c19e59 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -891,7 +891,14 @@ EXPORT_SYMBOL_GPL(iommu_group_unregister_notifier); * @data: private data passed as argument to the handler * * When an IOMMU fault event is received, this handler gets called with the - * fault event and data as argument. The handler should return 0 on success. + * fault event and data as argument. The handler should return 0 on success. If + * the fault is recoverable (IOMMU_FAULT_PAGE_REQ), the consumer should also + * complete the fault by calling iommu_page_response() with one of the following + * response code: + * - IOMMU_PAGE_RESP_SUCCESS: retry the translation + * - IOMMU_PAGE_RESP_INVALID: terminate the fault + * - IOMMU_PAGE_RESP_FAILURE: terminate the fault and stop reporting + * page faults if possible. * * Return 0 if the fault handler was installed successfully, or an error. */ @@ -921,6 +928,8 @@ int iommu_register_device_fault_handler(struct device *dev, } param->fault_param->handler = handler; param->fault_param->data = data; + mutex_init(¶m->fault_param->lock); + INIT_LIST_HEAD(¶m->fault_param->faults); done_unlock: mutex_unlock(¶m->lock); @@ -951,6 +960,12 @@ int iommu_unregister_device_fault_handler(struct device *dev) if (!param->fault_param) goto unlock; + /* we cannot unregister handler if there are pending faults */ + if (!list_empty(¶m->fault_param->faults)) { + ret = -EBUSY; + goto unlock; + } + kfree(param->fault_param); param->fault_param = NULL; put_device(dev); @@ -967,13 +982,15 @@ EXPORT_SYMBOL_GPL(iommu_unregister_device_fault_handler); * @evt: fault event data * * Called by IOMMU drivers when a fault is detected, typically in a threaded IRQ - * handler. + * handler. When this function fails and the fault is recoverable, it is the + * caller's responsibility to complete the fault. * * Return 0 on success, or an error. */ int iommu_report_device_fault(struct device *dev, struct iommu_fault_event *evt) { struct iommu_param *param = dev->iommu_param; + struct iommu_fault_event *evt_pending = NULL; struct iommu_fault_param *fparam; int ret = 0; @@ -987,13 +1004,86 @@ int iommu_report_device_fault(struct device *dev, struct iommu_fault_event *evt) ret = -EINVAL; goto done_unlock; } + + if (evt->fault.type == IOMMU_FAULT_PAGE_REQ && + (evt->fault.prm.flags & IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE)) { + evt_pending = kmemdup(evt, sizeof(struct iommu_fault_event), + GFP_KERNEL); + if (!evt_pending) { + ret = -ENOMEM; + goto done_unlock; + } + mutex_lock(&fparam->lock); + list_add_tail(&evt_pending->list, &fparam->faults); + mutex_unlock(&fparam->lock); + } + ret = fparam->handler(&evt->fault, fparam->data); + if (ret && evt_pending) { + mutex_lock(&fparam->lock); + list_del(&evt_pending->list); + mutex_unlock(&fparam->lock); + kfree(evt_pending); + } done_unlock: mutex_unlock(¶m->lock); return ret; } EXPORT_SYMBOL_GPL(iommu_report_device_fault); +int iommu_page_response(struct device *dev, + struct iommu_page_response *msg) +{ + bool pasid_valid; + int ret = -EINVAL; + struct iommu_fault_event *evt; + struct iommu_fault_page_request *prm; + struct iommu_param *param = dev->iommu_param; + struct iommu_domain *domain = iommu_get_domain_for_dev(dev); + + if (!domain || !domain->ops->page_response) + return -ENODEV; + + if (!param || !param->fault_param) + return -EINVAL; + + if (msg->version != IOMMU_PAGE_RESP_VERSION_1 || + msg->flags & ~IOMMU_PAGE_RESP_PASID_VALID) + return -EINVAL; + + /* Only send response if there is a fault report pending */ + mutex_lock(¶m->fault_param->lock); + if (list_empty(¶m->fault_param->faults)) { + dev_warn_ratelimited(dev, "no pending PRQ, drop response\n"); + goto done_unlock; + } + /* + * Check if we have a matching page request pending to respond, + * otherwise return -EINVAL + */ + list_for_each_entry(evt, ¶m->fault_param->faults, list) { + prm = &evt->fault.prm; + pasid_valid = prm->flags & IOMMU_FAULT_PAGE_REQUEST_PASID_VALID; + + if ((pasid_valid && prm->pasid != msg->pasid) || + prm->grpid != msg->grpid) + continue; + + /* Sanitize the reply */ + msg->flags = pasid_valid ? IOMMU_PAGE_RESP_PASID_VALID : 0; + + ret = domain->ops->page_response(dev, evt, msg); + list_del(&evt->list); + kfree(evt); + break; + } + +done_unlock: + mutex_unlock(¶m->fault_param->lock); + return ret; +} +EXPORT_SYMBOL_GPL(iommu_page_response); + /** * iommu_group_id - Return ID for a group * @group: the group to ID diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 3e783f5bf472..76c8cda61dfd 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -227,6 +227,7 @@ struct iommu_sva_ops { * @sva_bind: Bind process address space to device * @sva_unbind: Unbind process address space from device * @sva_get_pasid: Get PASID associated to a SVA handle + * @page_response: handle page request response * @pgsize_bitmap: bitmap of all possible supported page sizes */ struct iommu_ops { @@ -287,6 +288,10 @@ struct iommu_ops { void (*sva_unbind)(struct iommu_sva *handle); int (*sva_get_pasid)(struct iommu_sva *handle); + int (*page_response)(struct device *dev, + struct iommu_fault_event *evt, + struct iommu_page_response *msg); + unsigned long pgsize_bitmap; }; @@ -311,19 +316,25 @@ struct iommu_device { * unrecoverable faults such as DMA or IRQ remapping faults. * * @fault: fault descriptor + * @list: pending fault event list, used for tracking responses */ struct iommu_fault_event { struct iommu_fault fault; + struct list_head list; }; /** * struct iommu_fault_param - per-device IOMMU fault data * @handler: Callback function to handle IOMMU faults at device level * @data: handler private data + * @faults: holds the pending faults which needs response + * @lock: protect pending faults list */ struct iommu_fault_param { iommu_dev_fault_handler_t handler; void *data; + struct list_head faults; + struct mutex lock; }; /** @@ -437,6 +448,8 @@ extern int iommu_unregister_device_fault_handler(struct device *dev); extern int iommu_report_device_fault(struct device *dev, struct iommu_fault_event *evt); +extern int iommu_page_response(struct device *dev, + struct iommu_page_response *msg); extern int iommu_group_id(struct iommu_group *group); extern struct iommu_group *iommu_group_get_for_dev(struct device *dev); @@ -765,6 +778,12 @@ int iommu_report_device_fault(struct device *dev, struct iommu_fault_event *evt) return -ENODEV; } +static inline int iommu_page_response(struct device *dev, + struct iommu_page_response *msg) +{ + return -ENODEV; +} + static inline int iommu_group_id(struct iommu_group *group) { return -ENODEV; diff --git a/include/uapi/linux/iommu.h b/include/uapi/linux/iommu.h index 796402174d6c..f45d8e9e59c3 100644 --- a/include/uapi/linux/iommu.h +++ b/include/uapi/linux/iommu.h @@ -115,4 +115,39 @@ struct iommu_fault { struct iommu_fault_page_request prm; }; }; + +/** + * enum iommu_page_response_code - Return status of fault handlers + * @IOMMU_PAGE_RESP_SUCCESS: Fault has been handled and the page tables + * populated, retry the access. This is "Success" in PCI PRI. + * @IOMMU_PAGE_RESP_FAILURE: General error. Drop all subsequent faults from + * this device if possible. This is "Response Failure" in PCI PRI. + * @IOMMU_PAGE_RESP_INVALID: Could not handle this fault, don't retry the + * access. This is "Invalid Request" in PCI PRI. + */ +enum iommu_page_response_code { + IOMMU_PAGE_RESP_SUCCESS = 0, + IOMMU_PAGE_RESP_INVALID, + IOMMU_PAGE_RESP_FAILURE, +}; + +/** + * struct iommu_page_response - Generic page response information + * @version: API version of this structure + * @flags: encodes whether the corresponding fields are valid + * (IOMMU_FAULT_PAGE_RESPONSE_* values) + * @pasid: Process Address Space ID + * @grpid: Page Request Group Index + * @code: response code from &enum iommu_page_response_code + */ +struct iommu_page_response { +#define IOMMU_PAGE_RESP_VERSION_1 1 + __u32 version; +#define IOMMU_PAGE_RESP_PASID_VALID (1 << 0) + __u32 flags; + __u32 pasid; + __u32 grpid; + __u32 code; +}; + #endif /* _UAPI_IOMMU_H */ -- cgit v1.2.3-71-gd317 From a842fe1425cb20f457abd3f8ef98b468f83ca98b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 12 Jun 2019 11:57:25 -0700 Subject: tcp: add optional per socket transmit delay Adding delays to TCP flows is crucial for studying behavior of TCP stacks, including congestion control modules. Linux offers netem module, but it has unpractical constraints : - Need root access to change qdisc - Hard to setup on egress if combined with non trivial qdisc like FQ - Single delay for all flows. EDT (Earliest Departure Time) adoption in TCP stack allows us to enable a per socket delay at a very small cost. Networking tools can now establish thousands of flows, each of them with a different delay, simulating real world conditions. This requires FQ packet scheduler or a EDT-enabled NIC. This patchs adds TCP_TX_DELAY socket option, to set a delay in usec units. unsigned int tx_delay = 10000; /* 10 msec */ setsockopt(fd, SOL_TCP, TCP_TX_DELAY, &tx_delay, sizeof(tx_delay)); Note that FQ packet scheduler limits might need some tweaking : man tc-fq PARAMETERS limit Hard limit on the real queue size. When this limit is reached, new packets are dropped. If the value is lowered, packets are dropped so that the new limit is met. Default is 10000 packets. flow_limit Hard limit on the maximum number of packets queued per flow. Default value is 100. Use of TCP_TX_DELAY option will increase number of skbs in FQ qdisc, so packets would be dropped if any of the previous limit is hit. Use of a jump label makes this support runtime-free, for hosts never using the option. Also note that TSQ (TCP Small Queues) limits are slightly changed with this patch : we need to account that skbs artificially delayed wont stop us providind more skbs to feed the pipe (netem uses skb_orphan_partial() for this purpose, but FQ can not use this trick) Because of that, using big delays might very well trigger old bugs in TSO auto defer logic and/or sndbuf limited detection. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/tcp.h | 2 ++ include/net/tcp.h | 19 +++++++++++++++++++ include/uapi/linux/tcp.h | 3 +++ net/ipv4/tcp.c | 24 ++++++++++++++++++++++++ net/ipv4/tcp_ipv4.c | 10 ++++++---- net/ipv4/tcp_minisocks.c | 2 +- net/ipv4/tcp_output.c | 23 ++++++++++++++++++++--- net/ipv6/tcp_ipv6.c | 1 + 8 files changed, 76 insertions(+), 8 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 711361af9ce0..c23019a3b264 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -245,6 +245,7 @@ struct tcp_sock { syn_smc:1; /* SYN includes SMC */ u32 tlp_high_seq; /* snd_nxt at the time of TLP retransmit. */ + u32 tcp_tx_delay; /* delay (in usec) added to TX packets */ u64 tcp_wstamp_ns; /* departure time for next sent data packet */ u64 tcp_clock_cache; /* cache last tcp_clock_ns() (see tcp_mstamp_refresh()) */ @@ -436,6 +437,7 @@ struct tcp_timewait_sock { u32 tw_last_oow_ack_time; int tw_ts_recent_stamp; + u32 tw_tx_delay; #ifdef CONFIG_TCP_MD5SIG struct tcp_md5sig_key *tw_md5_key; #endif diff --git a/include/net/tcp.h b/include/net/tcp.h index 204328b88412..49a178b8d5b2 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -2232,4 +2232,23 @@ void clean_acked_data_disable(struct inet_connection_sock *icsk); void clean_acked_data_flush(void); #endif +DECLARE_STATIC_KEY_FALSE(tcp_tx_delay_enabled); +static inline void tcp_add_tx_delay(struct sk_buff *skb, + const struct tcp_sock *tp) +{ + if (static_branch_unlikely(&tcp_tx_delay_enabled)) + skb->skb_mstamp_ns += (u64)tp->tcp_tx_delay * NSEC_PER_USEC; +} + +static inline void tcp_set_tx_time(struct sk_buff *skb, + const struct sock *sk) +{ + if (static_branch_unlikely(&tcp_tx_delay_enabled)) { + u32 delay = (sk->sk_state == TCP_TIME_WAIT) ? + tcp_twsk(sk)->tw_tx_delay : tcp_sk(sk)->tcp_tx_delay; + + skb->skb_mstamp_ns = tcp_clock_ns() + (u64)delay * NSEC_PER_USEC; + } +} + #endif /* _TCP_H */ diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index b521464ea962..b3564f85a762 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -127,6 +127,9 @@ enum { #define TCP_CM_INQ TCP_INQ +#define TCP_TX_DELAY 37 /* delay outgoing packets by XX usec */ + + #define TCP_REPAIR_ON 1 #define TCP_REPAIR_OFF 0 #define TCP_REPAIR_OFF_NO_WP -1 /* Turn off without window probes */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index bd0856ac680a..5542e3d778e6 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2736,6 +2736,21 @@ static int tcp_repair_options_est(struct sock *sk, return 0; } +DEFINE_STATIC_KEY_FALSE(tcp_tx_delay_enabled); +EXPORT_SYMBOL(tcp_tx_delay_enabled); + +static void tcp_enable_tx_delay(void) +{ + if (!static_branch_unlikely(&tcp_tx_delay_enabled)) { + static int __tcp_tx_delay_enabled = 0; + + if (cmpxchg(&__tcp_tx_delay_enabled, 0, 1) == 0) { + static_branch_enable(&tcp_tx_delay_enabled); + pr_info("TCP_TX_DELAY enabled\n"); + } + } +} + /* * Socket option code for TCP. */ @@ -3087,6 +3102,11 @@ static int do_tcp_setsockopt(struct sock *sk, int level, else tp->recvmsg_inq = val; break; + case TCP_TX_DELAY: + if (val) + tcp_enable_tx_delay(); + tp->tcp_tx_delay = val; + break; default: err = -ENOPROTOOPT; break; @@ -3546,6 +3566,10 @@ static int do_tcp_getsockopt(struct sock *sk, int level, val = tp->fastopen_no_cookie; break; + case TCP_TX_DELAY: + val = tp->tcp_tx_delay; + break; + case TCP_TIMESTAMP: val = tcp_time_stamp_raw() + tp->tsoffset; break; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index f059fbd81a84..1b7e9e1fbd3b 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -767,9 +767,11 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL); local_bh_disable(); ctl_sk = this_cpu_read(*net->ipv4.tcp_sk); - if (sk) + if (sk) { ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? inet_twsk(sk)->tw_mark : sk->sk_mark; + tcp_set_tx_time(skb, sk); + } ip_send_unicast_reply(ctl_sk, skb, &TCP_SKB_CB(skb)->header.h4.opt, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, @@ -859,9 +861,9 @@ static void tcp_v4_send_ack(const struct sock *sk, arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL); local_bh_disable(); ctl_sk = this_cpu_read(*net->ipv4.tcp_sk); - if (sk) - ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? - inet_twsk(sk)->tw_mark : sk->sk_mark; + ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? + inet_twsk(sk)->tw_mark : sk->sk_mark; + tcp_set_tx_time(skb, sk); ip_send_unicast_reply(ctl_sk, skb, &TCP_SKB_CB(skb)->header.h4.opt, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 11011e8386dc..8bcaf2586b68 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -274,7 +274,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp; tcptw->tw_ts_offset = tp->tsoffset; tcptw->tw_last_oow_ack_time = 0; - + tcptw->tw_tx_delay = tp->tcp_tx_delay; #if IS_ENABLED(CONFIG_IPV6) if (tw->tw_family == PF_INET6) { struct ipv6_pinfo *np = inet6_sk(sk); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index f429e856e263..d954ff9069e8 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1153,6 +1153,8 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, memset(skb->cb, 0, max(sizeof(struct inet_skb_parm), sizeof(struct inet6_skb_parm))); + tcp_add_tx_delay(skb, tp); + err = icsk->icsk_af_ops->queue_xmit(sk, skb, &inet->cork.fl); if (unlikely(err > 0)) { @@ -2234,6 +2236,18 @@ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb, sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes); limit <<= factor; + if (static_branch_unlikely(&tcp_tx_delay_enabled) && + tcp_sk(sk)->tcp_tx_delay) { + u64 extra_bytes = (u64)sk->sk_pacing_rate * tcp_sk(sk)->tcp_tx_delay; + + /* TSQ is based on skb truesize sum (sk_wmem_alloc), so we + * approximate our needs assuming an ~100% skb->truesize overhead. + * USEC_PER_SEC is approximated by 2^20. + * do_div(extra_bytes, USEC_PER_SEC/2) is replaced by a right shift. + */ + extra_bytes >>= (20 - 1); + limit += extra_bytes; + } if (refcount_read(&sk->sk_wmem_alloc) > limit) { /* Always send skb if rtx queue is empty. * No need to wait for TX completion to call us back, @@ -3212,6 +3226,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, int tcp_header_size; struct tcphdr *th; int mss; + u64 now; skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC); if (unlikely(!skb)) { @@ -3243,13 +3258,14 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, mss = tcp_mss_clamp(tp, dst_metric_advmss(dst)); memset(&opts, 0, sizeof(opts)); + now = tcp_clock_ns(); #ifdef CONFIG_SYN_COOKIES if (unlikely(req->cookie_ts)) skb->skb_mstamp_ns = cookie_init_timestamp(req); else #endif { - skb->skb_mstamp_ns = tcp_clock_ns(); + skb->skb_mstamp_ns = now; if (!tcp_rsk(req)->snt_synack) /* Timestamp first SYNACK */ tcp_rsk(req)->snt_synack = tcp_skb_timestamp_us(skb); } @@ -3292,8 +3308,9 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, rcu_read_unlock(); #endif - /* Do not fool tcpdump (if any), clean our debris */ - skb->tstamp = 0; + skb->skb_mstamp_ns = now; + tcp_add_tx_delay(skb, tp); + return skb; } EXPORT_SYMBOL(tcp_make_synack); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index ad7039137a20..5606b2131b65 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -892,6 +892,7 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 } else { mark = sk->sk_mark; } + tcp_set_tx_time(buff, sk); } fl6.flowi6_mark = IP6_REPLY_MARK(net, skb->mark) ?: mark; fl6.fl6_dport = t1->dest; -- cgit v1.2.3-71-gd317 From de76cda215d56256ffcda7ffa538b70f9fb301a7 Mon Sep 17 00:00:00 2001 From: Gustavo Pimentel Date: Tue, 4 Jun 2019 18:24:43 +0200 Subject: PCI: Decode PCIe 32 GT/s link speed PCIe r5.0, sec 7.5.3.18, defines a new 32.0 GT/s bit in the Supported Link Speeds Vector of Link Capabilities 2. Decode this new speed. This does not affect the speed of the link, which should be negotiated automatically by the hardware; it only adds decoding when showing the speed to the user. Previously, reading the speed of a link operating at this speed showed "Unknown speed" instead of "32.0 GT/s". Link: https://lore.kernel.org/lkml/92365e3caf0fc559f9ab14bcd053bfc92d4f661c.1559664969.git.gustavo.pimentel@synopsys.com Signed-off-by: Gustavo Pimentel [bhelgaas: changelog] Signed-off-by: Bjorn Helgaas --- drivers/pci/pci-sysfs.c | 3 +++ drivers/pci/pci.c | 4 +++- drivers/pci/probe.c | 2 +- drivers/pci/slot.c | 1 + include/linux/pci.h | 1 + include/uapi/linux/pci_regs.h | 4 ++++ 6 files changed, 13 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c index 6d27475e39b2..d52d30448e41 100644 --- a/drivers/pci/pci-sysfs.c +++ b/drivers/pci/pci-sysfs.c @@ -182,6 +182,9 @@ static ssize_t current_link_speed_show(struct device *dev, return -EINVAL; switch (linkstat & PCI_EXP_LNKSTA_CLS) { + case PCI_EXP_LNKSTA_CLS_32_0GB: + speed = "32 GT/s"; + break; case PCI_EXP_LNKSTA_CLS_16_0GB: speed = "16 GT/s"; break; diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index 8abc843b1615..4729a7c7a9d9 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -5621,7 +5621,9 @@ enum pci_bus_speed pcie_get_speed_cap(struct pci_dev *dev) */ pcie_capability_read_dword(dev, PCI_EXP_LNKCAP2, &lnkcap2); if (lnkcap2) { /* PCIe r3.0-compliant */ - if (lnkcap2 & PCI_EXP_LNKCAP2_SLS_16_0GB) + if (lnkcap2 & PCI_EXP_LNKCAP2_SLS_32_0GB) + return PCIE_SPEED_32_0GT; + else if (lnkcap2 & PCI_EXP_LNKCAP2_SLS_16_0GB) return PCIE_SPEED_16_0GT; else if (lnkcap2 & PCI_EXP_LNKCAP2_SLS_8_0GB) return PCIE_SPEED_8_0GT; diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index 0e8e2c186f50..c5f27c8cd140 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -668,7 +668,7 @@ const unsigned char pcie_link_speed[] = { PCIE_SPEED_5_0GT, /* 2 */ PCIE_SPEED_8_0GT, /* 3 */ PCIE_SPEED_16_0GT, /* 4 */ - PCI_SPEED_UNKNOWN, /* 5 */ + PCIE_SPEED_32_0GT, /* 5 */ PCI_SPEED_UNKNOWN, /* 6 */ PCI_SPEED_UNKNOWN, /* 7 */ PCI_SPEED_UNKNOWN, /* 8 */ diff --git a/drivers/pci/slot.c b/drivers/pci/slot.c index f4d92b1afe7b..ae4aa0e1f2f4 100644 --- a/drivers/pci/slot.c +++ b/drivers/pci/slot.c @@ -75,6 +75,7 @@ static const char *pci_bus_speed_strings[] = { "5.0 GT/s PCIe", /* 0x15 */ "8.0 GT/s PCIe", /* 0x16 */ "16.0 GT/s PCIe", /* 0x17 */ + "32.0 GT/s PCIe", /* 0x18 */ }; static ssize_t bus_speed_read(enum pci_bus_speed speed, char *buf) diff --git a/include/linux/pci.h b/include/linux/pci.h index 4a5a84d7bdd4..2173e6b75579 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -258,6 +258,7 @@ enum pci_bus_speed { PCIE_SPEED_5_0GT = 0x15, PCIE_SPEED_8_0GT = 0x16, PCIE_SPEED_16_0GT = 0x17, + PCIE_SPEED_32_0GT = 0x18, PCI_SPEED_UNKNOWN = 0xff, }; diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h index 27164769d184..f28e562d7ca8 100644 --- a/include/uapi/linux/pci_regs.h +++ b/include/uapi/linux/pci_regs.h @@ -528,6 +528,7 @@ #define PCI_EXP_LNKCAP_SLS_5_0GB 0x00000002 /* LNKCAP2 SLS Vector bit 1 */ #define PCI_EXP_LNKCAP_SLS_8_0GB 0x00000003 /* LNKCAP2 SLS Vector bit 2 */ #define PCI_EXP_LNKCAP_SLS_16_0GB 0x00000004 /* LNKCAP2 SLS Vector bit 3 */ +#define PCI_EXP_LNKCAP_SLS_32_0GB 0x00000005 /* LNKCAP2 SLS Vector bit 4 */ #define PCI_EXP_LNKCAP_MLW 0x000003f0 /* Maximum Link Width */ #define PCI_EXP_LNKCAP_ASPMS 0x00000c00 /* ASPM Support */ #define PCI_EXP_LNKCAP_L0SEL 0x00007000 /* L0s Exit Latency */ @@ -556,6 +557,7 @@ #define PCI_EXP_LNKSTA_CLS_5_0GB 0x0002 /* Current Link Speed 5.0GT/s */ #define PCI_EXP_LNKSTA_CLS_8_0GB 0x0003 /* Current Link Speed 8.0GT/s */ #define PCI_EXP_LNKSTA_CLS_16_0GB 0x0004 /* Current Link Speed 16.0GT/s */ +#define PCI_EXP_LNKSTA_CLS_32_0GB 0x0005 /* Current Link Speed 32.0GT/s */ #define PCI_EXP_LNKSTA_NLW 0x03f0 /* Negotiated Link Width */ #define PCI_EXP_LNKSTA_NLW_X1 0x0010 /* Current Link Width x1 */ #define PCI_EXP_LNKSTA_NLW_X2 0x0020 /* Current Link Width x2 */ @@ -661,6 +663,7 @@ #define PCI_EXP_LNKCAP2_SLS_5_0GB 0x00000004 /* Supported Speed 5GT/s */ #define PCI_EXP_LNKCAP2_SLS_8_0GB 0x00000008 /* Supported Speed 8GT/s */ #define PCI_EXP_LNKCAP2_SLS_16_0GB 0x00000010 /* Supported Speed 16GT/s */ +#define PCI_EXP_LNKCAP2_SLS_32_0GB 0x00000020 /* Supported Speed 32GT/s */ #define PCI_EXP_LNKCAP2_CROSSLINK 0x00000100 /* Crosslink supported */ #define PCI_EXP_LNKCTL2 48 /* Link Control 2 */ #define PCI_EXP_LNKCTL2_TLS 0x000f @@ -668,6 +671,7 @@ #define PCI_EXP_LNKCTL2_TLS_5_0GT 0x0002 /* Supported Speed 5GT/s */ #define PCI_EXP_LNKCTL2_TLS_8_0GT 0x0003 /* Supported Speed 8GT/s */ #define PCI_EXP_LNKCTL2_TLS_16_0GT 0x0004 /* Supported Speed 16GT/s */ +#define PCI_EXP_LNKCTL2_TLS_32_0GT 0x0005 /* Supported Speed 32GT/s */ #define PCI_EXP_LNKSTA2 50 /* Link Status 2 */ #define PCI_CAP_EXP_ENDPOINT_SIZEOF_V2 52 /* v2 endpoints with link end here */ #define PCI_EXP_SLTCAP2 52 /* Slot Capabilities 2 */ -- cgit v1.2.3-71-gd317 From ed63bb1d1f8469586006a9ca63c42344401aa2ab Mon Sep 17 00:00:00 2001 From: Greg Hackmann Date: Thu, 13 Jun 2019 15:34:06 -0700 Subject: dma-buf: give each buffer a full-fledged inode By traversing /proc/*/fd and /proc/*/map_files, processes with CAP_ADMIN can get a lot of fine-grained data about how shmem buffers are shared among processes. stat(2) on each entry gives the caller a unique ID (st_ino), the buffer's size (st_size), and even the number of pages currently charged to the buffer (st_blocks / 512). In contrast, all dma-bufs share the same anonymous inode. So while we can count how many dma-buf fds or mappings a process has, we can't get the size of the backing buffers or tell if two entries point to the same dma-buf. On systems with debugfs, we can get a per-buffer breakdown of size and reference count, but can't tell which processes are actually holding the references to each buffer. Replace the singleton inode with full-fledged inodes allocated by alloc_anon_inode(). This involves creating and mounting a mini-pseudo-filesystem for dma-buf, following the example in fs/aio.c. Signed-off-by: Greg Hackmann Signed-off-by: Chenbo Feng Signed-off-by: Sumit Semwal Link: https://patchwork.freedesktop.org/patch/msgid/20190613223408.139221-2-fengc@google.com --- drivers/dma-buf/dma-buf.c | 63 +++++++++++++++++++++++++++++++++++++++++----- include/uapi/linux/magic.h | 1 + 2 files changed, 58 insertions(+), 6 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c index f4104a21b069..3612ccededd6 100644 --- a/drivers/dma-buf/dma-buf.c +++ b/drivers/dma-buf/dma-buf.c @@ -34,8 +34,10 @@ #include #include #include +#include #include +#include static inline int is_dma_buf_file(struct file *); @@ -46,6 +48,25 @@ struct dma_buf_list { static struct dma_buf_list db_list; +static const struct dentry_operations dma_buf_dentry_ops = { + .d_dname = simple_dname, +}; + +static struct vfsmount *dma_buf_mnt; + +static struct dentry *dma_buf_fs_mount(struct file_system_type *fs_type, + int flags, const char *name, void *data) +{ + return mount_pseudo(fs_type, "dmabuf:", NULL, &dma_buf_dentry_ops, + DMA_BUF_MAGIC); +} + +static struct file_system_type dma_buf_fs_type = { + .name = "dmabuf", + .mount = dma_buf_fs_mount, + .kill_sb = kill_anon_super, +}; + static int dma_buf_release(struct inode *inode, struct file *file) { struct dma_buf *dmabuf; @@ -342,6 +363,31 @@ static inline int is_dma_buf_file(struct file *file) return file->f_op == &dma_buf_fops; } +static struct file *dma_buf_getfile(struct dma_buf *dmabuf, int flags) +{ + struct file *file; + struct inode *inode = alloc_anon_inode(dma_buf_mnt->mnt_sb); + + if (IS_ERR(inode)) + return ERR_CAST(inode); + + inode->i_size = dmabuf->size; + inode_set_bytes(inode, dmabuf->size); + + file = alloc_file_pseudo(inode, dma_buf_mnt, "dmabuf", + flags, &dma_buf_fops); + if (IS_ERR(file)) + goto err_alloc_file; + file->f_flags = flags & (O_ACCMODE | O_NONBLOCK); + file->private_data = dmabuf; + + return file; + +err_alloc_file: + iput(inode); + return file; +} + /** * DOC: dma buf device access * @@ -436,8 +482,7 @@ struct dma_buf *dma_buf_export(const struct dma_buf_export_info *exp_info) } dmabuf->resv = resv; - file = anon_inode_getfile("dmabuf", &dma_buf_fops, dmabuf, - exp_info->flags); + file = dma_buf_getfile(dmabuf, exp_info->flags); if (IS_ERR(file)) { ret = PTR_ERR(file); goto err_dmabuf; @@ -1055,8 +1100,8 @@ static int dma_buf_debug_show(struct seq_file *s, void *unused) return ret; seq_puts(s, "\nDma-buf Objects:\n"); - seq_printf(s, "%-8s\t%-8s\t%-8s\t%-8s\texp_name\n", - "size", "flags", "mode", "count"); + seq_printf(s, "%-8s\t%-8s\t%-8s\t%-8s\texp_name\t%-8s\n", + "size", "flags", "mode", "count", "ino"); list_for_each_entry(buf_obj, &db_list.head, list_node) { ret = mutex_lock_interruptible(&buf_obj->lock); @@ -1067,11 +1112,12 @@ static int dma_buf_debug_show(struct seq_file *s, void *unused) continue; } - seq_printf(s, "%08zu\t%08x\t%08x\t%08ld\t%s\n", + seq_printf(s, "%08zu\t%08x\t%08x\t%08ld\t%s\t%08lu\n", buf_obj->size, buf_obj->file->f_flags, buf_obj->file->f_mode, file_count(buf_obj->file), - buf_obj->exp_name); + buf_obj->exp_name, + file_inode(buf_obj->file)->i_ino); robj = buf_obj->resv; while (true) { @@ -1167,6 +1213,10 @@ static inline void dma_buf_uninit_debugfs(void) static int __init dma_buf_init(void) { + dma_buf_mnt = kern_mount(&dma_buf_fs_type); + if (IS_ERR(dma_buf_mnt)) + return PTR_ERR(dma_buf_mnt); + mutex_init(&db_list.lock); INIT_LIST_HEAD(&db_list.head); dma_buf_init_debugfs(); @@ -1177,5 +1227,6 @@ subsys_initcall(dma_buf_init); static void __exit dma_buf_deinit(void) { dma_buf_uninit_debugfs(); + kern_unmount(dma_buf_mnt); } __exitcall(dma_buf_deinit); diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h index f8c00045d537..665e18627f78 100644 --- a/include/uapi/linux/magic.h +++ b/include/uapi/linux/magic.h @@ -91,5 +91,6 @@ #define UDF_SUPER_MAGIC 0x15013346 #define BALLOON_KVM_MAGIC 0x13661366 #define ZSMALLOC_MAGIC 0x58295829 +#define DMA_BUF_MAGIC 0x444d4142 /* "DMAB" */ #endif /* __LINUX_MAGIC_H__ */ -- cgit v1.2.3-71-gd317 From bb2bb903042517b8fb17b2bc21e00512f2dcac01 Mon Sep 17 00:00:00 2001 From: Greg Hackmann Date: Thu, 13 Jun 2019 15:34:07 -0700 Subject: dma-buf: add DMA_BUF_SET_NAME ioctls This patch adds complimentary DMA_BUF_SET_NAME ioctls, which lets userspace processes attach a free-form name to each buffer. This information can be extremely helpful for tracking and accounting shared buffers. For example, on Android, we know what each buffer will be used for at allocation time: GL, multimedia, camera, etc. The userspace allocator can use DMA_BUF_SET_NAME to associate that information with the buffer, so we can later give developers a breakdown of how much memory they're allocating for graphics, camera, etc. Signed-off-by: Greg Hackmann Signed-off-by: Chenbo Feng Signed-off-by: Sumit Semwal Link: https://patchwork.freedesktop.org/patch/msgid/20190613223408.139221-3-fengc@google.com --- drivers/dma-buf/dma-buf.c | 65 ++++++++++++++++++++++++++++++++++++++++++-- include/linux/dma-buf.h | 5 +++- include/uapi/linux/dma-buf.h | 3 ++ 3 files changed, 69 insertions(+), 4 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c index 3612ccededd6..ab96410d1dcd 100644 --- a/drivers/dma-buf/dma-buf.c +++ b/drivers/dma-buf/dma-buf.c @@ -48,8 +48,24 @@ struct dma_buf_list { static struct dma_buf_list db_list; +static char *dmabuffs_dname(struct dentry *dentry, char *buffer, int buflen) +{ + struct dma_buf *dmabuf; + char name[DMA_BUF_NAME_LEN]; + size_t ret = 0; + + dmabuf = dentry->d_fsdata; + mutex_lock(&dmabuf->lock); + if (dmabuf->name) + ret = strlcpy(name, dmabuf->name, DMA_BUF_NAME_LEN); + mutex_unlock(&dmabuf->lock); + + return dynamic_dname(dentry, buffer, buflen, "/%s:%s", + dentry->d_name.name, ret > 0 ? name : ""); +} + static const struct dentry_operations dma_buf_dentry_ops = { - .d_dname = simple_dname, + .d_dname = dmabuffs_dname, }; static struct vfsmount *dma_buf_mnt; @@ -301,6 +317,43 @@ out: return events; } +/** + * dma_buf_set_name - Set a name to a specific dma_buf to track the usage. + * The name of the dma-buf buffer can only be set when the dma-buf is not + * attached to any devices. It could theoritically support changing the + * name of the dma-buf if the same piece of memory is used for multiple + * purpose between different devices. + * + * @dmabuf [in] dmabuf buffer that will be renamed. + * @buf: [in] A piece of userspace memory that contains the name of + * the dma-buf. + * + * Returns 0 on success. If the dma-buf buffer is already attached to + * devices, return -EBUSY. + * + */ +static long dma_buf_set_name(struct dma_buf *dmabuf, const char __user *buf) +{ + char *name = strndup_user(buf, DMA_BUF_NAME_LEN); + long ret = 0; + + if (IS_ERR(name)) + return PTR_ERR(name); + + mutex_lock(&dmabuf->lock); + if (!list_empty(&dmabuf->attachments)) { + ret = -EBUSY; + kfree(name); + goto out_unlock; + } + kfree(dmabuf->name); + dmabuf->name = name; + +out_unlock: + mutex_unlock(&dmabuf->lock); + return ret; +} + static long dma_buf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { @@ -339,6 +392,10 @@ static long dma_buf_ioctl(struct file *file, ret = dma_buf_begin_cpu_access(dmabuf, direction); return ret; + + case DMA_BUF_SET_NAME: + return dma_buf_set_name(dmabuf, (const char __user *)arg); + default: return -ENOTTY; } @@ -380,6 +437,7 @@ static struct file *dma_buf_getfile(struct dma_buf *dmabuf, int flags) goto err_alloc_file; file->f_flags = flags & (O_ACCMODE | O_NONBLOCK); file->private_data = dmabuf; + file->f_path.dentry->d_fsdata = dmabuf; return file; @@ -1112,12 +1170,13 @@ static int dma_buf_debug_show(struct seq_file *s, void *unused) continue; } - seq_printf(s, "%08zu\t%08x\t%08x\t%08ld\t%s\t%08lu\n", + seq_printf(s, "%08zu\t%08x\t%08x\t%08ld\t%s\t%08lu\t%s\n", buf_obj->size, buf_obj->file->f_flags, buf_obj->file->f_mode, file_count(buf_obj->file), buf_obj->exp_name, - file_inode(buf_obj->file)->i_ino); + file_inode(buf_obj->file)->i_ino, + buf_obj->name ?: ""); robj = buf_obj->resv; while (true) { diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h index 8a327566d7f4..01ad5b942a6f 100644 --- a/include/linux/dma-buf.h +++ b/include/linux/dma-buf.h @@ -280,10 +280,12 @@ struct dma_buf_ops { * @file: file pointer used for sharing buffers across, and for refcounting. * @attachments: list of dma_buf_attachment that denotes all devices attached. * @ops: dma_buf_ops associated with this buffer object. - * @lock: used internally to serialize list manipulation, attach/detach and vmap/unmap + * @lock: used internally to serialize list manipulation, attach/detach and + * vmap/unmap, and accesses to name * @vmapping_counter: used internally to refcnt the vmaps * @vmap_ptr: the current vmap ptr if vmapping_counter > 0 * @exp_name: name of the exporter; useful for debugging. + * @name: userspace-provided name; useful for accounting and debugging. * @owner: pointer to exporter module; used for refcounting when exporter is a * kernel module. * @list_node: node for dma_buf accounting and debugging. @@ -311,6 +313,7 @@ struct dma_buf { unsigned vmapping_counter; void *vmap_ptr; const char *exp_name; + const char *name; struct module *owner; struct list_head list_node; void *priv; diff --git a/include/uapi/linux/dma-buf.h b/include/uapi/linux/dma-buf.h index d75df5210a4a..dbc7092e04b5 100644 --- a/include/uapi/linux/dma-buf.h +++ b/include/uapi/linux/dma-buf.h @@ -35,7 +35,10 @@ struct dma_buf_sync { #define DMA_BUF_SYNC_VALID_FLAGS_MASK \ (DMA_BUF_SYNC_RW | DMA_BUF_SYNC_END) +#define DMA_BUF_NAME_LEN 32 + #define DMA_BUF_BASE 'b' #define DMA_BUF_IOCTL_SYNC _IOW(DMA_BUF_BASE, 0, struct dma_buf_sync) +#define DMA_BUF_SET_NAME _IOW(DMA_BUF_BASE, 1, const char *) #endif -- cgit v1.2.3-71-gd317 From cc3e14c21ae928b3f8bce584b2c7d53d332b9738 Mon Sep 17 00:00:00 2001 From: Chung-Hsien Hsu Date: Thu, 9 May 2019 09:49:05 +0000 Subject: nl80211: add WPA3 definition for SAE authentication Add definition of WPA version 3 for SAE authentication. Signed-off-by: Chung-Hsien Hsu Signed-off-by: Chi-Hsien Lin Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 1 + net/wireless/nl80211.c | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 6f09d1500960..e9bf3d69d847 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -4406,6 +4406,7 @@ enum nl80211_mfp { enum nl80211_wpa_versions { NL80211_WPA_VERSION_1 = 1 << 0, NL80211_WPA_VERSION_2 = 1 << 1, + NL80211_WPA_VERSION_3 = 1 << 2, }; /** diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 1c74ca377bd8..8332a5731c57 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -8736,7 +8736,8 @@ static int nl80211_dump_survey(struct sk_buff *skb, struct netlink_callback *cb) static bool nl80211_valid_wpa_versions(u32 wpa_versions) { return !(wpa_versions & ~(NL80211_WPA_VERSION_1 | - NL80211_WPA_VERSION_2)); + NL80211_WPA_VERSION_2 | + NL80211_WPA_VERSION_3)); } static int nl80211_authenticate(struct sk_buff *skb, struct genl_info *info) -- cgit v1.2.3-71-gd317 From 26f7044e95042daabcf1c71796a0e804a83c979f Mon Sep 17 00:00:00 2001 From: Chung-Hsien Hsu Date: Thu, 9 May 2019 09:49:06 +0000 Subject: nl80211: add support for SAE authentication offload Let drivers advertise support for station-mode SAE authentication offload with a new NL80211_EXT_FEATURE_SAE_OFFLOAD flag. Signed-off-by: Chung-Hsien Hsu Signed-off-by: Chi-Hsien Lin Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 1 + include/net/cfg80211.h | 5 +++++ include/uapi/linux/nl80211.h | 19 +++++++++++++++++++ net/wireless/nl80211.c | 14 ++++++++++++++ 4 files changed, 39 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 61f0a316c6ac..5dfd949ade25 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -2612,6 +2612,7 @@ enum ieee80211_key_len { #define FILS_ERP_MAX_RRK_LEN 64 #define PMK_MAX_LEN 64 +#define SAE_PASSWORD_MAX_LEN 128 /* Public action codes (IEEE Std 802.11-2016, 9.6.8.1, Table 9-307) */ enum ieee80211_pub_actioncode { diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index c19687833493..4b45056dbb25 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -742,6 +742,9 @@ struct survey_info { * CFG80211_MAX_WEP_KEYS WEP keys * @wep_tx_key: key index (0..3) of the default TX static WEP key * @psk: PSK (for devices supporting 4-way-handshake offload) + * @sae_pwd: password for SAE authentication (for devices supporting SAE + * offload) + * @sae_pwd_len: length of SAE password (for devices supporting SAE offload) */ struct cfg80211_crypto_settings { u32 wpa_versions; @@ -757,6 +760,8 @@ struct cfg80211_crypto_settings { struct key_params *wep_keys; int wep_tx_key; const u8 *psk; + const u8 *sae_pwd; + u8 sae_pwd_len; }; /** diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index e9bf3d69d847..8b1e43fecd25 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -234,6 +234,15 @@ * use in a FILS shared key connection with PMKSA caching. */ +/** + * DOC: SAE authentication offload + * + * By setting @NL80211_EXT_FEATURE_SAE_OFFLOAD flag drivers can indicate they + * support offloading SAE authentication for WPA3-Personal networks. In + * %NL80211_CMD_CONNECT the password for SAE should be specified using + * %NL80211_ATTR_SAE_PASSWORD. + */ + /** * enum nl80211_commands - supported nl80211 commands * @@ -2341,6 +2350,10 @@ enum nl80211_commands { * should be picking up the lowest tx power, either tx power per-interface * or per-station. * + * @NL80211_ATTR_SAE_PASSWORD: attribute for passing SAE password material. It + * is used with %NL80211_CMD_CONNECT to provide password for offloading + * SAE authentication for WPA3-Personal networks. + * * @NUM_NL80211_ATTR: total number of nl80211_attrs available * @NL80211_ATTR_MAX: highest attribute number currently defined * @__NL80211_ATTR_AFTER_LAST: internal use @@ -2794,6 +2807,8 @@ enum nl80211_attrs { NL80211_ATTR_STA_TX_POWER_SETTING, NL80211_ATTR_STA_TX_POWER, + NL80211_ATTR_SAE_PASSWORD, + /* add attributes here, update the policy in nl80211.c */ __NL80211_ATTR_AFTER_LAST, @@ -5423,6 +5438,9 @@ enum nl80211_feature_flags { * @NL80211_EXT_FEATURE_STA_TX_PWR: This driver supports controlling tx power * to a station. * + * @NL80211_EXT_FEATURE_SAE_OFFLOAD: Device wants to do SAE authentication in + * station mode (SAE password is passed as part of the connect command). + * * @NUM_NL80211_EXT_FEATURES: number of extended features. * @MAX_NL80211_EXT_FEATURES: highest extended feature index. */ @@ -5467,6 +5485,7 @@ enum nl80211_ext_feature_index { NL80211_EXT_FEATURE_SCHED_SCAN_BAND_SPECIFIC_RSSI_THOLD, NL80211_EXT_FEATURE_EXT_KEY_ID, NL80211_EXT_FEATURE_STA_TX_PWR, + NL80211_EXT_FEATURE_SAE_OFFLOAD, /* add new features before the definition below */ NUM_NL80211_EXT_FEATURES, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 8332a5731c57..80e514872719 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -571,6 +571,8 @@ const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_PEER_MEASUREMENTS] = NLA_POLICY_NESTED(nl80211_pmsr_attr_policy), [NL80211_ATTR_AIRTIME_WEIGHT] = NLA_POLICY_MIN(NLA_U16, 1), + [NL80211_ATTR_SAE_PASSWORD] = { .type = NLA_BINARY, + .len = SAE_PASSWORD_MAX_LEN }, }; /* policy for the key attributes */ @@ -4434,6 +4436,8 @@ static bool nl80211_valid_auth_type(struct cfg80211_registered_device *rdev, return true; case NL80211_CMD_CONNECT: if (!(rdev->wiphy.features & NL80211_FEATURE_SAE) && + !wiphy_ext_feature_isset(&rdev->wiphy, + NL80211_EXT_FEATURE_SAE_OFFLOAD) && auth_type == NL80211_AUTHTYPE_SAE) return false; @@ -8973,6 +8977,16 @@ static int nl80211_crypto_settings(struct cfg80211_registered_device *rdev, settings->psk = nla_data(info->attrs[NL80211_ATTR_PMK]); } + if (info->attrs[NL80211_ATTR_SAE_PASSWORD]) { + if (!wiphy_ext_feature_isset(&rdev->wiphy, + NL80211_EXT_FEATURE_SAE_OFFLOAD)) + return -EINVAL; + settings->sae_pwd = + nla_data(info->attrs[NL80211_ATTR_SAE_PASSWORD]); + settings->sae_pwd_len = + nla_len(info->attrs[NL80211_ATTR_SAE_PASSWORD]); + } + return 0; } -- cgit v1.2.3-71-gd317 From a0de1ca383c77a1ae123d7c0cea45e327b61876a Mon Sep 17 00:00:00 2001 From: John Crispin Date: Tue, 28 May 2019 13:49:48 +0200 Subject: mac80211: allow turning TWT responder support on and off via netlink Allow the userland daemon to en/disable TWT support for an AP. Signed-off-by: Shashidhar Lakkavalli Signed-off-by: John Crispin [simplify parsing code] Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 2 ++ include/net/mac80211.h | 3 +++ include/uapi/linux/nl80211.h | 4 ++++ net/mac80211/cfg.c | 4 +++- net/wireless/nl80211.c | 4 ++++ 5 files changed, 16 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 7c4aa868e7a5..ac758a54e971 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -897,6 +897,7 @@ enum cfg80211_ap_settings_flags { * @he_cap: HE capabilities (or %NULL if HE isn't enabled) * @ht_required: stations must support HT * @vht_required: stations must support VHT + * @twt_responder: Enable Target Wait Time * @flags: flags, as defined in enum cfg80211_ap_settings_flags */ struct cfg80211_ap_settings { @@ -923,6 +924,7 @@ struct cfg80211_ap_settings { const struct ieee80211_vht_cap *vht_cap; const struct ieee80211_he_cap_elem *he_cap; bool ht_required, vht_required; + bool twt_responder; u32 flags; }; diff --git a/include/net/mac80211.h b/include/net/mac80211.h index e8fdb786b228..ed4911306f03 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -506,6 +506,8 @@ struct ieee80211_ftm_responder_params { * @he_support: does this BSS support HE * @twt_requester: does this BSS support TWT requester (relevant for managed * mode only, set if the AP advertises TWT responder role) + * @twt_responder: does this BSS support TWT requester (relevant for managed + * mode only, set if the AP advertises TWT responder role) * @assoc: association status * @ibss_joined: indicates whether this station is part of an IBSS * or not @@ -613,6 +615,7 @@ struct ieee80211_bss_conf { u16 frame_time_rts_th; bool he_support; bool twt_requester; + bool twt_responder; /* association related data */ bool assoc, ibss_joined; bool ibss_creator; diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 8b1e43fecd25..8fc3a43cac75 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -2354,6 +2354,8 @@ enum nl80211_commands { * is used with %NL80211_CMD_CONNECT to provide password for offloading * SAE authentication for WPA3-Personal networks. * + * @NL80211_ATTR_TWT_RESPONDER: Enable target wait time responder support. + * * @NUM_NL80211_ATTR: total number of nl80211_attrs available * @NL80211_ATTR_MAX: highest attribute number currently defined * @__NL80211_ATTR_AFTER_LAST: internal use @@ -2809,6 +2811,8 @@ enum nl80211_attrs { NL80211_ATTR_SAE_PASSWORD, + NL80211_ATTR_TWT_RESPONDER, + /* add attributes here, update the policy in nl80211.c */ __NL80211_ATTR_AFTER_LAST, diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 52e6a091b7e4..023e8751d223 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -975,7 +975,8 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev, BSS_CHANGED_BEACON | BSS_CHANGED_SSID | BSS_CHANGED_P2P_PS | - BSS_CHANGED_TXPOWER; + BSS_CHANGED_TXPOWER | + BSS_CHANGED_TWT; int err; int prev_beacon_int; @@ -1045,6 +1046,7 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev, sdata->vif.bss_conf.dtim_period = params->dtim_period; sdata->vif.bss_conf.enable_beacon = true; sdata->vif.bss_conf.allow_p2p_go_ps = sdata->vif.p2p; + sdata->vif.bss_conf.twt_responder = params->twt_responder; sdata->vif.bss_conf.ssid_len = params->ssid_len; if (params->ssid_len) diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 34e86539552e..68e5ab5394dd 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -573,6 +573,7 @@ const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_AIRTIME_WEIGHT] = NLA_POLICY_MIN(NLA_U16, 1), [NL80211_ATTR_SAE_PASSWORD] = { .type = NLA_BINARY, .len = SAE_PASSWORD_MAX_LEN }, + [NL80211_ATTR_TWT_RESPONDER] = { .type = NLA_FLAG }, }; /* policy for the key attributes */ @@ -4628,6 +4629,9 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info) return PTR_ERR(params.acl); } + params.twt_responder = + nla_get_flag(info->attrs[NL80211_ATTR_TWT_RESPONDER]); + nl80211_calculate_ap_params(¶ms); if (info->attrs[NL80211_ATTR_EXTERNAL_AUTH_SUPPORT]) -- cgit v1.2.3-71-gd317 From 99c8b231ae6c6ca4ca2fd1c0b3701071f589661f Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Wed, 12 Jun 2019 14:52:41 -0300 Subject: docs: cgroup-v1: convert docs to ReST and rename to *.rst Convert the cgroup-v1 files to ReST format, in order to allow a later addition to the admin-guide. The conversion is actually: - add blank lines and identation in order to identify paragraphs; - fix tables markups; - add some lists markups; - mark literal blocks; - adjust title markups. At its new index.rst, let's add a :orphan: while this is not linked to the main index.rst file, in order to avoid build warnings. Signed-off-by: Mauro Carvalho Chehab Acked-by: Tejun Heo Signed-off-by: Tejun Heo --- Documentation/admin-guide/hw-vuln/l1tf.rst | 2 +- Documentation/admin-guide/kernel-parameters.txt | 4 +- .../admin-guide/mm/numa_memory_policy.rst | 2 +- Documentation/block/bfq-iosched.txt | 2 +- Documentation/cgroup-v1/blkio-controller.rst | 391 ++++++++ Documentation/cgroup-v1/blkio-controller.txt | 375 -------- Documentation/cgroup-v1/cgroups.rst | 695 ++++++++++++++ Documentation/cgroup-v1/cgroups.txt | 677 ------------- Documentation/cgroup-v1/cpuacct.rst | 50 + Documentation/cgroup-v1/cpuacct.txt | 49 - Documentation/cgroup-v1/cpusets.rst | 866 +++++++++++++++++ Documentation/cgroup-v1/cpusets.txt | 839 ---------------- Documentation/cgroup-v1/devices.rst | 132 +++ Documentation/cgroup-v1/devices.txt | 116 --- Documentation/cgroup-v1/freezer-subsystem.rst | 127 +++ Documentation/cgroup-v1/freezer-subsystem.txt | 123 --- Documentation/cgroup-v1/hugetlb.rst | 50 + Documentation/cgroup-v1/hugetlb.txt | 49 - Documentation/cgroup-v1/index.rst | 30 + Documentation/cgroup-v1/memcg_test.rst | 355 +++++++ Documentation/cgroup-v1/memcg_test.txt | 280 ------ Documentation/cgroup-v1/memory.rst | 1003 ++++++++++++++++++++ Documentation/cgroup-v1/memory.txt | 892 ----------------- Documentation/cgroup-v1/net_cls.rst | 44 + Documentation/cgroup-v1/net_cls.txt | 39 - Documentation/cgroup-v1/net_prio.rst | 57 ++ Documentation/cgroup-v1/net_prio.txt | 55 -- Documentation/cgroup-v1/pids.rst | 92 ++ Documentation/cgroup-v1/pids.txt | 88 -- Documentation/cgroup-v1/rdma.rst | 117 +++ Documentation/cgroup-v1/rdma.txt | 109 --- Documentation/filesystems/tmpfs.txt | 2 +- Documentation/scheduler/sched-deadline.txt | 2 +- Documentation/scheduler/sched-design-CFS.txt | 2 +- Documentation/scheduler/sched-rt-group.txt | 2 +- Documentation/vm/numa.rst | 4 +- Documentation/vm/page_migration.rst | 2 +- Documentation/vm/unevictable-lru.rst | 2 +- Documentation/x86/x86_64/fake-numa-for-cpusets.rst | 4 +- MAINTAINERS | 2 +- block/Kconfig | 2 +- include/linux/cgroup-defs.h | 2 +- include/uapi/linux/bpf.h | 2 +- init/Kconfig | 2 +- kernel/cgroup/cpuset.c | 2 +- security/device_cgroup.c | 2 +- tools/include/uapi/linux/bpf.h | 2 +- 47 files changed, 4032 insertions(+), 3714 deletions(-) create mode 100644 Documentation/cgroup-v1/blkio-controller.rst delete mode 100644 Documentation/cgroup-v1/blkio-controller.txt create mode 100644 Documentation/cgroup-v1/cgroups.rst delete mode 100644 Documentation/cgroup-v1/cgroups.txt create mode 100644 Documentation/cgroup-v1/cpuacct.rst delete mode 100644 Documentation/cgroup-v1/cpuacct.txt create mode 100644 Documentation/cgroup-v1/cpusets.rst delete mode 100644 Documentation/cgroup-v1/cpusets.txt create mode 100644 Documentation/cgroup-v1/devices.rst delete mode 100644 Documentation/cgroup-v1/devices.txt create mode 100644 Documentation/cgroup-v1/freezer-subsystem.rst delete mode 100644 Documentation/cgroup-v1/freezer-subsystem.txt create mode 100644 Documentation/cgroup-v1/hugetlb.rst delete mode 100644 Documentation/cgroup-v1/hugetlb.txt create mode 100644 Documentation/cgroup-v1/index.rst create mode 100644 Documentation/cgroup-v1/memcg_test.rst delete mode 100644 Documentation/cgroup-v1/memcg_test.txt create mode 100644 Documentation/cgroup-v1/memory.rst delete mode 100644 Documentation/cgroup-v1/memory.txt create mode 100644 Documentation/cgroup-v1/net_cls.rst delete mode 100644 Documentation/cgroup-v1/net_cls.txt create mode 100644 Documentation/cgroup-v1/net_prio.rst delete mode 100644 Documentation/cgroup-v1/net_prio.txt create mode 100644 Documentation/cgroup-v1/pids.rst delete mode 100644 Documentation/cgroup-v1/pids.txt create mode 100644 Documentation/cgroup-v1/rdma.rst delete mode 100644 Documentation/cgroup-v1/rdma.txt (limited to 'include/uapi/linux') diff --git a/Documentation/admin-guide/hw-vuln/l1tf.rst b/Documentation/admin-guide/hw-vuln/l1tf.rst index 31653a9f0e1b..656aee262e23 100644 --- a/Documentation/admin-guide/hw-vuln/l1tf.rst +++ b/Documentation/admin-guide/hw-vuln/l1tf.rst @@ -241,7 +241,7 @@ Guest mitigation mechanisms For further information about confining guests to a single or to a group of cores consult the cpusets documentation: - https://www.kernel.org/doc/Documentation/cgroup-v1/cpusets.txt + https://www.kernel.org/doc/Documentation/cgroup-v1/cpusets.rst .. _interrupt_isolation: diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 138f6664b2e2..da0e84ecee32 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4078,7 +4078,7 @@ relax_domain_level= [KNL, SMP] Set scheduler's default relax_domain_level. - See Documentation/cgroup-v1/cpusets.txt. + See Documentation/cgroup-v1/cpusets.rst. reserve= [KNL,BUGS] Force kernel to ignore I/O ports or memory Format: ,[,,,...] @@ -4588,7 +4588,7 @@ swapaccount=[0|1] [KNL] Enable accounting of swap in memory resource controller if no parameter or 1 is given or disable - it if 0 is given (See Documentation/cgroup-v1/memory.txt) + it if 0 is given (See Documentation/cgroup-v1/memory.rst) swiotlb= [ARM,IA-64,PPC,MIPS,X86] Format: { | force | noforce } diff --git a/Documentation/admin-guide/mm/numa_memory_policy.rst b/Documentation/admin-guide/mm/numa_memory_policy.rst index d78c5b315f72..546f174e5d6a 100644 --- a/Documentation/admin-guide/mm/numa_memory_policy.rst +++ b/Documentation/admin-guide/mm/numa_memory_policy.rst @@ -15,7 +15,7 @@ document attempts to describe the concepts and APIs of the 2.6 memory policy support. Memory policies should not be confused with cpusets -(``Documentation/cgroup-v1/cpusets.txt``) +(``Documentation/cgroup-v1/cpusets.rst``) which is an administrative mechanism for restricting the nodes from which memory may be allocated by a set of processes. Memory policies are a programming interface that a NUMA-aware application can take advantage of. When diff --git a/Documentation/block/bfq-iosched.txt b/Documentation/block/bfq-iosched.txt index 1a0f2ac02eb6..b2265cf6c9c3 100644 --- a/Documentation/block/bfq-iosched.txt +++ b/Documentation/block/bfq-iosched.txt @@ -539,7 +539,7 @@ As for cgroups-v1 (blkio controller), the exact set of stat files created, and kept up-to-date by bfq, depends on whether CONFIG_DEBUG_BLK_CGROUP is set. If it is set, then bfq creates all the stat files documented in -Documentation/cgroup-v1/blkio-controller.txt. If, instead, +Documentation/cgroup-v1/blkio-controller.rst. If, instead, CONFIG_DEBUG_BLK_CGROUP is not set, then bfq creates only the files blkio.bfq.io_service_bytes blkio.bfq.io_service_bytes_recursive diff --git a/Documentation/cgroup-v1/blkio-controller.rst b/Documentation/cgroup-v1/blkio-controller.rst new file mode 100644 index 000000000000..2c1b907afc14 --- /dev/null +++ b/Documentation/cgroup-v1/blkio-controller.rst @@ -0,0 +1,391 @@ +=================== +Block IO Controller +=================== + +Overview +======== +cgroup subsys "blkio" implements the block io controller. There seems to be +a need of various kinds of IO control policies (like proportional BW, max BW) +both at leaf nodes as well as at intermediate nodes in a storage hierarchy. +Plan is to use the same cgroup based management interface for blkio controller +and based on user options switch IO policies in the background. + +Currently two IO control policies are implemented. First one is proportional +weight time based division of disk policy. It is implemented in CFQ. Hence +this policy takes effect only on leaf nodes when CFQ is being used. The second +one is throttling policy which can be used to specify upper IO rate limits +on devices. This policy is implemented in generic block layer and can be +used on leaf nodes as well as higher level logical devices like device mapper. + +HOWTO +===== +Proportional Weight division of bandwidth +----------------------------------------- +You can do a very simple testing of running two dd threads in two different +cgroups. Here is what you can do. + +- Enable Block IO controller:: + + CONFIG_BLK_CGROUP=y + +- Enable group scheduling in CFQ: + + + CONFIG_CFQ_GROUP_IOSCHED=y + +- Compile and boot into kernel and mount IO controller (blkio); see + cgroups.txt, Why are cgroups needed?. + + :: + + mount -t tmpfs cgroup_root /sys/fs/cgroup + mkdir /sys/fs/cgroup/blkio + mount -t cgroup -o blkio none /sys/fs/cgroup/blkio + +- Create two cgroups:: + + mkdir -p /sys/fs/cgroup/blkio/test1/ /sys/fs/cgroup/blkio/test2 + +- Set weights of group test1 and test2:: + + echo 1000 > /sys/fs/cgroup/blkio/test1/blkio.weight + echo 500 > /sys/fs/cgroup/blkio/test2/blkio.weight + +- Create two same size files (say 512MB each) on same disk (file1, file2) and + launch two dd threads in different cgroup to read those files:: + + sync + echo 3 > /proc/sys/vm/drop_caches + + dd if=/mnt/sdb/zerofile1 of=/dev/null & + echo $! > /sys/fs/cgroup/blkio/test1/tasks + cat /sys/fs/cgroup/blkio/test1/tasks + + dd if=/mnt/sdb/zerofile2 of=/dev/null & + echo $! > /sys/fs/cgroup/blkio/test2/tasks + cat /sys/fs/cgroup/blkio/test2/tasks + +- At macro level, first dd should finish first. To get more precise data, keep + on looking at (with the help of script), at blkio.disk_time and + blkio.disk_sectors files of both test1 and test2 groups. This will tell how + much disk time (in milliseconds), each group got and how many sectors each + group dispatched to the disk. We provide fairness in terms of disk time, so + ideally io.disk_time of cgroups should be in proportion to the weight. + +Throttling/Upper Limit policy +----------------------------- +- Enable Block IO controller:: + + CONFIG_BLK_CGROUP=y + +- Enable throttling in block layer:: + + CONFIG_BLK_DEV_THROTTLING=y + +- Mount blkio controller (see cgroups.txt, Why are cgroups needed?):: + + mount -t cgroup -o blkio none /sys/fs/cgroup/blkio + +- Specify a bandwidth rate on particular device for root group. The format + for policy is ": ":: + + echo "8:16 1048576" > /sys/fs/cgroup/blkio/blkio.throttle.read_bps_device + + Above will put a limit of 1MB/second on reads happening for root group + on device having major/minor number 8:16. + +- Run dd to read a file and see if rate is throttled to 1MB/s or not:: + + # dd iflag=direct if=/mnt/common/zerofile of=/dev/null bs=4K count=1024 + 1024+0 records in + 1024+0 records out + 4194304 bytes (4.2 MB) copied, 4.0001 s, 1.0 MB/s + + Limits for writes can be put using blkio.throttle.write_bps_device file. + +Hierarchical Cgroups +==================== + +Both CFQ and throttling implement hierarchy support; however, +throttling's hierarchy support is enabled iff "sane_behavior" is +enabled from cgroup side, which currently is a development option and +not publicly available. + +If somebody created a hierarchy like as follows:: + + root + / \ + test1 test2 + | + test3 + +CFQ by default and throttling with "sane_behavior" will handle the +hierarchy correctly. For details on CFQ hierarchy support, refer to +Documentation/block/cfq-iosched.txt. For throttling, all limits apply +to the whole subtree while all statistics are local to the IOs +directly generated by tasks in that cgroup. + +Throttling without "sane_behavior" enabled from cgroup side will +practically treat all groups at same level as if it looks like the +following:: + + pivot + / / \ \ + root test1 test2 test3 + +Various user visible config options +=================================== +CONFIG_BLK_CGROUP + - Block IO controller. + +CONFIG_DEBUG_BLK_CGROUP + - Debug help. Right now some additional stats file show up in cgroup + if this option is enabled. + +CONFIG_CFQ_GROUP_IOSCHED + - Enables group scheduling in CFQ. Currently only 1 level of group + creation is allowed. + +CONFIG_BLK_DEV_THROTTLING + - Enable block device throttling support in block layer. + +Details of cgroup files +======================= +Proportional weight policy files +-------------------------------- +- blkio.weight + - Specifies per cgroup weight. This is default weight of the group + on all the devices until and unless overridden by per device rule. + (See blkio.weight_device). + Currently allowed range of weights is from 10 to 1000. + +- blkio.weight_device + - One can specify per cgroup per device rules using this interface. + These rules override the default value of group weight as specified + by blkio.weight. + + Following is the format:: + + # echo dev_maj:dev_minor weight > blkio.weight_device + + Configure weight=300 on /dev/sdb (8:16) in this cgroup:: + + # echo 8:16 300 > blkio.weight_device + # cat blkio.weight_device + dev weight + 8:16 300 + + Configure weight=500 on /dev/sda (8:0) in this cgroup:: + + # echo 8:0 500 > blkio.weight_device + # cat blkio.weight_device + dev weight + 8:0 500 + 8:16 300 + + Remove specific weight for /dev/sda in this cgroup:: + + # echo 8:0 0 > blkio.weight_device + # cat blkio.weight_device + dev weight + 8:16 300 + +- blkio.leaf_weight[_device] + - Equivalents of blkio.weight[_device] for the purpose of + deciding how much weight tasks in the given cgroup has while + competing with the cgroup's child cgroups. For details, + please refer to Documentation/block/cfq-iosched.txt. + +- blkio.time + - disk time allocated to cgroup per device in milliseconds. First + two fields specify the major and minor number of the device and + third field specifies the disk time allocated to group in + milliseconds. + +- blkio.sectors + - number of sectors transferred to/from disk by the group. First + two fields specify the major and minor number of the device and + third field specifies the number of sectors transferred by the + group to/from the device. + +- blkio.io_service_bytes + - Number of bytes transferred to/from the disk by the group. These + are further divided by the type of operation - read or write, sync + or async. First two fields specify the major and minor number of the + device, third field specifies the operation type and the fourth field + specifies the number of bytes. + +- blkio.io_serviced + - Number of IOs (bio) issued to the disk by the group. These + are further divided by the type of operation - read or write, sync + or async. First two fields specify the major and minor number of the + device, third field specifies the operation type and the fourth field + specifies the number of IOs. + +- blkio.io_service_time + - Total amount of time between request dispatch and request completion + for the IOs done by this cgroup. This is in nanoseconds to make it + meaningful for flash devices too. For devices with queue depth of 1, + this time represents the actual service time. When queue_depth > 1, + that is no longer true as requests may be served out of order. This + may cause the service time for a given IO to include the service time + of multiple IOs when served out of order which may result in total + io_service_time > actual time elapsed. This time is further divided by + the type of operation - read or write, sync or async. First two fields + specify the major and minor number of the device, third field + specifies the operation type and the fourth field specifies the + io_service_time in ns. + +- blkio.io_wait_time + - Total amount of time the IOs for this cgroup spent waiting in the + scheduler queues for service. This can be greater than the total time + elapsed since it is cumulative io_wait_time for all IOs. It is not a + measure of total time the cgroup spent waiting but rather a measure of + the wait_time for its individual IOs. For devices with queue_depth > 1 + this metric does not include the time spent waiting for service once + the IO is dispatched to the device but till it actually gets serviced + (there might be a time lag here due to re-ordering of requests by the + device). This is in nanoseconds to make it meaningful for flash + devices too. This time is further divided by the type of operation - + read or write, sync or async. First two fields specify the major and + minor number of the device, third field specifies the operation type + and the fourth field specifies the io_wait_time in ns. + +- blkio.io_merged + - Total number of bios/requests merged into requests belonging to this + cgroup. This is further divided by the type of operation - read or + write, sync or async. + +- blkio.io_queued + - Total number of requests queued up at any given instant for this + cgroup. This is further divided by the type of operation - read or + write, sync or async. + +- blkio.avg_queue_size + - Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y. + The average queue size for this cgroup over the entire time of this + cgroup's existence. Queue size samples are taken each time one of the + queues of this cgroup gets a timeslice. + +- blkio.group_wait_time + - Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y. + This is the amount of time the cgroup had to wait since it became busy + (i.e., went from 0 to 1 request queued) to get a timeslice for one of + its queues. This is different from the io_wait_time which is the + cumulative total of the amount of time spent by each IO in that cgroup + waiting in the scheduler queue. This is in nanoseconds. If this is + read when the cgroup is in a waiting (for timeslice) state, the stat + will only report the group_wait_time accumulated till the last time it + got a timeslice and will not include the current delta. + +- blkio.empty_time + - Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y. + This is the amount of time a cgroup spends without any pending + requests when not being served, i.e., it does not include any time + spent idling for one of the queues of the cgroup. This is in + nanoseconds. If this is read when the cgroup is in an empty state, + the stat will only report the empty_time accumulated till the last + time it had a pending request and will not include the current delta. + +- blkio.idle_time + - Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y. + This is the amount of time spent by the IO scheduler idling for a + given cgroup in anticipation of a better request than the existing ones + from other queues/cgroups. This is in nanoseconds. If this is read + when the cgroup is in an idling state, the stat will only report the + idle_time accumulated till the last idle period and will not include + the current delta. + +- blkio.dequeue + - Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y. This + gives the statistics about how many a times a group was dequeued + from service tree of the device. First two fields specify the major + and minor number of the device and third field specifies the number + of times a group was dequeued from a particular device. + +- blkio.*_recursive + - Recursive version of various stats. These files show the + same information as their non-recursive counterparts but + include stats from all the descendant cgroups. + +Throttling/Upper limit policy files +----------------------------------- +- blkio.throttle.read_bps_device + - Specifies upper limit on READ rate from the device. IO rate is + specified in bytes per second. Rules are per device. Following is + the format:: + + echo ": " > /cgrp/blkio.throttle.read_bps_device + +- blkio.throttle.write_bps_device + - Specifies upper limit on WRITE rate to the device. IO rate is + specified in bytes per second. Rules are per device. Following is + the format:: + + echo ": " > /cgrp/blkio.throttle.write_bps_device + +- blkio.throttle.read_iops_device + - Specifies upper limit on READ rate from the device. IO rate is + specified in IO per second. Rules are per device. Following is + the format:: + + echo ": " > /cgrp/blkio.throttle.read_iops_device + +- blkio.throttle.write_iops_device + - Specifies upper limit on WRITE rate to the device. IO rate is + specified in io per second. Rules are per device. Following is + the format:: + + echo ": " > /cgrp/blkio.throttle.write_iops_device + +Note: If both BW and IOPS rules are specified for a device, then IO is + subjected to both the constraints. + +- blkio.throttle.io_serviced + - Number of IOs (bio) issued to the disk by the group. These + are further divided by the type of operation - read or write, sync + or async. First two fields specify the major and minor number of the + device, third field specifies the operation type and the fourth field + specifies the number of IOs. + +- blkio.throttle.io_service_bytes + - Number of bytes transferred to/from the disk by the group. These + are further divided by the type of operation - read or write, sync + or async. First two fields specify the major and minor number of the + device, third field specifies the operation type and the fourth field + specifies the number of bytes. + +Common files among various policies +----------------------------------- +- blkio.reset_stats + - Writing an int to this file will result in resetting all the stats + for that cgroup. + +CFQ sysfs tunable +================= +/sys/block//queue/iosched/slice_idle +------------------------------------------ +On a faster hardware CFQ can be slow, especially with sequential workload. +This happens because CFQ idles on a single queue and single queue might not +drive deeper request queue depths to keep the storage busy. In such scenarios +one can try setting slice_idle=0 and that would switch CFQ to IOPS +(IO operations per second) mode on NCQ supporting hardware. + +That means CFQ will not idle between cfq queues of a cfq group and hence be +able to driver higher queue depth and achieve better throughput. That also +means that cfq provides fairness among groups in terms of IOPS and not in +terms of disk time. + +/sys/block//queue/iosched/group_idle +------------------------------------------ +If one disables idling on individual cfq queues and cfq service trees by +setting slice_idle=0, group_idle kicks in. That means CFQ will still idle +on the group in an attempt to provide fairness among groups. + +By default group_idle is same as slice_idle and does not do anything if +slice_idle is enabled. + +One can experience an overall throughput drop if you have created multiple +groups and put applications in that group which are not driving enough +IO to keep disk busy. In that case set group_idle=0, and CFQ will not idle +on individual groups and throughput should improve. diff --git a/Documentation/cgroup-v1/blkio-controller.txt b/Documentation/cgroup-v1/blkio-controller.txt deleted file mode 100644 index 673dc34d3f78..000000000000 --- a/Documentation/cgroup-v1/blkio-controller.txt +++ /dev/null @@ -1,375 +0,0 @@ - Block IO Controller - =================== -Overview -======== -cgroup subsys "blkio" implements the block io controller. There seems to be -a need of various kinds of IO control policies (like proportional BW, max BW) -both at leaf nodes as well as at intermediate nodes in a storage hierarchy. -Plan is to use the same cgroup based management interface for blkio controller -and based on user options switch IO policies in the background. - -Currently two IO control policies are implemented. First one is proportional -weight time based division of disk policy. It is implemented in CFQ. Hence -this policy takes effect only on leaf nodes when CFQ is being used. The second -one is throttling policy which can be used to specify upper IO rate limits -on devices. This policy is implemented in generic block layer and can be -used on leaf nodes as well as higher level logical devices like device mapper. - -HOWTO -===== -Proportional Weight division of bandwidth ------------------------------------------ -You can do a very simple testing of running two dd threads in two different -cgroups. Here is what you can do. - -- Enable Block IO controller - CONFIG_BLK_CGROUP=y - -- Enable group scheduling in CFQ - CONFIG_CFQ_GROUP_IOSCHED=y - -- Compile and boot into kernel and mount IO controller (blkio); see - cgroups.txt, Why are cgroups needed?. - - mount -t tmpfs cgroup_root /sys/fs/cgroup - mkdir /sys/fs/cgroup/blkio - mount -t cgroup -o blkio none /sys/fs/cgroup/blkio - -- Create two cgroups - mkdir -p /sys/fs/cgroup/blkio/test1/ /sys/fs/cgroup/blkio/test2 - -- Set weights of group test1 and test2 - echo 1000 > /sys/fs/cgroup/blkio/test1/blkio.weight - echo 500 > /sys/fs/cgroup/blkio/test2/blkio.weight - -- Create two same size files (say 512MB each) on same disk (file1, file2) and - launch two dd threads in different cgroup to read those files. - - sync - echo 3 > /proc/sys/vm/drop_caches - - dd if=/mnt/sdb/zerofile1 of=/dev/null & - echo $! > /sys/fs/cgroup/blkio/test1/tasks - cat /sys/fs/cgroup/blkio/test1/tasks - - dd if=/mnt/sdb/zerofile2 of=/dev/null & - echo $! > /sys/fs/cgroup/blkio/test2/tasks - cat /sys/fs/cgroup/blkio/test2/tasks - -- At macro level, first dd should finish first. To get more precise data, keep - on looking at (with the help of script), at blkio.disk_time and - blkio.disk_sectors files of both test1 and test2 groups. This will tell how - much disk time (in milliseconds), each group got and how many sectors each - group dispatched to the disk. We provide fairness in terms of disk time, so - ideally io.disk_time of cgroups should be in proportion to the weight. - -Throttling/Upper Limit policy ------------------------------ -- Enable Block IO controller - CONFIG_BLK_CGROUP=y - -- Enable throttling in block layer - CONFIG_BLK_DEV_THROTTLING=y - -- Mount blkio controller (see cgroups.txt, Why are cgroups needed?) - mount -t cgroup -o blkio none /sys/fs/cgroup/blkio - -- Specify a bandwidth rate on particular device for root group. The format - for policy is ": ". - - echo "8:16 1048576" > /sys/fs/cgroup/blkio/blkio.throttle.read_bps_device - - Above will put a limit of 1MB/second on reads happening for root group - on device having major/minor number 8:16. - -- Run dd to read a file and see if rate is throttled to 1MB/s or not. - - # dd iflag=direct if=/mnt/common/zerofile of=/dev/null bs=4K count=1024 - 1024+0 records in - 1024+0 records out - 4194304 bytes (4.2 MB) copied, 4.0001 s, 1.0 MB/s - - Limits for writes can be put using blkio.throttle.write_bps_device file. - -Hierarchical Cgroups -==================== - -Both CFQ and throttling implement hierarchy support; however, -throttling's hierarchy support is enabled iff "sane_behavior" is -enabled from cgroup side, which currently is a development option and -not publicly available. - -If somebody created a hierarchy like as follows. - - root - / \ - test1 test2 - | - test3 - -CFQ by default and throttling with "sane_behavior" will handle the -hierarchy correctly. For details on CFQ hierarchy support, refer to -Documentation/block/cfq-iosched.txt. For throttling, all limits apply -to the whole subtree while all statistics are local to the IOs -directly generated by tasks in that cgroup. - -Throttling without "sane_behavior" enabled from cgroup side will -practically treat all groups at same level as if it looks like the -following. - - pivot - / / \ \ - root test1 test2 test3 - -Various user visible config options -=================================== -CONFIG_BLK_CGROUP - - Block IO controller. - -CONFIG_DEBUG_BLK_CGROUP - - Debug help. Right now some additional stats file show up in cgroup - if this option is enabled. - -CONFIG_CFQ_GROUP_IOSCHED - - Enables group scheduling in CFQ. Currently only 1 level of group - creation is allowed. - -CONFIG_BLK_DEV_THROTTLING - - Enable block device throttling support in block layer. - -Details of cgroup files -======================= -Proportional weight policy files --------------------------------- -- blkio.weight - - Specifies per cgroup weight. This is default weight of the group - on all the devices until and unless overridden by per device rule. - (See blkio.weight_device). - Currently allowed range of weights is from 10 to 1000. - -- blkio.weight_device - - One can specify per cgroup per device rules using this interface. - These rules override the default value of group weight as specified - by blkio.weight. - - Following is the format. - - # echo dev_maj:dev_minor weight > blkio.weight_device - Configure weight=300 on /dev/sdb (8:16) in this cgroup - # echo 8:16 300 > blkio.weight_device - # cat blkio.weight_device - dev weight - 8:16 300 - - Configure weight=500 on /dev/sda (8:0) in this cgroup - # echo 8:0 500 > blkio.weight_device - # cat blkio.weight_device - dev weight - 8:0 500 - 8:16 300 - - Remove specific weight for /dev/sda in this cgroup - # echo 8:0 0 > blkio.weight_device - # cat blkio.weight_device - dev weight - 8:16 300 - -- blkio.leaf_weight[_device] - - Equivalents of blkio.weight[_device] for the purpose of - deciding how much weight tasks in the given cgroup has while - competing with the cgroup's child cgroups. For details, - please refer to Documentation/block/cfq-iosched.txt. - -- blkio.time - - disk time allocated to cgroup per device in milliseconds. First - two fields specify the major and minor number of the device and - third field specifies the disk time allocated to group in - milliseconds. - -- blkio.sectors - - number of sectors transferred to/from disk by the group. First - two fields specify the major and minor number of the device and - third field specifies the number of sectors transferred by the - group to/from the device. - -- blkio.io_service_bytes - - Number of bytes transferred to/from the disk by the group. These - are further divided by the type of operation - read or write, sync - or async. First two fields specify the major and minor number of the - device, third field specifies the operation type and the fourth field - specifies the number of bytes. - -- blkio.io_serviced - - Number of IOs (bio) issued to the disk by the group. These - are further divided by the type of operation - read or write, sync - or async. First two fields specify the major and minor number of the - device, third field specifies the operation type and the fourth field - specifies the number of IOs. - -- blkio.io_service_time - - Total amount of time between request dispatch and request completion - for the IOs done by this cgroup. This is in nanoseconds to make it - meaningful for flash devices too. For devices with queue depth of 1, - this time represents the actual service time. When queue_depth > 1, - that is no longer true as requests may be served out of order. This - may cause the service time for a given IO to include the service time - of multiple IOs when served out of order which may result in total - io_service_time > actual time elapsed. This time is further divided by - the type of operation - read or write, sync or async. First two fields - specify the major and minor number of the device, third field - specifies the operation type and the fourth field specifies the - io_service_time in ns. - -- blkio.io_wait_time - - Total amount of time the IOs for this cgroup spent waiting in the - scheduler queues for service. This can be greater than the total time - elapsed since it is cumulative io_wait_time for all IOs. It is not a - measure of total time the cgroup spent waiting but rather a measure of - the wait_time for its individual IOs. For devices with queue_depth > 1 - this metric does not include the time spent waiting for service once - the IO is dispatched to the device but till it actually gets serviced - (there might be a time lag here due to re-ordering of requests by the - device). This is in nanoseconds to make it meaningful for flash - devices too. This time is further divided by the type of operation - - read or write, sync or async. First two fields specify the major and - minor number of the device, third field specifies the operation type - and the fourth field specifies the io_wait_time in ns. - -- blkio.io_merged - - Total number of bios/requests merged into requests belonging to this - cgroup. This is further divided by the type of operation - read or - write, sync or async. - -- blkio.io_queued - - Total number of requests queued up at any given instant for this - cgroup. This is further divided by the type of operation - read or - write, sync or async. - -- blkio.avg_queue_size - - Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y. - The average queue size for this cgroup over the entire time of this - cgroup's existence. Queue size samples are taken each time one of the - queues of this cgroup gets a timeslice. - -- blkio.group_wait_time - - Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y. - This is the amount of time the cgroup had to wait since it became busy - (i.e., went from 0 to 1 request queued) to get a timeslice for one of - its queues. This is different from the io_wait_time which is the - cumulative total of the amount of time spent by each IO in that cgroup - waiting in the scheduler queue. This is in nanoseconds. If this is - read when the cgroup is in a waiting (for timeslice) state, the stat - will only report the group_wait_time accumulated till the last time it - got a timeslice and will not include the current delta. - -- blkio.empty_time - - Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y. - This is the amount of time a cgroup spends without any pending - requests when not being served, i.e., it does not include any time - spent idling for one of the queues of the cgroup. This is in - nanoseconds. If this is read when the cgroup is in an empty state, - the stat will only report the empty_time accumulated till the last - time it had a pending request and will not include the current delta. - -- blkio.idle_time - - Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y. - This is the amount of time spent by the IO scheduler idling for a - given cgroup in anticipation of a better request than the existing ones - from other queues/cgroups. This is in nanoseconds. If this is read - when the cgroup is in an idling state, the stat will only report the - idle_time accumulated till the last idle period and will not include - the current delta. - -- blkio.dequeue - - Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y. This - gives the statistics about how many a times a group was dequeued - from service tree of the device. First two fields specify the major - and minor number of the device and third field specifies the number - of times a group was dequeued from a particular device. - -- blkio.*_recursive - - Recursive version of various stats. These files show the - same information as their non-recursive counterparts but - include stats from all the descendant cgroups. - -Throttling/Upper limit policy files ------------------------------------ -- blkio.throttle.read_bps_device - - Specifies upper limit on READ rate from the device. IO rate is - specified in bytes per second. Rules are per device. Following is - the format. - - echo ": " > /cgrp/blkio.throttle.read_bps_device - -- blkio.throttle.write_bps_device - - Specifies upper limit on WRITE rate to the device. IO rate is - specified in bytes per second. Rules are per device. Following is - the format. - - echo ": " > /cgrp/blkio.throttle.write_bps_device - -- blkio.throttle.read_iops_device - - Specifies upper limit on READ rate from the device. IO rate is - specified in IO per second. Rules are per device. Following is - the format. - - echo ": " > /cgrp/blkio.throttle.read_iops_device - -- blkio.throttle.write_iops_device - - Specifies upper limit on WRITE rate to the device. IO rate is - specified in io per second. Rules are per device. Following is - the format. - - echo ": " > /cgrp/blkio.throttle.write_iops_device - -Note: If both BW and IOPS rules are specified for a device, then IO is - subjected to both the constraints. - -- blkio.throttle.io_serviced - - Number of IOs (bio) issued to the disk by the group. These - are further divided by the type of operation - read or write, sync - or async. First two fields specify the major and minor number of the - device, third field specifies the operation type and the fourth field - specifies the number of IOs. - -- blkio.throttle.io_service_bytes - - Number of bytes transferred to/from the disk by the group. These - are further divided by the type of operation - read or write, sync - or async. First two fields specify the major and minor number of the - device, third field specifies the operation type and the fourth field - specifies the number of bytes. - -Common files among various policies ------------------------------------ -- blkio.reset_stats - - Writing an int to this file will result in resetting all the stats - for that cgroup. - -CFQ sysfs tunable -================= -/sys/block//queue/iosched/slice_idle ------------------------------------------- -On a faster hardware CFQ can be slow, especially with sequential workload. -This happens because CFQ idles on a single queue and single queue might not -drive deeper request queue depths to keep the storage busy. In such scenarios -one can try setting slice_idle=0 and that would switch CFQ to IOPS -(IO operations per second) mode on NCQ supporting hardware. - -That means CFQ will not idle between cfq queues of a cfq group and hence be -able to driver higher queue depth and achieve better throughput. That also -means that cfq provides fairness among groups in terms of IOPS and not in -terms of disk time. - -/sys/block//queue/iosched/group_idle ------------------------------------------- -If one disables idling on individual cfq queues and cfq service trees by -setting slice_idle=0, group_idle kicks in. That means CFQ will still idle -on the group in an attempt to provide fairness among groups. - -By default group_idle is same as slice_idle and does not do anything if -slice_idle is enabled. - -One can experience an overall throughput drop if you have created multiple -groups and put applications in that group which are not driving enough -IO to keep disk busy. In that case set group_idle=0, and CFQ will not idle -on individual groups and throughput should improve. diff --git a/Documentation/cgroup-v1/cgroups.rst b/Documentation/cgroup-v1/cgroups.rst new file mode 100644 index 000000000000..46bbe7e022d4 --- /dev/null +++ b/Documentation/cgroup-v1/cgroups.rst @@ -0,0 +1,695 @@ +============== +Control Groups +============== + +Written by Paul Menage based on +Documentation/cgroup-v1/cpusets.rst + +Original copyright statements from cpusets.txt: + +Portions Copyright (C) 2004 BULL SA. + +Portions Copyright (c) 2004-2006 Silicon Graphics, Inc. + +Modified by Paul Jackson + +Modified by Christoph Lameter + +.. CONTENTS: + + 1. Control Groups + 1.1 What are cgroups ? + 1.2 Why are cgroups needed ? + 1.3 How are cgroups implemented ? + 1.4 What does notify_on_release do ? + 1.5 What does clone_children do ? + 1.6 How do I use cgroups ? + 2. Usage Examples and Syntax + 2.1 Basic Usage + 2.2 Attaching processes + 2.3 Mounting hierarchies by name + 3. Kernel API + 3.1 Overview + 3.2 Synchronization + 3.3 Subsystem API + 4. Extended attributes usage + 5. Questions + +1. Control Groups +================= + +1.1 What are cgroups ? +---------------------- + +Control Groups provide a mechanism for aggregating/partitioning sets of +tasks, and all their future children, into hierarchical groups with +specialized behaviour. + +Definitions: + +A *cgroup* associates a set of tasks with a set of parameters for one +or more subsystems. + +A *subsystem* is a module that makes use of the task grouping +facilities provided by cgroups to treat groups of tasks in +particular ways. A subsystem is typically a "resource controller" that +schedules a resource or applies per-cgroup limits, but it may be +anything that wants to act on a group of processes, e.g. a +virtualization subsystem. + +A *hierarchy* is a set of cgroups arranged in a tree, such that +every task in the system is in exactly one of the cgroups in the +hierarchy, and a set of subsystems; each subsystem has system-specific +state attached to each cgroup in the hierarchy. Each hierarchy has +an instance of the cgroup virtual filesystem associated with it. + +At any one time there may be multiple active hierarchies of task +cgroups. Each hierarchy is a partition of all tasks in the system. + +User-level code may create and destroy cgroups by name in an +instance of the cgroup virtual file system, specify and query to +which cgroup a task is assigned, and list the task PIDs assigned to +a cgroup. Those creations and assignments only affect the hierarchy +associated with that instance of the cgroup file system. + +On their own, the only use for cgroups is for simple job +tracking. The intention is that other subsystems hook into the generic +cgroup support to provide new attributes for cgroups, such as +accounting/limiting the resources which processes in a cgroup can +access. For example, cpusets (see Documentation/cgroup-v1/cpusets.rst) allow +you to associate a set of CPUs and a set of memory nodes with the +tasks in each cgroup. + +1.2 Why are cgroups needed ? +---------------------------- + +There are multiple efforts to provide process aggregations in the +Linux kernel, mainly for resource-tracking purposes. Such efforts +include cpusets, CKRM/ResGroups, UserBeanCounters, and virtual server +namespaces. These all require the basic notion of a +grouping/partitioning of processes, with newly forked processes ending +up in the same group (cgroup) as their parent process. + +The kernel cgroup patch provides the minimum essential kernel +mechanisms required to efficiently implement such groups. It has +minimal impact on the system fast paths, and provides hooks for +specific subsystems such as cpusets to provide additional behaviour as +desired. + +Multiple hierarchy support is provided to allow for situations where +the division of tasks into cgroups is distinctly different for +different subsystems - having parallel hierarchies allows each +hierarchy to be a natural division of tasks, without having to handle +complex combinations of tasks that would be present if several +unrelated subsystems needed to be forced into the same tree of +cgroups. + +At one extreme, each resource controller or subsystem could be in a +separate hierarchy; at the other extreme, all subsystems +would be attached to the same hierarchy. + +As an example of a scenario (originally proposed by vatsa@in.ibm.com) +that can benefit from multiple hierarchies, consider a large +university server with various users - students, professors, system +tasks etc. The resource planning for this server could be along the +following lines:: + + CPU : "Top cpuset" + / \ + CPUSet1 CPUSet2 + | | + (Professors) (Students) + + In addition (system tasks) are attached to topcpuset (so + that they can run anywhere) with a limit of 20% + + Memory : Professors (50%), Students (30%), system (20%) + + Disk : Professors (50%), Students (30%), system (20%) + + Network : WWW browsing (20%), Network File System (60%), others (20%) + / \ + Professors (15%) students (5%) + +Browsers like Firefox/Lynx go into the WWW network class, while (k)nfsd goes +into the NFS network class. + +At the same time Firefox/Lynx will share an appropriate CPU/Memory class +depending on who launched it (prof/student). + +With the ability to classify tasks differently for different resources +(by putting those resource subsystems in different hierarchies), +the admin can easily set up a script which receives exec notifications +and depending on who is launching the browser he can:: + + # echo browser_pid > /sys/fs/cgroup///tasks + +With only a single hierarchy, he now would potentially have to create +a separate cgroup for every browser launched and associate it with +appropriate network and other resource class. This may lead to +proliferation of such cgroups. + +Also let's say that the administrator would like to give enhanced network +access temporarily to a student's browser (since it is night and the user +wants to do online gaming :)) OR give one of the student's simulation +apps enhanced CPU power. + +With ability to write PIDs directly to resource classes, it's just a +matter of:: + + # echo pid > /sys/fs/cgroup/network//tasks + (after some time) + # echo pid > /sys/fs/cgroup/network//tasks + +Without this ability, the administrator would have to split the cgroup into +multiple separate ones and then associate the new cgroups with the +new resource classes. + + + +1.3 How are cgroups implemented ? +--------------------------------- + +Control Groups extends the kernel as follows: + + - Each task in the system has a reference-counted pointer to a + css_set. + + - A css_set contains a set of reference-counted pointers to + cgroup_subsys_state objects, one for each cgroup subsystem + registered in the system. There is no direct link from a task to + the cgroup of which it's a member in each hierarchy, but this + can be determined by following pointers through the + cgroup_subsys_state objects. This is because accessing the + subsystem state is something that's expected to happen frequently + and in performance-critical code, whereas operations that require a + task's actual cgroup assignments (in particular, moving between + cgroups) are less common. A linked list runs through the cg_list + field of each task_struct using the css_set, anchored at + css_set->tasks. + + - A cgroup hierarchy filesystem can be mounted for browsing and + manipulation from user space. + + - You can list all the tasks (by PID) attached to any cgroup. + +The implementation of cgroups requires a few, simple hooks +into the rest of the kernel, none in performance-critical paths: + + - in init/main.c, to initialize the root cgroups and initial + css_set at system boot. + + - in fork and exit, to attach and detach a task from its css_set. + +In addition, a new file system of type "cgroup" may be mounted, to +enable browsing and modifying the cgroups presently known to the +kernel. When mounting a cgroup hierarchy, you may specify a +comma-separated list of subsystems to mount as the filesystem mount +options. By default, mounting the cgroup filesystem attempts to +mount a hierarchy containing all registered subsystems. + +If an active hierarchy with exactly the same set of subsystems already +exists, it will be reused for the new mount. If no existing hierarchy +matches, and any of the requested subsystems are in use in an existing +hierarchy, the mount will fail with -EBUSY. Otherwise, a new hierarchy +is activated, associated with the requested subsystems. + +It's not currently possible to bind a new subsystem to an active +cgroup hierarchy, or to unbind a subsystem from an active cgroup +hierarchy. This may be possible in future, but is fraught with nasty +error-recovery issues. + +When a cgroup filesystem is unmounted, if there are any +child cgroups created below the top-level cgroup, that hierarchy +will remain active even though unmounted; if there are no +child cgroups then the hierarchy will be deactivated. + +No new system calls are added for cgroups - all support for +querying and modifying cgroups is via this cgroup file system. + +Each task under /proc has an added file named 'cgroup' displaying, +for each active hierarchy, the subsystem names and the cgroup name +as the path relative to the root of the cgroup file system. + +Each cgroup is represented by a directory in the cgroup file system +containing the following files describing that cgroup: + + - tasks: list of tasks (by PID) attached to that cgroup. This list + is not guaranteed to be sorted. Writing a thread ID into this file + moves the thread into this cgroup. + - cgroup.procs: list of thread group IDs in the cgroup. This list is + not guaranteed to be sorted or free of duplicate TGIDs, and userspace + should sort/uniquify the list if this property is required. + Writing a thread group ID into this file moves all threads in that + group into this cgroup. + - notify_on_release flag: run the release agent on exit? + - release_agent: the path to use for release notifications (this file + exists in the top cgroup only) + +Other subsystems such as cpusets may add additional files in each +cgroup dir. + +New cgroups are created using the mkdir system call or shell +command. The properties of a cgroup, such as its flags, are +modified by writing to the appropriate file in that cgroups +directory, as listed above. + +The named hierarchical structure of nested cgroups allows partitioning +a large system into nested, dynamically changeable, "soft-partitions". + +The attachment of each task, automatically inherited at fork by any +children of that task, to a cgroup allows organizing the work load +on a system into related sets of tasks. A task may be re-attached to +any other cgroup, if allowed by the permissions on the necessary +cgroup file system directories. + +When a task is moved from one cgroup to another, it gets a new +css_set pointer - if there's an already existing css_set with the +desired collection of cgroups then that group is reused, otherwise a new +css_set is allocated. The appropriate existing css_set is located by +looking into a hash table. + +To allow access from a cgroup to the css_sets (and hence tasks) +that comprise it, a set of cg_cgroup_link objects form a lattice; +each cg_cgroup_link is linked into a list of cg_cgroup_links for +a single cgroup on its cgrp_link_list field, and a list of +cg_cgroup_links for a single css_set on its cg_link_list. + +Thus the set of tasks in a cgroup can be listed by iterating over +each css_set that references the cgroup, and sub-iterating over +each css_set's task set. + +The use of a Linux virtual file system (vfs) to represent the +cgroup hierarchy provides for a familiar permission and name space +for cgroups, with a minimum of additional kernel code. + +1.4 What does notify_on_release do ? +------------------------------------ + +If the notify_on_release flag is enabled (1) in a cgroup, then +whenever the last task in the cgroup leaves (exits or attaches to +some other cgroup) and the last child cgroup of that cgroup +is removed, then the kernel runs the command specified by the contents +of the "release_agent" file in that hierarchy's root directory, +supplying the pathname (relative to the mount point of the cgroup +file system) of the abandoned cgroup. This enables automatic +removal of abandoned cgroups. The default value of +notify_on_release in the root cgroup at system boot is disabled +(0). The default value of other cgroups at creation is the current +value of their parents' notify_on_release settings. The default value of +a cgroup hierarchy's release_agent path is empty. + +1.5 What does clone_children do ? +--------------------------------- + +This flag only affects the cpuset controller. If the clone_children +flag is enabled (1) in a cgroup, a new cpuset cgroup will copy its +configuration from the parent during initialization. + +1.6 How do I use cgroups ? +-------------------------- + +To start a new job that is to be contained within a cgroup, using +the "cpuset" cgroup subsystem, the steps are something like:: + + 1) mount -t tmpfs cgroup_root /sys/fs/cgroup + 2) mkdir /sys/fs/cgroup/cpuset + 3) mount -t cgroup -ocpuset cpuset /sys/fs/cgroup/cpuset + 4) Create the new cgroup by doing mkdir's and write's (or echo's) in + the /sys/fs/cgroup/cpuset virtual file system. + 5) Start a task that will be the "founding father" of the new job. + 6) Attach that task to the new cgroup by writing its PID to the + /sys/fs/cgroup/cpuset tasks file for that cgroup. + 7) fork, exec or clone the job tasks from this founding father task. + +For example, the following sequence of commands will setup a cgroup +named "Charlie", containing just CPUs 2 and 3, and Memory Node 1, +and then start a subshell 'sh' in that cgroup:: + + mount -t tmpfs cgroup_root /sys/fs/cgroup + mkdir /sys/fs/cgroup/cpuset + mount -t cgroup cpuset -ocpuset /sys/fs/cgroup/cpuset + cd /sys/fs/cgroup/cpuset + mkdir Charlie + cd Charlie + /bin/echo 2-3 > cpuset.cpus + /bin/echo 1 > cpuset.mems + /bin/echo $$ > tasks + sh + # The subshell 'sh' is now running in cgroup Charlie + # The next line should display '/Charlie' + cat /proc/self/cgroup + +2. Usage Examples and Syntax +============================ + +2.1 Basic Usage +--------------- + +Creating, modifying, using cgroups can be done through the cgroup +virtual filesystem. + +To mount a cgroup hierarchy with all available subsystems, type:: + + # mount -t cgroup xxx /sys/fs/cgroup + +The "xxx" is not interpreted by the cgroup code, but will appear in +/proc/mounts so may be any useful identifying string that you like. + +Note: Some subsystems do not work without some user input first. For instance, +if cpusets are enabled the user will have to populate the cpus and mems files +for each new cgroup created before that group can be used. + +As explained in section `1.2 Why are cgroups needed?` you should create +different hierarchies of cgroups for each single resource or group of +resources you want to control. Therefore, you should mount a tmpfs on +/sys/fs/cgroup and create directories for each cgroup resource or resource +group:: + + # mount -t tmpfs cgroup_root /sys/fs/cgroup + # mkdir /sys/fs/cgroup/rg1 + +To mount a cgroup hierarchy with just the cpuset and memory +subsystems, type:: + + # mount -t cgroup -o cpuset,memory hier1 /sys/fs/cgroup/rg1 + +While remounting cgroups is currently supported, it is not recommend +to use it. Remounting allows changing bound subsystems and +release_agent. Rebinding is hardly useful as it only works when the +hierarchy is empty and release_agent itself should be replaced with +conventional fsnotify. The support for remounting will be removed in +the future. + +To Specify a hierarchy's release_agent:: + + # mount -t cgroup -o cpuset,release_agent="/sbin/cpuset_release_agent" \ + xxx /sys/fs/cgroup/rg1 + +Note that specifying 'release_agent' more than once will return failure. + +Note that changing the set of subsystems is currently only supported +when the hierarchy consists of a single (root) cgroup. Supporting +the ability to arbitrarily bind/unbind subsystems from an existing +cgroup hierarchy is intended to be implemented in the future. + +Then under /sys/fs/cgroup/rg1 you can find a tree that corresponds to the +tree of the cgroups in the system. For instance, /sys/fs/cgroup/rg1 +is the cgroup that holds the whole system. + +If you want to change the value of release_agent:: + + # echo "/sbin/new_release_agent" > /sys/fs/cgroup/rg1/release_agent + +It can also be changed via remount. + +If you want to create a new cgroup under /sys/fs/cgroup/rg1:: + + # cd /sys/fs/cgroup/rg1 + # mkdir my_cgroup + +Now you want to do something with this cgroup: + + # cd my_cgroup + +In this directory you can find several files:: + + # ls + cgroup.procs notify_on_release tasks + (plus whatever files added by the attached subsystems) + +Now attach your shell to this cgroup:: + + # /bin/echo $$ > tasks + +You can also create cgroups inside your cgroup by using mkdir in this +directory:: + + # mkdir my_sub_cs + +To remove a cgroup, just use rmdir:: + + # rmdir my_sub_cs + +This will fail if the cgroup is in use (has cgroups inside, or +has processes attached, or is held alive by other subsystem-specific +reference). + +2.2 Attaching processes +----------------------- + +:: + + # /bin/echo PID > tasks + +Note that it is PID, not PIDs. You can only attach ONE task at a time. +If you have several tasks to attach, you have to do it one after another:: + + # /bin/echo PID1 > tasks + # /bin/echo PID2 > tasks + ... + # /bin/echo PIDn > tasks + +You can attach the current shell task by echoing 0:: + + # echo 0 > tasks + +You can use the cgroup.procs file instead of the tasks file to move all +threads in a threadgroup at once. Echoing the PID of any task in a +threadgroup to cgroup.procs causes all tasks in that threadgroup to be +attached to the cgroup. Writing 0 to cgroup.procs moves all tasks +in the writing task's threadgroup. + +Note: Since every task is always a member of exactly one cgroup in each +mounted hierarchy, to remove a task from its current cgroup you must +move it into a new cgroup (possibly the root cgroup) by writing to the +new cgroup's tasks file. + +Note: Due to some restrictions enforced by some cgroup subsystems, moving +a process to another cgroup can fail. + +2.3 Mounting hierarchies by name +-------------------------------- + +Passing the name= option when mounting a cgroups hierarchy +associates the given name with the hierarchy. This can be used when +mounting a pre-existing hierarchy, in order to refer to it by name +rather than by its set of active subsystems. Each hierarchy is either +nameless, or has a unique name. + +The name should match [\w.-]+ + +When passing a name= option for a new hierarchy, you need to +specify subsystems manually; the legacy behaviour of mounting all +subsystems when none are explicitly specified is not supported when +you give a subsystem a name. + +The name of the subsystem appears as part of the hierarchy description +in /proc/mounts and /proc//cgroups. + + +3. Kernel API +============= + +3.1 Overview +------------ + +Each kernel subsystem that wants to hook into the generic cgroup +system needs to create a cgroup_subsys object. This contains +various methods, which are callbacks from the cgroup system, along +with a subsystem ID which will be assigned by the cgroup system. + +Other fields in the cgroup_subsys object include: + +- subsys_id: a unique array index for the subsystem, indicating which + entry in cgroup->subsys[] this subsystem should be managing. + +- name: should be initialized to a unique subsystem name. Should be + no longer than MAX_CGROUP_TYPE_NAMELEN. + +- early_init: indicate if the subsystem needs early initialization + at system boot. + +Each cgroup object created by the system has an array of pointers, +indexed by subsystem ID; this pointer is entirely managed by the +subsystem; the generic cgroup code will never touch this pointer. + +3.2 Synchronization +------------------- + +There is a global mutex, cgroup_mutex, used by the cgroup +system. This should be taken by anything that wants to modify a +cgroup. It may also be taken to prevent cgroups from being +modified, but more specific locks may be more appropriate in that +situation. + +See kernel/cgroup.c for more details. + +Subsystems can take/release the cgroup_mutex via the functions +cgroup_lock()/cgroup_unlock(). + +Accessing a task's cgroup pointer may be done in the following ways: +- while holding cgroup_mutex +- while holding the task's alloc_lock (via task_lock()) +- inside an rcu_read_lock() section via rcu_dereference() + +3.3 Subsystem API +----------------- + +Each subsystem should: + +- add an entry in linux/cgroup_subsys.h +- define a cgroup_subsys object called _cgrp_subsys + +Each subsystem may export the following methods. The only mandatory +methods are css_alloc/free. Any others that are null are presumed to +be successful no-ops. + +``struct cgroup_subsys_state *css_alloc(struct cgroup *cgrp)`` +(cgroup_mutex held by caller) + +Called to allocate a subsystem state object for a cgroup. The +subsystem should allocate its subsystem state object for the passed +cgroup, returning a pointer to the new object on success or a +ERR_PTR() value. On success, the subsystem pointer should point to +a structure of type cgroup_subsys_state (typically embedded in a +larger subsystem-specific object), which will be initialized by the +cgroup system. Note that this will be called at initialization to +create the root subsystem state for this subsystem; this case can be +identified by the passed cgroup object having a NULL parent (since +it's the root of the hierarchy) and may be an appropriate place for +initialization code. + +``int css_online(struct cgroup *cgrp)`` +(cgroup_mutex held by caller) + +Called after @cgrp successfully completed all allocations and made +visible to cgroup_for_each_child/descendant_*() iterators. The +subsystem may choose to fail creation by returning -errno. This +callback can be used to implement reliable state sharing and +propagation along the hierarchy. See the comment on +cgroup_for_each_descendant_pre() for details. + +``void css_offline(struct cgroup *cgrp);`` +(cgroup_mutex held by caller) + +This is the counterpart of css_online() and called iff css_online() +has succeeded on @cgrp. This signifies the beginning of the end of +@cgrp. @cgrp is being removed and the subsystem should start dropping +all references it's holding on @cgrp. When all references are dropped, +cgroup removal will proceed to the next step - css_free(). After this +callback, @cgrp should be considered dead to the subsystem. + +``void css_free(struct cgroup *cgrp)`` +(cgroup_mutex held by caller) + +The cgroup system is about to free @cgrp; the subsystem should free +its subsystem state object. By the time this method is called, @cgrp +is completely unused; @cgrp->parent is still valid. (Note - can also +be called for a newly-created cgroup if an error occurs after this +subsystem's create() method has been called for the new cgroup). + +``int can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)`` +(cgroup_mutex held by caller) + +Called prior to moving one or more tasks into a cgroup; if the +subsystem returns an error, this will abort the attach operation. +@tset contains the tasks to be attached and is guaranteed to have at +least one task in it. + +If there are multiple tasks in the taskset, then: + - it's guaranteed that all are from the same thread group + - @tset contains all tasks from the thread group whether or not + they're switching cgroups + - the first task is the leader + +Each @tset entry also contains the task's old cgroup and tasks which +aren't switching cgroup can be skipped easily using the +cgroup_taskset_for_each() iterator. Note that this isn't called on a +fork. If this method returns 0 (success) then this should remain valid +while the caller holds cgroup_mutex and it is ensured that either +attach() or cancel_attach() will be called in future. + +``void css_reset(struct cgroup_subsys_state *css)`` +(cgroup_mutex held by caller) + +An optional operation which should restore @css's configuration to the +initial state. This is currently only used on the unified hierarchy +when a subsystem is disabled on a cgroup through +"cgroup.subtree_control" but should remain enabled because other +subsystems depend on it. cgroup core makes such a css invisible by +removing the associated interface files and invokes this callback so +that the hidden subsystem can return to the initial neutral state. +This prevents unexpected resource control from a hidden css and +ensures that the configuration is in the initial state when it is made +visible again later. + +``void cancel_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)`` +(cgroup_mutex held by caller) + +Called when a task attach operation has failed after can_attach() has succeeded. +A subsystem whose can_attach() has some side-effects should provide this +function, so that the subsystem can implement a rollback. If not, not necessary. +This will be called only about subsystems whose can_attach() operation have +succeeded. The parameters are identical to can_attach(). + +``void attach(struct cgroup *cgrp, struct cgroup_taskset *tset)`` +(cgroup_mutex held by caller) + +Called after the task has been attached to the cgroup, to allow any +post-attachment activity that requires memory allocations or blocking. +The parameters are identical to can_attach(). + +``void fork(struct task_struct *task)`` + +Called when a task is forked into a cgroup. + +``void exit(struct task_struct *task)`` + +Called during task exit. + +``void free(struct task_struct *task)`` + +Called when the task_struct is freed. + +``void bind(struct cgroup *root)`` +(cgroup_mutex held by caller) + +Called when a cgroup subsystem is rebound to a different hierarchy +and root cgroup. Currently this will only involve movement between +the default hierarchy (which never has sub-cgroups) and a hierarchy +that is being created/destroyed (and hence has no sub-cgroups). + +4. Extended attribute usage +=========================== + +cgroup filesystem supports certain types of extended attributes in its +directories and files. The current supported types are: + + - Trusted (XATTR_TRUSTED) + - Security (XATTR_SECURITY) + +Both require CAP_SYS_ADMIN capability to set. + +Like in tmpfs, the extended attributes in cgroup filesystem are stored +using kernel memory and it's advised to keep the usage at minimum. This +is the reason why user defined extended attributes are not supported, since +any user can do it and there's no limit in the value size. + +The current known users for this feature are SELinux to limit cgroup usage +in containers and systemd for assorted meta data like main PID in a cgroup +(systemd creates a cgroup per service). + +5. Questions +============ + +:: + + Q: what's up with this '/bin/echo' ? + A: bash's builtin 'echo' command does not check calls to write() against + errors. If you use it in the cgroup file system, you won't be + able to tell whether a command succeeded or failed. + + Q: When I attach processes, only the first of the line gets really attached ! + A: We can only return one error code per call to write(). So you should also + put only ONE PID. diff --git a/Documentation/cgroup-v1/cgroups.txt b/Documentation/cgroup-v1/cgroups.txt deleted file mode 100644 index 059f7063eea6..000000000000 --- a/Documentation/cgroup-v1/cgroups.txt +++ /dev/null @@ -1,677 +0,0 @@ - CGROUPS - ------- - -Written by Paul Menage based on -Documentation/cgroup-v1/cpusets.txt - -Original copyright statements from cpusets.txt: -Portions Copyright (C) 2004 BULL SA. -Portions Copyright (c) 2004-2006 Silicon Graphics, Inc. -Modified by Paul Jackson -Modified by Christoph Lameter - -CONTENTS: -========= - -1. Control Groups - 1.1 What are cgroups ? - 1.2 Why are cgroups needed ? - 1.3 How are cgroups implemented ? - 1.4 What does notify_on_release do ? - 1.5 What does clone_children do ? - 1.6 How do I use cgroups ? -2. Usage Examples and Syntax - 2.1 Basic Usage - 2.2 Attaching processes - 2.3 Mounting hierarchies by name -3. Kernel API - 3.1 Overview - 3.2 Synchronization - 3.3 Subsystem API -4. Extended attributes usage -5. Questions - -1. Control Groups -================= - -1.1 What are cgroups ? ----------------------- - -Control Groups provide a mechanism for aggregating/partitioning sets of -tasks, and all their future children, into hierarchical groups with -specialized behaviour. - -Definitions: - -A *cgroup* associates a set of tasks with a set of parameters for one -or more subsystems. - -A *subsystem* is a module that makes use of the task grouping -facilities provided by cgroups to treat groups of tasks in -particular ways. A subsystem is typically a "resource controller" that -schedules a resource or applies per-cgroup limits, but it may be -anything that wants to act on a group of processes, e.g. a -virtualization subsystem. - -A *hierarchy* is a set of cgroups arranged in a tree, such that -every task in the system is in exactly one of the cgroups in the -hierarchy, and a set of subsystems; each subsystem has system-specific -state attached to each cgroup in the hierarchy. Each hierarchy has -an instance of the cgroup virtual filesystem associated with it. - -At any one time there may be multiple active hierarchies of task -cgroups. Each hierarchy is a partition of all tasks in the system. - -User-level code may create and destroy cgroups by name in an -instance of the cgroup virtual file system, specify and query to -which cgroup a task is assigned, and list the task PIDs assigned to -a cgroup. Those creations and assignments only affect the hierarchy -associated with that instance of the cgroup file system. - -On their own, the only use for cgroups is for simple job -tracking. The intention is that other subsystems hook into the generic -cgroup support to provide new attributes for cgroups, such as -accounting/limiting the resources which processes in a cgroup can -access. For example, cpusets (see Documentation/cgroup-v1/cpusets.txt) allow -you to associate a set of CPUs and a set of memory nodes with the -tasks in each cgroup. - -1.2 Why are cgroups needed ? ----------------------------- - -There are multiple efforts to provide process aggregations in the -Linux kernel, mainly for resource-tracking purposes. Such efforts -include cpusets, CKRM/ResGroups, UserBeanCounters, and virtual server -namespaces. These all require the basic notion of a -grouping/partitioning of processes, with newly forked processes ending -up in the same group (cgroup) as their parent process. - -The kernel cgroup patch provides the minimum essential kernel -mechanisms required to efficiently implement such groups. It has -minimal impact on the system fast paths, and provides hooks for -specific subsystems such as cpusets to provide additional behaviour as -desired. - -Multiple hierarchy support is provided to allow for situations where -the division of tasks into cgroups is distinctly different for -different subsystems - having parallel hierarchies allows each -hierarchy to be a natural division of tasks, without having to handle -complex combinations of tasks that would be present if several -unrelated subsystems needed to be forced into the same tree of -cgroups. - -At one extreme, each resource controller or subsystem could be in a -separate hierarchy; at the other extreme, all subsystems -would be attached to the same hierarchy. - -As an example of a scenario (originally proposed by vatsa@in.ibm.com) -that can benefit from multiple hierarchies, consider a large -university server with various users - students, professors, system -tasks etc. The resource planning for this server could be along the -following lines: - - CPU : "Top cpuset" - / \ - CPUSet1 CPUSet2 - | | - (Professors) (Students) - - In addition (system tasks) are attached to topcpuset (so - that they can run anywhere) with a limit of 20% - - Memory : Professors (50%), Students (30%), system (20%) - - Disk : Professors (50%), Students (30%), system (20%) - - Network : WWW browsing (20%), Network File System (60%), others (20%) - / \ - Professors (15%) students (5%) - -Browsers like Firefox/Lynx go into the WWW network class, while (k)nfsd goes -into the NFS network class. - -At the same time Firefox/Lynx will share an appropriate CPU/Memory class -depending on who launched it (prof/student). - -With the ability to classify tasks differently for different resources -(by putting those resource subsystems in different hierarchies), -the admin can easily set up a script which receives exec notifications -and depending on who is launching the browser he can - - # echo browser_pid > /sys/fs/cgroup///tasks - -With only a single hierarchy, he now would potentially have to create -a separate cgroup for every browser launched and associate it with -appropriate network and other resource class. This may lead to -proliferation of such cgroups. - -Also let's say that the administrator would like to give enhanced network -access temporarily to a student's browser (since it is night and the user -wants to do online gaming :)) OR give one of the student's simulation -apps enhanced CPU power. - -With ability to write PIDs directly to resource classes, it's just a -matter of: - - # echo pid > /sys/fs/cgroup/network//tasks - (after some time) - # echo pid > /sys/fs/cgroup/network//tasks - -Without this ability, the administrator would have to split the cgroup into -multiple separate ones and then associate the new cgroups with the -new resource classes. - - - -1.3 How are cgroups implemented ? ---------------------------------- - -Control Groups extends the kernel as follows: - - - Each task in the system has a reference-counted pointer to a - css_set. - - - A css_set contains a set of reference-counted pointers to - cgroup_subsys_state objects, one for each cgroup subsystem - registered in the system. There is no direct link from a task to - the cgroup of which it's a member in each hierarchy, but this - can be determined by following pointers through the - cgroup_subsys_state objects. This is because accessing the - subsystem state is something that's expected to happen frequently - and in performance-critical code, whereas operations that require a - task's actual cgroup assignments (in particular, moving between - cgroups) are less common. A linked list runs through the cg_list - field of each task_struct using the css_set, anchored at - css_set->tasks. - - - A cgroup hierarchy filesystem can be mounted for browsing and - manipulation from user space. - - - You can list all the tasks (by PID) attached to any cgroup. - -The implementation of cgroups requires a few, simple hooks -into the rest of the kernel, none in performance-critical paths: - - - in init/main.c, to initialize the root cgroups and initial - css_set at system boot. - - - in fork and exit, to attach and detach a task from its css_set. - -In addition, a new file system of type "cgroup" may be mounted, to -enable browsing and modifying the cgroups presently known to the -kernel. When mounting a cgroup hierarchy, you may specify a -comma-separated list of subsystems to mount as the filesystem mount -options. By default, mounting the cgroup filesystem attempts to -mount a hierarchy containing all registered subsystems. - -If an active hierarchy with exactly the same set of subsystems already -exists, it will be reused for the new mount. If no existing hierarchy -matches, and any of the requested subsystems are in use in an existing -hierarchy, the mount will fail with -EBUSY. Otherwise, a new hierarchy -is activated, associated with the requested subsystems. - -It's not currently possible to bind a new subsystem to an active -cgroup hierarchy, or to unbind a subsystem from an active cgroup -hierarchy. This may be possible in future, but is fraught with nasty -error-recovery issues. - -When a cgroup filesystem is unmounted, if there are any -child cgroups created below the top-level cgroup, that hierarchy -will remain active even though unmounted; if there are no -child cgroups then the hierarchy will be deactivated. - -No new system calls are added for cgroups - all support for -querying and modifying cgroups is via this cgroup file system. - -Each task under /proc has an added file named 'cgroup' displaying, -for each active hierarchy, the subsystem names and the cgroup name -as the path relative to the root of the cgroup file system. - -Each cgroup is represented by a directory in the cgroup file system -containing the following files describing that cgroup: - - - tasks: list of tasks (by PID) attached to that cgroup. This list - is not guaranteed to be sorted. Writing a thread ID into this file - moves the thread into this cgroup. - - cgroup.procs: list of thread group IDs in the cgroup. This list is - not guaranteed to be sorted or free of duplicate TGIDs, and userspace - should sort/uniquify the list if this property is required. - Writing a thread group ID into this file moves all threads in that - group into this cgroup. - - notify_on_release flag: run the release agent on exit? - - release_agent: the path to use for release notifications (this file - exists in the top cgroup only) - -Other subsystems such as cpusets may add additional files in each -cgroup dir. - -New cgroups are created using the mkdir system call or shell -command. The properties of a cgroup, such as its flags, are -modified by writing to the appropriate file in that cgroups -directory, as listed above. - -The named hierarchical structure of nested cgroups allows partitioning -a large system into nested, dynamically changeable, "soft-partitions". - -The attachment of each task, automatically inherited at fork by any -children of that task, to a cgroup allows organizing the work load -on a system into related sets of tasks. A task may be re-attached to -any other cgroup, if allowed by the permissions on the necessary -cgroup file system directories. - -When a task is moved from one cgroup to another, it gets a new -css_set pointer - if there's an already existing css_set with the -desired collection of cgroups then that group is reused, otherwise a new -css_set is allocated. The appropriate existing css_set is located by -looking into a hash table. - -To allow access from a cgroup to the css_sets (and hence tasks) -that comprise it, a set of cg_cgroup_link objects form a lattice; -each cg_cgroup_link is linked into a list of cg_cgroup_links for -a single cgroup on its cgrp_link_list field, and a list of -cg_cgroup_links for a single css_set on its cg_link_list. - -Thus the set of tasks in a cgroup can be listed by iterating over -each css_set that references the cgroup, and sub-iterating over -each css_set's task set. - -The use of a Linux virtual file system (vfs) to represent the -cgroup hierarchy provides for a familiar permission and name space -for cgroups, with a minimum of additional kernel code. - -1.4 What does notify_on_release do ? ------------------------------------- - -If the notify_on_release flag is enabled (1) in a cgroup, then -whenever the last task in the cgroup leaves (exits or attaches to -some other cgroup) and the last child cgroup of that cgroup -is removed, then the kernel runs the command specified by the contents -of the "release_agent" file in that hierarchy's root directory, -supplying the pathname (relative to the mount point of the cgroup -file system) of the abandoned cgroup. This enables automatic -removal of abandoned cgroups. The default value of -notify_on_release in the root cgroup at system boot is disabled -(0). The default value of other cgroups at creation is the current -value of their parents' notify_on_release settings. The default value of -a cgroup hierarchy's release_agent path is empty. - -1.5 What does clone_children do ? ---------------------------------- - -This flag only affects the cpuset controller. If the clone_children -flag is enabled (1) in a cgroup, a new cpuset cgroup will copy its -configuration from the parent during initialization. - -1.6 How do I use cgroups ? --------------------------- - -To start a new job that is to be contained within a cgroup, using -the "cpuset" cgroup subsystem, the steps are something like: - - 1) mount -t tmpfs cgroup_root /sys/fs/cgroup - 2) mkdir /sys/fs/cgroup/cpuset - 3) mount -t cgroup -ocpuset cpuset /sys/fs/cgroup/cpuset - 4) Create the new cgroup by doing mkdir's and write's (or echo's) in - the /sys/fs/cgroup/cpuset virtual file system. - 5) Start a task that will be the "founding father" of the new job. - 6) Attach that task to the new cgroup by writing its PID to the - /sys/fs/cgroup/cpuset tasks file for that cgroup. - 7) fork, exec or clone the job tasks from this founding father task. - -For example, the following sequence of commands will setup a cgroup -named "Charlie", containing just CPUs 2 and 3, and Memory Node 1, -and then start a subshell 'sh' in that cgroup: - - mount -t tmpfs cgroup_root /sys/fs/cgroup - mkdir /sys/fs/cgroup/cpuset - mount -t cgroup cpuset -ocpuset /sys/fs/cgroup/cpuset - cd /sys/fs/cgroup/cpuset - mkdir Charlie - cd Charlie - /bin/echo 2-3 > cpuset.cpus - /bin/echo 1 > cpuset.mems - /bin/echo $$ > tasks - sh - # The subshell 'sh' is now running in cgroup Charlie - # The next line should display '/Charlie' - cat /proc/self/cgroup - -2. Usage Examples and Syntax -============================ - -2.1 Basic Usage ---------------- - -Creating, modifying, using cgroups can be done through the cgroup -virtual filesystem. - -To mount a cgroup hierarchy with all available subsystems, type: -# mount -t cgroup xxx /sys/fs/cgroup - -The "xxx" is not interpreted by the cgroup code, but will appear in -/proc/mounts so may be any useful identifying string that you like. - -Note: Some subsystems do not work without some user input first. For instance, -if cpusets are enabled the user will have to populate the cpus and mems files -for each new cgroup created before that group can be used. - -As explained in section `1.2 Why are cgroups needed?' you should create -different hierarchies of cgroups for each single resource or group of -resources you want to control. Therefore, you should mount a tmpfs on -/sys/fs/cgroup and create directories for each cgroup resource or resource -group. - -# mount -t tmpfs cgroup_root /sys/fs/cgroup -# mkdir /sys/fs/cgroup/rg1 - -To mount a cgroup hierarchy with just the cpuset and memory -subsystems, type: -# mount -t cgroup -o cpuset,memory hier1 /sys/fs/cgroup/rg1 - -While remounting cgroups is currently supported, it is not recommend -to use it. Remounting allows changing bound subsystems and -release_agent. Rebinding is hardly useful as it only works when the -hierarchy is empty and release_agent itself should be replaced with -conventional fsnotify. The support for remounting will be removed in -the future. - -To Specify a hierarchy's release_agent: -# mount -t cgroup -o cpuset,release_agent="/sbin/cpuset_release_agent" \ - xxx /sys/fs/cgroup/rg1 - -Note that specifying 'release_agent' more than once will return failure. - -Note that changing the set of subsystems is currently only supported -when the hierarchy consists of a single (root) cgroup. Supporting -the ability to arbitrarily bind/unbind subsystems from an existing -cgroup hierarchy is intended to be implemented in the future. - -Then under /sys/fs/cgroup/rg1 you can find a tree that corresponds to the -tree of the cgroups in the system. For instance, /sys/fs/cgroup/rg1 -is the cgroup that holds the whole system. - -If you want to change the value of release_agent: -# echo "/sbin/new_release_agent" > /sys/fs/cgroup/rg1/release_agent - -It can also be changed via remount. - -If you want to create a new cgroup under /sys/fs/cgroup/rg1: -# cd /sys/fs/cgroup/rg1 -# mkdir my_cgroup - -Now you want to do something with this cgroup. -# cd my_cgroup - -In this directory you can find several files: -# ls -cgroup.procs notify_on_release tasks -(plus whatever files added by the attached subsystems) - -Now attach your shell to this cgroup: -# /bin/echo $$ > tasks - -You can also create cgroups inside your cgroup by using mkdir in this -directory. -# mkdir my_sub_cs - -To remove a cgroup, just use rmdir: -# rmdir my_sub_cs - -This will fail if the cgroup is in use (has cgroups inside, or -has processes attached, or is held alive by other subsystem-specific -reference). - -2.2 Attaching processes ------------------------ - -# /bin/echo PID > tasks - -Note that it is PID, not PIDs. You can only attach ONE task at a time. -If you have several tasks to attach, you have to do it one after another: - -# /bin/echo PID1 > tasks -# /bin/echo PID2 > tasks - ... -# /bin/echo PIDn > tasks - -You can attach the current shell task by echoing 0: - -# echo 0 > tasks - -You can use the cgroup.procs file instead of the tasks file to move all -threads in a threadgroup at once. Echoing the PID of any task in a -threadgroup to cgroup.procs causes all tasks in that threadgroup to be -attached to the cgroup. Writing 0 to cgroup.procs moves all tasks -in the writing task's threadgroup. - -Note: Since every task is always a member of exactly one cgroup in each -mounted hierarchy, to remove a task from its current cgroup you must -move it into a new cgroup (possibly the root cgroup) by writing to the -new cgroup's tasks file. - -Note: Due to some restrictions enforced by some cgroup subsystems, moving -a process to another cgroup can fail. - -2.3 Mounting hierarchies by name --------------------------------- - -Passing the name= option when mounting a cgroups hierarchy -associates the given name with the hierarchy. This can be used when -mounting a pre-existing hierarchy, in order to refer to it by name -rather than by its set of active subsystems. Each hierarchy is either -nameless, or has a unique name. - -The name should match [\w.-]+ - -When passing a name= option for a new hierarchy, you need to -specify subsystems manually; the legacy behaviour of mounting all -subsystems when none are explicitly specified is not supported when -you give a subsystem a name. - -The name of the subsystem appears as part of the hierarchy description -in /proc/mounts and /proc//cgroups. - - -3. Kernel API -============= - -3.1 Overview ------------- - -Each kernel subsystem that wants to hook into the generic cgroup -system needs to create a cgroup_subsys object. This contains -various methods, which are callbacks from the cgroup system, along -with a subsystem ID which will be assigned by the cgroup system. - -Other fields in the cgroup_subsys object include: - -- subsys_id: a unique array index for the subsystem, indicating which - entry in cgroup->subsys[] this subsystem should be managing. - -- name: should be initialized to a unique subsystem name. Should be - no longer than MAX_CGROUP_TYPE_NAMELEN. - -- early_init: indicate if the subsystem needs early initialization - at system boot. - -Each cgroup object created by the system has an array of pointers, -indexed by subsystem ID; this pointer is entirely managed by the -subsystem; the generic cgroup code will never touch this pointer. - -3.2 Synchronization -------------------- - -There is a global mutex, cgroup_mutex, used by the cgroup -system. This should be taken by anything that wants to modify a -cgroup. It may also be taken to prevent cgroups from being -modified, but more specific locks may be more appropriate in that -situation. - -See kernel/cgroup.c for more details. - -Subsystems can take/release the cgroup_mutex via the functions -cgroup_lock()/cgroup_unlock(). - -Accessing a task's cgroup pointer may be done in the following ways: -- while holding cgroup_mutex -- while holding the task's alloc_lock (via task_lock()) -- inside an rcu_read_lock() section via rcu_dereference() - -3.3 Subsystem API ------------------ - -Each subsystem should: - -- add an entry in linux/cgroup_subsys.h -- define a cgroup_subsys object called _cgrp_subsys - -Each subsystem may export the following methods. The only mandatory -methods are css_alloc/free. Any others that are null are presumed to -be successful no-ops. - -struct cgroup_subsys_state *css_alloc(struct cgroup *cgrp) -(cgroup_mutex held by caller) - -Called to allocate a subsystem state object for a cgroup. The -subsystem should allocate its subsystem state object for the passed -cgroup, returning a pointer to the new object on success or a -ERR_PTR() value. On success, the subsystem pointer should point to -a structure of type cgroup_subsys_state (typically embedded in a -larger subsystem-specific object), which will be initialized by the -cgroup system. Note that this will be called at initialization to -create the root subsystem state for this subsystem; this case can be -identified by the passed cgroup object having a NULL parent (since -it's the root of the hierarchy) and may be an appropriate place for -initialization code. - -int css_online(struct cgroup *cgrp) -(cgroup_mutex held by caller) - -Called after @cgrp successfully completed all allocations and made -visible to cgroup_for_each_child/descendant_*() iterators. The -subsystem may choose to fail creation by returning -errno. This -callback can be used to implement reliable state sharing and -propagation along the hierarchy. See the comment on -cgroup_for_each_descendant_pre() for details. - -void css_offline(struct cgroup *cgrp); -(cgroup_mutex held by caller) - -This is the counterpart of css_online() and called iff css_online() -has succeeded on @cgrp. This signifies the beginning of the end of -@cgrp. @cgrp is being removed and the subsystem should start dropping -all references it's holding on @cgrp. When all references are dropped, -cgroup removal will proceed to the next step - css_free(). After this -callback, @cgrp should be considered dead to the subsystem. - -void css_free(struct cgroup *cgrp) -(cgroup_mutex held by caller) - -The cgroup system is about to free @cgrp; the subsystem should free -its subsystem state object. By the time this method is called, @cgrp -is completely unused; @cgrp->parent is still valid. (Note - can also -be called for a newly-created cgroup if an error occurs after this -subsystem's create() method has been called for the new cgroup). - -int can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) -(cgroup_mutex held by caller) - -Called prior to moving one or more tasks into a cgroup; if the -subsystem returns an error, this will abort the attach operation. -@tset contains the tasks to be attached and is guaranteed to have at -least one task in it. - -If there are multiple tasks in the taskset, then: - - it's guaranteed that all are from the same thread group - - @tset contains all tasks from the thread group whether or not - they're switching cgroups - - the first task is the leader - -Each @tset entry also contains the task's old cgroup and tasks which -aren't switching cgroup can be skipped easily using the -cgroup_taskset_for_each() iterator. Note that this isn't called on a -fork. If this method returns 0 (success) then this should remain valid -while the caller holds cgroup_mutex and it is ensured that either -attach() or cancel_attach() will be called in future. - -void css_reset(struct cgroup_subsys_state *css) -(cgroup_mutex held by caller) - -An optional operation which should restore @css's configuration to the -initial state. This is currently only used on the unified hierarchy -when a subsystem is disabled on a cgroup through -"cgroup.subtree_control" but should remain enabled because other -subsystems depend on it. cgroup core makes such a css invisible by -removing the associated interface files and invokes this callback so -that the hidden subsystem can return to the initial neutral state. -This prevents unexpected resource control from a hidden css and -ensures that the configuration is in the initial state when it is made -visible again later. - -void cancel_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) -(cgroup_mutex held by caller) - -Called when a task attach operation has failed after can_attach() has succeeded. -A subsystem whose can_attach() has some side-effects should provide this -function, so that the subsystem can implement a rollback. If not, not necessary. -This will be called only about subsystems whose can_attach() operation have -succeeded. The parameters are identical to can_attach(). - -void attach(struct cgroup *cgrp, struct cgroup_taskset *tset) -(cgroup_mutex held by caller) - -Called after the task has been attached to the cgroup, to allow any -post-attachment activity that requires memory allocations or blocking. -The parameters are identical to can_attach(). - -void fork(struct task_struct *task) - -Called when a task is forked into a cgroup. - -void exit(struct task_struct *task) - -Called during task exit. - -void free(struct task_struct *task) - -Called when the task_struct is freed. - -void bind(struct cgroup *root) -(cgroup_mutex held by caller) - -Called when a cgroup subsystem is rebound to a different hierarchy -and root cgroup. Currently this will only involve movement between -the default hierarchy (which never has sub-cgroups) and a hierarchy -that is being created/destroyed (and hence has no sub-cgroups). - -4. Extended attribute usage -=========================== - -cgroup filesystem supports certain types of extended attributes in its -directories and files. The current supported types are: - - Trusted (XATTR_TRUSTED) - - Security (XATTR_SECURITY) - -Both require CAP_SYS_ADMIN capability to set. - -Like in tmpfs, the extended attributes in cgroup filesystem are stored -using kernel memory and it's advised to keep the usage at minimum. This -is the reason why user defined extended attributes are not supported, since -any user can do it and there's no limit in the value size. - -The current known users for this feature are SELinux to limit cgroup usage -in containers and systemd for assorted meta data like main PID in a cgroup -(systemd creates a cgroup per service). - -5. Questions -============ - -Q: what's up with this '/bin/echo' ? -A: bash's builtin 'echo' command does not check calls to write() against - errors. If you use it in the cgroup file system, you won't be - able to tell whether a command succeeded or failed. - -Q: When I attach processes, only the first of the line gets really attached ! -A: We can only return one error code per call to write(). So you should also - put only ONE PID. - diff --git a/Documentation/cgroup-v1/cpuacct.rst b/Documentation/cgroup-v1/cpuacct.rst new file mode 100644 index 000000000000..d30ed81d2ad7 --- /dev/null +++ b/Documentation/cgroup-v1/cpuacct.rst @@ -0,0 +1,50 @@ +========================= +CPU Accounting Controller +========================= + +The CPU accounting controller is used to group tasks using cgroups and +account the CPU usage of these groups of tasks. + +The CPU accounting controller supports multi-hierarchy groups. An accounting +group accumulates the CPU usage of all of its child groups and the tasks +directly present in its group. + +Accounting groups can be created by first mounting the cgroup filesystem:: + + # mount -t cgroup -ocpuacct none /sys/fs/cgroup + +With the above step, the initial or the parent accounting group becomes +visible at /sys/fs/cgroup. At bootup, this group includes all the tasks in +the system. /sys/fs/cgroup/tasks lists the tasks in this cgroup. +/sys/fs/cgroup/cpuacct.usage gives the CPU time (in nanoseconds) obtained +by this group which is essentially the CPU time obtained by all the tasks +in the system. + +New accounting groups can be created under the parent group /sys/fs/cgroup:: + + # cd /sys/fs/cgroup + # mkdir g1 + # echo $$ > g1/tasks + +The above steps create a new group g1 and move the current shell +process (bash) into it. CPU time consumed by this bash and its children +can be obtained from g1/cpuacct.usage and the same is accumulated in +/sys/fs/cgroup/cpuacct.usage also. + +cpuacct.stat file lists a few statistics which further divide the +CPU time obtained by the cgroup into user and system times. Currently +the following statistics are supported: + +user: Time spent by tasks of the cgroup in user mode. +system: Time spent by tasks of the cgroup in kernel mode. + +user and system are in USER_HZ unit. + +cpuacct controller uses percpu_counter interface to collect user and +system times. This has two side effects: + +- It is theoretically possible to see wrong values for user and system times. + This is because percpu_counter_read() on 32bit systems isn't safe + against concurrent writes. +- It is possible to see slightly outdated values for user and system times + due to the batch processing nature of percpu_counter. diff --git a/Documentation/cgroup-v1/cpuacct.txt b/Documentation/cgroup-v1/cpuacct.txt deleted file mode 100644 index 9d73cc0cadb9..000000000000 --- a/Documentation/cgroup-v1/cpuacct.txt +++ /dev/null @@ -1,49 +0,0 @@ -CPU Accounting Controller -------------------------- - -The CPU accounting controller is used to group tasks using cgroups and -account the CPU usage of these groups of tasks. - -The CPU accounting controller supports multi-hierarchy groups. An accounting -group accumulates the CPU usage of all of its child groups and the tasks -directly present in its group. - -Accounting groups can be created by first mounting the cgroup filesystem. - -# mount -t cgroup -ocpuacct none /sys/fs/cgroup - -With the above step, the initial or the parent accounting group becomes -visible at /sys/fs/cgroup. At bootup, this group includes all the tasks in -the system. /sys/fs/cgroup/tasks lists the tasks in this cgroup. -/sys/fs/cgroup/cpuacct.usage gives the CPU time (in nanoseconds) obtained -by this group which is essentially the CPU time obtained by all the tasks -in the system. - -New accounting groups can be created under the parent group /sys/fs/cgroup. - -# cd /sys/fs/cgroup -# mkdir g1 -# echo $$ > g1/tasks - -The above steps create a new group g1 and move the current shell -process (bash) into it. CPU time consumed by this bash and its children -can be obtained from g1/cpuacct.usage and the same is accumulated in -/sys/fs/cgroup/cpuacct.usage also. - -cpuacct.stat file lists a few statistics which further divide the -CPU time obtained by the cgroup into user and system times. Currently -the following statistics are supported: - -user: Time spent by tasks of the cgroup in user mode. -system: Time spent by tasks of the cgroup in kernel mode. - -user and system are in USER_HZ unit. - -cpuacct controller uses percpu_counter interface to collect user and -system times. This has two side effects: - -- It is theoretically possible to see wrong values for user and system times. - This is because percpu_counter_read() on 32bit systems isn't safe - against concurrent writes. -- It is possible to see slightly outdated values for user and system times - due to the batch processing nature of percpu_counter. diff --git a/Documentation/cgroup-v1/cpusets.rst b/Documentation/cgroup-v1/cpusets.rst new file mode 100644 index 000000000000..b6a42cdea72b --- /dev/null +++ b/Documentation/cgroup-v1/cpusets.rst @@ -0,0 +1,866 @@ +======= +CPUSETS +======= + +Copyright (C) 2004 BULL SA. + +Written by Simon.Derr@bull.net + +- Portions Copyright (c) 2004-2006 Silicon Graphics, Inc. +- Modified by Paul Jackson +- Modified by Christoph Lameter +- Modified by Paul Menage +- Modified by Hidetoshi Seto + +.. CONTENTS: + + 1. Cpusets + 1.1 What are cpusets ? + 1.2 Why are cpusets needed ? + 1.3 How are cpusets implemented ? + 1.4 What are exclusive cpusets ? + 1.5 What is memory_pressure ? + 1.6 What is memory spread ? + 1.7 What is sched_load_balance ? + 1.8 What is sched_relax_domain_level ? + 1.9 How do I use cpusets ? + 2. Usage Examples and Syntax + 2.1 Basic Usage + 2.2 Adding/removing cpus + 2.3 Setting flags + 2.4 Attaching processes + 3. Questions + 4. Contact + +1. Cpusets +========== + +1.1 What are cpusets ? +---------------------- + +Cpusets provide a mechanism for assigning a set of CPUs and Memory +Nodes to a set of tasks. In this document "Memory Node" refers to +an on-line node that contains memory. + +Cpusets constrain the CPU and Memory placement of tasks to only +the resources within a task's current cpuset. They form a nested +hierarchy visible in a virtual file system. These are the essential +hooks, beyond what is already present, required to manage dynamic +job placement on large systems. + +Cpusets use the generic cgroup subsystem described in +Documentation/cgroup-v1/cgroups.rst. + +Requests by a task, using the sched_setaffinity(2) system call to +include CPUs in its CPU affinity mask, and using the mbind(2) and +set_mempolicy(2) system calls to include Memory Nodes in its memory +policy, are both filtered through that task's cpuset, filtering out any +CPUs or Memory Nodes not in that cpuset. The scheduler will not +schedule a task on a CPU that is not allowed in its cpus_allowed +vector, and the kernel page allocator will not allocate a page on a +node that is not allowed in the requesting task's mems_allowed vector. + +User level code may create and destroy cpusets by name in the cgroup +virtual file system, manage the attributes and permissions of these +cpusets and which CPUs and Memory Nodes are assigned to each cpuset, +specify and query to which cpuset a task is assigned, and list the +task pids assigned to a cpuset. + + +1.2 Why are cpusets needed ? +---------------------------- + +The management of large computer systems, with many processors (CPUs), +complex memory cache hierarchies and multiple Memory Nodes having +non-uniform access times (NUMA) presents additional challenges for +the efficient scheduling and memory placement of processes. + +Frequently more modest sized systems can be operated with adequate +efficiency just by letting the operating system automatically share +the available CPU and Memory resources amongst the requesting tasks. + +But larger systems, which benefit more from careful processor and +memory placement to reduce memory access times and contention, +and which typically represent a larger investment for the customer, +can benefit from explicitly placing jobs on properly sized subsets of +the system. + +This can be especially valuable on: + + * Web Servers running multiple instances of the same web application, + * Servers running different applications (for instance, a web server + and a database), or + * NUMA systems running large HPC applications with demanding + performance characteristics. + +These subsets, or "soft partitions" must be able to be dynamically +adjusted, as the job mix changes, without impacting other concurrently +executing jobs. The location of the running jobs pages may also be moved +when the memory locations are changed. + +The kernel cpuset patch provides the minimum essential kernel +mechanisms required to efficiently implement such subsets. It +leverages existing CPU and Memory Placement facilities in the Linux +kernel to avoid any additional impact on the critical scheduler or +memory allocator code. + + +1.3 How are cpusets implemented ? +--------------------------------- + +Cpusets provide a Linux kernel mechanism to constrain which CPUs and +Memory Nodes are used by a process or set of processes. + +The Linux kernel already has a pair of mechanisms to specify on which +CPUs a task may be scheduled (sched_setaffinity) and on which Memory +Nodes it may obtain memory (mbind, set_mempolicy). + +Cpusets extends these two mechanisms as follows: + + - Cpusets are sets of allowed CPUs and Memory Nodes, known to the + kernel. + - Each task in the system is attached to a cpuset, via a pointer + in the task structure to a reference counted cgroup structure. + - Calls to sched_setaffinity are filtered to just those CPUs + allowed in that task's cpuset. + - Calls to mbind and set_mempolicy are filtered to just + those Memory Nodes allowed in that task's cpuset. + - The root cpuset contains all the systems CPUs and Memory + Nodes. + - For any cpuset, one can define child cpusets containing a subset + of the parents CPU and Memory Node resources. + - The hierarchy of cpusets can be mounted at /dev/cpuset, for + browsing and manipulation from user space. + - A cpuset may be marked exclusive, which ensures that no other + cpuset (except direct ancestors and descendants) may contain + any overlapping CPUs or Memory Nodes. + - You can list all the tasks (by pid) attached to any cpuset. + +The implementation of cpusets requires a few, simple hooks +into the rest of the kernel, none in performance critical paths: + + - in init/main.c, to initialize the root cpuset at system boot. + - in fork and exit, to attach and detach a task from its cpuset. + - in sched_setaffinity, to mask the requested CPUs by what's + allowed in that task's cpuset. + - in sched.c migrate_live_tasks(), to keep migrating tasks within + the CPUs allowed by their cpuset, if possible. + - in the mbind and set_mempolicy system calls, to mask the requested + Memory Nodes by what's allowed in that task's cpuset. + - in page_alloc.c, to restrict memory to allowed nodes. + - in vmscan.c, to restrict page recovery to the current cpuset. + +You should mount the "cgroup" filesystem type in order to enable +browsing and modifying the cpusets presently known to the kernel. No +new system calls are added for cpusets - all support for querying and +modifying cpusets is via this cpuset file system. + +The /proc//status file for each task has four added lines, +displaying the task's cpus_allowed (on which CPUs it may be scheduled) +and mems_allowed (on which Memory Nodes it may obtain memory), +in the two formats seen in the following example:: + + Cpus_allowed: ffffffff,ffffffff,ffffffff,ffffffff + Cpus_allowed_list: 0-127 + Mems_allowed: ffffffff,ffffffff + Mems_allowed_list: 0-63 + +Each cpuset is represented by a directory in the cgroup file system +containing (on top of the standard cgroup files) the following +files describing that cpuset: + + - cpuset.cpus: list of CPUs in that cpuset + - cpuset.mems: list of Memory Nodes in that cpuset + - cpuset.memory_migrate flag: if set, move pages to cpusets nodes + - cpuset.cpu_exclusive flag: is cpu placement exclusive? + - cpuset.mem_exclusive flag: is memory placement exclusive? + - cpuset.mem_hardwall flag: is memory allocation hardwalled + - cpuset.memory_pressure: measure of how much paging pressure in cpuset + - cpuset.memory_spread_page flag: if set, spread page cache evenly on allowed nodes + - cpuset.memory_spread_slab flag: if set, spread slab cache evenly on allowed nodes + - cpuset.sched_load_balance flag: if set, load balance within CPUs on that cpuset + - cpuset.sched_relax_domain_level: the searching range when migrating tasks + +In addition, only the root cpuset has the following file: + + - cpuset.memory_pressure_enabled flag: compute memory_pressure? + +New cpusets are created using the mkdir system call or shell +command. The properties of a cpuset, such as its flags, allowed +CPUs and Memory Nodes, and attached tasks, are modified by writing +to the appropriate file in that cpusets directory, as listed above. + +The named hierarchical structure of nested cpusets allows partitioning +a large system into nested, dynamically changeable, "soft-partitions". + +The attachment of each task, automatically inherited at fork by any +children of that task, to a cpuset allows organizing the work load +on a system into related sets of tasks such that each set is constrained +to using the CPUs and Memory Nodes of a particular cpuset. A task +may be re-attached to any other cpuset, if allowed by the permissions +on the necessary cpuset file system directories. + +Such management of a system "in the large" integrates smoothly with +the detailed placement done on individual tasks and memory regions +using the sched_setaffinity, mbind and set_mempolicy system calls. + +The following rules apply to each cpuset: + + - Its CPUs and Memory Nodes must be a subset of its parents. + - It can't be marked exclusive unless its parent is. + - If its cpu or memory is exclusive, they may not overlap any sibling. + +These rules, and the natural hierarchy of cpusets, enable efficient +enforcement of the exclusive guarantee, without having to scan all +cpusets every time any of them change to ensure nothing overlaps a +exclusive cpuset. Also, the use of a Linux virtual file system (vfs) +to represent the cpuset hierarchy provides for a familiar permission +and name space for cpusets, with a minimum of additional kernel code. + +The cpus and mems files in the root (top_cpuset) cpuset are +read-only. The cpus file automatically tracks the value of +cpu_online_mask using a CPU hotplug notifier, and the mems file +automatically tracks the value of node_states[N_MEMORY]--i.e., +nodes with memory--using the cpuset_track_online_nodes() hook. + + +1.4 What are exclusive cpusets ? +-------------------------------- + +If a cpuset is cpu or mem exclusive, no other cpuset, other than +a direct ancestor or descendant, may share any of the same CPUs or +Memory Nodes. + +A cpuset that is cpuset.mem_exclusive *or* cpuset.mem_hardwall is "hardwalled", +i.e. it restricts kernel allocations for page, buffer and other data +commonly shared by the kernel across multiple users. All cpusets, +whether hardwalled or not, restrict allocations of memory for user +space. This enables configuring a system so that several independent +jobs can share common kernel data, such as file system pages, while +isolating each job's user allocation in its own cpuset. To do this, +construct a large mem_exclusive cpuset to hold all the jobs, and +construct child, non-mem_exclusive cpusets for each individual job. +Only a small amount of typical kernel memory, such as requests from +interrupt handlers, is allowed to be taken outside even a +mem_exclusive cpuset. + + +1.5 What is memory_pressure ? +----------------------------- +The memory_pressure of a cpuset provides a simple per-cpuset metric +of the rate that the tasks in a cpuset are attempting to free up in +use memory on the nodes of the cpuset to satisfy additional memory +requests. + +This enables batch managers monitoring jobs running in dedicated +cpusets to efficiently detect what level of memory pressure that job +is causing. + +This is useful both on tightly managed systems running a wide mix of +submitted jobs, which may choose to terminate or re-prioritize jobs that +are trying to use more memory than allowed on the nodes assigned to them, +and with tightly coupled, long running, massively parallel scientific +computing jobs that will dramatically fail to meet required performance +goals if they start to use more memory than allowed to them. + +This mechanism provides a very economical way for the batch manager +to monitor a cpuset for signs of memory pressure. It's up to the +batch manager or other user code to decide what to do about it and +take action. + +==> + Unless this feature is enabled by writing "1" to the special file + /dev/cpuset/memory_pressure_enabled, the hook in the rebalance + code of __alloc_pages() for this metric reduces to simply noticing + that the cpuset_memory_pressure_enabled flag is zero. So only + systems that enable this feature will compute the metric. + +Why a per-cpuset, running average: + + Because this meter is per-cpuset, rather than per-task or mm, + the system load imposed by a batch scheduler monitoring this + metric is sharply reduced on large systems, because a scan of + the tasklist can be avoided on each set of queries. + + Because this meter is a running average, instead of an accumulating + counter, a batch scheduler can detect memory pressure with a + single read, instead of having to read and accumulate results + for a period of time. + + Because this meter is per-cpuset rather than per-task or mm, + the batch scheduler can obtain the key information, memory + pressure in a cpuset, with a single read, rather than having to + query and accumulate results over all the (dynamically changing) + set of tasks in the cpuset. + +A per-cpuset simple digital filter (requires a spinlock and 3 words +of data per-cpuset) is kept, and updated by any task attached to that +cpuset, if it enters the synchronous (direct) page reclaim code. + +A per-cpuset file provides an integer number representing the recent +(half-life of 10 seconds) rate of direct page reclaims caused by +the tasks in the cpuset, in units of reclaims attempted per second, +times 1000. + + +1.6 What is memory spread ? +--------------------------- +There are two boolean flag files per cpuset that control where the +kernel allocates pages for the file system buffers and related in +kernel data structures. They are called 'cpuset.memory_spread_page' and +'cpuset.memory_spread_slab'. + +If the per-cpuset boolean flag file 'cpuset.memory_spread_page' is set, then +the kernel will spread the file system buffers (page cache) evenly +over all the nodes that the faulting task is allowed to use, instead +of preferring to put those pages on the node where the task is running. + +If the per-cpuset boolean flag file 'cpuset.memory_spread_slab' is set, +then the kernel will spread some file system related slab caches, +such as for inodes and dentries evenly over all the nodes that the +faulting task is allowed to use, instead of preferring to put those +pages on the node where the task is running. + +The setting of these flags does not affect anonymous data segment or +stack segment pages of a task. + +By default, both kinds of memory spreading are off, and memory +pages are allocated on the node local to where the task is running, +except perhaps as modified by the task's NUMA mempolicy or cpuset +configuration, so long as sufficient free memory pages are available. + +When new cpusets are created, they inherit the memory spread settings +of their parent. + +Setting memory spreading causes allocations for the affected page +or slab caches to ignore the task's NUMA mempolicy and be spread +instead. Tasks using mbind() or set_mempolicy() calls to set NUMA +mempolicies will not notice any change in these calls as a result of +their containing task's memory spread settings. If memory spreading +is turned off, then the currently specified NUMA mempolicy once again +applies to memory page allocations. + +Both 'cpuset.memory_spread_page' and 'cpuset.memory_spread_slab' are boolean flag +files. By default they contain "0", meaning that the feature is off +for that cpuset. If a "1" is written to that file, then that turns +the named feature on. + +The implementation is simple. + +Setting the flag 'cpuset.memory_spread_page' turns on a per-process flag +PFA_SPREAD_PAGE for each task that is in that cpuset or subsequently +joins that cpuset. The page allocation calls for the page cache +is modified to perform an inline check for this PFA_SPREAD_PAGE task +flag, and if set, a call to a new routine cpuset_mem_spread_node() +returns the node to prefer for the allocation. + +Similarly, setting 'cpuset.memory_spread_slab' turns on the flag +PFA_SPREAD_SLAB, and appropriately marked slab caches will allocate +pages from the node returned by cpuset_mem_spread_node(). + +The cpuset_mem_spread_node() routine is also simple. It uses the +value of a per-task rotor cpuset_mem_spread_rotor to select the next +node in the current task's mems_allowed to prefer for the allocation. + +This memory placement policy is also known (in other contexts) as +round-robin or interleave. + +This policy can provide substantial improvements for jobs that need +to place thread local data on the corresponding node, but that need +to access large file system data sets that need to be spread across +the several nodes in the jobs cpuset in order to fit. Without this +policy, especially for jobs that might have one thread reading in the +data set, the memory allocation across the nodes in the jobs cpuset +can become very uneven. + +1.7 What is sched_load_balance ? +-------------------------------- + +The kernel scheduler (kernel/sched/core.c) automatically load balances +tasks. If one CPU is underutilized, kernel code running on that +CPU will look for tasks on other more overloaded CPUs and move those +tasks to itself, within the constraints of such placement mechanisms +as cpusets and sched_setaffinity. + +The algorithmic cost of load balancing and its impact on key shared +kernel data structures such as the task list increases more than +linearly with the number of CPUs being balanced. So the scheduler +has support to partition the systems CPUs into a number of sched +domains such that it only load balances within each sched domain. +Each sched domain covers some subset of the CPUs in the system; +no two sched domains overlap; some CPUs might not be in any sched +domain and hence won't be load balanced. + +Put simply, it costs less to balance between two smaller sched domains +than one big one, but doing so means that overloads in one of the +two domains won't be load balanced to the other one. + +By default, there is one sched domain covering all CPUs, including those +marked isolated using the kernel boot time "isolcpus=" argument. However, +the isolated CPUs will not participate in load balancing, and will not +have tasks running on them unless explicitly assigned. + +This default load balancing across all CPUs is not well suited for +the following two situations: + + 1) On large systems, load balancing across many CPUs is expensive. + If the system is managed using cpusets to place independent jobs + on separate sets of CPUs, full load balancing is unnecessary. + 2) Systems supporting realtime on some CPUs need to minimize + system overhead on those CPUs, including avoiding task load + balancing if that is not needed. + +When the per-cpuset flag "cpuset.sched_load_balance" is enabled (the default +setting), it requests that all the CPUs in that cpusets allowed 'cpuset.cpus' +be contained in a single sched domain, ensuring that load balancing +can move a task (not otherwised pinned, as by sched_setaffinity) +from any CPU in that cpuset to any other. + +When the per-cpuset flag "cpuset.sched_load_balance" is disabled, then the +scheduler will avoid load balancing across the CPUs in that cpuset, +--except-- in so far as is necessary because some overlapping cpuset +has "sched_load_balance" enabled. + +So, for example, if the top cpuset has the flag "cpuset.sched_load_balance" +enabled, then the scheduler will have one sched domain covering all +CPUs, and the setting of the "cpuset.sched_load_balance" flag in any other +cpusets won't matter, as we're already fully load balancing. + +Therefore in the above two situations, the top cpuset flag +"cpuset.sched_load_balance" should be disabled, and only some of the smaller, +child cpusets have this flag enabled. + +When doing this, you don't usually want to leave any unpinned tasks in +the top cpuset that might use non-trivial amounts of CPU, as such tasks +may be artificially constrained to some subset of CPUs, depending on +the particulars of this flag setting in descendant cpusets. Even if +such a task could use spare CPU cycles in some other CPUs, the kernel +scheduler might not consider the possibility of load balancing that +task to that underused CPU. + +Of course, tasks pinned to a particular CPU can be left in a cpuset +that disables "cpuset.sched_load_balance" as those tasks aren't going anywhere +else anyway. + +There is an impedance mismatch here, between cpusets and sched domains. +Cpusets are hierarchical and nest. Sched domains are flat; they don't +overlap and each CPU is in at most one sched domain. + +It is necessary for sched domains to be flat because load balancing +across partially overlapping sets of CPUs would risk unstable dynamics +that would be beyond our understanding. So if each of two partially +overlapping cpusets enables the flag 'cpuset.sched_load_balance', then we +form a single sched domain that is a superset of both. We won't move +a task to a CPU outside its cpuset, but the scheduler load balancing +code might waste some compute cycles considering that possibility. + +This mismatch is why there is not a simple one-to-one relation +between which cpusets have the flag "cpuset.sched_load_balance" enabled, +and the sched domain configuration. If a cpuset enables the flag, it +will get balancing across all its CPUs, but if it disables the flag, +it will only be assured of no load balancing if no other overlapping +cpuset enables the flag. + +If two cpusets have partially overlapping 'cpuset.cpus' allowed, and only +one of them has this flag enabled, then the other may find its +tasks only partially load balanced, just on the overlapping CPUs. +This is just the general case of the top_cpuset example given a few +paragraphs above. In the general case, as in the top cpuset case, +don't leave tasks that might use non-trivial amounts of CPU in +such partially load balanced cpusets, as they may be artificially +constrained to some subset of the CPUs allowed to them, for lack of +load balancing to the other CPUs. + +CPUs in "cpuset.isolcpus" were excluded from load balancing by the +isolcpus= kernel boot option, and will never be load balanced regardless +of the value of "cpuset.sched_load_balance" in any cpuset. + +1.7.1 sched_load_balance implementation details. +------------------------------------------------ + +The per-cpuset flag 'cpuset.sched_load_balance' defaults to enabled (contrary +to most cpuset flags.) When enabled for a cpuset, the kernel will +ensure that it can load balance across all the CPUs in that cpuset +(makes sure that all the CPUs in the cpus_allowed of that cpuset are +in the same sched domain.) + +If two overlapping cpusets both have 'cpuset.sched_load_balance' enabled, +then they will be (must be) both in the same sched domain. + +If, as is the default, the top cpuset has 'cpuset.sched_load_balance' enabled, +then by the above that means there is a single sched domain covering +the whole system, regardless of any other cpuset settings. + +The kernel commits to user space that it will avoid load balancing +where it can. It will pick as fine a granularity partition of sched +domains as it can while still providing load balancing for any set +of CPUs allowed to a cpuset having 'cpuset.sched_load_balance' enabled. + +The internal kernel cpuset to scheduler interface passes from the +cpuset code to the scheduler code a partition of the load balanced +CPUs in the system. This partition is a set of subsets (represented +as an array of struct cpumask) of CPUs, pairwise disjoint, that cover +all the CPUs that must be load balanced. + +The cpuset code builds a new such partition and passes it to the +scheduler sched domain setup code, to have the sched domains rebuilt +as necessary, whenever: + + - the 'cpuset.sched_load_balance' flag of a cpuset with non-empty CPUs changes, + - or CPUs come or go from a cpuset with this flag enabled, + - or 'cpuset.sched_relax_domain_level' value of a cpuset with non-empty CPUs + and with this flag enabled changes, + - or a cpuset with non-empty CPUs and with this flag enabled is removed, + - or a cpu is offlined/onlined. + +This partition exactly defines what sched domains the scheduler should +setup - one sched domain for each element (struct cpumask) in the +partition. + +The scheduler remembers the currently active sched domain partitions. +When the scheduler routine partition_sched_domains() is invoked from +the cpuset code to update these sched domains, it compares the new +partition requested with the current, and updates its sched domains, +removing the old and adding the new, for each change. + + +1.8 What is sched_relax_domain_level ? +-------------------------------------- + +In sched domain, the scheduler migrates tasks in 2 ways; periodic load +balance on tick, and at time of some schedule events. + +When a task is woken up, scheduler try to move the task on idle CPU. +For example, if a task A running on CPU X activates another task B +on the same CPU X, and if CPU Y is X's sibling and performing idle, +then scheduler migrate task B to CPU Y so that task B can start on +CPU Y without waiting task A on CPU X. + +And if a CPU run out of tasks in its runqueue, the CPU try to pull +extra tasks from other busy CPUs to help them before it is going to +be idle. + +Of course it takes some searching cost to find movable tasks and/or +idle CPUs, the scheduler might not search all CPUs in the domain +every time. In fact, in some architectures, the searching ranges on +events are limited in the same socket or node where the CPU locates, +while the load balance on tick searches all. + +For example, assume CPU Z is relatively far from CPU X. Even if CPU Z +is idle while CPU X and the siblings are busy, scheduler can't migrate +woken task B from X to Z since it is out of its searching range. +As the result, task B on CPU X need to wait task A or wait load balance +on the next tick. For some applications in special situation, waiting +1 tick may be too long. + +The 'cpuset.sched_relax_domain_level' file allows you to request changing +this searching range as you like. This file takes int value which +indicates size of searching range in levels ideally as follows, +otherwise initial value -1 that indicates the cpuset has no request. + +====== =========================================================== + -1 no request. use system default or follow request of others. + 0 no search. + 1 search siblings (hyperthreads in a core). + 2 search cores in a package. + 3 search cpus in a node [= system wide on non-NUMA system] + 4 search nodes in a chunk of node [on NUMA system] + 5 search system wide [on NUMA system] +====== =========================================================== + +The system default is architecture dependent. The system default +can be changed using the relax_domain_level= boot parameter. + +This file is per-cpuset and affect the sched domain where the cpuset +belongs to. Therefore if the flag 'cpuset.sched_load_balance' of a cpuset +is disabled, then 'cpuset.sched_relax_domain_level' have no effect since +there is no sched domain belonging the cpuset. + +If multiple cpusets are overlapping and hence they form a single sched +domain, the largest value among those is used. Be careful, if one +requests 0 and others are -1 then 0 is used. + +Note that modifying this file will have both good and bad effects, +and whether it is acceptable or not depends on your situation. +Don't modify this file if you are not sure. + +If your situation is: + + - The migration costs between each cpu can be assumed considerably + small(for you) due to your special application's behavior or + special hardware support for CPU cache etc. + - The searching cost doesn't have impact(for you) or you can make + the searching cost enough small by managing cpuset to compact etc. + - The latency is required even it sacrifices cache hit rate etc. + then increasing 'sched_relax_domain_level' would benefit you. + + +1.9 How do I use cpusets ? +-------------------------- + +In order to minimize the impact of cpusets on critical kernel +code, such as the scheduler, and due to the fact that the kernel +does not support one task updating the memory placement of another +task directly, the impact on a task of changing its cpuset CPU +or Memory Node placement, or of changing to which cpuset a task +is attached, is subtle. + +If a cpuset has its Memory Nodes modified, then for each task attached +to that cpuset, the next time that the kernel attempts to allocate +a page of memory for that task, the kernel will notice the change +in the task's cpuset, and update its per-task memory placement to +remain within the new cpusets memory placement. If the task was using +mempolicy MPOL_BIND, and the nodes to which it was bound overlap with +its new cpuset, then the task will continue to use whatever subset +of MPOL_BIND nodes are still allowed in the new cpuset. If the task +was using MPOL_BIND and now none of its MPOL_BIND nodes are allowed +in the new cpuset, then the task will be essentially treated as if it +was MPOL_BIND bound to the new cpuset (even though its NUMA placement, +as queried by get_mempolicy(), doesn't change). If a task is moved +from one cpuset to another, then the kernel will adjust the task's +memory placement, as above, the next time that the kernel attempts +to allocate a page of memory for that task. + +If a cpuset has its 'cpuset.cpus' modified, then each task in that cpuset +will have its allowed CPU placement changed immediately. Similarly, +if a task's pid is written to another cpuset's 'tasks' file, then its +allowed CPU placement is changed immediately. If such a task had been +bound to some subset of its cpuset using the sched_setaffinity() call, +the task will be allowed to run on any CPU allowed in its new cpuset, +negating the effect of the prior sched_setaffinity() call. + +In summary, the memory placement of a task whose cpuset is changed is +updated by the kernel, on the next allocation of a page for that task, +and the processor placement is updated immediately. + +Normally, once a page is allocated (given a physical page +of main memory) then that page stays on whatever node it +was allocated, so long as it remains allocated, even if the +cpusets memory placement policy 'cpuset.mems' subsequently changes. +If the cpuset flag file 'cpuset.memory_migrate' is set true, then when +tasks are attached to that cpuset, any pages that task had +allocated to it on nodes in its previous cpuset are migrated +to the task's new cpuset. The relative placement of the page within +the cpuset is preserved during these migration operations if possible. +For example if the page was on the second valid node of the prior cpuset +then the page will be placed on the second valid node of the new cpuset. + +Also if 'cpuset.memory_migrate' is set true, then if that cpuset's +'cpuset.mems' file is modified, pages allocated to tasks in that +cpuset, that were on nodes in the previous setting of 'cpuset.mems', +will be moved to nodes in the new setting of 'mems.' +Pages that were not in the task's prior cpuset, or in the cpuset's +prior 'cpuset.mems' setting, will not be moved. + +There is an exception to the above. If hotplug functionality is used +to remove all the CPUs that are currently assigned to a cpuset, +then all the tasks in that cpuset will be moved to the nearest ancestor +with non-empty cpus. But the moving of some (or all) tasks might fail if +cpuset is bound with another cgroup subsystem which has some restrictions +on task attaching. In this failing case, those tasks will stay +in the original cpuset, and the kernel will automatically update +their cpus_allowed to allow all online CPUs. When memory hotplug +functionality for removing Memory Nodes is available, a similar exception +is expected to apply there as well. In general, the kernel prefers to +violate cpuset placement, over starving a task that has had all +its allowed CPUs or Memory Nodes taken offline. + +There is a second exception to the above. GFP_ATOMIC requests are +kernel internal allocations that must be satisfied, immediately. +The kernel may drop some request, in rare cases even panic, if a +GFP_ATOMIC alloc fails. If the request cannot be satisfied within +the current task's cpuset, then we relax the cpuset, and look for +memory anywhere we can find it. It's better to violate the cpuset +than stress the kernel. + +To start a new job that is to be contained within a cpuset, the steps are: + + 1) mkdir /sys/fs/cgroup/cpuset + 2) mount -t cgroup -ocpuset cpuset /sys/fs/cgroup/cpuset + 3) Create the new cpuset by doing mkdir's and write's (or echo's) in + the /sys/fs/cgroup/cpuset virtual file system. + 4) Start a task that will be the "founding father" of the new job. + 5) Attach that task to the new cpuset by writing its pid to the + /sys/fs/cgroup/cpuset tasks file for that cpuset. + 6) fork, exec or clone the job tasks from this founding father task. + +For example, the following sequence of commands will setup a cpuset +named "Charlie", containing just CPUs 2 and 3, and Memory Node 1, +and then start a subshell 'sh' in that cpuset:: + + mount -t cgroup -ocpuset cpuset /sys/fs/cgroup/cpuset + cd /sys/fs/cgroup/cpuset + mkdir Charlie + cd Charlie + /bin/echo 2-3 > cpuset.cpus + /bin/echo 1 > cpuset.mems + /bin/echo $$ > tasks + sh + # The subshell 'sh' is now running in cpuset Charlie + # The next line should display '/Charlie' + cat /proc/self/cpuset + +There are ways to query or modify cpusets: + + - via the cpuset file system directly, using the various cd, mkdir, echo, + cat, rmdir commands from the shell, or their equivalent from C. + - via the C library libcpuset. + - via the C library libcgroup. + (http://sourceforge.net/projects/libcg/) + - via the python application cset. + (http://code.google.com/p/cpuset/) + +The sched_setaffinity calls can also be done at the shell prompt using +SGI's runon or Robert Love's taskset. The mbind and set_mempolicy +calls can be done at the shell prompt using the numactl command +(part of Andi Kleen's numa package). + +2. Usage Examples and Syntax +============================ + +2.1 Basic Usage +--------------- + +Creating, modifying, using the cpusets can be done through the cpuset +virtual filesystem. + +To mount it, type: +# mount -t cgroup -o cpuset cpuset /sys/fs/cgroup/cpuset + +Then under /sys/fs/cgroup/cpuset you can find a tree that corresponds to the +tree of the cpusets in the system. For instance, /sys/fs/cgroup/cpuset +is the cpuset that holds the whole system. + +If you want to create a new cpuset under /sys/fs/cgroup/cpuset:: + + # cd /sys/fs/cgroup/cpuset + # mkdir my_cpuset + +Now you want to do something with this cpuset:: + + # cd my_cpuset + +In this directory you can find several files:: + + # ls + cgroup.clone_children cpuset.memory_pressure + cgroup.event_control cpuset.memory_spread_page + cgroup.procs cpuset.memory_spread_slab + cpuset.cpu_exclusive cpuset.mems + cpuset.cpus cpuset.sched_load_balance + cpuset.mem_exclusive cpuset.sched_relax_domain_level + cpuset.mem_hardwall notify_on_release + cpuset.memory_migrate tasks + +Reading them will give you information about the state of this cpuset: +the CPUs and Memory Nodes it can use, the processes that are using +it, its properties. By writing to these files you can manipulate +the cpuset. + +Set some flags:: + + # /bin/echo 1 > cpuset.cpu_exclusive + +Add some cpus:: + + # /bin/echo 0-7 > cpuset.cpus + +Add some mems:: + + # /bin/echo 0-7 > cpuset.mems + +Now attach your shell to this cpuset:: + + # /bin/echo $$ > tasks + +You can also create cpusets inside your cpuset by using mkdir in this +directory:: + + # mkdir my_sub_cs + +To remove a cpuset, just use rmdir:: + + # rmdir my_sub_cs + +This will fail if the cpuset is in use (has cpusets inside, or has +processes attached). + +Note that for legacy reasons, the "cpuset" filesystem exists as a +wrapper around the cgroup filesystem. + +The command:: + + mount -t cpuset X /sys/fs/cgroup/cpuset + +is equivalent to:: + + mount -t cgroup -ocpuset,noprefix X /sys/fs/cgroup/cpuset + echo "/sbin/cpuset_release_agent" > /sys/fs/cgroup/cpuset/release_agent + +2.2 Adding/removing cpus +------------------------ + +This is the syntax to use when writing in the cpus or mems files +in cpuset directories:: + + # /bin/echo 1-4 > cpuset.cpus -> set cpus list to cpus 1,2,3,4 + # /bin/echo 1,2,3,4 > cpuset.cpus -> set cpus list to cpus 1,2,3,4 + +To add a CPU to a cpuset, write the new list of CPUs including the +CPU to be added. To add 6 to the above cpuset:: + + # /bin/echo 1-4,6 > cpuset.cpus -> set cpus list to cpus 1,2,3,4,6 + +Similarly to remove a CPU from a cpuset, write the new list of CPUs +without the CPU to be removed. + +To remove all the CPUs:: + + # /bin/echo "" > cpuset.cpus -> clear cpus list + +2.3 Setting flags +----------------- + +The syntax is very simple:: + + # /bin/echo 1 > cpuset.cpu_exclusive -> set flag 'cpuset.cpu_exclusive' + # /bin/echo 0 > cpuset.cpu_exclusive -> unset flag 'cpuset.cpu_exclusive' + +2.4 Attaching processes +----------------------- + +:: + + # /bin/echo PID > tasks + +Note that it is PID, not PIDs. You can only attach ONE task at a time. +If you have several tasks to attach, you have to do it one after another:: + + # /bin/echo PID1 > tasks + # /bin/echo PID2 > tasks + ... + # /bin/echo PIDn > tasks + + +3. Questions +============ + +Q: + what's up with this '/bin/echo' ? + +A: + bash's builtin 'echo' command does not check calls to write() against + errors. If you use it in the cpuset file system, you won't be + able to tell whether a command succeeded or failed. + +Q: + When I attach processes, only the first of the line gets really attached ! + +A: + We can only return one error code per call to write(). So you should also + put only ONE pid. + +4. Contact +========== + +Web: http://www.bullopensource.org/cpuset diff --git a/Documentation/cgroup-v1/cpusets.txt b/Documentation/cgroup-v1/cpusets.txt deleted file mode 100644 index 8402dd6de8df..000000000000 --- a/Documentation/cgroup-v1/cpusets.txt +++ /dev/null @@ -1,839 +0,0 @@ - CPUSETS - ------- - -Copyright (C) 2004 BULL SA. -Written by Simon.Derr@bull.net - -Portions Copyright (c) 2004-2006 Silicon Graphics, Inc. -Modified by Paul Jackson -Modified by Christoph Lameter -Modified by Paul Menage -Modified by Hidetoshi Seto - -CONTENTS: -========= - -1. Cpusets - 1.1 What are cpusets ? - 1.2 Why are cpusets needed ? - 1.3 How are cpusets implemented ? - 1.4 What are exclusive cpusets ? - 1.5 What is memory_pressure ? - 1.6 What is memory spread ? - 1.7 What is sched_load_balance ? - 1.8 What is sched_relax_domain_level ? - 1.9 How do I use cpusets ? -2. Usage Examples and Syntax - 2.1 Basic Usage - 2.2 Adding/removing cpus - 2.3 Setting flags - 2.4 Attaching processes -3. Questions -4. Contact - -1. Cpusets -========== - -1.1 What are cpusets ? ----------------------- - -Cpusets provide a mechanism for assigning a set of CPUs and Memory -Nodes to a set of tasks. In this document "Memory Node" refers to -an on-line node that contains memory. - -Cpusets constrain the CPU and Memory placement of tasks to only -the resources within a task's current cpuset. They form a nested -hierarchy visible in a virtual file system. These are the essential -hooks, beyond what is already present, required to manage dynamic -job placement on large systems. - -Cpusets use the generic cgroup subsystem described in -Documentation/cgroup-v1/cgroups.txt. - -Requests by a task, using the sched_setaffinity(2) system call to -include CPUs in its CPU affinity mask, and using the mbind(2) and -set_mempolicy(2) system calls to include Memory Nodes in its memory -policy, are both filtered through that task's cpuset, filtering out any -CPUs or Memory Nodes not in that cpuset. The scheduler will not -schedule a task on a CPU that is not allowed in its cpus_allowed -vector, and the kernel page allocator will not allocate a page on a -node that is not allowed in the requesting task's mems_allowed vector. - -User level code may create and destroy cpusets by name in the cgroup -virtual file system, manage the attributes and permissions of these -cpusets and which CPUs and Memory Nodes are assigned to each cpuset, -specify and query to which cpuset a task is assigned, and list the -task pids assigned to a cpuset. - - -1.2 Why are cpusets needed ? ----------------------------- - -The management of large computer systems, with many processors (CPUs), -complex memory cache hierarchies and multiple Memory Nodes having -non-uniform access times (NUMA) presents additional challenges for -the efficient scheduling and memory placement of processes. - -Frequently more modest sized systems can be operated with adequate -efficiency just by letting the operating system automatically share -the available CPU and Memory resources amongst the requesting tasks. - -But larger systems, which benefit more from careful processor and -memory placement to reduce memory access times and contention, -and which typically represent a larger investment for the customer, -can benefit from explicitly placing jobs on properly sized subsets of -the system. - -This can be especially valuable on: - - * Web Servers running multiple instances of the same web application, - * Servers running different applications (for instance, a web server - and a database), or - * NUMA systems running large HPC applications with demanding - performance characteristics. - -These subsets, or "soft partitions" must be able to be dynamically -adjusted, as the job mix changes, without impacting other concurrently -executing jobs. The location of the running jobs pages may also be moved -when the memory locations are changed. - -The kernel cpuset patch provides the minimum essential kernel -mechanisms required to efficiently implement such subsets. It -leverages existing CPU and Memory Placement facilities in the Linux -kernel to avoid any additional impact on the critical scheduler or -memory allocator code. - - -1.3 How are cpusets implemented ? ---------------------------------- - -Cpusets provide a Linux kernel mechanism to constrain which CPUs and -Memory Nodes are used by a process or set of processes. - -The Linux kernel already has a pair of mechanisms to specify on which -CPUs a task may be scheduled (sched_setaffinity) and on which Memory -Nodes it may obtain memory (mbind, set_mempolicy). - -Cpusets extends these two mechanisms as follows: - - - Cpusets are sets of allowed CPUs and Memory Nodes, known to the - kernel. - - Each task in the system is attached to a cpuset, via a pointer - in the task structure to a reference counted cgroup structure. - - Calls to sched_setaffinity are filtered to just those CPUs - allowed in that task's cpuset. - - Calls to mbind and set_mempolicy are filtered to just - those Memory Nodes allowed in that task's cpuset. - - The root cpuset contains all the systems CPUs and Memory - Nodes. - - For any cpuset, one can define child cpusets containing a subset - of the parents CPU and Memory Node resources. - - The hierarchy of cpusets can be mounted at /dev/cpuset, for - browsing and manipulation from user space. - - A cpuset may be marked exclusive, which ensures that no other - cpuset (except direct ancestors and descendants) may contain - any overlapping CPUs or Memory Nodes. - - You can list all the tasks (by pid) attached to any cpuset. - -The implementation of cpusets requires a few, simple hooks -into the rest of the kernel, none in performance critical paths: - - - in init/main.c, to initialize the root cpuset at system boot. - - in fork and exit, to attach and detach a task from its cpuset. - - in sched_setaffinity, to mask the requested CPUs by what's - allowed in that task's cpuset. - - in sched.c migrate_live_tasks(), to keep migrating tasks within - the CPUs allowed by their cpuset, if possible. - - in the mbind and set_mempolicy system calls, to mask the requested - Memory Nodes by what's allowed in that task's cpuset. - - in page_alloc.c, to restrict memory to allowed nodes. - - in vmscan.c, to restrict page recovery to the current cpuset. - -You should mount the "cgroup" filesystem type in order to enable -browsing and modifying the cpusets presently known to the kernel. No -new system calls are added for cpusets - all support for querying and -modifying cpusets is via this cpuset file system. - -The /proc//status file for each task has four added lines, -displaying the task's cpus_allowed (on which CPUs it may be scheduled) -and mems_allowed (on which Memory Nodes it may obtain memory), -in the two formats seen in the following example: - - Cpus_allowed: ffffffff,ffffffff,ffffffff,ffffffff - Cpus_allowed_list: 0-127 - Mems_allowed: ffffffff,ffffffff - Mems_allowed_list: 0-63 - -Each cpuset is represented by a directory in the cgroup file system -containing (on top of the standard cgroup files) the following -files describing that cpuset: - - - cpuset.cpus: list of CPUs in that cpuset - - cpuset.mems: list of Memory Nodes in that cpuset - - cpuset.memory_migrate flag: if set, move pages to cpusets nodes - - cpuset.cpu_exclusive flag: is cpu placement exclusive? - - cpuset.mem_exclusive flag: is memory placement exclusive? - - cpuset.mem_hardwall flag: is memory allocation hardwalled - - cpuset.memory_pressure: measure of how much paging pressure in cpuset - - cpuset.memory_spread_page flag: if set, spread page cache evenly on allowed nodes - - cpuset.memory_spread_slab flag: if set, spread slab cache evenly on allowed nodes - - cpuset.sched_load_balance flag: if set, load balance within CPUs on that cpuset - - cpuset.sched_relax_domain_level: the searching range when migrating tasks - -In addition, only the root cpuset has the following file: - - cpuset.memory_pressure_enabled flag: compute memory_pressure? - -New cpusets are created using the mkdir system call or shell -command. The properties of a cpuset, such as its flags, allowed -CPUs and Memory Nodes, and attached tasks, are modified by writing -to the appropriate file in that cpusets directory, as listed above. - -The named hierarchical structure of nested cpusets allows partitioning -a large system into nested, dynamically changeable, "soft-partitions". - -The attachment of each task, automatically inherited at fork by any -children of that task, to a cpuset allows organizing the work load -on a system into related sets of tasks such that each set is constrained -to using the CPUs and Memory Nodes of a particular cpuset. A task -may be re-attached to any other cpuset, if allowed by the permissions -on the necessary cpuset file system directories. - -Such management of a system "in the large" integrates smoothly with -the detailed placement done on individual tasks and memory regions -using the sched_setaffinity, mbind and set_mempolicy system calls. - -The following rules apply to each cpuset: - - - Its CPUs and Memory Nodes must be a subset of its parents. - - It can't be marked exclusive unless its parent is. - - If its cpu or memory is exclusive, they may not overlap any sibling. - -These rules, and the natural hierarchy of cpusets, enable efficient -enforcement of the exclusive guarantee, without having to scan all -cpusets every time any of them change to ensure nothing overlaps a -exclusive cpuset. Also, the use of a Linux virtual file system (vfs) -to represent the cpuset hierarchy provides for a familiar permission -and name space for cpusets, with a minimum of additional kernel code. - -The cpus and mems files in the root (top_cpuset) cpuset are -read-only. The cpus file automatically tracks the value of -cpu_online_mask using a CPU hotplug notifier, and the mems file -automatically tracks the value of node_states[N_MEMORY]--i.e., -nodes with memory--using the cpuset_track_online_nodes() hook. - - -1.4 What are exclusive cpusets ? --------------------------------- - -If a cpuset is cpu or mem exclusive, no other cpuset, other than -a direct ancestor or descendant, may share any of the same CPUs or -Memory Nodes. - -A cpuset that is cpuset.mem_exclusive *or* cpuset.mem_hardwall is "hardwalled", -i.e. it restricts kernel allocations for page, buffer and other data -commonly shared by the kernel across multiple users. All cpusets, -whether hardwalled or not, restrict allocations of memory for user -space. This enables configuring a system so that several independent -jobs can share common kernel data, such as file system pages, while -isolating each job's user allocation in its own cpuset. To do this, -construct a large mem_exclusive cpuset to hold all the jobs, and -construct child, non-mem_exclusive cpusets for each individual job. -Only a small amount of typical kernel memory, such as requests from -interrupt handlers, is allowed to be taken outside even a -mem_exclusive cpuset. - - -1.5 What is memory_pressure ? ------------------------------ -The memory_pressure of a cpuset provides a simple per-cpuset metric -of the rate that the tasks in a cpuset are attempting to free up in -use memory on the nodes of the cpuset to satisfy additional memory -requests. - -This enables batch managers monitoring jobs running in dedicated -cpusets to efficiently detect what level of memory pressure that job -is causing. - -This is useful both on tightly managed systems running a wide mix of -submitted jobs, which may choose to terminate or re-prioritize jobs that -are trying to use more memory than allowed on the nodes assigned to them, -and with tightly coupled, long running, massively parallel scientific -computing jobs that will dramatically fail to meet required performance -goals if they start to use more memory than allowed to them. - -This mechanism provides a very economical way for the batch manager -to monitor a cpuset for signs of memory pressure. It's up to the -batch manager or other user code to decide what to do about it and -take action. - -==> Unless this feature is enabled by writing "1" to the special file - /dev/cpuset/memory_pressure_enabled, the hook in the rebalance - code of __alloc_pages() for this metric reduces to simply noticing - that the cpuset_memory_pressure_enabled flag is zero. So only - systems that enable this feature will compute the metric. - -Why a per-cpuset, running average: - - Because this meter is per-cpuset, rather than per-task or mm, - the system load imposed by a batch scheduler monitoring this - metric is sharply reduced on large systems, because a scan of - the tasklist can be avoided on each set of queries. - - Because this meter is a running average, instead of an accumulating - counter, a batch scheduler can detect memory pressure with a - single read, instead of having to read and accumulate results - for a period of time. - - Because this meter is per-cpuset rather than per-task or mm, - the batch scheduler can obtain the key information, memory - pressure in a cpuset, with a single read, rather than having to - query and accumulate results over all the (dynamically changing) - set of tasks in the cpuset. - -A per-cpuset simple digital filter (requires a spinlock and 3 words -of data per-cpuset) is kept, and updated by any task attached to that -cpuset, if it enters the synchronous (direct) page reclaim code. - -A per-cpuset file provides an integer number representing the recent -(half-life of 10 seconds) rate of direct page reclaims caused by -the tasks in the cpuset, in units of reclaims attempted per second, -times 1000. - - -1.6 What is memory spread ? ---------------------------- -There are two boolean flag files per cpuset that control where the -kernel allocates pages for the file system buffers and related in -kernel data structures. They are called 'cpuset.memory_spread_page' and -'cpuset.memory_spread_slab'. - -If the per-cpuset boolean flag file 'cpuset.memory_spread_page' is set, then -the kernel will spread the file system buffers (page cache) evenly -over all the nodes that the faulting task is allowed to use, instead -of preferring to put those pages on the node where the task is running. - -If the per-cpuset boolean flag file 'cpuset.memory_spread_slab' is set, -then the kernel will spread some file system related slab caches, -such as for inodes and dentries evenly over all the nodes that the -faulting task is allowed to use, instead of preferring to put those -pages on the node where the task is running. - -The setting of these flags does not affect anonymous data segment or -stack segment pages of a task. - -By default, both kinds of memory spreading are off, and memory -pages are allocated on the node local to where the task is running, -except perhaps as modified by the task's NUMA mempolicy or cpuset -configuration, so long as sufficient free memory pages are available. - -When new cpusets are created, they inherit the memory spread settings -of their parent. - -Setting memory spreading causes allocations for the affected page -or slab caches to ignore the task's NUMA mempolicy and be spread -instead. Tasks using mbind() or set_mempolicy() calls to set NUMA -mempolicies will not notice any change in these calls as a result of -their containing task's memory spread settings. If memory spreading -is turned off, then the currently specified NUMA mempolicy once again -applies to memory page allocations. - -Both 'cpuset.memory_spread_page' and 'cpuset.memory_spread_slab' are boolean flag -files. By default they contain "0", meaning that the feature is off -for that cpuset. If a "1" is written to that file, then that turns -the named feature on. - -The implementation is simple. - -Setting the flag 'cpuset.memory_spread_page' turns on a per-process flag -PFA_SPREAD_PAGE for each task that is in that cpuset or subsequently -joins that cpuset. The page allocation calls for the page cache -is modified to perform an inline check for this PFA_SPREAD_PAGE task -flag, and if set, a call to a new routine cpuset_mem_spread_node() -returns the node to prefer for the allocation. - -Similarly, setting 'cpuset.memory_spread_slab' turns on the flag -PFA_SPREAD_SLAB, and appropriately marked slab caches will allocate -pages from the node returned by cpuset_mem_spread_node(). - -The cpuset_mem_spread_node() routine is also simple. It uses the -value of a per-task rotor cpuset_mem_spread_rotor to select the next -node in the current task's mems_allowed to prefer for the allocation. - -This memory placement policy is also known (in other contexts) as -round-robin or interleave. - -This policy can provide substantial improvements for jobs that need -to place thread local data on the corresponding node, but that need -to access large file system data sets that need to be spread across -the several nodes in the jobs cpuset in order to fit. Without this -policy, especially for jobs that might have one thread reading in the -data set, the memory allocation across the nodes in the jobs cpuset -can become very uneven. - -1.7 What is sched_load_balance ? --------------------------------- - -The kernel scheduler (kernel/sched/core.c) automatically load balances -tasks. If one CPU is underutilized, kernel code running on that -CPU will look for tasks on other more overloaded CPUs and move those -tasks to itself, within the constraints of such placement mechanisms -as cpusets and sched_setaffinity. - -The algorithmic cost of load balancing and its impact on key shared -kernel data structures such as the task list increases more than -linearly with the number of CPUs being balanced. So the scheduler -has support to partition the systems CPUs into a number of sched -domains such that it only load balances within each sched domain. -Each sched domain covers some subset of the CPUs in the system; -no two sched domains overlap; some CPUs might not be in any sched -domain and hence won't be load balanced. - -Put simply, it costs less to balance between two smaller sched domains -than one big one, but doing so means that overloads in one of the -two domains won't be load balanced to the other one. - -By default, there is one sched domain covering all CPUs, including those -marked isolated using the kernel boot time "isolcpus=" argument. However, -the isolated CPUs will not participate in load balancing, and will not -have tasks running on them unless explicitly assigned. - -This default load balancing across all CPUs is not well suited for -the following two situations: - 1) On large systems, load balancing across many CPUs is expensive. - If the system is managed using cpusets to place independent jobs - on separate sets of CPUs, full load balancing is unnecessary. - 2) Systems supporting realtime on some CPUs need to minimize - system overhead on those CPUs, including avoiding task load - balancing if that is not needed. - -When the per-cpuset flag "cpuset.sched_load_balance" is enabled (the default -setting), it requests that all the CPUs in that cpusets allowed 'cpuset.cpus' -be contained in a single sched domain, ensuring that load balancing -can move a task (not otherwised pinned, as by sched_setaffinity) -from any CPU in that cpuset to any other. - -When the per-cpuset flag "cpuset.sched_load_balance" is disabled, then the -scheduler will avoid load balancing across the CPUs in that cpuset, ---except-- in so far as is necessary because some overlapping cpuset -has "sched_load_balance" enabled. - -So, for example, if the top cpuset has the flag "cpuset.sched_load_balance" -enabled, then the scheduler will have one sched domain covering all -CPUs, and the setting of the "cpuset.sched_load_balance" flag in any other -cpusets won't matter, as we're already fully load balancing. - -Therefore in the above two situations, the top cpuset flag -"cpuset.sched_load_balance" should be disabled, and only some of the smaller, -child cpusets have this flag enabled. - -When doing this, you don't usually want to leave any unpinned tasks in -the top cpuset that might use non-trivial amounts of CPU, as such tasks -may be artificially constrained to some subset of CPUs, depending on -the particulars of this flag setting in descendant cpusets. Even if -such a task could use spare CPU cycles in some other CPUs, the kernel -scheduler might not consider the possibility of load balancing that -task to that underused CPU. - -Of course, tasks pinned to a particular CPU can be left in a cpuset -that disables "cpuset.sched_load_balance" as those tasks aren't going anywhere -else anyway. - -There is an impedance mismatch here, between cpusets and sched domains. -Cpusets are hierarchical and nest. Sched domains are flat; they don't -overlap and each CPU is in at most one sched domain. - -It is necessary for sched domains to be flat because load balancing -across partially overlapping sets of CPUs would risk unstable dynamics -that would be beyond our understanding. So if each of two partially -overlapping cpusets enables the flag 'cpuset.sched_load_balance', then we -form a single sched domain that is a superset of both. We won't move -a task to a CPU outside its cpuset, but the scheduler load balancing -code might waste some compute cycles considering that possibility. - -This mismatch is why there is not a simple one-to-one relation -between which cpusets have the flag "cpuset.sched_load_balance" enabled, -and the sched domain configuration. If a cpuset enables the flag, it -will get balancing across all its CPUs, but if it disables the flag, -it will only be assured of no load balancing if no other overlapping -cpuset enables the flag. - -If two cpusets have partially overlapping 'cpuset.cpus' allowed, and only -one of them has this flag enabled, then the other may find its -tasks only partially load balanced, just on the overlapping CPUs. -This is just the general case of the top_cpuset example given a few -paragraphs above. In the general case, as in the top cpuset case, -don't leave tasks that might use non-trivial amounts of CPU in -such partially load balanced cpusets, as they may be artificially -constrained to some subset of the CPUs allowed to them, for lack of -load balancing to the other CPUs. - -CPUs in "cpuset.isolcpus" were excluded from load balancing by the -isolcpus= kernel boot option, and will never be load balanced regardless -of the value of "cpuset.sched_load_balance" in any cpuset. - -1.7.1 sched_load_balance implementation details. ------------------------------------------------- - -The per-cpuset flag 'cpuset.sched_load_balance' defaults to enabled (contrary -to most cpuset flags.) When enabled for a cpuset, the kernel will -ensure that it can load balance across all the CPUs in that cpuset -(makes sure that all the CPUs in the cpus_allowed of that cpuset are -in the same sched domain.) - -If two overlapping cpusets both have 'cpuset.sched_load_balance' enabled, -then they will be (must be) both in the same sched domain. - -If, as is the default, the top cpuset has 'cpuset.sched_load_balance' enabled, -then by the above that means there is a single sched domain covering -the whole system, regardless of any other cpuset settings. - -The kernel commits to user space that it will avoid load balancing -where it can. It will pick as fine a granularity partition of sched -domains as it can while still providing load balancing for any set -of CPUs allowed to a cpuset having 'cpuset.sched_load_balance' enabled. - -The internal kernel cpuset to scheduler interface passes from the -cpuset code to the scheduler code a partition of the load balanced -CPUs in the system. This partition is a set of subsets (represented -as an array of struct cpumask) of CPUs, pairwise disjoint, that cover -all the CPUs that must be load balanced. - -The cpuset code builds a new such partition and passes it to the -scheduler sched domain setup code, to have the sched domains rebuilt -as necessary, whenever: - - the 'cpuset.sched_load_balance' flag of a cpuset with non-empty CPUs changes, - - or CPUs come or go from a cpuset with this flag enabled, - - or 'cpuset.sched_relax_domain_level' value of a cpuset with non-empty CPUs - and with this flag enabled changes, - - or a cpuset with non-empty CPUs and with this flag enabled is removed, - - or a cpu is offlined/onlined. - -This partition exactly defines what sched domains the scheduler should -setup - one sched domain for each element (struct cpumask) in the -partition. - -The scheduler remembers the currently active sched domain partitions. -When the scheduler routine partition_sched_domains() is invoked from -the cpuset code to update these sched domains, it compares the new -partition requested with the current, and updates its sched domains, -removing the old and adding the new, for each change. - - -1.8 What is sched_relax_domain_level ? --------------------------------------- - -In sched domain, the scheduler migrates tasks in 2 ways; periodic load -balance on tick, and at time of some schedule events. - -When a task is woken up, scheduler try to move the task on idle CPU. -For example, if a task A running on CPU X activates another task B -on the same CPU X, and if CPU Y is X's sibling and performing idle, -then scheduler migrate task B to CPU Y so that task B can start on -CPU Y without waiting task A on CPU X. - -And if a CPU run out of tasks in its runqueue, the CPU try to pull -extra tasks from other busy CPUs to help them before it is going to -be idle. - -Of course it takes some searching cost to find movable tasks and/or -idle CPUs, the scheduler might not search all CPUs in the domain -every time. In fact, in some architectures, the searching ranges on -events are limited in the same socket or node where the CPU locates, -while the load balance on tick searches all. - -For example, assume CPU Z is relatively far from CPU X. Even if CPU Z -is idle while CPU X and the siblings are busy, scheduler can't migrate -woken task B from X to Z since it is out of its searching range. -As the result, task B on CPU X need to wait task A or wait load balance -on the next tick. For some applications in special situation, waiting -1 tick may be too long. - -The 'cpuset.sched_relax_domain_level' file allows you to request changing -this searching range as you like. This file takes int value which -indicates size of searching range in levels ideally as follows, -otherwise initial value -1 that indicates the cpuset has no request. - - -1 : no request. use system default or follow request of others. - 0 : no search. - 1 : search siblings (hyperthreads in a core). - 2 : search cores in a package. - 3 : search cpus in a node [= system wide on non-NUMA system] - 4 : search nodes in a chunk of node [on NUMA system] - 5 : search system wide [on NUMA system] - -The system default is architecture dependent. The system default -can be changed using the relax_domain_level= boot parameter. - -This file is per-cpuset and affect the sched domain where the cpuset -belongs to. Therefore if the flag 'cpuset.sched_load_balance' of a cpuset -is disabled, then 'cpuset.sched_relax_domain_level' have no effect since -there is no sched domain belonging the cpuset. - -If multiple cpusets are overlapping and hence they form a single sched -domain, the largest value among those is used. Be careful, if one -requests 0 and others are -1 then 0 is used. - -Note that modifying this file will have both good and bad effects, -and whether it is acceptable or not depends on your situation. -Don't modify this file if you are not sure. - -If your situation is: - - The migration costs between each cpu can be assumed considerably - small(for you) due to your special application's behavior or - special hardware support for CPU cache etc. - - The searching cost doesn't have impact(for you) or you can make - the searching cost enough small by managing cpuset to compact etc. - - The latency is required even it sacrifices cache hit rate etc. -then increasing 'sched_relax_domain_level' would benefit you. - - -1.9 How do I use cpusets ? --------------------------- - -In order to minimize the impact of cpusets on critical kernel -code, such as the scheduler, and due to the fact that the kernel -does not support one task updating the memory placement of another -task directly, the impact on a task of changing its cpuset CPU -or Memory Node placement, or of changing to which cpuset a task -is attached, is subtle. - -If a cpuset has its Memory Nodes modified, then for each task attached -to that cpuset, the next time that the kernel attempts to allocate -a page of memory for that task, the kernel will notice the change -in the task's cpuset, and update its per-task memory placement to -remain within the new cpusets memory placement. If the task was using -mempolicy MPOL_BIND, and the nodes to which it was bound overlap with -its new cpuset, then the task will continue to use whatever subset -of MPOL_BIND nodes are still allowed in the new cpuset. If the task -was using MPOL_BIND and now none of its MPOL_BIND nodes are allowed -in the new cpuset, then the task will be essentially treated as if it -was MPOL_BIND bound to the new cpuset (even though its NUMA placement, -as queried by get_mempolicy(), doesn't change). If a task is moved -from one cpuset to another, then the kernel will adjust the task's -memory placement, as above, the next time that the kernel attempts -to allocate a page of memory for that task. - -If a cpuset has its 'cpuset.cpus' modified, then each task in that cpuset -will have its allowed CPU placement changed immediately. Similarly, -if a task's pid is written to another cpuset's 'tasks' file, then its -allowed CPU placement is changed immediately. If such a task had been -bound to some subset of its cpuset using the sched_setaffinity() call, -the task will be allowed to run on any CPU allowed in its new cpuset, -negating the effect of the prior sched_setaffinity() call. - -In summary, the memory placement of a task whose cpuset is changed is -updated by the kernel, on the next allocation of a page for that task, -and the processor placement is updated immediately. - -Normally, once a page is allocated (given a physical page -of main memory) then that page stays on whatever node it -was allocated, so long as it remains allocated, even if the -cpusets memory placement policy 'cpuset.mems' subsequently changes. -If the cpuset flag file 'cpuset.memory_migrate' is set true, then when -tasks are attached to that cpuset, any pages that task had -allocated to it on nodes in its previous cpuset are migrated -to the task's new cpuset. The relative placement of the page within -the cpuset is preserved during these migration operations if possible. -For example if the page was on the second valid node of the prior cpuset -then the page will be placed on the second valid node of the new cpuset. - -Also if 'cpuset.memory_migrate' is set true, then if that cpuset's -'cpuset.mems' file is modified, pages allocated to tasks in that -cpuset, that were on nodes in the previous setting of 'cpuset.mems', -will be moved to nodes in the new setting of 'mems.' -Pages that were not in the task's prior cpuset, or in the cpuset's -prior 'cpuset.mems' setting, will not be moved. - -There is an exception to the above. If hotplug functionality is used -to remove all the CPUs that are currently assigned to a cpuset, -then all the tasks in that cpuset will be moved to the nearest ancestor -with non-empty cpus. But the moving of some (or all) tasks might fail if -cpuset is bound with another cgroup subsystem which has some restrictions -on task attaching. In this failing case, those tasks will stay -in the original cpuset, and the kernel will automatically update -their cpus_allowed to allow all online CPUs. When memory hotplug -functionality for removing Memory Nodes is available, a similar exception -is expected to apply there as well. In general, the kernel prefers to -violate cpuset placement, over starving a task that has had all -its allowed CPUs or Memory Nodes taken offline. - -There is a second exception to the above. GFP_ATOMIC requests are -kernel internal allocations that must be satisfied, immediately. -The kernel may drop some request, in rare cases even panic, if a -GFP_ATOMIC alloc fails. If the request cannot be satisfied within -the current task's cpuset, then we relax the cpuset, and look for -memory anywhere we can find it. It's better to violate the cpuset -than stress the kernel. - -To start a new job that is to be contained within a cpuset, the steps are: - - 1) mkdir /sys/fs/cgroup/cpuset - 2) mount -t cgroup -ocpuset cpuset /sys/fs/cgroup/cpuset - 3) Create the new cpuset by doing mkdir's and write's (or echo's) in - the /sys/fs/cgroup/cpuset virtual file system. - 4) Start a task that will be the "founding father" of the new job. - 5) Attach that task to the new cpuset by writing its pid to the - /sys/fs/cgroup/cpuset tasks file for that cpuset. - 6) fork, exec or clone the job tasks from this founding father task. - -For example, the following sequence of commands will setup a cpuset -named "Charlie", containing just CPUs 2 and 3, and Memory Node 1, -and then start a subshell 'sh' in that cpuset: - - mount -t cgroup -ocpuset cpuset /sys/fs/cgroup/cpuset - cd /sys/fs/cgroup/cpuset - mkdir Charlie - cd Charlie - /bin/echo 2-3 > cpuset.cpus - /bin/echo 1 > cpuset.mems - /bin/echo $$ > tasks - sh - # The subshell 'sh' is now running in cpuset Charlie - # The next line should display '/Charlie' - cat /proc/self/cpuset - -There are ways to query or modify cpusets: - - via the cpuset file system directly, using the various cd, mkdir, echo, - cat, rmdir commands from the shell, or their equivalent from C. - - via the C library libcpuset. - - via the C library libcgroup. - (http://sourceforge.net/projects/libcg/) - - via the python application cset. - (http://code.google.com/p/cpuset/) - -The sched_setaffinity calls can also be done at the shell prompt using -SGI's runon or Robert Love's taskset. The mbind and set_mempolicy -calls can be done at the shell prompt using the numactl command -(part of Andi Kleen's numa package). - -2. Usage Examples and Syntax -============================ - -2.1 Basic Usage ---------------- - -Creating, modifying, using the cpusets can be done through the cpuset -virtual filesystem. - -To mount it, type: -# mount -t cgroup -o cpuset cpuset /sys/fs/cgroup/cpuset - -Then under /sys/fs/cgroup/cpuset you can find a tree that corresponds to the -tree of the cpusets in the system. For instance, /sys/fs/cgroup/cpuset -is the cpuset that holds the whole system. - -If you want to create a new cpuset under /sys/fs/cgroup/cpuset: -# cd /sys/fs/cgroup/cpuset -# mkdir my_cpuset - -Now you want to do something with this cpuset. -# cd my_cpuset - -In this directory you can find several files: -# ls -cgroup.clone_children cpuset.memory_pressure -cgroup.event_control cpuset.memory_spread_page -cgroup.procs cpuset.memory_spread_slab -cpuset.cpu_exclusive cpuset.mems -cpuset.cpus cpuset.sched_load_balance -cpuset.mem_exclusive cpuset.sched_relax_domain_level -cpuset.mem_hardwall notify_on_release -cpuset.memory_migrate tasks - -Reading them will give you information about the state of this cpuset: -the CPUs and Memory Nodes it can use, the processes that are using -it, its properties. By writing to these files you can manipulate -the cpuset. - -Set some flags: -# /bin/echo 1 > cpuset.cpu_exclusive - -Add some cpus: -# /bin/echo 0-7 > cpuset.cpus - -Add some mems: -# /bin/echo 0-7 > cpuset.mems - -Now attach your shell to this cpuset: -# /bin/echo $$ > tasks - -You can also create cpusets inside your cpuset by using mkdir in this -directory. -# mkdir my_sub_cs - -To remove a cpuset, just use rmdir: -# rmdir my_sub_cs -This will fail if the cpuset is in use (has cpusets inside, or has -processes attached). - -Note that for legacy reasons, the "cpuset" filesystem exists as a -wrapper around the cgroup filesystem. - -The command - -mount -t cpuset X /sys/fs/cgroup/cpuset - -is equivalent to - -mount -t cgroup -ocpuset,noprefix X /sys/fs/cgroup/cpuset -echo "/sbin/cpuset_release_agent" > /sys/fs/cgroup/cpuset/release_agent - -2.2 Adding/removing cpus ------------------------- - -This is the syntax to use when writing in the cpus or mems files -in cpuset directories: - -# /bin/echo 1-4 > cpuset.cpus -> set cpus list to cpus 1,2,3,4 -# /bin/echo 1,2,3,4 > cpuset.cpus -> set cpus list to cpus 1,2,3,4 - -To add a CPU to a cpuset, write the new list of CPUs including the -CPU to be added. To add 6 to the above cpuset: - -# /bin/echo 1-4,6 > cpuset.cpus -> set cpus list to cpus 1,2,3,4,6 - -Similarly to remove a CPU from a cpuset, write the new list of CPUs -without the CPU to be removed. - -To remove all the CPUs: - -# /bin/echo "" > cpuset.cpus -> clear cpus list - -2.3 Setting flags ------------------ - -The syntax is very simple: - -# /bin/echo 1 > cpuset.cpu_exclusive -> set flag 'cpuset.cpu_exclusive' -# /bin/echo 0 > cpuset.cpu_exclusive -> unset flag 'cpuset.cpu_exclusive' - -2.4 Attaching processes ------------------------ - -# /bin/echo PID > tasks - -Note that it is PID, not PIDs. You can only attach ONE task at a time. -If you have several tasks to attach, you have to do it one after another: - -# /bin/echo PID1 > tasks -# /bin/echo PID2 > tasks - ... -# /bin/echo PIDn > tasks - - -3. Questions -============ - -Q: what's up with this '/bin/echo' ? -A: bash's builtin 'echo' command does not check calls to write() against - errors. If you use it in the cpuset file system, you won't be - able to tell whether a command succeeded or failed. - -Q: When I attach processes, only the first of the line gets really attached ! -A: We can only return one error code per call to write(). So you should also - put only ONE pid. - -4. Contact -========== - -Web: http://www.bullopensource.org/cpuset diff --git a/Documentation/cgroup-v1/devices.rst b/Documentation/cgroup-v1/devices.rst new file mode 100644 index 000000000000..e1886783961e --- /dev/null +++ b/Documentation/cgroup-v1/devices.rst @@ -0,0 +1,132 @@ +=========================== +Device Whitelist Controller +=========================== + +1. Description +============== + +Implement a cgroup to track and enforce open and mknod restrictions +on device files. A device cgroup associates a device access +whitelist with each cgroup. A whitelist entry has 4 fields. +'type' is a (all), c (char), or b (block). 'all' means it applies +to all types and all major and minor numbers. Major and minor are +either an integer or * for all. Access is a composition of r +(read), w (write), and m (mknod). + +The root device cgroup starts with rwm to 'all'. A child device +cgroup gets a copy of the parent. Administrators can then remove +devices from the whitelist or add new entries. A child cgroup can +never receive a device access which is denied by its parent. + +2. User Interface +================= + +An entry is added using devices.allow, and removed using +devices.deny. For instance:: + + echo 'c 1:3 mr' > /sys/fs/cgroup/1/devices.allow + +allows cgroup 1 to read and mknod the device usually known as +/dev/null. Doing:: + + echo a > /sys/fs/cgroup/1/devices.deny + +will remove the default 'a *:* rwm' entry. Doing:: + + echo a > /sys/fs/cgroup/1/devices.allow + +will add the 'a *:* rwm' entry to the whitelist. + +3. Security +=========== + +Any task can move itself between cgroups. This clearly won't +suffice, but we can decide the best way to adequately restrict +movement as people get some experience with this. We may just want +to require CAP_SYS_ADMIN, which at least is a separate bit from +CAP_MKNOD. We may want to just refuse moving to a cgroup which +isn't a descendant of the current one. Or we may want to use +CAP_MAC_ADMIN, since we really are trying to lock down root. + +CAP_SYS_ADMIN is needed to modify the whitelist or move another +task to a new cgroup. (Again we'll probably want to change that). + +A cgroup may not be granted more permissions than the cgroup's +parent has. + +4. Hierarchy +============ + +device cgroups maintain hierarchy by making sure a cgroup never has more +access permissions than its parent. Every time an entry is written to +a cgroup's devices.deny file, all its children will have that entry removed +from their whitelist and all the locally set whitelist entries will be +re-evaluated. In case one of the locally set whitelist entries would provide +more access than the cgroup's parent, it'll be removed from the whitelist. + +Example:: + + A + / \ + B + + group behavior exceptions + A allow "b 8:* rwm", "c 116:1 rw" + B deny "c 1:3 rwm", "c 116:2 rwm", "b 3:* rwm" + +If a device is denied in group A:: + + # echo "c 116:* r" > A/devices.deny + +it'll propagate down and after revalidating B's entries, the whitelist entry +"c 116:2 rwm" will be removed:: + + group whitelist entries denied devices + A all "b 8:* rwm", "c 116:* rw" + B "c 1:3 rwm", "b 3:* rwm" all the rest + +In case parent's exceptions change and local exceptions are not allowed +anymore, they'll be deleted. + +Notice that new whitelist entries will not be propagated:: + + A + / \ + B + + group whitelist entries denied devices + A "c 1:3 rwm", "c 1:5 r" all the rest + B "c 1:3 rwm", "c 1:5 r" all the rest + +when adding ``c *:3 rwm``:: + + # echo "c *:3 rwm" >A/devices.allow + +the result:: + + group whitelist entries denied devices + A "c *:3 rwm", "c 1:5 r" all the rest + B "c 1:3 rwm", "c 1:5 r" all the rest + +but now it'll be possible to add new entries to B:: + + # echo "c 2:3 rwm" >B/devices.allow + # echo "c 50:3 r" >B/devices.allow + +or even:: + + # echo "c *:3 rwm" >B/devices.allow + +Allowing or denying all by writing 'a' to devices.allow or devices.deny will +not be possible once the device cgroups has children. + +4.1 Hierarchy (internal implementation) +--------------------------------------- + +device cgroups is implemented internally using a behavior (ALLOW, DENY) and a +list of exceptions. The internal state is controlled using the same user +interface to preserve compatibility with the previous whitelist-only +implementation. Removal or addition of exceptions that will reduce the access +to devices will be propagated down the hierarchy. +For every propagated exception, the effective rules will be re-evaluated based +on current parent's access rules. diff --git a/Documentation/cgroup-v1/devices.txt b/Documentation/cgroup-v1/devices.txt deleted file mode 100644 index 3c1095ca02ea..000000000000 --- a/Documentation/cgroup-v1/devices.txt +++ /dev/null @@ -1,116 +0,0 @@ -Device Whitelist Controller - -1. Description: - -Implement a cgroup to track and enforce open and mknod restrictions -on device files. A device cgroup associates a device access -whitelist with each cgroup. A whitelist entry has 4 fields. -'type' is a (all), c (char), or b (block). 'all' means it applies -to all types and all major and minor numbers. Major and minor are -either an integer or * for all. Access is a composition of r -(read), w (write), and m (mknod). - -The root device cgroup starts with rwm to 'all'. A child device -cgroup gets a copy of the parent. Administrators can then remove -devices from the whitelist or add new entries. A child cgroup can -never receive a device access which is denied by its parent. - -2. User Interface - -An entry is added using devices.allow, and removed using -devices.deny. For instance - - echo 'c 1:3 mr' > /sys/fs/cgroup/1/devices.allow - -allows cgroup 1 to read and mknod the device usually known as -/dev/null. Doing - - echo a > /sys/fs/cgroup/1/devices.deny - -will remove the default 'a *:* rwm' entry. Doing - - echo a > /sys/fs/cgroup/1/devices.allow - -will add the 'a *:* rwm' entry to the whitelist. - -3. Security - -Any task can move itself between cgroups. This clearly won't -suffice, but we can decide the best way to adequately restrict -movement as people get some experience with this. We may just want -to require CAP_SYS_ADMIN, which at least is a separate bit from -CAP_MKNOD. We may want to just refuse moving to a cgroup which -isn't a descendant of the current one. Or we may want to use -CAP_MAC_ADMIN, since we really are trying to lock down root. - -CAP_SYS_ADMIN is needed to modify the whitelist or move another -task to a new cgroup. (Again we'll probably want to change that). - -A cgroup may not be granted more permissions than the cgroup's -parent has. - -4. Hierarchy - -device cgroups maintain hierarchy by making sure a cgroup never has more -access permissions than its parent. Every time an entry is written to -a cgroup's devices.deny file, all its children will have that entry removed -from their whitelist and all the locally set whitelist entries will be -re-evaluated. In case one of the locally set whitelist entries would provide -more access than the cgroup's parent, it'll be removed from the whitelist. - -Example: - A - / \ - B - - group behavior exceptions - A allow "b 8:* rwm", "c 116:1 rw" - B deny "c 1:3 rwm", "c 116:2 rwm", "b 3:* rwm" - -If a device is denied in group A: - # echo "c 116:* r" > A/devices.deny -it'll propagate down and after revalidating B's entries, the whitelist entry -"c 116:2 rwm" will be removed: - - group whitelist entries denied devices - A all "b 8:* rwm", "c 116:* rw" - B "c 1:3 rwm", "b 3:* rwm" all the rest - -In case parent's exceptions change and local exceptions are not allowed -anymore, they'll be deleted. - -Notice that new whitelist entries will not be propagated: - A - / \ - B - - group whitelist entries denied devices - A "c 1:3 rwm", "c 1:5 r" all the rest - B "c 1:3 rwm", "c 1:5 r" all the rest - -when adding "c *:3 rwm": - # echo "c *:3 rwm" >A/devices.allow - -the result: - group whitelist entries denied devices - A "c *:3 rwm", "c 1:5 r" all the rest - B "c 1:3 rwm", "c 1:5 r" all the rest - -but now it'll be possible to add new entries to B: - # echo "c 2:3 rwm" >B/devices.allow - # echo "c 50:3 r" >B/devices.allow -or even - # echo "c *:3 rwm" >B/devices.allow - -Allowing or denying all by writing 'a' to devices.allow or devices.deny will -not be possible once the device cgroups has children. - -4.1 Hierarchy (internal implementation) - -device cgroups is implemented internally using a behavior (ALLOW, DENY) and a -list of exceptions. The internal state is controlled using the same user -interface to preserve compatibility with the previous whitelist-only -implementation. Removal or addition of exceptions that will reduce the access -to devices will be propagated down the hierarchy. -For every propagated exception, the effective rules will be re-evaluated based -on current parent's access rules. diff --git a/Documentation/cgroup-v1/freezer-subsystem.rst b/Documentation/cgroup-v1/freezer-subsystem.rst new file mode 100644 index 000000000000..582d3427de3f --- /dev/null +++ b/Documentation/cgroup-v1/freezer-subsystem.rst @@ -0,0 +1,127 @@ +============== +Cgroup Freezer +============== + +The cgroup freezer is useful to batch job management system which start +and stop sets of tasks in order to schedule the resources of a machine +according to the desires of a system administrator. This sort of program +is often used on HPC clusters to schedule access to the cluster as a +whole. The cgroup freezer uses cgroups to describe the set of tasks to +be started/stopped by the batch job management system. It also provides +a means to start and stop the tasks composing the job. + +The cgroup freezer will also be useful for checkpointing running groups +of tasks. The freezer allows the checkpoint code to obtain a consistent +image of the tasks by attempting to force the tasks in a cgroup into a +quiescent state. Once the tasks are quiescent another task can +walk /proc or invoke a kernel interface to gather information about the +quiesced tasks. Checkpointed tasks can be restarted later should a +recoverable error occur. This also allows the checkpointed tasks to be +migrated between nodes in a cluster by copying the gathered information +to another node and restarting the tasks there. + +Sequences of SIGSTOP and SIGCONT are not always sufficient for stopping +and resuming tasks in userspace. Both of these signals are observable +from within the tasks we wish to freeze. While SIGSTOP cannot be caught, +blocked, or ignored it can be seen by waiting or ptracing parent tasks. +SIGCONT is especially unsuitable since it can be caught by the task. Any +programs designed to watch for SIGSTOP and SIGCONT could be broken by +attempting to use SIGSTOP and SIGCONT to stop and resume tasks. We can +demonstrate this problem using nested bash shells:: + + $ echo $$ + 16644 + $ bash + $ echo $$ + 16690 + + From a second, unrelated bash shell: + $ kill -SIGSTOP 16690 + $ kill -SIGCONT 16690 + + + +This happens because bash can observe both signals and choose how it +responds to them. + +Another example of a program which catches and responds to these +signals is gdb. In fact any program designed to use ptrace is likely to +have a problem with this method of stopping and resuming tasks. + +In contrast, the cgroup freezer uses the kernel freezer code to +prevent the freeze/unfreeze cycle from becoming visible to the tasks +being frozen. This allows the bash example above and gdb to run as +expected. + +The cgroup freezer is hierarchical. Freezing a cgroup freezes all +tasks belonging to the cgroup and all its descendant cgroups. Each +cgroup has its own state (self-state) and the state inherited from the +parent (parent-state). Iff both states are THAWED, the cgroup is +THAWED. + +The following cgroupfs files are created by cgroup freezer. + +* freezer.state: Read-write. + + When read, returns the effective state of the cgroup - "THAWED", + "FREEZING" or "FROZEN". This is the combined self and parent-states. + If any is freezing, the cgroup is freezing (FREEZING or FROZEN). + + FREEZING cgroup transitions into FROZEN state when all tasks + belonging to the cgroup and its descendants become frozen. Note that + a cgroup reverts to FREEZING from FROZEN after a new task is added + to the cgroup or one of its descendant cgroups until the new task is + frozen. + + When written, sets the self-state of the cgroup. Two values are + allowed - "FROZEN" and "THAWED". If FROZEN is written, the cgroup, + if not already freezing, enters FREEZING state along with all its + descendant cgroups. + + If THAWED is written, the self-state of the cgroup is changed to + THAWED. Note that the effective state may not change to THAWED if + the parent-state is still freezing. If a cgroup's effective state + becomes THAWED, all its descendants which are freezing because of + the cgroup also leave the freezing state. + +* freezer.self_freezing: Read only. + + Shows the self-state. 0 if the self-state is THAWED; otherwise, 1. + This value is 1 iff the last write to freezer.state was "FROZEN". + +* freezer.parent_freezing: Read only. + + Shows the parent-state. 0 if none of the cgroup's ancestors is + frozen; otherwise, 1. + +The root cgroup is non-freezable and the above interface files don't +exist. + +* Examples of usage:: + + # mkdir /sys/fs/cgroup/freezer + # mount -t cgroup -ofreezer freezer /sys/fs/cgroup/freezer + # mkdir /sys/fs/cgroup/freezer/0 + # echo $some_pid > /sys/fs/cgroup/freezer/0/tasks + +to get status of the freezer subsystem:: + + # cat /sys/fs/cgroup/freezer/0/freezer.state + THAWED + +to freeze all tasks in the container:: + + # echo FROZEN > /sys/fs/cgroup/freezer/0/freezer.state + # cat /sys/fs/cgroup/freezer/0/freezer.state + FREEZING + # cat /sys/fs/cgroup/freezer/0/freezer.state + FROZEN + +to unfreeze all tasks in the container:: + + # echo THAWED > /sys/fs/cgroup/freezer/0/freezer.state + # cat /sys/fs/cgroup/freezer/0/freezer.state + THAWED + +This is the basic mechanism which should do the right thing for user space task +in a simple scenario. diff --git a/Documentation/cgroup-v1/freezer-subsystem.txt b/Documentation/cgroup-v1/freezer-subsystem.txt deleted file mode 100644 index e831cb2b8394..000000000000 --- a/Documentation/cgroup-v1/freezer-subsystem.txt +++ /dev/null @@ -1,123 +0,0 @@ -The cgroup freezer is useful to batch job management system which start -and stop sets of tasks in order to schedule the resources of a machine -according to the desires of a system administrator. This sort of program -is often used on HPC clusters to schedule access to the cluster as a -whole. The cgroup freezer uses cgroups to describe the set of tasks to -be started/stopped by the batch job management system. It also provides -a means to start and stop the tasks composing the job. - -The cgroup freezer will also be useful for checkpointing running groups -of tasks. The freezer allows the checkpoint code to obtain a consistent -image of the tasks by attempting to force the tasks in a cgroup into a -quiescent state. Once the tasks are quiescent another task can -walk /proc or invoke a kernel interface to gather information about the -quiesced tasks. Checkpointed tasks can be restarted later should a -recoverable error occur. This also allows the checkpointed tasks to be -migrated between nodes in a cluster by copying the gathered information -to another node and restarting the tasks there. - -Sequences of SIGSTOP and SIGCONT are not always sufficient for stopping -and resuming tasks in userspace. Both of these signals are observable -from within the tasks we wish to freeze. While SIGSTOP cannot be caught, -blocked, or ignored it can be seen by waiting or ptracing parent tasks. -SIGCONT is especially unsuitable since it can be caught by the task. Any -programs designed to watch for SIGSTOP and SIGCONT could be broken by -attempting to use SIGSTOP and SIGCONT to stop and resume tasks. We can -demonstrate this problem using nested bash shells: - - $ echo $$ - 16644 - $ bash - $ echo $$ - 16690 - - From a second, unrelated bash shell: - $ kill -SIGSTOP 16690 - $ kill -SIGCONT 16690 - - - -This happens because bash can observe both signals and choose how it -responds to them. - -Another example of a program which catches and responds to these -signals is gdb. In fact any program designed to use ptrace is likely to -have a problem with this method of stopping and resuming tasks. - -In contrast, the cgroup freezer uses the kernel freezer code to -prevent the freeze/unfreeze cycle from becoming visible to the tasks -being frozen. This allows the bash example above and gdb to run as -expected. - -The cgroup freezer is hierarchical. Freezing a cgroup freezes all -tasks belonging to the cgroup and all its descendant cgroups. Each -cgroup has its own state (self-state) and the state inherited from the -parent (parent-state). Iff both states are THAWED, the cgroup is -THAWED. - -The following cgroupfs files are created by cgroup freezer. - -* freezer.state: Read-write. - - When read, returns the effective state of the cgroup - "THAWED", - "FREEZING" or "FROZEN". This is the combined self and parent-states. - If any is freezing, the cgroup is freezing (FREEZING or FROZEN). - - FREEZING cgroup transitions into FROZEN state when all tasks - belonging to the cgroup and its descendants become frozen. Note that - a cgroup reverts to FREEZING from FROZEN after a new task is added - to the cgroup or one of its descendant cgroups until the new task is - frozen. - - When written, sets the self-state of the cgroup. Two values are - allowed - "FROZEN" and "THAWED". If FROZEN is written, the cgroup, - if not already freezing, enters FREEZING state along with all its - descendant cgroups. - - If THAWED is written, the self-state of the cgroup is changed to - THAWED. Note that the effective state may not change to THAWED if - the parent-state is still freezing. If a cgroup's effective state - becomes THAWED, all its descendants which are freezing because of - the cgroup also leave the freezing state. - -* freezer.self_freezing: Read only. - - Shows the self-state. 0 if the self-state is THAWED; otherwise, 1. - This value is 1 iff the last write to freezer.state was "FROZEN". - -* freezer.parent_freezing: Read only. - - Shows the parent-state. 0 if none of the cgroup's ancestors is - frozen; otherwise, 1. - -The root cgroup is non-freezable and the above interface files don't -exist. - -* Examples of usage : - - # mkdir /sys/fs/cgroup/freezer - # mount -t cgroup -ofreezer freezer /sys/fs/cgroup/freezer - # mkdir /sys/fs/cgroup/freezer/0 - # echo $some_pid > /sys/fs/cgroup/freezer/0/tasks - -to get status of the freezer subsystem : - - # cat /sys/fs/cgroup/freezer/0/freezer.state - THAWED - -to freeze all tasks in the container : - - # echo FROZEN > /sys/fs/cgroup/freezer/0/freezer.state - # cat /sys/fs/cgroup/freezer/0/freezer.state - FREEZING - # cat /sys/fs/cgroup/freezer/0/freezer.state - FROZEN - -to unfreeze all tasks in the container : - - # echo THAWED > /sys/fs/cgroup/freezer/0/freezer.state - # cat /sys/fs/cgroup/freezer/0/freezer.state - THAWED - -This is the basic mechanism which should do the right thing for user space task -in a simple scenario. diff --git a/Documentation/cgroup-v1/hugetlb.rst b/Documentation/cgroup-v1/hugetlb.rst new file mode 100644 index 000000000000..a3902aa253a9 --- /dev/null +++ b/Documentation/cgroup-v1/hugetlb.rst @@ -0,0 +1,50 @@ +================== +HugeTLB Controller +================== + +The HugeTLB controller allows to limit the HugeTLB usage per control group and +enforces the controller limit during page fault. Since HugeTLB doesn't +support page reclaim, enforcing the limit at page fault time implies that, +the application will get SIGBUS signal if it tries to access HugeTLB pages +beyond its limit. This requires the application to know beforehand how much +HugeTLB pages it would require for its use. + +HugeTLB controller can be created by first mounting the cgroup filesystem. + +# mount -t cgroup -o hugetlb none /sys/fs/cgroup + +With the above step, the initial or the parent HugeTLB group becomes +visible at /sys/fs/cgroup. At bootup, this group includes all the tasks in +the system. /sys/fs/cgroup/tasks lists the tasks in this cgroup. + +New groups can be created under the parent group /sys/fs/cgroup:: + + # cd /sys/fs/cgroup + # mkdir g1 + # echo $$ > g1/tasks + +The above steps create a new group g1 and move the current shell +process (bash) into it. + +Brief summary of control files:: + + hugetlb..limit_in_bytes # set/show limit of "hugepagesize" hugetlb usage + hugetlb..max_usage_in_bytes # show max "hugepagesize" hugetlb usage recorded + hugetlb..usage_in_bytes # show current usage for "hugepagesize" hugetlb + hugetlb..failcnt # show the number of allocation failure due to HugeTLB limit + +For a system supporting three hugepage sizes (64k, 32M and 1G), the control +files include:: + + hugetlb.1GB.limit_in_bytes + hugetlb.1GB.max_usage_in_bytes + hugetlb.1GB.usage_in_bytes + hugetlb.1GB.failcnt + hugetlb.64KB.limit_in_bytes + hugetlb.64KB.max_usage_in_bytes + hugetlb.64KB.usage_in_bytes + hugetlb.64KB.failcnt + hugetlb.32MB.limit_in_bytes + hugetlb.32MB.max_usage_in_bytes + hugetlb.32MB.usage_in_bytes + hugetlb.32MB.failcnt diff --git a/Documentation/cgroup-v1/hugetlb.txt b/Documentation/cgroup-v1/hugetlb.txt deleted file mode 100644 index 1260e5369b9b..000000000000 --- a/Documentation/cgroup-v1/hugetlb.txt +++ /dev/null @@ -1,49 +0,0 @@ -HugeTLB Controller -------------------- - -The HugeTLB controller allows to limit the HugeTLB usage per control group and -enforces the controller limit during page fault. Since HugeTLB doesn't -support page reclaim, enforcing the limit at page fault time implies that, -the application will get SIGBUS signal if it tries to access HugeTLB pages -beyond its limit. This requires the application to know beforehand how much -HugeTLB pages it would require for its use. - -HugeTLB controller can be created by first mounting the cgroup filesystem. - -# mount -t cgroup -o hugetlb none /sys/fs/cgroup - -With the above step, the initial or the parent HugeTLB group becomes -visible at /sys/fs/cgroup. At bootup, this group includes all the tasks in -the system. /sys/fs/cgroup/tasks lists the tasks in this cgroup. - -New groups can be created under the parent group /sys/fs/cgroup. - -# cd /sys/fs/cgroup -# mkdir g1 -# echo $$ > g1/tasks - -The above steps create a new group g1 and move the current shell -process (bash) into it. - -Brief summary of control files - - hugetlb..limit_in_bytes # set/show limit of "hugepagesize" hugetlb usage - hugetlb..max_usage_in_bytes # show max "hugepagesize" hugetlb usage recorded - hugetlb..usage_in_bytes # show current usage for "hugepagesize" hugetlb - hugetlb..failcnt # show the number of allocation failure due to HugeTLB limit - -For a system supporting three hugepage sizes (64k, 32M and 1G), the control -files include: - -hugetlb.1GB.limit_in_bytes -hugetlb.1GB.max_usage_in_bytes -hugetlb.1GB.usage_in_bytes -hugetlb.1GB.failcnt -hugetlb.64KB.limit_in_bytes -hugetlb.64KB.max_usage_in_bytes -hugetlb.64KB.usage_in_bytes -hugetlb.64KB.failcnt -hugetlb.32MB.limit_in_bytes -hugetlb.32MB.max_usage_in_bytes -hugetlb.32MB.usage_in_bytes -hugetlb.32MB.failcnt diff --git a/Documentation/cgroup-v1/index.rst b/Documentation/cgroup-v1/index.rst new file mode 100644 index 000000000000..fe76d42edc11 --- /dev/null +++ b/Documentation/cgroup-v1/index.rst @@ -0,0 +1,30 @@ +:orphan: + +======================== +Control Groups version 1 +======================== + +.. toctree:: + :maxdepth: 1 + + cgroups + + blkio-controller + cpuacct + cpusets + devices + freezer-subsystem + hugetlb + memcg_test + memory + net_cls + net_prio + pids + rdma + +.. only:: subproject and html + + Indices + ======= + + * :ref:`genindex` diff --git a/Documentation/cgroup-v1/memcg_test.rst b/Documentation/cgroup-v1/memcg_test.rst new file mode 100644 index 000000000000..91bd18c6a514 --- /dev/null +++ b/Documentation/cgroup-v1/memcg_test.rst @@ -0,0 +1,355 @@ +===================================================== +Memory Resource Controller(Memcg) Implementation Memo +===================================================== + +Last Updated: 2010/2 + +Base Kernel Version: based on 2.6.33-rc7-mm(candidate for 34). + +Because VM is getting complex (one of reasons is memcg...), memcg's behavior +is complex. This is a document for memcg's internal behavior. +Please note that implementation details can be changed. + +(*) Topics on API should be in Documentation/cgroup-v1/memory.rst) + +0. How to record usage ? +======================== + + 2 objects are used. + + page_cgroup ....an object per page. + + Allocated at boot or memory hotplug. Freed at memory hot removal. + + swap_cgroup ... an entry per swp_entry. + + Allocated at swapon(). Freed at swapoff(). + + The page_cgroup has USED bit and double count against a page_cgroup never + occurs. swap_cgroup is used only when a charged page is swapped-out. + +1. Charge +========= + + a page/swp_entry may be charged (usage += PAGE_SIZE) at + + mem_cgroup_try_charge() + +2. Uncharge +=========== + + a page/swp_entry may be uncharged (usage -= PAGE_SIZE) by + + mem_cgroup_uncharge() + Called when a page's refcount goes down to 0. + + mem_cgroup_uncharge_swap() + Called when swp_entry's refcnt goes down to 0. A charge against swap + disappears. + +3. charge-commit-cancel +======================= + + Memcg pages are charged in two steps: + + - mem_cgroup_try_charge() + - mem_cgroup_commit_charge() or mem_cgroup_cancel_charge() + + At try_charge(), there are no flags to say "this page is charged". + at this point, usage += PAGE_SIZE. + + At commit(), the page is associated with the memcg. + + At cancel(), simply usage -= PAGE_SIZE. + +Under below explanation, we assume CONFIG_MEM_RES_CTRL_SWAP=y. + +4. Anonymous +============ + + Anonymous page is newly allocated at + - page fault into MAP_ANONYMOUS mapping. + - Copy-On-Write. + + 4.1 Swap-in. + At swap-in, the page is taken from swap-cache. There are 2 cases. + + (a) If the SwapCache is newly allocated and read, it has no charges. + (b) If the SwapCache has been mapped by processes, it has been + charged already. + + 4.2 Swap-out. + At swap-out, typical state transition is below. + + (a) add to swap cache. (marked as SwapCache) + swp_entry's refcnt += 1. + (b) fully unmapped. + swp_entry's refcnt += # of ptes. + (c) write back to swap. + (d) delete from swap cache. (remove from SwapCache) + swp_entry's refcnt -= 1. + + + Finally, at task exit, + (e) zap_pte() is called and swp_entry's refcnt -=1 -> 0. + +5. Page Cache +============= + + Page Cache is charged at + - add_to_page_cache_locked(). + + The logic is very clear. (About migration, see below) + + Note: + __remove_from_page_cache() is called by remove_from_page_cache() + and __remove_mapping(). + +6. Shmem(tmpfs) Page Cache +=========================== + + The best way to understand shmem's page state transition is to read + mm/shmem.c. + + But brief explanation of the behavior of memcg around shmem will be + helpful to understand the logic. + + Shmem's page (just leaf page, not direct/indirect block) can be on + + - radix-tree of shmem's inode. + - SwapCache. + - Both on radix-tree and SwapCache. This happens at swap-in + and swap-out, + + It's charged when... + + - A new page is added to shmem's radix-tree. + - A swp page is read. (move a charge from swap_cgroup to page_cgroup) + +7. Page Migration +================= + + mem_cgroup_migrate() + +8. LRU +====== + Each memcg has its own private LRU. Now, its handling is under global + VM's control (means that it's handled under global pgdat->lru_lock). + Almost all routines around memcg's LRU is called by global LRU's + list management functions under pgdat->lru_lock. + + A special function is mem_cgroup_isolate_pages(). This scans + memcg's private LRU and call __isolate_lru_page() to extract a page + from LRU. + + (By __isolate_lru_page(), the page is removed from both of global and + private LRU.) + + +9. Typical Tests. +================= + + Tests for racy cases. + +9.1 Small limit to memcg. +------------------------- + + When you do test to do racy case, it's good test to set memcg's limit + to be very small rather than GB. Many races found in the test under + xKB or xxMB limits. + + (Memory behavior under GB and Memory behavior under MB shows very + different situation.) + +9.2 Shmem +--------- + + Historically, memcg's shmem handling was poor and we saw some amount + of troubles here. This is because shmem is page-cache but can be + SwapCache. Test with shmem/tmpfs is always good test. + +9.3 Migration +------------- + + For NUMA, migration is an another special case. To do easy test, cpuset + is useful. Following is a sample script to do migration:: + + mount -t cgroup -o cpuset none /opt/cpuset + + mkdir /opt/cpuset/01 + echo 1 > /opt/cpuset/01/cpuset.cpus + echo 0 > /opt/cpuset/01/cpuset.mems + echo 1 > /opt/cpuset/01/cpuset.memory_migrate + mkdir /opt/cpuset/02 + echo 1 > /opt/cpuset/02/cpuset.cpus + echo 1 > /opt/cpuset/02/cpuset.mems + echo 1 > /opt/cpuset/02/cpuset.memory_migrate + + In above set, when you moves a task from 01 to 02, page migration to + node 0 to node 1 will occur. Following is a script to migrate all + under cpuset.:: + + -- + move_task() + { + for pid in $1 + do + /bin/echo $pid >$2/tasks 2>/dev/null + echo -n $pid + echo -n " " + done + echo END + } + + G1_TASK=`cat ${G1}/tasks` + G2_TASK=`cat ${G2}/tasks` + move_task "${G1_TASK}" ${G2} & + -- + +9.4 Memory hotplug +------------------ + + memory hotplug test is one of good test. + + to offline memory, do following:: + + # echo offline > /sys/devices/system/memory/memoryXXX/state + + (XXX is the place of memory) + + This is an easy way to test page migration, too. + +9.5 mkdir/rmdir +--------------- + + When using hierarchy, mkdir/rmdir test should be done. + Use tests like the following:: + + echo 1 >/opt/cgroup/01/memory/use_hierarchy + mkdir /opt/cgroup/01/child_a + mkdir /opt/cgroup/01/child_b + + set limit to 01. + add limit to 01/child_b + run jobs under child_a and child_b + + create/delete following groups at random while jobs are running:: + + /opt/cgroup/01/child_a/child_aa + /opt/cgroup/01/child_b/child_bb + /opt/cgroup/01/child_c + + running new jobs in new group is also good. + +9.6 Mount with other subsystems +------------------------------- + + Mounting with other subsystems is a good test because there is a + race and lock dependency with other cgroup subsystems. + + example:: + + # mount -t cgroup none /cgroup -o cpuset,memory,cpu,devices + + and do task move, mkdir, rmdir etc...under this. + +9.7 swapoff +----------- + + Besides management of swap is one of complicated parts of memcg, + call path of swap-in at swapoff is not same as usual swap-in path.. + It's worth to be tested explicitly. + + For example, test like following is good: + + (Shell-A):: + + # mount -t cgroup none /cgroup -o memory + # mkdir /cgroup/test + # echo 40M > /cgroup/test/memory.limit_in_bytes + # echo 0 > /cgroup/test/tasks + + Run malloc(100M) program under this. You'll see 60M of swaps. + + (Shell-B):: + + # move all tasks in /cgroup/test to /cgroup + # /sbin/swapoff -a + # rmdir /cgroup/test + # kill malloc task. + + Of course, tmpfs v.s. swapoff test should be tested, too. + +9.8 OOM-Killer +-------------- + + Out-of-memory caused by memcg's limit will kill tasks under + the memcg. When hierarchy is used, a task under hierarchy + will be killed by the kernel. + + In this case, panic_on_oom shouldn't be invoked and tasks + in other groups shouldn't be killed. + + It's not difficult to cause OOM under memcg as following. + + Case A) when you can swapoff:: + + #swapoff -a + #echo 50M > /memory.limit_in_bytes + + run 51M of malloc + + Case B) when you use mem+swap limitation:: + + #echo 50M > memory.limit_in_bytes + #echo 50M > memory.memsw.limit_in_bytes + + run 51M of malloc + +9.9 Move charges at task migration +---------------------------------- + + Charges associated with a task can be moved along with task migration. + + (Shell-A):: + + #mkdir /cgroup/A + #echo $$ >/cgroup/A/tasks + + run some programs which uses some amount of memory in /cgroup/A. + + (Shell-B):: + + #mkdir /cgroup/B + #echo 1 >/cgroup/B/memory.move_charge_at_immigrate + #echo "pid of the program running in group A" >/cgroup/B/tasks + + You can see charges have been moved by reading ``*.usage_in_bytes`` or + memory.stat of both A and B. + + See 8.2 of Documentation/cgroup-v1/memory.rst to see what value should + be written to move_charge_at_immigrate. + +9.10 Memory thresholds +---------------------- + + Memory controller implements memory thresholds using cgroups notification + API. You can use tools/cgroup/cgroup_event_listener.c to test it. + + (Shell-A) Create cgroup and run event listener:: + + # mkdir /cgroup/A + # ./cgroup_event_listener /cgroup/A/memory.usage_in_bytes 5M + + (Shell-B) Add task to cgroup and try to allocate and free memory:: + + # echo $$ >/cgroup/A/tasks + # a="$(dd if=/dev/zero bs=1M count=10)" + # a= + + You will see message from cgroup_event_listener every time you cross + the thresholds. + + Use /cgroup/A/memory.memsw.usage_in_bytes to test memsw thresholds. + + It's good idea to test root cgroup as well. diff --git a/Documentation/cgroup-v1/memcg_test.txt b/Documentation/cgroup-v1/memcg_test.txt deleted file mode 100644 index 621e29ffb358..000000000000 --- a/Documentation/cgroup-v1/memcg_test.txt +++ /dev/null @@ -1,280 +0,0 @@ -Memory Resource Controller(Memcg) Implementation Memo. -Last Updated: 2010/2 -Base Kernel Version: based on 2.6.33-rc7-mm(candidate for 34). - -Because VM is getting complex (one of reasons is memcg...), memcg's behavior -is complex. This is a document for memcg's internal behavior. -Please note that implementation details can be changed. - -(*) Topics on API should be in Documentation/cgroup-v1/memory.txt) - -0. How to record usage ? - 2 objects are used. - - page_cgroup ....an object per page. - Allocated at boot or memory hotplug. Freed at memory hot removal. - - swap_cgroup ... an entry per swp_entry. - Allocated at swapon(). Freed at swapoff(). - - The page_cgroup has USED bit and double count against a page_cgroup never - occurs. swap_cgroup is used only when a charged page is swapped-out. - -1. Charge - - a page/swp_entry may be charged (usage += PAGE_SIZE) at - - mem_cgroup_try_charge() - -2. Uncharge - a page/swp_entry may be uncharged (usage -= PAGE_SIZE) by - - mem_cgroup_uncharge() - Called when a page's refcount goes down to 0. - - mem_cgroup_uncharge_swap() - Called when swp_entry's refcnt goes down to 0. A charge against swap - disappears. - -3. charge-commit-cancel - Memcg pages are charged in two steps: - mem_cgroup_try_charge() - mem_cgroup_commit_charge() or mem_cgroup_cancel_charge() - - At try_charge(), there are no flags to say "this page is charged". - at this point, usage += PAGE_SIZE. - - At commit(), the page is associated with the memcg. - - At cancel(), simply usage -= PAGE_SIZE. - -Under below explanation, we assume CONFIG_MEM_RES_CTRL_SWAP=y. - -4. Anonymous - Anonymous page is newly allocated at - - page fault into MAP_ANONYMOUS mapping. - - Copy-On-Write. - - 4.1 Swap-in. - At swap-in, the page is taken from swap-cache. There are 2 cases. - - (a) If the SwapCache is newly allocated and read, it has no charges. - (b) If the SwapCache has been mapped by processes, it has been - charged already. - - 4.2 Swap-out. - At swap-out, typical state transition is below. - - (a) add to swap cache. (marked as SwapCache) - swp_entry's refcnt += 1. - (b) fully unmapped. - swp_entry's refcnt += # of ptes. - (c) write back to swap. - (d) delete from swap cache. (remove from SwapCache) - swp_entry's refcnt -= 1. - - - Finally, at task exit, - (e) zap_pte() is called and swp_entry's refcnt -=1 -> 0. - -5. Page Cache - Page Cache is charged at - - add_to_page_cache_locked(). - - The logic is very clear. (About migration, see below) - Note: __remove_from_page_cache() is called by remove_from_page_cache() - and __remove_mapping(). - -6. Shmem(tmpfs) Page Cache - The best way to understand shmem's page state transition is to read - mm/shmem.c. - But brief explanation of the behavior of memcg around shmem will be - helpful to understand the logic. - - Shmem's page (just leaf page, not direct/indirect block) can be on - - radix-tree of shmem's inode. - - SwapCache. - - Both on radix-tree and SwapCache. This happens at swap-in - and swap-out, - - It's charged when... - - A new page is added to shmem's radix-tree. - - A swp page is read. (move a charge from swap_cgroup to page_cgroup) - -7. Page Migration - - mem_cgroup_migrate() - -8. LRU - Each memcg has its own private LRU. Now, its handling is under global - VM's control (means that it's handled under global pgdat->lru_lock). - Almost all routines around memcg's LRU is called by global LRU's - list management functions under pgdat->lru_lock. - - A special function is mem_cgroup_isolate_pages(). This scans - memcg's private LRU and call __isolate_lru_page() to extract a page - from LRU. - (By __isolate_lru_page(), the page is removed from both of global and - private LRU.) - - -9. Typical Tests. - - Tests for racy cases. - - 9.1 Small limit to memcg. - When you do test to do racy case, it's good test to set memcg's limit - to be very small rather than GB. Many races found in the test under - xKB or xxMB limits. - (Memory behavior under GB and Memory behavior under MB shows very - different situation.) - - 9.2 Shmem - Historically, memcg's shmem handling was poor and we saw some amount - of troubles here. This is because shmem is page-cache but can be - SwapCache. Test with shmem/tmpfs is always good test. - - 9.3 Migration - For NUMA, migration is an another special case. To do easy test, cpuset - is useful. Following is a sample script to do migration. - - mount -t cgroup -o cpuset none /opt/cpuset - - mkdir /opt/cpuset/01 - echo 1 > /opt/cpuset/01/cpuset.cpus - echo 0 > /opt/cpuset/01/cpuset.mems - echo 1 > /opt/cpuset/01/cpuset.memory_migrate - mkdir /opt/cpuset/02 - echo 1 > /opt/cpuset/02/cpuset.cpus - echo 1 > /opt/cpuset/02/cpuset.mems - echo 1 > /opt/cpuset/02/cpuset.memory_migrate - - In above set, when you moves a task from 01 to 02, page migration to - node 0 to node 1 will occur. Following is a script to migrate all - under cpuset. - -- - move_task() - { - for pid in $1 - do - /bin/echo $pid >$2/tasks 2>/dev/null - echo -n $pid - echo -n " " - done - echo END - } - - G1_TASK=`cat ${G1}/tasks` - G2_TASK=`cat ${G2}/tasks` - move_task "${G1_TASK}" ${G2} & - -- - 9.4 Memory hotplug. - memory hotplug test is one of good test. - to offline memory, do following. - # echo offline > /sys/devices/system/memory/memoryXXX/state - (XXX is the place of memory) - This is an easy way to test page migration, too. - - 9.5 mkdir/rmdir - When using hierarchy, mkdir/rmdir test should be done. - Use tests like the following. - - echo 1 >/opt/cgroup/01/memory/use_hierarchy - mkdir /opt/cgroup/01/child_a - mkdir /opt/cgroup/01/child_b - - set limit to 01. - add limit to 01/child_b - run jobs under child_a and child_b - - create/delete following groups at random while jobs are running. - /opt/cgroup/01/child_a/child_aa - /opt/cgroup/01/child_b/child_bb - /opt/cgroup/01/child_c - - running new jobs in new group is also good. - - 9.6 Mount with other subsystems. - Mounting with other subsystems is a good test because there is a - race and lock dependency with other cgroup subsystems. - - example) - # mount -t cgroup none /cgroup -o cpuset,memory,cpu,devices - - and do task move, mkdir, rmdir etc...under this. - - 9.7 swapoff. - Besides management of swap is one of complicated parts of memcg, - call path of swap-in at swapoff is not same as usual swap-in path.. - It's worth to be tested explicitly. - - For example, test like following is good. - (Shell-A) - # mount -t cgroup none /cgroup -o memory - # mkdir /cgroup/test - # echo 40M > /cgroup/test/memory.limit_in_bytes - # echo 0 > /cgroup/test/tasks - Run malloc(100M) program under this. You'll see 60M of swaps. - (Shell-B) - # move all tasks in /cgroup/test to /cgroup - # /sbin/swapoff -a - # rmdir /cgroup/test - # kill malloc task. - - Of course, tmpfs v.s. swapoff test should be tested, too. - - 9.8 OOM-Killer - Out-of-memory caused by memcg's limit will kill tasks under - the memcg. When hierarchy is used, a task under hierarchy - will be killed by the kernel. - In this case, panic_on_oom shouldn't be invoked and tasks - in other groups shouldn't be killed. - - It's not difficult to cause OOM under memcg as following. - Case A) when you can swapoff - #swapoff -a - #echo 50M > /memory.limit_in_bytes - run 51M of malloc - - Case B) when you use mem+swap limitation. - #echo 50M > memory.limit_in_bytes - #echo 50M > memory.memsw.limit_in_bytes - run 51M of malloc - - 9.9 Move charges at task migration - Charges associated with a task can be moved along with task migration. - - (Shell-A) - #mkdir /cgroup/A - #echo $$ >/cgroup/A/tasks - run some programs which uses some amount of memory in /cgroup/A. - - (Shell-B) - #mkdir /cgroup/B - #echo 1 >/cgroup/B/memory.move_charge_at_immigrate - #echo "pid of the program running in group A" >/cgroup/B/tasks - - You can see charges have been moved by reading *.usage_in_bytes or - memory.stat of both A and B. - See 8.2 of Documentation/cgroup-v1/memory.txt to see what value should be - written to move_charge_at_immigrate. - - 9.10 Memory thresholds - Memory controller implements memory thresholds using cgroups notification - API. You can use tools/cgroup/cgroup_event_listener.c to test it. - - (Shell-A) Create cgroup and run event listener - # mkdir /cgroup/A - # ./cgroup_event_listener /cgroup/A/memory.usage_in_bytes 5M - - (Shell-B) Add task to cgroup and try to allocate and free memory - # echo $$ >/cgroup/A/tasks - # a="$(dd if=/dev/zero bs=1M count=10)" - # a= - - You will see message from cgroup_event_listener every time you cross - the thresholds. - - Use /cgroup/A/memory.memsw.usage_in_bytes to test memsw thresholds. - - It's good idea to test root cgroup as well. diff --git a/Documentation/cgroup-v1/memory.rst b/Documentation/cgroup-v1/memory.rst new file mode 100644 index 000000000000..41bdc038dad9 --- /dev/null +++ b/Documentation/cgroup-v1/memory.rst @@ -0,0 +1,1003 @@ +========================== +Memory Resource Controller +========================== + +NOTE: + This document is hopelessly outdated and it asks for a complete + rewrite. It still contains a useful information so we are keeping it + here but make sure to check the current code if you need a deeper + understanding. + +NOTE: + The Memory Resource Controller has generically been referred to as the + memory controller in this document. Do not confuse memory controller + used here with the memory controller that is used in hardware. + +(For editors) In this document: + When we mention a cgroup (cgroupfs's directory) with memory controller, + we call it "memory cgroup". When you see git-log and source code, you'll + see patch's title and function names tend to use "memcg". + In this document, we avoid using it. + +Benefits and Purpose of the memory controller +============================================= + +The memory controller isolates the memory behaviour of a group of tasks +from the rest of the system. The article on LWN [12] mentions some probable +uses of the memory controller. The memory controller can be used to + +a. Isolate an application or a group of applications + Memory-hungry applications can be isolated and limited to a smaller + amount of memory. +b. Create a cgroup with a limited amount of memory; this can be used + as a good alternative to booting with mem=XXXX. +c. Virtualization solutions can control the amount of memory they want + to assign to a virtual machine instance. +d. A CD/DVD burner could control the amount of memory used by the + rest of the system to ensure that burning does not fail due to lack + of available memory. +e. There are several other use cases; find one or use the controller just + for fun (to learn and hack on the VM subsystem). + +Current Status: linux-2.6.34-mmotm(development version of 2010/April) + +Features: + + - accounting anonymous pages, file caches, swap caches usage and limiting them. + - pages are linked to per-memcg LRU exclusively, and there is no global LRU. + - optionally, memory+swap usage can be accounted and limited. + - hierarchical accounting + - soft limit + - moving (recharging) account at moving a task is selectable. + - usage threshold notifier + - memory pressure notifier + - oom-killer disable knob and oom-notifier + - Root cgroup has no limit controls. + + Kernel memory support is a work in progress, and the current version provides + basically functionality. (See Section 2.7) + +Brief summary of control files. + +==================================== ========================================== + tasks attach a task(thread) and show list of + threads + cgroup.procs show list of processes + cgroup.event_control an interface for event_fd() + memory.usage_in_bytes show current usage for memory + (See 5.5 for details) + memory.memsw.usage_in_bytes show current usage for memory+Swap + (See 5.5 for details) + memory.limit_in_bytes set/show limit of memory usage + memory.memsw.limit_in_bytes set/show limit of memory+Swap usage + memory.failcnt show the number of memory usage hits limits + memory.memsw.failcnt show the number of memory+Swap hits limits + memory.max_usage_in_bytes show max memory usage recorded + memory.memsw.max_usage_in_bytes show max memory+Swap usage recorded + memory.soft_limit_in_bytes set/show soft limit of memory usage + memory.stat show various statistics + memory.use_hierarchy set/show hierarchical account enabled + memory.force_empty trigger forced page reclaim + memory.pressure_level set memory pressure notifications + memory.swappiness set/show swappiness parameter of vmscan + (See sysctl's vm.swappiness) + memory.move_charge_at_immigrate set/show controls of moving charges + memory.oom_control set/show oom controls. + memory.numa_stat show the number of memory usage per numa + node + + memory.kmem.limit_in_bytes set/show hard limit for kernel memory + memory.kmem.usage_in_bytes show current kernel memory allocation + memory.kmem.failcnt show the number of kernel memory usage + hits limits + memory.kmem.max_usage_in_bytes show max kernel memory usage recorded + + memory.kmem.tcp.limit_in_bytes set/show hard limit for tcp buf memory + memory.kmem.tcp.usage_in_bytes show current tcp buf memory allocation + memory.kmem.tcp.failcnt show the number of tcp buf memory usage + hits limits + memory.kmem.tcp.max_usage_in_bytes show max tcp buf memory usage recorded +==================================== ========================================== + +1. History +========== + +The memory controller has a long history. A request for comments for the memory +controller was posted by Balbir Singh [1]. At the time the RFC was posted +there were several implementations for memory control. The goal of the +RFC was to build consensus and agreement for the minimal features required +for memory control. The first RSS controller was posted by Balbir Singh[2] +in Feb 2007. Pavel Emelianov [3][4][5] has since posted three versions of the +RSS controller. At OLS, at the resource management BoF, everyone suggested +that we handle both page cache and RSS together. Another request was raised +to allow user space handling of OOM. The current memory controller is +at version 6; it combines both mapped (RSS) and unmapped Page +Cache Control [11]. + +2. Memory Control +================= + +Memory is a unique resource in the sense that it is present in a limited +amount. If a task requires a lot of CPU processing, the task can spread +its processing over a period of hours, days, months or years, but with +memory, the same physical memory needs to be reused to accomplish the task. + +The memory controller implementation has been divided into phases. These +are: + +1. Memory controller +2. mlock(2) controller +3. Kernel user memory accounting and slab control +4. user mappings length controller + +The memory controller is the first controller developed. + +2.1. Design +----------- + +The core of the design is a counter called the page_counter. The +page_counter tracks the current memory usage and limit of the group of +processes associated with the controller. Each cgroup has a memory controller +specific data structure (mem_cgroup) associated with it. + +2.2. Accounting +--------------- + +:: + + +--------------------+ + | mem_cgroup | + | (page_counter) | + +--------------------+ + / ^ \ + / | \ + +---------------+ | +---------------+ + | mm_struct | |.... | mm_struct | + | | | | | + +---------------+ | +---------------+ + | + + --------------+ + | + +---------------+ +------+--------+ + | page +----------> page_cgroup| + | | | | + +---------------+ +---------------+ + + (Figure 1: Hierarchy of Accounting) + + +Figure 1 shows the important aspects of the controller + +1. Accounting happens per cgroup +2. Each mm_struct knows about which cgroup it belongs to +3. Each page has a pointer to the page_cgroup, which in turn knows the + cgroup it belongs to + +The accounting is done as follows: mem_cgroup_charge_common() is invoked to +set up the necessary data structures and check if the cgroup that is being +charged is over its limit. If it is, then reclaim is invoked on the cgroup. +More details can be found in the reclaim section of this document. +If everything goes well, a page meta-data-structure called page_cgroup is +updated. page_cgroup has its own LRU on cgroup. +(*) page_cgroup structure is allocated at boot/memory-hotplug time. + +2.2.1 Accounting details +------------------------ + +All mapped anon pages (RSS) and cache pages (Page Cache) are accounted. +Some pages which are never reclaimable and will not be on the LRU +are not accounted. We just account pages under usual VM management. + +RSS pages are accounted at page_fault unless they've already been accounted +for earlier. A file page will be accounted for as Page Cache when it's +inserted into inode (radix-tree). While it's mapped into the page tables of +processes, duplicate accounting is carefully avoided. + +An RSS page is unaccounted when it's fully unmapped. A PageCache page is +unaccounted when it's removed from radix-tree. Even if RSS pages are fully +unmapped (by kswapd), they may exist as SwapCache in the system until they +are really freed. Such SwapCaches are also accounted. +A swapped-in page is not accounted until it's mapped. + +Note: The kernel does swapin-readahead and reads multiple swaps at once. +This means swapped-in pages may contain pages for other tasks than a task +causing page fault. So, we avoid accounting at swap-in I/O. + +At page migration, accounting information is kept. + +Note: we just account pages-on-LRU because our purpose is to control amount +of used pages; not-on-LRU pages tend to be out-of-control from VM view. + +2.3 Shared Page Accounting +-------------------------- + +Shared pages are accounted on the basis of the first touch approach. The +cgroup that first touches a page is accounted for the page. The principle +behind this approach is that a cgroup that aggressively uses a shared +page will eventually get charged for it (once it is uncharged from +the cgroup that brought it in -- this will happen on memory pressure). + +But see section 8.2: when moving a task to another cgroup, its pages may +be recharged to the new cgroup, if move_charge_at_immigrate has been chosen. + +Exception: If CONFIG_MEMCG_SWAP is not used. +When you do swapoff and make swapped-out pages of shmem(tmpfs) to +be backed into memory in force, charges for pages are accounted against the +caller of swapoff rather than the users of shmem. + +2.4 Swap Extension (CONFIG_MEMCG_SWAP) +-------------------------------------- + +Swap Extension allows you to record charge for swap. A swapped-in page is +charged back to original page allocator if possible. + +When swap is accounted, following files are added. + + - memory.memsw.usage_in_bytes. + - memory.memsw.limit_in_bytes. + +memsw means memory+swap. Usage of memory+swap is limited by +memsw.limit_in_bytes. + +Example: Assume a system with 4G of swap. A task which allocates 6G of memory +(by mistake) under 2G memory limitation will use all swap. +In this case, setting memsw.limit_in_bytes=3G will prevent bad use of swap. +By using the memsw limit, you can avoid system OOM which can be caused by swap +shortage. + +**why 'memory+swap' rather than swap** + +The global LRU(kswapd) can swap out arbitrary pages. Swap-out means +to move account from memory to swap...there is no change in usage of +memory+swap. In other words, when we want to limit the usage of swap without +affecting global LRU, memory+swap limit is better than just limiting swap from +an OS point of view. + +**What happens when a cgroup hits memory.memsw.limit_in_bytes** + +When a cgroup hits memory.memsw.limit_in_bytes, it's useless to do swap-out +in this cgroup. Then, swap-out will not be done by cgroup routine and file +caches are dropped. But as mentioned above, global LRU can do swapout memory +from it for sanity of the system's memory management state. You can't forbid +it by cgroup. + +2.5 Reclaim +----------- + +Each cgroup maintains a per cgroup LRU which has the same structure as +global VM. When a cgroup goes over its limit, we first try +to reclaim memory from the cgroup so as to make space for the new +pages that the cgroup has touched. If the reclaim is unsuccessful, +an OOM routine is invoked to select and kill the bulkiest task in the +cgroup. (See 10. OOM Control below.) + +The reclaim algorithm has not been modified for cgroups, except that +pages that are selected for reclaiming come from the per-cgroup LRU +list. + +NOTE: + Reclaim does not work for the root cgroup, since we cannot set any + limits on the root cgroup. + +Note2: + When panic_on_oom is set to "2", the whole system will panic. + +When oom event notifier is registered, event will be delivered. +(See oom_control section) + +2.6 Locking +----------- + + lock_page_cgroup()/unlock_page_cgroup() should not be called under + the i_pages lock. + + Other lock order is following: + + PG_locked. + mm->page_table_lock + pgdat->lru_lock + lock_page_cgroup. + + In many cases, just lock_page_cgroup() is called. + + per-zone-per-cgroup LRU (cgroup's private LRU) is just guarded by + pgdat->lru_lock, it has no lock of its own. + +2.7 Kernel Memory Extension (CONFIG_MEMCG_KMEM) +----------------------------------------------- + +With the Kernel memory extension, the Memory Controller is able to limit +the amount of kernel memory used by the system. Kernel memory is fundamentally +different than user memory, since it can't be swapped out, which makes it +possible to DoS the system by consuming too much of this precious resource. + +Kernel memory accounting is enabled for all memory cgroups by default. But +it can be disabled system-wide by passing cgroup.memory=nokmem to the kernel +at boot time. In this case, kernel memory will not be accounted at all. + +Kernel memory limits are not imposed for the root cgroup. Usage for the root +cgroup may or may not be accounted. The memory used is accumulated into +memory.kmem.usage_in_bytes, or in a separate counter when it makes sense. +(currently only for tcp). + +The main "kmem" counter is fed into the main counter, so kmem charges will +also be visible from the user counter. + +Currently no soft limit is implemented for kernel memory. It is future work +to trigger slab reclaim when those limits are reached. + +2.7.1 Current Kernel Memory resources accounted +----------------------------------------------- + +stack pages: + every process consumes some stack pages. By accounting into + kernel memory, we prevent new processes from being created when the kernel + memory usage is too high. + +slab pages: + pages allocated by the SLAB or SLUB allocator are tracked. A copy + of each kmem_cache is created every time the cache is touched by the first time + from inside the memcg. The creation is done lazily, so some objects can still be + skipped while the cache is being created. All objects in a slab page should + belong to the same memcg. This only fails to hold when a task is migrated to a + different memcg during the page allocation by the cache. + +sockets memory pressure: + some sockets protocols have memory pressure + thresholds. The Memory Controller allows them to be controlled individually + per cgroup, instead of globally. + +tcp memory pressure: + sockets memory pressure for the tcp protocol. + +2.7.2 Common use cases +---------------------- + +Because the "kmem" counter is fed to the main user counter, kernel memory can +never be limited completely independently of user memory. Say "U" is the user +limit, and "K" the kernel limit. There are three possible ways limits can be +set: + +U != 0, K = unlimited: + This is the standard memcg limitation mechanism already present before kmem + accounting. Kernel memory is completely ignored. + +U != 0, K < U: + Kernel memory is a subset of the user memory. This setup is useful in + deployments where the total amount of memory per-cgroup is overcommited. + Overcommiting kernel memory limits is definitely not recommended, since the + box can still run out of non-reclaimable memory. + In this case, the admin could set up K so that the sum of all groups is + never greater than the total memory, and freely set U at the cost of his + QoS. + +WARNING: + In the current implementation, memory reclaim will NOT be + triggered for a cgroup when it hits K while staying below U, which makes + this setup impractical. + +U != 0, K >= U: + Since kmem charges will also be fed to the user counter and reclaim will be + triggered for the cgroup for both kinds of memory. This setup gives the + admin a unified view of memory, and it is also useful for people who just + want to track kernel memory usage. + +3. User Interface +================= + +3.0. Configuration +------------------ + +a. Enable CONFIG_CGROUPS +b. Enable CONFIG_MEMCG +c. Enable CONFIG_MEMCG_SWAP (to use swap extension) +d. Enable CONFIG_MEMCG_KMEM (to use kmem extension) + +3.1. Prepare the cgroups (see cgroups.txt, Why are cgroups needed?) +------------------------------------------------------------------- + +:: + + # mount -t tmpfs none /sys/fs/cgroup + # mkdir /sys/fs/cgroup/memory + # mount -t cgroup none /sys/fs/cgroup/memory -o memory + +3.2. Make the new group and move bash into it:: + + # mkdir /sys/fs/cgroup/memory/0 + # echo $$ > /sys/fs/cgroup/memory/0/tasks + +Since now we're in the 0 cgroup, we can alter the memory limit:: + + # echo 4M > /sys/fs/cgroup/memory/0/memory.limit_in_bytes + +NOTE: + We can use a suffix (k, K, m, M, g or G) to indicate values in kilo, + mega or gigabytes. (Here, Kilo, Mega, Giga are Kibibytes, Mebibytes, + Gibibytes.) + +NOTE: + We can write "-1" to reset the ``*.limit_in_bytes(unlimited)``. + +NOTE: + We cannot set limits on the root cgroup any more. + +:: + + # cat /sys/fs/cgroup/memory/0/memory.limit_in_bytes + 4194304 + +We can check the usage:: + + # cat /sys/fs/cgroup/memory/0/memory.usage_in_bytes + 1216512 + +A successful write to this file does not guarantee a successful setting of +this limit to the value written into the file. This can be due to a +number of factors, such as rounding up to page boundaries or the total +availability of memory on the system. The user is required to re-read +this file after a write to guarantee the value committed by the kernel:: + + # echo 1 > memory.limit_in_bytes + # cat memory.limit_in_bytes + 4096 + +The memory.failcnt field gives the number of times that the cgroup limit was +exceeded. + +The memory.stat file gives accounting information. Now, the number of +caches, RSS and Active pages/Inactive pages are shown. + +4. Testing +========== + +For testing features and implementation, see memcg_test.txt. + +Performance test is also important. To see pure memory controller's overhead, +testing on tmpfs will give you good numbers of small overheads. +Example: do kernel make on tmpfs. + +Page-fault scalability is also important. At measuring parallel +page fault test, multi-process test may be better than multi-thread +test because it has noise of shared objects/status. + +But the above two are testing extreme situations. +Trying usual test under memory controller is always helpful. + +4.1 Troubleshooting +------------------- + +Sometimes a user might find that the application under a cgroup is +terminated by the OOM killer. There are several causes for this: + +1. The cgroup limit is too low (just too low to do anything useful) +2. The user is using anonymous memory and swap is turned off or too low + +A sync followed by echo 1 > /proc/sys/vm/drop_caches will help get rid of +some of the pages cached in the cgroup (page cache pages). + +To know what happens, disabling OOM_Kill as per "10. OOM Control" (below) and +seeing what happens will be helpful. + +4.2 Task migration +------------------ + +When a task migrates from one cgroup to another, its charge is not +carried forward by default. The pages allocated from the original cgroup still +remain charged to it, the charge is dropped when the page is freed or +reclaimed. + +You can move charges of a task along with task migration. +See 8. "Move charges at task migration" + +4.3 Removing a cgroup +--------------------- + +A cgroup can be removed by rmdir, but as discussed in sections 4.1 and 4.2, a +cgroup might have some charge associated with it, even though all +tasks have migrated away from it. (because we charge against pages, not +against tasks.) + +We move the stats to root (if use_hierarchy==0) or parent (if +use_hierarchy==1), and no change on the charge except uncharging +from the child. + +Charges recorded in swap information is not updated at removal of cgroup. +Recorded information is discarded and a cgroup which uses swap (swapcache) +will be charged as a new owner of it. + +About use_hierarchy, see Section 6. + +5. Misc. interfaces +=================== + +5.1 force_empty +--------------- + memory.force_empty interface is provided to make cgroup's memory usage empty. + When writing anything to this:: + + # echo 0 > memory.force_empty + + the cgroup will be reclaimed and as many pages reclaimed as possible. + + The typical use case for this interface is before calling rmdir(). + Though rmdir() offlines memcg, but the memcg may still stay there due to + charged file caches. Some out-of-use page caches may keep charged until + memory pressure happens. If you want to avoid that, force_empty will be useful. + + Also, note that when memory.kmem.limit_in_bytes is set the charges due to + kernel pages will still be seen. This is not considered a failure and the + write will still return success. In this case, it is expected that + memory.kmem.usage_in_bytes == memory.usage_in_bytes. + + About use_hierarchy, see Section 6. + +5.2 stat file +------------- + +memory.stat file includes following statistics + +per-memory cgroup local status +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +=============== =============================================================== +cache # of bytes of page cache memory. +rss # of bytes of anonymous and swap cache memory (includes + transparent hugepages). +rss_huge # of bytes of anonymous transparent hugepages. +mapped_file # of bytes of mapped file (includes tmpfs/shmem) +pgpgin # of charging events to the memory cgroup. The charging + event happens each time a page is accounted as either mapped + anon page(RSS) or cache page(Page Cache) to the cgroup. +pgpgout # of uncharging events to the memory cgroup. The uncharging + event happens each time a page is unaccounted from the cgroup. +swap # of bytes of swap usage +dirty # of bytes that are waiting to get written back to the disk. +writeback # of bytes of file/anon cache that are queued for syncing to + disk. +inactive_anon # of bytes of anonymous and swap cache memory on inactive + LRU list. +active_anon # of bytes of anonymous and swap cache memory on active + LRU list. +inactive_file # of bytes of file-backed memory on inactive LRU list. +active_file # of bytes of file-backed memory on active LRU list. +unevictable # of bytes of memory that cannot be reclaimed (mlocked etc). +=============== =============================================================== + +status considering hierarchy (see memory.use_hierarchy settings) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +========================= =================================================== +hierarchical_memory_limit # of bytes of memory limit with regard to hierarchy + under which the memory cgroup is +hierarchical_memsw_limit # of bytes of memory+swap limit with regard to + hierarchy under which memory cgroup is. + +total_ # hierarchical version of , which in + addition to the cgroup's own value includes the + sum of all hierarchical children's values of + , i.e. total_cache +========================= =================================================== + +The following additional stats are dependent on CONFIG_DEBUG_VM +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +========================= ======================================== +recent_rotated_anon VM internal parameter. (see mm/vmscan.c) +recent_rotated_file VM internal parameter. (see mm/vmscan.c) +recent_scanned_anon VM internal parameter. (see mm/vmscan.c) +recent_scanned_file VM internal parameter. (see mm/vmscan.c) +========================= ======================================== + +Memo: + recent_rotated means recent frequency of LRU rotation. + recent_scanned means recent # of scans to LRU. + showing for better debug please see the code for meanings. + +Note: + Only anonymous and swap cache memory is listed as part of 'rss' stat. + This should not be confused with the true 'resident set size' or the + amount of physical memory used by the cgroup. + + 'rss + mapped_file" will give you resident set size of cgroup. + + (Note: file and shmem may be shared among other cgroups. In that case, + mapped_file is accounted only when the memory cgroup is owner of page + cache.) + +5.3 swappiness +-------------- + +Overrides /proc/sys/vm/swappiness for the particular group. The tunable +in the root cgroup corresponds to the global swappiness setting. + +Please note that unlike during the global reclaim, limit reclaim +enforces that 0 swappiness really prevents from any swapping even if +there is a swap storage available. This might lead to memcg OOM killer +if there are no file pages to reclaim. + +5.4 failcnt +----------- + +A memory cgroup provides memory.failcnt and memory.memsw.failcnt files. +This failcnt(== failure count) shows the number of times that a usage counter +hit its limit. When a memory cgroup hits a limit, failcnt increases and +memory under it will be reclaimed. + +You can reset failcnt by writing 0 to failcnt file:: + + # echo 0 > .../memory.failcnt + +5.5 usage_in_bytes +------------------ + +For efficiency, as other kernel components, memory cgroup uses some optimization +to avoid unnecessary cacheline false sharing. usage_in_bytes is affected by the +method and doesn't show 'exact' value of memory (and swap) usage, it's a fuzz +value for efficient access. (Of course, when necessary, it's synchronized.) +If you want to know more exact memory usage, you should use RSS+CACHE(+SWAP) +value in memory.stat(see 5.2). + +5.6 numa_stat +------------- + +This is similar to numa_maps but operates on a per-memcg basis. This is +useful for providing visibility into the numa locality information within +an memcg since the pages are allowed to be allocated from any physical +node. One of the use cases is evaluating application performance by +combining this information with the application's CPU allocation. + +Each memcg's numa_stat file includes "total", "file", "anon" and "unevictable" +per-node page counts including "hierarchical_" which sums up all +hierarchical children's values in addition to the memcg's own value. + +The output format of memory.numa_stat is:: + + total= N0= N1= ... + file= N0= N1= ... + anon= N0= N1= ... + unevictable= N0= N1= ... + hierarchical_= N0= N1= ... + +The "total" count is sum of file + anon + unevictable. + +6. Hierarchy support +==================== + +The memory controller supports a deep hierarchy and hierarchical accounting. +The hierarchy is created by creating the appropriate cgroups in the +cgroup filesystem. Consider for example, the following cgroup filesystem +hierarchy:: + + root + / | \ + / | \ + a b c + | \ + | \ + d e + +In the diagram above, with hierarchical accounting enabled, all memory +usage of e, is accounted to its ancestors up until the root (i.e, c and root), +that has memory.use_hierarchy enabled. If one of the ancestors goes over its +limit, the reclaim algorithm reclaims from the tasks in the ancestor and the +children of the ancestor. + +6.1 Enabling hierarchical accounting and reclaim +------------------------------------------------ + +A memory cgroup by default disables the hierarchy feature. Support +can be enabled by writing 1 to memory.use_hierarchy file of the root cgroup:: + + # echo 1 > memory.use_hierarchy + +The feature can be disabled by:: + + # echo 0 > memory.use_hierarchy + +NOTE1: + Enabling/disabling will fail if either the cgroup already has other + cgroups created below it, or if the parent cgroup has use_hierarchy + enabled. + +NOTE2: + When panic_on_oom is set to "2", the whole system will panic in + case of an OOM event in any cgroup. + +7. Soft limits +============== + +Soft limits allow for greater sharing of memory. The idea behind soft limits +is to allow control groups to use as much of the memory as needed, provided + +a. There is no memory contention +b. They do not exceed their hard limit + +When the system detects memory contention or low memory, control groups +are pushed back to their soft limits. If the soft limit of each control +group is very high, they are pushed back as much as possible to make +sure that one control group does not starve the others of memory. + +Please note that soft limits is a best-effort feature; it comes with +no guarantees, but it does its best to make sure that when memory is +heavily contended for, memory is allocated based on the soft limit +hints/setup. Currently soft limit based reclaim is set up such that +it gets invoked from balance_pgdat (kswapd). + +7.1 Interface +------------- + +Soft limits can be setup by using the following commands (in this example we +assume a soft limit of 256 MiB):: + + # echo 256M > memory.soft_limit_in_bytes + +If we want to change this to 1G, we can at any time use:: + + # echo 1G > memory.soft_limit_in_bytes + +NOTE1: + Soft limits take effect over a long period of time, since they involve + reclaiming memory for balancing between memory cgroups +NOTE2: + It is recommended to set the soft limit always below the hard limit, + otherwise the hard limit will take precedence. + +8. Move charges at task migration +================================= + +Users can move charges associated with a task along with task migration, that +is, uncharge task's pages from the old cgroup and charge them to the new cgroup. +This feature is not supported in !CONFIG_MMU environments because of lack of +page tables. + +8.1 Interface +------------- + +This feature is disabled by default. It can be enabled (and disabled again) by +writing to memory.move_charge_at_immigrate of the destination cgroup. + +If you want to enable it:: + + # echo (some positive value) > memory.move_charge_at_immigrate + +Note: + Each bits of move_charge_at_immigrate has its own meaning about what type + of charges should be moved. See 8.2 for details. +Note: + Charges are moved only when you move mm->owner, in other words, + a leader of a thread group. +Note: + If we cannot find enough space for the task in the destination cgroup, we + try to make space by reclaiming memory. Task migration may fail if we + cannot make enough space. +Note: + It can take several seconds if you move charges much. + +And if you want disable it again:: + + # echo 0 > memory.move_charge_at_immigrate + +8.2 Type of charges which can be moved +-------------------------------------- + +Each bit in move_charge_at_immigrate has its own meaning about what type of +charges should be moved. But in any case, it must be noted that an account of +a page or a swap can be moved only when it is charged to the task's current +(old) memory cgroup. + ++---+--------------------------------------------------------------------------+ +|bit| what type of charges would be moved ? | ++===+==========================================================================+ +| 0 | A charge of an anonymous page (or swap of it) used by the target task. | +| | You must enable Swap Extension (see 2.4) to enable move of swap charges. | ++---+--------------------------------------------------------------------------+ +| 1 | A charge of file pages (normal file, tmpfs file (e.g. ipc shared memory) | +| | and swaps of tmpfs file) mmapped by the target task. Unlike the case of | +| | anonymous pages, file pages (and swaps) in the range mmapped by the task | +| | will be moved even if the task hasn't done page fault, i.e. they might | +| | not be the task's "RSS", but other task's "RSS" that maps the same file. | +| | And mapcount of the page is ignored (the page can be moved even if | +| | page_mapcount(page) > 1). You must enable Swap Extension (see 2.4) to | +| | enable move of swap charges. | ++---+--------------------------------------------------------------------------+ + +8.3 TODO +-------- + +- All of moving charge operations are done under cgroup_mutex. It's not good + behavior to hold the mutex too long, so we may need some trick. + +9. Memory thresholds +==================== + +Memory cgroup implements memory thresholds using the cgroups notification +API (see cgroups.txt). It allows to register multiple memory and memsw +thresholds and gets notifications when it crosses. + +To register a threshold, an application must: + +- create an eventfd using eventfd(2); +- open memory.usage_in_bytes or memory.memsw.usage_in_bytes; +- write string like " " to + cgroup.event_control. + +Application will be notified through eventfd when memory usage crosses +threshold in any direction. + +It's applicable for root and non-root cgroup. + +10. OOM Control +=============== + +memory.oom_control file is for OOM notification and other controls. + +Memory cgroup implements OOM notifier using the cgroup notification +API (See cgroups.txt). It allows to register multiple OOM notification +delivery and gets notification when OOM happens. + +To register a notifier, an application must: + + - create an eventfd using eventfd(2) + - open memory.oom_control file + - write string like " " to + cgroup.event_control + +The application will be notified through eventfd when OOM happens. +OOM notification doesn't work for the root cgroup. + +You can disable the OOM-killer by writing "1" to memory.oom_control file, as: + + #echo 1 > memory.oom_control + +If OOM-killer is disabled, tasks under cgroup will hang/sleep +in memory cgroup's OOM-waitqueue when they request accountable memory. + +For running them, you have to relax the memory cgroup's OOM status by + + * enlarge limit or reduce usage. + +To reduce usage, + + * kill some tasks. + * move some tasks to other group with account migration. + * remove some files (on tmpfs?) + +Then, stopped tasks will work again. + +At reading, current status of OOM is shown. + + - oom_kill_disable 0 or 1 + (if 1, oom-killer is disabled) + - under_oom 0 or 1 + (if 1, the memory cgroup is under OOM, tasks may be stopped.) + +11. Memory Pressure +=================== + +The pressure level notifications can be used to monitor the memory +allocation cost; based on the pressure, applications can implement +different strategies of managing their memory resources. The pressure +levels are defined as following: + +The "low" level means that the system is reclaiming memory for new +allocations. Monitoring this reclaiming activity might be useful for +maintaining cache level. Upon notification, the program (typically +"Activity Manager") might analyze vmstat and act in advance (i.e. +prematurely shutdown unimportant services). + +The "medium" level means that the system is experiencing medium memory +pressure, the system might be making swap, paging out active file caches, +etc. Upon this event applications may decide to further analyze +vmstat/zoneinfo/memcg or internal memory usage statistics and free any +resources that can be easily reconstructed or re-read from a disk. + +The "critical" level means that the system is actively thrashing, it is +about to out of memory (OOM) or even the in-kernel OOM killer is on its +way to trigger. Applications should do whatever they can to help the +system. It might be too late to consult with vmstat or any other +statistics, so it's advisable to take an immediate action. + +By default, events are propagated upward until the event is handled, i.e. the +events are not pass-through. For example, you have three cgroups: A->B->C. Now +you set up an event listener on cgroups A, B and C, and suppose group C +experiences some pressure. In this situation, only group C will receive the +notification, i.e. groups A and B will not receive it. This is done to avoid +excessive "broadcasting" of messages, which disturbs the system and which is +especially bad if we are low on memory or thrashing. Group B, will receive +notification only if there are no event listers for group C. + +There are three optional modes that specify different propagation behavior: + + - "default": this is the default behavior specified above. This mode is the + same as omitting the optional mode parameter, preserved by backwards + compatibility. + + - "hierarchy": events always propagate up to the root, similar to the default + behavior, except that propagation continues regardless of whether there are + event listeners at each level, with the "hierarchy" mode. In the above + example, groups A, B, and C will receive notification of memory pressure. + + - "local": events are pass-through, i.e. they only receive notifications when + memory pressure is experienced in the memcg for which the notification is + registered. In the above example, group C will receive notification if + registered for "local" notification and the group experiences memory + pressure. However, group B will never receive notification, regardless if + there is an event listener for group C or not, if group B is registered for + local notification. + +The level and event notification mode ("hierarchy" or "local", if necessary) are +specified by a comma-delimited string, i.e. "low,hierarchy" specifies +hierarchical, pass-through, notification for all ancestor memcgs. Notification +that is the default, non pass-through behavior, does not specify a mode. +"medium,local" specifies pass-through notification for the medium level. + +The file memory.pressure_level is only used to setup an eventfd. To +register a notification, an application must: + +- create an eventfd using eventfd(2); +- open memory.pressure_level; +- write string as " " + to cgroup.event_control. + +Application will be notified through eventfd when memory pressure is at +the specific level (or higher). Read/write operations to +memory.pressure_level are no implemented. + +Test: + + Here is a small script example that makes a new cgroup, sets up a + memory limit, sets up a notification in the cgroup and then makes child + cgroup experience a critical pressure:: + + # cd /sys/fs/cgroup/memory/ + # mkdir foo + # cd foo + # cgroup_event_listener memory.pressure_level low,hierarchy & + # echo 8000000 > memory.limit_in_bytes + # echo 8000000 > memory.memsw.limit_in_bytes + # echo $$ > tasks + # dd if=/dev/zero | read x + + (Expect a bunch of notifications, and eventually, the oom-killer will + trigger.) + +12. TODO +======== + +1. Make per-cgroup scanner reclaim not-shared pages first +2. Teach controller to account for shared-pages +3. Start reclamation in the background when the limit is + not yet hit but the usage is getting closer + +Summary +======= + +Overall, the memory controller has been a stable controller and has been +commented and discussed quite extensively in the community. + +References +========== + +1. Singh, Balbir. RFC: Memory Controller, http://lwn.net/Articles/206697/ +2. Singh, Balbir. Memory Controller (RSS Control), + http://lwn.net/Articles/222762/ +3. Emelianov, Pavel. Resource controllers based on process cgroups + http://lkml.org/lkml/2007/3/6/198 +4. Emelianov, Pavel. RSS controller based on process cgroups (v2) + http://lkml.org/lkml/2007/4/9/78 +5. Emelianov, Pavel. RSS controller based on process cgroups (v3) + http://lkml.org/lkml/2007/5/30/244 +6. Menage, Paul. Control Groups v10, http://lwn.net/Articles/236032/ +7. Vaidyanathan, Srinivasan, Control Groups: Pagecache accounting and control + subsystem (v3), http://lwn.net/Articles/235534/ +8. Singh, Balbir. RSS controller v2 test results (lmbench), + http://lkml.org/lkml/2007/5/17/232 +9. Singh, Balbir. RSS controller v2 AIM9 results + http://lkml.org/lkml/2007/5/18/1 +10. Singh, Balbir. Memory controller v6 test results, + http://lkml.org/lkml/2007/8/19/36 +11. Singh, Balbir. Memory controller introduction (v6), + http://lkml.org/lkml/2007/8/17/69 +12. Corbet, Jonathan, Controlling memory use in cgroups, + http://lwn.net/Articles/243795/ diff --git a/Documentation/cgroup-v1/memory.txt b/Documentation/cgroup-v1/memory.txt deleted file mode 100644 index a33cedf85427..000000000000 --- a/Documentation/cgroup-v1/memory.txt +++ /dev/null @@ -1,892 +0,0 @@ -Memory Resource Controller - -NOTE: This document is hopelessly outdated and it asks for a complete - rewrite. It still contains a useful information so we are keeping it - here but make sure to check the current code if you need a deeper - understanding. - -NOTE: The Memory Resource Controller has generically been referred to as the - memory controller in this document. Do not confuse memory controller - used here with the memory controller that is used in hardware. - -(For editors) -In this document: - When we mention a cgroup (cgroupfs's directory) with memory controller, - we call it "memory cgroup". When you see git-log and source code, you'll - see patch's title and function names tend to use "memcg". - In this document, we avoid using it. - -Benefits and Purpose of the memory controller - -The memory controller isolates the memory behaviour of a group of tasks -from the rest of the system. The article on LWN [12] mentions some probable -uses of the memory controller. The memory controller can be used to - -a. Isolate an application or a group of applications - Memory-hungry applications can be isolated and limited to a smaller - amount of memory. -b. Create a cgroup with a limited amount of memory; this can be used - as a good alternative to booting with mem=XXXX. -c. Virtualization solutions can control the amount of memory they want - to assign to a virtual machine instance. -d. A CD/DVD burner could control the amount of memory used by the - rest of the system to ensure that burning does not fail due to lack - of available memory. -e. There are several other use cases; find one or use the controller just - for fun (to learn and hack on the VM subsystem). - -Current Status: linux-2.6.34-mmotm(development version of 2010/April) - -Features: - - accounting anonymous pages, file caches, swap caches usage and limiting them. - - pages are linked to per-memcg LRU exclusively, and there is no global LRU. - - optionally, memory+swap usage can be accounted and limited. - - hierarchical accounting - - soft limit - - moving (recharging) account at moving a task is selectable. - - usage threshold notifier - - memory pressure notifier - - oom-killer disable knob and oom-notifier - - Root cgroup has no limit controls. - - Kernel memory support is a work in progress, and the current version provides - basically functionality. (See Section 2.7) - -Brief summary of control files. - - tasks # attach a task(thread) and show list of threads - cgroup.procs # show list of processes - cgroup.event_control # an interface for event_fd() - memory.usage_in_bytes # show current usage for memory - (See 5.5 for details) - memory.memsw.usage_in_bytes # show current usage for memory+Swap - (See 5.5 for details) - memory.limit_in_bytes # set/show limit of memory usage - memory.memsw.limit_in_bytes # set/show limit of memory+Swap usage - memory.failcnt # show the number of memory usage hits limits - memory.memsw.failcnt # show the number of memory+Swap hits limits - memory.max_usage_in_bytes # show max memory usage recorded - memory.memsw.max_usage_in_bytes # show max memory+Swap usage recorded - memory.soft_limit_in_bytes # set/show soft limit of memory usage - memory.stat # show various statistics - memory.use_hierarchy # set/show hierarchical account enabled - memory.force_empty # trigger forced page reclaim - memory.pressure_level # set memory pressure notifications - memory.swappiness # set/show swappiness parameter of vmscan - (See sysctl's vm.swappiness) - memory.move_charge_at_immigrate # set/show controls of moving charges - memory.oom_control # set/show oom controls. - memory.numa_stat # show the number of memory usage per numa node - - memory.kmem.limit_in_bytes # set/show hard limit for kernel memory - memory.kmem.usage_in_bytes # show current kernel memory allocation - memory.kmem.failcnt # show the number of kernel memory usage hits limits - memory.kmem.max_usage_in_bytes # show max kernel memory usage recorded - - memory.kmem.tcp.limit_in_bytes # set/show hard limit for tcp buf memory - memory.kmem.tcp.usage_in_bytes # show current tcp buf memory allocation - memory.kmem.tcp.failcnt # show the number of tcp buf memory usage hits limits - memory.kmem.tcp.max_usage_in_bytes # show max tcp buf memory usage recorded - -1. History - -The memory controller has a long history. A request for comments for the memory -controller was posted by Balbir Singh [1]. At the time the RFC was posted -there were several implementations for memory control. The goal of the -RFC was to build consensus and agreement for the minimal features required -for memory control. The first RSS controller was posted by Balbir Singh[2] -in Feb 2007. Pavel Emelianov [3][4][5] has since posted three versions of the -RSS controller. At OLS, at the resource management BoF, everyone suggested -that we handle both page cache and RSS together. Another request was raised -to allow user space handling of OOM. The current memory controller is -at version 6; it combines both mapped (RSS) and unmapped Page -Cache Control [11]. - -2. Memory Control - -Memory is a unique resource in the sense that it is present in a limited -amount. If a task requires a lot of CPU processing, the task can spread -its processing over a period of hours, days, months or years, but with -memory, the same physical memory needs to be reused to accomplish the task. - -The memory controller implementation has been divided into phases. These -are: - -1. Memory controller -2. mlock(2) controller -3. Kernel user memory accounting and slab control -4. user mappings length controller - -The memory controller is the first controller developed. - -2.1. Design - -The core of the design is a counter called the page_counter. The -page_counter tracks the current memory usage and limit of the group of -processes associated with the controller. Each cgroup has a memory controller -specific data structure (mem_cgroup) associated with it. - -2.2. Accounting - - +--------------------+ - | mem_cgroup | - | (page_counter) | - +--------------------+ - / ^ \ - / | \ - +---------------+ | +---------------+ - | mm_struct | |.... | mm_struct | - | | | | | - +---------------+ | +---------------+ - | - + --------------+ - | - +---------------+ +------+--------+ - | page +----------> page_cgroup| - | | | | - +---------------+ +---------------+ - - (Figure 1: Hierarchy of Accounting) - - -Figure 1 shows the important aspects of the controller - -1. Accounting happens per cgroup -2. Each mm_struct knows about which cgroup it belongs to -3. Each page has a pointer to the page_cgroup, which in turn knows the - cgroup it belongs to - -The accounting is done as follows: mem_cgroup_charge_common() is invoked to -set up the necessary data structures and check if the cgroup that is being -charged is over its limit. If it is, then reclaim is invoked on the cgroup. -More details can be found in the reclaim section of this document. -If everything goes well, a page meta-data-structure called page_cgroup is -updated. page_cgroup has its own LRU on cgroup. -(*) page_cgroup structure is allocated at boot/memory-hotplug time. - -2.2.1 Accounting details - -All mapped anon pages (RSS) and cache pages (Page Cache) are accounted. -Some pages which are never reclaimable and will not be on the LRU -are not accounted. We just account pages under usual VM management. - -RSS pages are accounted at page_fault unless they've already been accounted -for earlier. A file page will be accounted for as Page Cache when it's -inserted into inode (radix-tree). While it's mapped into the page tables of -processes, duplicate accounting is carefully avoided. - -An RSS page is unaccounted when it's fully unmapped. A PageCache page is -unaccounted when it's removed from radix-tree. Even if RSS pages are fully -unmapped (by kswapd), they may exist as SwapCache in the system until they -are really freed. Such SwapCaches are also accounted. -A swapped-in page is not accounted until it's mapped. - -Note: The kernel does swapin-readahead and reads multiple swaps at once. -This means swapped-in pages may contain pages for other tasks than a task -causing page fault. So, we avoid accounting at swap-in I/O. - -At page migration, accounting information is kept. - -Note: we just account pages-on-LRU because our purpose is to control amount -of used pages; not-on-LRU pages tend to be out-of-control from VM view. - -2.3 Shared Page Accounting - -Shared pages are accounted on the basis of the first touch approach. The -cgroup that first touches a page is accounted for the page. The principle -behind this approach is that a cgroup that aggressively uses a shared -page will eventually get charged for it (once it is uncharged from -the cgroup that brought it in -- this will happen on memory pressure). - -But see section 8.2: when moving a task to another cgroup, its pages may -be recharged to the new cgroup, if move_charge_at_immigrate has been chosen. - -Exception: If CONFIG_MEMCG_SWAP is not used. -When you do swapoff and make swapped-out pages of shmem(tmpfs) to -be backed into memory in force, charges for pages are accounted against the -caller of swapoff rather than the users of shmem. - -2.4 Swap Extension (CONFIG_MEMCG_SWAP) - -Swap Extension allows you to record charge for swap. A swapped-in page is -charged back to original page allocator if possible. - -When swap is accounted, following files are added. - - memory.memsw.usage_in_bytes. - - memory.memsw.limit_in_bytes. - -memsw means memory+swap. Usage of memory+swap is limited by -memsw.limit_in_bytes. - -Example: Assume a system with 4G of swap. A task which allocates 6G of memory -(by mistake) under 2G memory limitation will use all swap. -In this case, setting memsw.limit_in_bytes=3G will prevent bad use of swap. -By using the memsw limit, you can avoid system OOM which can be caused by swap -shortage. - -* why 'memory+swap' rather than swap. -The global LRU(kswapd) can swap out arbitrary pages. Swap-out means -to move account from memory to swap...there is no change in usage of -memory+swap. In other words, when we want to limit the usage of swap without -affecting global LRU, memory+swap limit is better than just limiting swap from -an OS point of view. - -* What happens when a cgroup hits memory.memsw.limit_in_bytes -When a cgroup hits memory.memsw.limit_in_bytes, it's useless to do swap-out -in this cgroup. Then, swap-out will not be done by cgroup routine and file -caches are dropped. But as mentioned above, global LRU can do swapout memory -from it for sanity of the system's memory management state. You can't forbid -it by cgroup. - -2.5 Reclaim - -Each cgroup maintains a per cgroup LRU which has the same structure as -global VM. When a cgroup goes over its limit, we first try -to reclaim memory from the cgroup so as to make space for the new -pages that the cgroup has touched. If the reclaim is unsuccessful, -an OOM routine is invoked to select and kill the bulkiest task in the -cgroup. (See 10. OOM Control below.) - -The reclaim algorithm has not been modified for cgroups, except that -pages that are selected for reclaiming come from the per-cgroup LRU -list. - -NOTE: Reclaim does not work for the root cgroup, since we cannot set any -limits on the root cgroup. - -Note2: When panic_on_oom is set to "2", the whole system will panic. - -When oom event notifier is registered, event will be delivered. -(See oom_control section) - -2.6 Locking - - lock_page_cgroup()/unlock_page_cgroup() should not be called under - the i_pages lock. - - Other lock order is following: - PG_locked. - mm->page_table_lock - pgdat->lru_lock - lock_page_cgroup. - In many cases, just lock_page_cgroup() is called. - per-zone-per-cgroup LRU (cgroup's private LRU) is just guarded by - pgdat->lru_lock, it has no lock of its own. - -2.7 Kernel Memory Extension (CONFIG_MEMCG_KMEM) - -With the Kernel memory extension, the Memory Controller is able to limit -the amount of kernel memory used by the system. Kernel memory is fundamentally -different than user memory, since it can't be swapped out, which makes it -possible to DoS the system by consuming too much of this precious resource. - -Kernel memory accounting is enabled for all memory cgroups by default. But -it can be disabled system-wide by passing cgroup.memory=nokmem to the kernel -at boot time. In this case, kernel memory will not be accounted at all. - -Kernel memory limits are not imposed for the root cgroup. Usage for the root -cgroup may or may not be accounted. The memory used is accumulated into -memory.kmem.usage_in_bytes, or in a separate counter when it makes sense. -(currently only for tcp). -The main "kmem" counter is fed into the main counter, so kmem charges will -also be visible from the user counter. - -Currently no soft limit is implemented for kernel memory. It is future work -to trigger slab reclaim when those limits are reached. - -2.7.1 Current Kernel Memory resources accounted - -* stack pages: every process consumes some stack pages. By accounting into -kernel memory, we prevent new processes from being created when the kernel -memory usage is too high. - -* slab pages: pages allocated by the SLAB or SLUB allocator are tracked. A copy -of each kmem_cache is created every time the cache is touched by the first time -from inside the memcg. The creation is done lazily, so some objects can still be -skipped while the cache is being created. All objects in a slab page should -belong to the same memcg. This only fails to hold when a task is migrated to a -different memcg during the page allocation by the cache. - -* sockets memory pressure: some sockets protocols have memory pressure -thresholds. The Memory Controller allows them to be controlled individually -per cgroup, instead of globally. - -* tcp memory pressure: sockets memory pressure for the tcp protocol. - -2.7.2 Common use cases - -Because the "kmem" counter is fed to the main user counter, kernel memory can -never be limited completely independently of user memory. Say "U" is the user -limit, and "K" the kernel limit. There are three possible ways limits can be -set: - - U != 0, K = unlimited: - This is the standard memcg limitation mechanism already present before kmem - accounting. Kernel memory is completely ignored. - - U != 0, K < U: - Kernel memory is a subset of the user memory. This setup is useful in - deployments where the total amount of memory per-cgroup is overcommited. - Overcommiting kernel memory limits is definitely not recommended, since the - box can still run out of non-reclaimable memory. - In this case, the admin could set up K so that the sum of all groups is - never greater than the total memory, and freely set U at the cost of his - QoS. - WARNING: In the current implementation, memory reclaim will NOT be - triggered for a cgroup when it hits K while staying below U, which makes - this setup impractical. - - U != 0, K >= U: - Since kmem charges will also be fed to the user counter and reclaim will be - triggered for the cgroup for both kinds of memory. This setup gives the - admin a unified view of memory, and it is also useful for people who just - want to track kernel memory usage. - -3. User Interface - -3.0. Configuration - -a. Enable CONFIG_CGROUPS -b. Enable CONFIG_MEMCG -c. Enable CONFIG_MEMCG_SWAP (to use swap extension) -d. Enable CONFIG_MEMCG_KMEM (to use kmem extension) - -3.1. Prepare the cgroups (see cgroups.txt, Why are cgroups needed?) -# mount -t tmpfs none /sys/fs/cgroup -# mkdir /sys/fs/cgroup/memory -# mount -t cgroup none /sys/fs/cgroup/memory -o memory - -3.2. Make the new group and move bash into it -# mkdir /sys/fs/cgroup/memory/0 -# echo $$ > /sys/fs/cgroup/memory/0/tasks - -Since now we're in the 0 cgroup, we can alter the memory limit: -# echo 4M > /sys/fs/cgroup/memory/0/memory.limit_in_bytes - -NOTE: We can use a suffix (k, K, m, M, g or G) to indicate values in kilo, -mega or gigabytes. (Here, Kilo, Mega, Giga are Kibibytes, Mebibytes, Gibibytes.) - -NOTE: We can write "-1" to reset the *.limit_in_bytes(unlimited). -NOTE: We cannot set limits on the root cgroup any more. - -# cat /sys/fs/cgroup/memory/0/memory.limit_in_bytes -4194304 - -We can check the usage: -# cat /sys/fs/cgroup/memory/0/memory.usage_in_bytes -1216512 - -A successful write to this file does not guarantee a successful setting of -this limit to the value written into the file. This can be due to a -number of factors, such as rounding up to page boundaries or the total -availability of memory on the system. The user is required to re-read -this file after a write to guarantee the value committed by the kernel. - -# echo 1 > memory.limit_in_bytes -# cat memory.limit_in_bytes -4096 - -The memory.failcnt field gives the number of times that the cgroup limit was -exceeded. - -The memory.stat file gives accounting information. Now, the number of -caches, RSS and Active pages/Inactive pages are shown. - -4. Testing - -For testing features and implementation, see memcg_test.txt. - -Performance test is also important. To see pure memory controller's overhead, -testing on tmpfs will give you good numbers of small overheads. -Example: do kernel make on tmpfs. - -Page-fault scalability is also important. At measuring parallel -page fault test, multi-process test may be better than multi-thread -test because it has noise of shared objects/status. - -But the above two are testing extreme situations. -Trying usual test under memory controller is always helpful. - -4.1 Troubleshooting - -Sometimes a user might find that the application under a cgroup is -terminated by the OOM killer. There are several causes for this: - -1. The cgroup limit is too low (just too low to do anything useful) -2. The user is using anonymous memory and swap is turned off or too low - -A sync followed by echo 1 > /proc/sys/vm/drop_caches will help get rid of -some of the pages cached in the cgroup (page cache pages). - -To know what happens, disabling OOM_Kill as per "10. OOM Control" (below) and -seeing what happens will be helpful. - -4.2 Task migration - -When a task migrates from one cgroup to another, its charge is not -carried forward by default. The pages allocated from the original cgroup still -remain charged to it, the charge is dropped when the page is freed or -reclaimed. - -You can move charges of a task along with task migration. -See 8. "Move charges at task migration" - -4.3 Removing a cgroup - -A cgroup can be removed by rmdir, but as discussed in sections 4.1 and 4.2, a -cgroup might have some charge associated with it, even though all -tasks have migrated away from it. (because we charge against pages, not -against tasks.) - -We move the stats to root (if use_hierarchy==0) or parent (if -use_hierarchy==1), and no change on the charge except uncharging -from the child. - -Charges recorded in swap information is not updated at removal of cgroup. -Recorded information is discarded and a cgroup which uses swap (swapcache) -will be charged as a new owner of it. - -About use_hierarchy, see Section 6. - -5. Misc. interfaces. - -5.1 force_empty - memory.force_empty interface is provided to make cgroup's memory usage empty. - When writing anything to this - - # echo 0 > memory.force_empty - - the cgroup will be reclaimed and as many pages reclaimed as possible. - - The typical use case for this interface is before calling rmdir(). - Though rmdir() offlines memcg, but the memcg may still stay there due to - charged file caches. Some out-of-use page caches may keep charged until - memory pressure happens. If you want to avoid that, force_empty will be useful. - - Also, note that when memory.kmem.limit_in_bytes is set the charges due to - kernel pages will still be seen. This is not considered a failure and the - write will still return success. In this case, it is expected that - memory.kmem.usage_in_bytes == memory.usage_in_bytes. - - About use_hierarchy, see Section 6. - -5.2 stat file - -memory.stat file includes following statistics - -# per-memory cgroup local status -cache - # of bytes of page cache memory. -rss - # of bytes of anonymous and swap cache memory (includes - transparent hugepages). -rss_huge - # of bytes of anonymous transparent hugepages. -mapped_file - # of bytes of mapped file (includes tmpfs/shmem) -pgpgin - # of charging events to the memory cgroup. The charging - event happens each time a page is accounted as either mapped - anon page(RSS) or cache page(Page Cache) to the cgroup. -pgpgout - # of uncharging events to the memory cgroup. The uncharging - event happens each time a page is unaccounted from the cgroup. -swap - # of bytes of swap usage -dirty - # of bytes that are waiting to get written back to the disk. -writeback - # of bytes of file/anon cache that are queued for syncing to - disk. -inactive_anon - # of bytes of anonymous and swap cache memory on inactive - LRU list. -active_anon - # of bytes of anonymous and swap cache memory on active - LRU list. -inactive_file - # of bytes of file-backed memory on inactive LRU list. -active_file - # of bytes of file-backed memory on active LRU list. -unevictable - # of bytes of memory that cannot be reclaimed (mlocked etc). - -# status considering hierarchy (see memory.use_hierarchy settings) - -hierarchical_memory_limit - # of bytes of memory limit with regard to hierarchy - under which the memory cgroup is -hierarchical_memsw_limit - # of bytes of memory+swap limit with regard to - hierarchy under which memory cgroup is. - -total_ - # hierarchical version of , which in - addition to the cgroup's own value includes the - sum of all hierarchical children's values of - , i.e. total_cache - -# The following additional stats are dependent on CONFIG_DEBUG_VM. - -recent_rotated_anon - VM internal parameter. (see mm/vmscan.c) -recent_rotated_file - VM internal parameter. (see mm/vmscan.c) -recent_scanned_anon - VM internal parameter. (see mm/vmscan.c) -recent_scanned_file - VM internal parameter. (see mm/vmscan.c) - -Memo: - recent_rotated means recent frequency of LRU rotation. - recent_scanned means recent # of scans to LRU. - showing for better debug please see the code for meanings. - -Note: - Only anonymous and swap cache memory is listed as part of 'rss' stat. - This should not be confused with the true 'resident set size' or the - amount of physical memory used by the cgroup. - 'rss + mapped_file" will give you resident set size of cgroup. - (Note: file and shmem may be shared among other cgroups. In that case, - mapped_file is accounted only when the memory cgroup is owner of page - cache.) - -5.3 swappiness - -Overrides /proc/sys/vm/swappiness for the particular group. The tunable -in the root cgroup corresponds to the global swappiness setting. - -Please note that unlike during the global reclaim, limit reclaim -enforces that 0 swappiness really prevents from any swapping even if -there is a swap storage available. This might lead to memcg OOM killer -if there are no file pages to reclaim. - -5.4 failcnt - -A memory cgroup provides memory.failcnt and memory.memsw.failcnt files. -This failcnt(== failure count) shows the number of times that a usage counter -hit its limit. When a memory cgroup hits a limit, failcnt increases and -memory under it will be reclaimed. - -You can reset failcnt by writing 0 to failcnt file. -# echo 0 > .../memory.failcnt - -5.5 usage_in_bytes - -For efficiency, as other kernel components, memory cgroup uses some optimization -to avoid unnecessary cacheline false sharing. usage_in_bytes is affected by the -method and doesn't show 'exact' value of memory (and swap) usage, it's a fuzz -value for efficient access. (Of course, when necessary, it's synchronized.) -If you want to know more exact memory usage, you should use RSS+CACHE(+SWAP) -value in memory.stat(see 5.2). - -5.6 numa_stat - -This is similar to numa_maps but operates on a per-memcg basis. This is -useful for providing visibility into the numa locality information within -an memcg since the pages are allowed to be allocated from any physical -node. One of the use cases is evaluating application performance by -combining this information with the application's CPU allocation. - -Each memcg's numa_stat file includes "total", "file", "anon" and "unevictable" -per-node page counts including "hierarchical_" which sums up all -hierarchical children's values in addition to the memcg's own value. - -The output format of memory.numa_stat is: - -total= N0= N1= ... -file= N0= N1= ... -anon= N0= N1= ... -unevictable= N0= N1= ... -hierarchical_= N0= N1= ... - -The "total" count is sum of file + anon + unevictable. - -6. Hierarchy support - -The memory controller supports a deep hierarchy and hierarchical accounting. -The hierarchy is created by creating the appropriate cgroups in the -cgroup filesystem. Consider for example, the following cgroup filesystem -hierarchy - - root - / | \ - / | \ - a b c - | \ - | \ - d e - -In the diagram above, with hierarchical accounting enabled, all memory -usage of e, is accounted to its ancestors up until the root (i.e, c and root), -that has memory.use_hierarchy enabled. If one of the ancestors goes over its -limit, the reclaim algorithm reclaims from the tasks in the ancestor and the -children of the ancestor. - -6.1 Enabling hierarchical accounting and reclaim - -A memory cgroup by default disables the hierarchy feature. Support -can be enabled by writing 1 to memory.use_hierarchy file of the root cgroup - -# echo 1 > memory.use_hierarchy - -The feature can be disabled by - -# echo 0 > memory.use_hierarchy - -NOTE1: Enabling/disabling will fail if either the cgroup already has other - cgroups created below it, or if the parent cgroup has use_hierarchy - enabled. - -NOTE2: When panic_on_oom is set to "2", the whole system will panic in - case of an OOM event in any cgroup. - -7. Soft limits - -Soft limits allow for greater sharing of memory. The idea behind soft limits -is to allow control groups to use as much of the memory as needed, provided - -a. There is no memory contention -b. They do not exceed their hard limit - -When the system detects memory contention or low memory, control groups -are pushed back to their soft limits. If the soft limit of each control -group is very high, they are pushed back as much as possible to make -sure that one control group does not starve the others of memory. - -Please note that soft limits is a best-effort feature; it comes with -no guarantees, but it does its best to make sure that when memory is -heavily contended for, memory is allocated based on the soft limit -hints/setup. Currently soft limit based reclaim is set up such that -it gets invoked from balance_pgdat (kswapd). - -7.1 Interface - -Soft limits can be setup by using the following commands (in this example we -assume a soft limit of 256 MiB) - -# echo 256M > memory.soft_limit_in_bytes - -If we want to change this to 1G, we can at any time use - -# echo 1G > memory.soft_limit_in_bytes - -NOTE1: Soft limits take effect over a long period of time, since they involve - reclaiming memory for balancing between memory cgroups -NOTE2: It is recommended to set the soft limit always below the hard limit, - otherwise the hard limit will take precedence. - -8. Move charges at task migration - -Users can move charges associated with a task along with task migration, that -is, uncharge task's pages from the old cgroup and charge them to the new cgroup. -This feature is not supported in !CONFIG_MMU environments because of lack of -page tables. - -8.1 Interface - -This feature is disabled by default. It can be enabled (and disabled again) by -writing to memory.move_charge_at_immigrate of the destination cgroup. - -If you want to enable it: - -# echo (some positive value) > memory.move_charge_at_immigrate - -Note: Each bits of move_charge_at_immigrate has its own meaning about what type - of charges should be moved. See 8.2 for details. -Note: Charges are moved only when you move mm->owner, in other words, - a leader of a thread group. -Note: If we cannot find enough space for the task in the destination cgroup, we - try to make space by reclaiming memory. Task migration may fail if we - cannot make enough space. -Note: It can take several seconds if you move charges much. - -And if you want disable it again: - -# echo 0 > memory.move_charge_at_immigrate - -8.2 Type of charges which can be moved - -Each bit in move_charge_at_immigrate has its own meaning about what type of -charges should be moved. But in any case, it must be noted that an account of -a page or a swap can be moved only when it is charged to the task's current -(old) memory cgroup. - - bit | what type of charges would be moved ? - -----+------------------------------------------------------------------------ - 0 | A charge of an anonymous page (or swap of it) used by the target task. - | You must enable Swap Extension (see 2.4) to enable move of swap charges. - -----+------------------------------------------------------------------------ - 1 | A charge of file pages (normal file, tmpfs file (e.g. ipc shared memory) - | and swaps of tmpfs file) mmapped by the target task. Unlike the case of - | anonymous pages, file pages (and swaps) in the range mmapped by the task - | will be moved even if the task hasn't done page fault, i.e. they might - | not be the task's "RSS", but other task's "RSS" that maps the same file. - | And mapcount of the page is ignored (the page can be moved even if - | page_mapcount(page) > 1). You must enable Swap Extension (see 2.4) to - | enable move of swap charges. - -8.3 TODO - -- All of moving charge operations are done under cgroup_mutex. It's not good - behavior to hold the mutex too long, so we may need some trick. - -9. Memory thresholds - -Memory cgroup implements memory thresholds using the cgroups notification -API (see cgroups.txt). It allows to register multiple memory and memsw -thresholds and gets notifications when it crosses. - -To register a threshold, an application must: -- create an eventfd using eventfd(2); -- open memory.usage_in_bytes or memory.memsw.usage_in_bytes; -- write string like " " to - cgroup.event_control. - -Application will be notified through eventfd when memory usage crosses -threshold in any direction. - -It's applicable for root and non-root cgroup. - -10. OOM Control - -memory.oom_control file is for OOM notification and other controls. - -Memory cgroup implements OOM notifier using the cgroup notification -API (See cgroups.txt). It allows to register multiple OOM notification -delivery and gets notification when OOM happens. - -To register a notifier, an application must: - - create an eventfd using eventfd(2) - - open memory.oom_control file - - write string like " " to - cgroup.event_control - -The application will be notified through eventfd when OOM happens. -OOM notification doesn't work for the root cgroup. - -You can disable the OOM-killer by writing "1" to memory.oom_control file, as: - - #echo 1 > memory.oom_control - -If OOM-killer is disabled, tasks under cgroup will hang/sleep -in memory cgroup's OOM-waitqueue when they request accountable memory. - -For running them, you have to relax the memory cgroup's OOM status by - * enlarge limit or reduce usage. -To reduce usage, - * kill some tasks. - * move some tasks to other group with account migration. - * remove some files (on tmpfs?) - -Then, stopped tasks will work again. - -At reading, current status of OOM is shown. - oom_kill_disable 0 or 1 (if 1, oom-killer is disabled) - under_oom 0 or 1 (if 1, the memory cgroup is under OOM, tasks may - be stopped.) - -11. Memory Pressure - -The pressure level notifications can be used to monitor the memory -allocation cost; based on the pressure, applications can implement -different strategies of managing their memory resources. The pressure -levels are defined as following: - -The "low" level means that the system is reclaiming memory for new -allocations. Monitoring this reclaiming activity might be useful for -maintaining cache level. Upon notification, the program (typically -"Activity Manager") might analyze vmstat and act in advance (i.e. -prematurely shutdown unimportant services). - -The "medium" level means that the system is experiencing medium memory -pressure, the system might be making swap, paging out active file caches, -etc. Upon this event applications may decide to further analyze -vmstat/zoneinfo/memcg or internal memory usage statistics and free any -resources that can be easily reconstructed or re-read from a disk. - -The "critical" level means that the system is actively thrashing, it is -about to out of memory (OOM) or even the in-kernel OOM killer is on its -way to trigger. Applications should do whatever they can to help the -system. It might be too late to consult with vmstat or any other -statistics, so it's advisable to take an immediate action. - -By default, events are propagated upward until the event is handled, i.e. the -events are not pass-through. For example, you have three cgroups: A->B->C. Now -you set up an event listener on cgroups A, B and C, and suppose group C -experiences some pressure. In this situation, only group C will receive the -notification, i.e. groups A and B will not receive it. This is done to avoid -excessive "broadcasting" of messages, which disturbs the system and which is -especially bad if we are low on memory or thrashing. Group B, will receive -notification only if there are no event listers for group C. - -There are three optional modes that specify different propagation behavior: - - - "default": this is the default behavior specified above. This mode is the - same as omitting the optional mode parameter, preserved by backwards - compatibility. - - - "hierarchy": events always propagate up to the root, similar to the default - behavior, except that propagation continues regardless of whether there are - event listeners at each level, with the "hierarchy" mode. In the above - example, groups A, B, and C will receive notification of memory pressure. - - - "local": events are pass-through, i.e. they only receive notifications when - memory pressure is experienced in the memcg for which the notification is - registered. In the above example, group C will receive notification if - registered for "local" notification and the group experiences memory - pressure. However, group B will never receive notification, regardless if - there is an event listener for group C or not, if group B is registered for - local notification. - -The level and event notification mode ("hierarchy" or "local", if necessary) are -specified by a comma-delimited string, i.e. "low,hierarchy" specifies -hierarchical, pass-through, notification for all ancestor memcgs. Notification -that is the default, non pass-through behavior, does not specify a mode. -"medium,local" specifies pass-through notification for the medium level. - -The file memory.pressure_level is only used to setup an eventfd. To -register a notification, an application must: - -- create an eventfd using eventfd(2); -- open memory.pressure_level; -- write string as " " - to cgroup.event_control. - -Application will be notified through eventfd when memory pressure is at -the specific level (or higher). Read/write operations to -memory.pressure_level are no implemented. - -Test: - - Here is a small script example that makes a new cgroup, sets up a - memory limit, sets up a notification in the cgroup and then makes child - cgroup experience a critical pressure: - - # cd /sys/fs/cgroup/memory/ - # mkdir foo - # cd foo - # cgroup_event_listener memory.pressure_level low,hierarchy & - # echo 8000000 > memory.limit_in_bytes - # echo 8000000 > memory.memsw.limit_in_bytes - # echo $$ > tasks - # dd if=/dev/zero | read x - - (Expect a bunch of notifications, and eventually, the oom-killer will - trigger.) - -12. TODO - -1. Make per-cgroup scanner reclaim not-shared pages first -2. Teach controller to account for shared-pages -3. Start reclamation in the background when the limit is - not yet hit but the usage is getting closer - -Summary - -Overall, the memory controller has been a stable controller and has been -commented and discussed quite extensively in the community. - -References - -1. Singh, Balbir. RFC: Memory Controller, http://lwn.net/Articles/206697/ -2. Singh, Balbir. Memory Controller (RSS Control), - http://lwn.net/Articles/222762/ -3. Emelianov, Pavel. Resource controllers based on process cgroups - http://lkml.org/lkml/2007/3/6/198 -4. Emelianov, Pavel. RSS controller based on process cgroups (v2) - http://lkml.org/lkml/2007/4/9/78 -5. Emelianov, Pavel. RSS controller based on process cgroups (v3) - http://lkml.org/lkml/2007/5/30/244 -6. Menage, Paul. Control Groups v10, http://lwn.net/Articles/236032/ -7. Vaidyanathan, Srinivasan, Control Groups: Pagecache accounting and control - subsystem (v3), http://lwn.net/Articles/235534/ -8. Singh, Balbir. RSS controller v2 test results (lmbench), - http://lkml.org/lkml/2007/5/17/232 -9. Singh, Balbir. RSS controller v2 AIM9 results - http://lkml.org/lkml/2007/5/18/1 -10. Singh, Balbir. Memory controller v6 test results, - http://lkml.org/lkml/2007/8/19/36 -11. Singh, Balbir. Memory controller introduction (v6), - http://lkml.org/lkml/2007/8/17/69 -12. Corbet, Jonathan, Controlling memory use in cgroups, - http://lwn.net/Articles/243795/ diff --git a/Documentation/cgroup-v1/net_cls.rst b/Documentation/cgroup-v1/net_cls.rst new file mode 100644 index 000000000000..a2cf272af7a0 --- /dev/null +++ b/Documentation/cgroup-v1/net_cls.rst @@ -0,0 +1,44 @@ +========================= +Network classifier cgroup +========================= + +The Network classifier cgroup provides an interface to +tag network packets with a class identifier (classid). + +The Traffic Controller (tc) can be used to assign +different priorities to packets from different cgroups. +Also, Netfilter (iptables) can use this tag to perform +actions on such packets. + +Creating a net_cls cgroups instance creates a net_cls.classid file. +This net_cls.classid value is initialized to 0. + +You can write hexadecimal values to net_cls.classid; the format for these +values is 0xAAAABBBB; AAAA is the major handle number and BBBB +is the minor handle number. +Reading net_cls.classid yields a decimal result. + +Example:: + + mkdir /sys/fs/cgroup/net_cls + mount -t cgroup -onet_cls net_cls /sys/fs/cgroup/net_cls + mkdir /sys/fs/cgroup/net_cls/0 + echo 0x100001 > /sys/fs/cgroup/net_cls/0/net_cls.classid + +- setting a 10:1 handle:: + + cat /sys/fs/cgroup/net_cls/0/net_cls.classid + 1048577 + +- configuring tc:: + + tc qdisc add dev eth0 root handle 10: htb + tc class add dev eth0 parent 10: classid 10:1 htb rate 40mbit + +- creating traffic class 10:1:: + + tc filter add dev eth0 parent 10: protocol ip prio 10 handle 1: cgroup + +configuring iptables, basic example:: + + iptables -A OUTPUT -m cgroup ! --cgroup 0x100001 -j DROP diff --git a/Documentation/cgroup-v1/net_cls.txt b/Documentation/cgroup-v1/net_cls.txt deleted file mode 100644 index ec182346dea2..000000000000 --- a/Documentation/cgroup-v1/net_cls.txt +++ /dev/null @@ -1,39 +0,0 @@ -Network classifier cgroup -------------------------- - -The Network classifier cgroup provides an interface to -tag network packets with a class identifier (classid). - -The Traffic Controller (tc) can be used to assign -different priorities to packets from different cgroups. -Also, Netfilter (iptables) can use this tag to perform -actions on such packets. - -Creating a net_cls cgroups instance creates a net_cls.classid file. -This net_cls.classid value is initialized to 0. - -You can write hexadecimal values to net_cls.classid; the format for these -values is 0xAAAABBBB; AAAA is the major handle number and BBBB -is the minor handle number. -Reading net_cls.classid yields a decimal result. - -Example: -mkdir /sys/fs/cgroup/net_cls -mount -t cgroup -onet_cls net_cls /sys/fs/cgroup/net_cls -mkdir /sys/fs/cgroup/net_cls/0 -echo 0x100001 > /sys/fs/cgroup/net_cls/0/net_cls.classid - - setting a 10:1 handle. - -cat /sys/fs/cgroup/net_cls/0/net_cls.classid -1048577 - -configuring tc: -tc qdisc add dev eth0 root handle 10: htb - -tc class add dev eth0 parent 10: classid 10:1 htb rate 40mbit - - creating traffic class 10:1 - -tc filter add dev eth0 parent 10: protocol ip prio 10 handle 1: cgroup - -configuring iptables, basic example: -iptables -A OUTPUT -m cgroup ! --cgroup 0x100001 -j DROP diff --git a/Documentation/cgroup-v1/net_prio.rst b/Documentation/cgroup-v1/net_prio.rst new file mode 100644 index 000000000000..b40905871c64 --- /dev/null +++ b/Documentation/cgroup-v1/net_prio.rst @@ -0,0 +1,57 @@ +======================= +Network priority cgroup +======================= + +The Network priority cgroup provides an interface to allow an administrator to +dynamically set the priority of network traffic generated by various +applications + +Nominally, an application would set the priority of its traffic via the +SO_PRIORITY socket option. This however, is not always possible because: + +1) The application may not have been coded to set this value +2) The priority of application traffic is often a site-specific administrative + decision rather than an application defined one. + +This cgroup allows an administrator to assign a process to a group which defines +the priority of egress traffic on a given interface. Network priority groups can +be created by first mounting the cgroup filesystem:: + + # mount -t cgroup -onet_prio none /sys/fs/cgroup/net_prio + +With the above step, the initial group acting as the parent accounting group +becomes visible at '/sys/fs/cgroup/net_prio'. This group includes all tasks in +the system. '/sys/fs/cgroup/net_prio/tasks' lists the tasks in this cgroup. + +Each net_prio cgroup contains two files that are subsystem specific + +net_prio.prioidx + This file is read-only, and is simply informative. It contains a unique + integer value that the kernel uses as an internal representation of this + cgroup. + +net_prio.ifpriomap + This file contains a map of the priorities assigned to traffic originating + from processes in this group and egressing the system on various interfaces. + It contains a list of tuples in the form . Contents of this + file can be modified by echoing a string into the file using the same tuple + format. For example:: + + echo "eth0 5" > /sys/fs/cgroups/net_prio/iscsi/net_prio.ifpriomap + +This command would force any traffic originating from processes belonging to the +iscsi net_prio cgroup and egressing on interface eth0 to have the priority of +said traffic set to the value 5. The parent accounting group also has a +writeable 'net_prio.ifpriomap' file that can be used to set a system default +priority. + +Priorities are set immediately prior to queueing a frame to the device +queueing discipline (qdisc) so priorities will be assigned prior to the hardware +queue selection being made. + +One usage for the net_prio cgroup is with mqprio qdisc allowing application +traffic to be steered to hardware/driver based traffic classes. These mappings +can then be managed by administrators or other networking protocols such as +DCBX. + +A new net_prio cgroup inherits the parent's configuration. diff --git a/Documentation/cgroup-v1/net_prio.txt b/Documentation/cgroup-v1/net_prio.txt deleted file mode 100644 index a82cbd28ea8a..000000000000 --- a/Documentation/cgroup-v1/net_prio.txt +++ /dev/null @@ -1,55 +0,0 @@ -Network priority cgroup -------------------------- - -The Network priority cgroup provides an interface to allow an administrator to -dynamically set the priority of network traffic generated by various -applications - -Nominally, an application would set the priority of its traffic via the -SO_PRIORITY socket option. This however, is not always possible because: - -1) The application may not have been coded to set this value -2) The priority of application traffic is often a site-specific administrative - decision rather than an application defined one. - -This cgroup allows an administrator to assign a process to a group which defines -the priority of egress traffic on a given interface. Network priority groups can -be created by first mounting the cgroup filesystem. - -# mount -t cgroup -onet_prio none /sys/fs/cgroup/net_prio - -With the above step, the initial group acting as the parent accounting group -becomes visible at '/sys/fs/cgroup/net_prio'. This group includes all tasks in -the system. '/sys/fs/cgroup/net_prio/tasks' lists the tasks in this cgroup. - -Each net_prio cgroup contains two files that are subsystem specific - -net_prio.prioidx -This file is read-only, and is simply informative. It contains a unique integer -value that the kernel uses as an internal representation of this cgroup. - -net_prio.ifpriomap -This file contains a map of the priorities assigned to traffic originating from -processes in this group and egressing the system on various interfaces. It -contains a list of tuples in the form . Contents of this file -can be modified by echoing a string into the file using the same tuple format. -for example: - -echo "eth0 5" > /sys/fs/cgroups/net_prio/iscsi/net_prio.ifpriomap - -This command would force any traffic originating from processes belonging to the -iscsi net_prio cgroup and egressing on interface eth0 to have the priority of -said traffic set to the value 5. The parent accounting group also has a -writeable 'net_prio.ifpriomap' file that can be used to set a system default -priority. - -Priorities are set immediately prior to queueing a frame to the device -queueing discipline (qdisc) so priorities will be assigned prior to the hardware -queue selection being made. - -One usage for the net_prio cgroup is with mqprio qdisc allowing application -traffic to be steered to hardware/driver based traffic classes. These mappings -can then be managed by administrators or other networking protocols such as -DCBX. - -A new net_prio cgroup inherits the parent's configuration. diff --git a/Documentation/cgroup-v1/pids.rst b/Documentation/cgroup-v1/pids.rst new file mode 100644 index 000000000000..6acebd9e72c8 --- /dev/null +++ b/Documentation/cgroup-v1/pids.rst @@ -0,0 +1,92 @@ +========================= +Process Number Controller +========================= + +Abstract +-------- + +The process number controller is used to allow a cgroup hierarchy to stop any +new tasks from being fork()'d or clone()'d after a certain limit is reached. + +Since it is trivial to hit the task limit without hitting any kmemcg limits in +place, PIDs are a fundamental resource. As such, PID exhaustion must be +preventable in the scope of a cgroup hierarchy by allowing resource limiting of +the number of tasks in a cgroup. + +Usage +----- + +In order to use the `pids` controller, set the maximum number of tasks in +pids.max (this is not available in the root cgroup for obvious reasons). The +number of processes currently in the cgroup is given by pids.current. + +Organisational operations are not blocked by cgroup policies, so it is possible +to have pids.current > pids.max. This can be done by either setting the limit to +be smaller than pids.current, or attaching enough processes to the cgroup such +that pids.current > pids.max. However, it is not possible to violate a cgroup +policy through fork() or clone(). fork() and clone() will return -EAGAIN if the +creation of a new process would cause a cgroup policy to be violated. + +To set a cgroup to have no limit, set pids.max to "max". This is the default for +all new cgroups (N.B. that PID limits are hierarchical, so the most stringent +limit in the hierarchy is followed). + +pids.current tracks all child cgroup hierarchies, so parent/pids.current is a +superset of parent/child/pids.current. + +The pids.events file contains event counters: + + - max: Number of times fork failed because limit was hit. + +Example +------- + +First, we mount the pids controller:: + + # mkdir -p /sys/fs/cgroup/pids + # mount -t cgroup -o pids none /sys/fs/cgroup/pids + +Then we create a hierarchy, set limits and attach processes to it:: + + # mkdir -p /sys/fs/cgroup/pids/parent/child + # echo 2 > /sys/fs/cgroup/pids/parent/pids.max + # echo $$ > /sys/fs/cgroup/pids/parent/cgroup.procs + # cat /sys/fs/cgroup/pids/parent/pids.current + 2 + # + +It should be noted that attempts to overcome the set limit (2 in this case) will +fail:: + + # cat /sys/fs/cgroup/pids/parent/pids.current + 2 + # ( /bin/echo "Here's some processes for you." | cat ) + sh: fork: Resource temporary unavailable + # + +Even if we migrate to a child cgroup (which doesn't have a set limit), we will +not be able to overcome the most stringent limit in the hierarchy (in this case, +parent's):: + + # echo $$ > /sys/fs/cgroup/pids/parent/child/cgroup.procs + # cat /sys/fs/cgroup/pids/parent/pids.current + 2 + # cat /sys/fs/cgroup/pids/parent/child/pids.current + 2 + # cat /sys/fs/cgroup/pids/parent/child/pids.max + max + # ( /bin/echo "Here's some processes for you." | cat ) + sh: fork: Resource temporary unavailable + # + +We can set a limit that is smaller than pids.current, which will stop any new +processes from being forked at all (note that the shell itself counts towards +pids.current):: + + # echo 1 > /sys/fs/cgroup/pids/parent/pids.max + # /bin/echo "We can't even spawn a single process now." + sh: fork: Resource temporary unavailable + # echo 0 > /sys/fs/cgroup/pids/parent/pids.max + # /bin/echo "We can't even spawn a single process now." + sh: fork: Resource temporary unavailable + # diff --git a/Documentation/cgroup-v1/pids.txt b/Documentation/cgroup-v1/pids.txt deleted file mode 100644 index e105d708ccde..000000000000 --- a/Documentation/cgroup-v1/pids.txt +++ /dev/null @@ -1,88 +0,0 @@ - Process Number Controller - ========================= - -Abstract --------- - -The process number controller is used to allow a cgroup hierarchy to stop any -new tasks from being fork()'d or clone()'d after a certain limit is reached. - -Since it is trivial to hit the task limit without hitting any kmemcg limits in -place, PIDs are a fundamental resource. As such, PID exhaustion must be -preventable in the scope of a cgroup hierarchy by allowing resource limiting of -the number of tasks in a cgroup. - -Usage ------ - -In order to use the `pids` controller, set the maximum number of tasks in -pids.max (this is not available in the root cgroup for obvious reasons). The -number of processes currently in the cgroup is given by pids.current. - -Organisational operations are not blocked by cgroup policies, so it is possible -to have pids.current > pids.max. This can be done by either setting the limit to -be smaller than pids.current, or attaching enough processes to the cgroup such -that pids.current > pids.max. However, it is not possible to violate a cgroup -policy through fork() or clone(). fork() and clone() will return -EAGAIN if the -creation of a new process would cause a cgroup policy to be violated. - -To set a cgroup to have no limit, set pids.max to "max". This is the default for -all new cgroups (N.B. that PID limits are hierarchical, so the most stringent -limit in the hierarchy is followed). - -pids.current tracks all child cgroup hierarchies, so parent/pids.current is a -superset of parent/child/pids.current. - -The pids.events file contains event counters: - - max: Number of times fork failed because limit was hit. - -Example -------- - -First, we mount the pids controller: -# mkdir -p /sys/fs/cgroup/pids -# mount -t cgroup -o pids none /sys/fs/cgroup/pids - -Then we create a hierarchy, set limits and attach processes to it: -# mkdir -p /sys/fs/cgroup/pids/parent/child -# echo 2 > /sys/fs/cgroup/pids/parent/pids.max -# echo $$ > /sys/fs/cgroup/pids/parent/cgroup.procs -# cat /sys/fs/cgroup/pids/parent/pids.current -2 -# - -It should be noted that attempts to overcome the set limit (2 in this case) will -fail: - -# cat /sys/fs/cgroup/pids/parent/pids.current -2 -# ( /bin/echo "Here's some processes for you." | cat ) -sh: fork: Resource temporary unavailable -# - -Even if we migrate to a child cgroup (which doesn't have a set limit), we will -not be able to overcome the most stringent limit in the hierarchy (in this case, -parent's): - -# echo $$ > /sys/fs/cgroup/pids/parent/child/cgroup.procs -# cat /sys/fs/cgroup/pids/parent/pids.current -2 -# cat /sys/fs/cgroup/pids/parent/child/pids.current -2 -# cat /sys/fs/cgroup/pids/parent/child/pids.max -max -# ( /bin/echo "Here's some processes for you." | cat ) -sh: fork: Resource temporary unavailable -# - -We can set a limit that is smaller than pids.current, which will stop any new -processes from being forked at all (note that the shell itself counts towards -pids.current): - -# echo 1 > /sys/fs/cgroup/pids/parent/pids.max -# /bin/echo "We can't even spawn a single process now." -sh: fork: Resource temporary unavailable -# echo 0 > /sys/fs/cgroup/pids/parent/pids.max -# /bin/echo "We can't even spawn a single process now." -sh: fork: Resource temporary unavailable -# diff --git a/Documentation/cgroup-v1/rdma.rst b/Documentation/cgroup-v1/rdma.rst new file mode 100644 index 000000000000..2fcb0a9bf790 --- /dev/null +++ b/Documentation/cgroup-v1/rdma.rst @@ -0,0 +1,117 @@ +=============== +RDMA Controller +=============== + +.. Contents + + 1. Overview + 1-1. What is RDMA controller? + 1-2. Why RDMA controller needed? + 1-3. How is RDMA controller implemented? + 2. Usage Examples + +1. Overview +=========== + +1-1. What is RDMA controller? +----------------------------- + +RDMA controller allows user to limit RDMA/IB specific resources that a given +set of processes can use. These processes are grouped using RDMA controller. + +RDMA controller defines two resources which can be limited for processes of a +cgroup. + +1-2. Why RDMA controller needed? +-------------------------------- + +Currently user space applications can easily take away all the rdma verb +specific resources such as AH, CQ, QP, MR etc. Due to which other applications +in other cgroup or kernel space ULPs may not even get chance to allocate any +rdma resources. This can lead to service unavailability. + +Therefore RDMA controller is needed through which resource consumption +of processes can be limited. Through this controller different rdma +resources can be accounted. + +1-3. How is RDMA controller implemented? +---------------------------------------- + +RDMA cgroup allows limit configuration of resources. Rdma cgroup maintains +resource accounting per cgroup, per device using resource pool structure. +Each such resource pool is limited up to 64 resources in given resource pool +by rdma cgroup, which can be extended later if required. + +This resource pool object is linked to the cgroup css. Typically there +are 0 to 4 resource pool instances per cgroup, per device in most use cases. +But nothing limits to have it more. At present hundreds of RDMA devices per +single cgroup may not be handled optimally, however there is no +known use case or requirement for such configuration either. + +Since RDMA resources can be allocated from any process and can be freed by any +of the child processes which shares the address space, rdma resources are +always owned by the creator cgroup css. This allows process migration from one +to other cgroup without major complexity of transferring resource ownership; +because such ownership is not really present due to shared nature of +rdma resources. Linking resources around css also ensures that cgroups can be +deleted after processes migrated. This allow progress migration as well with +active resources, even though that is not a primary use case. + +Whenever RDMA resource charging occurs, owner rdma cgroup is returned to +the caller. Same rdma cgroup should be passed while uncharging the resource. +This also allows process migrated with active RDMA resource to charge +to new owner cgroup for new resource. It also allows to uncharge resource of +a process from previously charged cgroup which is migrated to new cgroup, +even though that is not a primary use case. + +Resource pool object is created in following situations. +(a) User sets the limit and no previous resource pool exist for the device +of interest for the cgroup. +(b) No resource limits were configured, but IB/RDMA stack tries to +charge the resource. So that it correctly uncharge them when applications are +running without limits and later on when limits are enforced during uncharging, +otherwise usage count will drop to negative. + +Resource pool is destroyed if all the resource limits are set to max and +it is the last resource getting deallocated. + +User should set all the limit to max value if it intents to remove/unconfigure +the resource pool for a particular device. + +IB stack honors limits enforced by the rdma controller. When application +query about maximum resource limits of IB device, it returns minimum of +what is configured by user for a given cgroup and what is supported by +IB device. + +Following resources can be accounted by rdma controller. + + ========== ============================= + hca_handle Maximum number of HCA Handles + hca_object Maximum number of HCA Objects + ========== ============================= + +2. Usage Examples +================= + +(a) Configure resource limit:: + + echo mlx4_0 hca_handle=2 hca_object=2000 > /sys/fs/cgroup/rdma/1/rdma.max + echo ocrdma1 hca_handle=3 > /sys/fs/cgroup/rdma/2/rdma.max + +(b) Query resource limit:: + + cat /sys/fs/cgroup/rdma/2/rdma.max + #Output: + mlx4_0 hca_handle=2 hca_object=2000 + ocrdma1 hca_handle=3 hca_object=max + +(c) Query current usage:: + + cat /sys/fs/cgroup/rdma/2/rdma.current + #Output: + mlx4_0 hca_handle=1 hca_object=20 + ocrdma1 hca_handle=1 hca_object=23 + +(d) Delete resource limit:: + + echo echo mlx4_0 hca_handle=max hca_object=max > /sys/fs/cgroup/rdma/1/rdma.max diff --git a/Documentation/cgroup-v1/rdma.txt b/Documentation/cgroup-v1/rdma.txt deleted file mode 100644 index 9bdb7fd03f83..000000000000 --- a/Documentation/cgroup-v1/rdma.txt +++ /dev/null @@ -1,109 +0,0 @@ - RDMA Controller - ---------------- - -Contents --------- - -1. Overview - 1-1. What is RDMA controller? - 1-2. Why RDMA controller needed? - 1-3. How is RDMA controller implemented? -2. Usage Examples - -1. Overview - -1-1. What is RDMA controller? ------------------------------ - -RDMA controller allows user to limit RDMA/IB specific resources that a given -set of processes can use. These processes are grouped using RDMA controller. - -RDMA controller defines two resources which can be limited for processes of a -cgroup. - -1-2. Why RDMA controller needed? --------------------------------- - -Currently user space applications can easily take away all the rdma verb -specific resources such as AH, CQ, QP, MR etc. Due to which other applications -in other cgroup or kernel space ULPs may not even get chance to allocate any -rdma resources. This can lead to service unavailability. - -Therefore RDMA controller is needed through which resource consumption -of processes can be limited. Through this controller different rdma -resources can be accounted. - -1-3. How is RDMA controller implemented? ----------------------------------------- - -RDMA cgroup allows limit configuration of resources. Rdma cgroup maintains -resource accounting per cgroup, per device using resource pool structure. -Each such resource pool is limited up to 64 resources in given resource pool -by rdma cgroup, which can be extended later if required. - -This resource pool object is linked to the cgroup css. Typically there -are 0 to 4 resource pool instances per cgroup, per device in most use cases. -But nothing limits to have it more. At present hundreds of RDMA devices per -single cgroup may not be handled optimally, however there is no -known use case or requirement for such configuration either. - -Since RDMA resources can be allocated from any process and can be freed by any -of the child processes which shares the address space, rdma resources are -always owned by the creator cgroup css. This allows process migration from one -to other cgroup without major complexity of transferring resource ownership; -because such ownership is not really present due to shared nature of -rdma resources. Linking resources around css also ensures that cgroups can be -deleted after processes migrated. This allow progress migration as well with -active resources, even though that is not a primary use case. - -Whenever RDMA resource charging occurs, owner rdma cgroup is returned to -the caller. Same rdma cgroup should be passed while uncharging the resource. -This also allows process migrated with active RDMA resource to charge -to new owner cgroup for new resource. It also allows to uncharge resource of -a process from previously charged cgroup which is migrated to new cgroup, -even though that is not a primary use case. - -Resource pool object is created in following situations. -(a) User sets the limit and no previous resource pool exist for the device -of interest for the cgroup. -(b) No resource limits were configured, but IB/RDMA stack tries to -charge the resource. So that it correctly uncharge them when applications are -running without limits and later on when limits are enforced during uncharging, -otherwise usage count will drop to negative. - -Resource pool is destroyed if all the resource limits are set to max and -it is the last resource getting deallocated. - -User should set all the limit to max value if it intents to remove/unconfigure -the resource pool for a particular device. - -IB stack honors limits enforced by the rdma controller. When application -query about maximum resource limits of IB device, it returns minimum of -what is configured by user for a given cgroup and what is supported by -IB device. - -Following resources can be accounted by rdma controller. - hca_handle Maximum number of HCA Handles - hca_object Maximum number of HCA Objects - -2. Usage Examples ------------------ - -(a) Configure resource limit: -echo mlx4_0 hca_handle=2 hca_object=2000 > /sys/fs/cgroup/rdma/1/rdma.max -echo ocrdma1 hca_handle=3 > /sys/fs/cgroup/rdma/2/rdma.max - -(b) Query resource limit: -cat /sys/fs/cgroup/rdma/2/rdma.max -#Output: -mlx4_0 hca_handle=2 hca_object=2000 -ocrdma1 hca_handle=3 hca_object=max - -(c) Query current usage: -cat /sys/fs/cgroup/rdma/2/rdma.current -#Output: -mlx4_0 hca_handle=1 hca_object=20 -ocrdma1 hca_handle=1 hca_object=23 - -(d) Delete resource limit: -echo echo mlx4_0 hca_handle=max hca_object=max > /sys/fs/cgroup/rdma/1/rdma.max diff --git a/Documentation/filesystems/tmpfs.txt b/Documentation/filesystems/tmpfs.txt index d06e9a59a9f4..cad797a8a39e 100644 --- a/Documentation/filesystems/tmpfs.txt +++ b/Documentation/filesystems/tmpfs.txt @@ -98,7 +98,7 @@ A memory policy with a valid NodeList will be saved, as specified, for use at file creation time. When a task allocates a file in the file system, the mount option memory policy will be applied with a NodeList, if any, modified by the calling task's cpuset constraints -[See Documentation/cgroup-v1/cpusets.txt] and any optional flags, listed +[See Documentation/cgroup-v1/cpusets.rst] and any optional flags, listed below. If the resulting NodeLists is the empty set, the effective memory policy for the file will revert to "default" policy. diff --git a/Documentation/scheduler/sched-deadline.txt b/Documentation/scheduler/sched-deadline.txt index b14e03ff3528..a7514343b660 100644 --- a/Documentation/scheduler/sched-deadline.txt +++ b/Documentation/scheduler/sched-deadline.txt @@ -652,7 +652,7 @@ CONTENTS -deadline tasks cannot have an affinity mask smaller that the entire root_domain they are created on. However, affinities can be specified - through the cpuset facility (Documentation/cgroup-v1/cpusets.txt). + through the cpuset facility (Documentation/cgroup-v1/cpusets.rst). 5.1 SCHED_DEADLINE and cpusets HOWTO ------------------------------------ diff --git a/Documentation/scheduler/sched-design-CFS.txt b/Documentation/scheduler/sched-design-CFS.txt index edd861c94c1b..d1328890ef28 100644 --- a/Documentation/scheduler/sched-design-CFS.txt +++ b/Documentation/scheduler/sched-design-CFS.txt @@ -215,7 +215,7 @@ SCHED_BATCH) tasks. These options need CONFIG_CGROUPS to be defined, and let the administrator create arbitrary groups of tasks, using the "cgroup" pseudo filesystem. See - Documentation/cgroup-v1/cgroups.txt for more information about this filesystem. + Documentation/cgroup-v1/cgroups.rst for more information about this filesystem. When CONFIG_FAIR_GROUP_SCHED is defined, a "cpu.shares" file is created for each group created using the pseudo filesystem. See example steps below to create diff --git a/Documentation/scheduler/sched-rt-group.txt b/Documentation/scheduler/sched-rt-group.txt index d8fce3e78457..c09f7a3fee66 100644 --- a/Documentation/scheduler/sched-rt-group.txt +++ b/Documentation/scheduler/sched-rt-group.txt @@ -133,7 +133,7 @@ This uses the cgroup virtual file system and "/cpu.rt_runtime_us" to control the CPU time reserved for each control group. For more information on working with control groups, you should read -Documentation/cgroup-v1/cgroups.txt as well. +Documentation/cgroup-v1/cgroups.rst as well. Group settings are checked against the following limits in order to keep the configuration schedulable: diff --git a/Documentation/vm/numa.rst b/Documentation/vm/numa.rst index 5cae13e9a08b..0d830edae8fe 100644 --- a/Documentation/vm/numa.rst +++ b/Documentation/vm/numa.rst @@ -67,7 +67,7 @@ nodes. Each emulated node will manage a fraction of the underlying cells' physical memory. NUMA emluation is useful for testing NUMA kernel and application features on non-NUMA platforms, and as a sort of memory resource management mechanism when used together with cpusets. -[see Documentation/cgroup-v1/cpusets.txt] +[see Documentation/cgroup-v1/cpusets.rst] For each node with memory, Linux constructs an independent memory management subsystem, complete with its own free page lists, in-use page lists, usage @@ -114,7 +114,7 @@ allocation behavior using Linux NUMA memory policy. [see System administrators can restrict the CPUs and nodes' memories that a non- privileged user can specify in the scheduling or NUMA commands and functions -using control groups and CPUsets. [see Documentation/cgroup-v1/cpusets.txt] +using control groups and CPUsets. [see Documentation/cgroup-v1/cpusets.rst] On architectures that do not hide memoryless nodes, Linux will include only zones [nodes] with memory in the zonelists. This means that for a memoryless diff --git a/Documentation/vm/page_migration.rst b/Documentation/vm/page_migration.rst index f68d61335abb..35bba27d5fff 100644 --- a/Documentation/vm/page_migration.rst +++ b/Documentation/vm/page_migration.rst @@ -41,7 +41,7 @@ locations. Larger installations usually partition the system using cpusets into sections of nodes. Paul Jackson has equipped cpusets with the ability to move pages when a task is moved to another cpuset (See -Documentation/cgroup-v1/cpusets.txt). +Documentation/cgroup-v1/cpusets.rst). Cpusets allows the automation of process locality. If a task is moved to a new cpuset then also all its pages are moved with it so that the performance of the process does not sink dramatically. Also the pages diff --git a/Documentation/vm/unevictable-lru.rst b/Documentation/vm/unevictable-lru.rst index b8e29f977f2d..c6d94118fbcc 100644 --- a/Documentation/vm/unevictable-lru.rst +++ b/Documentation/vm/unevictable-lru.rst @@ -98,7 +98,7 @@ Memory Control Group Interaction -------------------------------- The unevictable LRU facility interacts with the memory control group [aka -memory controller; see Documentation/cgroup-v1/memory.txt] by extending the +memory controller; see Documentation/cgroup-v1/memory.rst] by extending the lru_list enum. The memory controller data structure automatically gets a per-zone unevictable diff --git a/Documentation/x86/x86_64/fake-numa-for-cpusets.rst b/Documentation/x86/x86_64/fake-numa-for-cpusets.rst index 74fbb78b3c67..a6926cd40f70 100644 --- a/Documentation/x86/x86_64/fake-numa-for-cpusets.rst +++ b/Documentation/x86/x86_64/fake-numa-for-cpusets.rst @@ -15,7 +15,7 @@ assign them to cpusets and their attached tasks. This is a way of limiting the amount of system memory that are available to a certain class of tasks. For more information on the features of cpusets, see -Documentation/cgroup-v1/cpusets.txt. +Documentation/cgroup-v1/cpusets.rst. There are a number of different configurations you can use for your needs. For more information on the numa=fake command line option and its various ways of configuring fake nodes, see Documentation/x86/x86_64/boot-options.txt. @@ -40,7 +40,7 @@ A machine may be split as follows with "numa=fake=4*512," as reported by dmesg:: On node 3 totalpages: 131072 Now following the instructions for mounting the cpusets filesystem from -Documentation/cgroup-v1/cpusets.txt, you can assign fake nodes (i.e. contiguous memory +Documentation/cgroup-v1/cpusets.rst, you can assign fake nodes (i.e. contiguous memory address spaces) to individual cpusets:: [root@xroads /]# mkdir exampleset diff --git a/MAINTAINERS b/MAINTAINERS index 429c6c624861..b8663911779a 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4094,7 +4094,7 @@ W: http://www.bullopensource.org/cpuset/ W: http://oss.sgi.com/projects/cpusets/ T: git git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup.git S: Maintained -F: Documentation/cgroup-v1/cpusets.txt +F: Documentation/cgroup-v1/cpusets.rst F: include/linux/cpuset.h F: kernel/cgroup/cpuset.c diff --git a/block/Kconfig b/block/Kconfig index 1b220101a9cb..78374cb03114 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -88,7 +88,7 @@ config BLK_DEV_THROTTLING one needs to mount and use blkio cgroup controller for creating cgroups and specifying per device IO rate policies. - See Documentation/cgroup-v1/blkio-controller.txt for more information. + See Documentation/cgroup-v1/blkio-controller.rst for more information. config BLK_DEV_THROTTLING_LOW bool "Block throttling .low limit interface support (EXPERIMENTAL)" diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 1615b9c17e02..a3699d4d27e0 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -619,7 +619,7 @@ struct cftype { /* * Control Group subsystem type. - * See Documentation/cgroup-v1/cgroups.txt for details + * See Documentation/cgroup-v1/cgroups.rst for details */ struct cgroup_subsys { struct cgroup_subsys_state *(*css_alloc)(struct cgroup_subsys_state *parent_css); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 63e0cf66f01a..79e080ea71d8 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -783,7 +783,7 @@ union bpf_attr { * based on a user-provided identifier for all traffic coming from * the tasks belonging to the related cgroup. See also the related * kernel documentation, available from the Linux sources in file - * *Documentation/cgroup-v1/net_cls.txt*. + * *Documentation/cgroup-v1/net_cls.rst*. * * The Linux kernel has two versions for cgroups: there are * cgroups v1 and cgroups v2. Both are available to users, who can diff --git a/init/Kconfig b/init/Kconfig index 36894c9fb420..5d4bf0f676e9 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -798,7 +798,7 @@ config BLK_CGROUP CONFIG_CFQ_GROUP_IOSCHED=y; for enabling throttling policy, set CONFIG_BLK_DEV_THROTTLING=y. - See Documentation/cgroup-v1/blkio-controller.txt for more information. + See Documentation/cgroup-v1/blkio-controller.rst for more information. config DEBUG_BLK_CGROUP bool "IO controller debugging" diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 6a1942ed781c..fc6668f9db15 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -729,7 +729,7 @@ static inline int nr_cpusets(void) * load balancing domains (sched domains) as specified by that partial * partition. * - * See "What is sched_load_balance" in Documentation/cgroup-v1/cpusets.txt + * See "What is sched_load_balance" in Documentation/cgroup-v1/cpusets.rst * for a background explanation of this. * * Does not return errors, on the theory that the callers of this diff --git a/security/device_cgroup.c b/security/device_cgroup.c index dc28914fa72e..c07196502577 100644 --- a/security/device_cgroup.c +++ b/security/device_cgroup.c @@ -509,7 +509,7 @@ static inline int may_allow_all(struct dev_cgroup *parent) * This is one of the three key functions for hierarchy implementation. * This function is responsible for re-evaluating all the cgroup's active * exceptions due to a parent's exception change. - * Refer to Documentation/cgroup-v1/devices.txt for more details. + * Refer to Documentation/cgroup-v1/devices.rst for more details. */ static void revalidate_active_exceptions(struct dev_cgroup *devcg) { diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 63e0cf66f01a..79e080ea71d8 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -783,7 +783,7 @@ union bpf_attr { * based on a user-provided identifier for all traffic coming from * the tasks belonging to the related cgroup. See also the related * kernel documentation, available from the Linux sources in file - * *Documentation/cgroup-v1/net_cls.txt*. + * *Documentation/cgroup-v1/net_cls.rst*. * * The Linux kernel has two versions for cgroups: there are * cgroups v1 and cgroups v2. Both are available to users, who can -- cgit v1.2.3-71-gd317 From fb85c4a730af221339c1dde1a434b73da0dfc3ed Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Wed, 12 Jun 2019 10:30:37 -0700 Subject: bpf: export bpf_sock for BPF_PROG_TYPE_CGROUP_SOCK_ADDR prog type And let it use bpf_sk_storage_{get,delete} helpers to access socket storage. Kernel context (struct bpf_sock_addr_kern) already has sk member, so I just expose it to the BPF hooks. Using PTR_TO_SOCKET instead of PTR_TO_SOCK_COMMON should be safe because the hook is called on bind/connect. Cc: Martin Lau Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 1 + net/core/filter.c | 16 ++++++++++++++++ 2 files changed, 17 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index ae0907d8c03a..8815fc418cde 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3247,6 +3247,7 @@ struct bpf_sock_addr { __u32 msg_src_ip6[4]; /* Allows 1,2,4-byte read an 4-byte write. * Stored in network byte order. */ + __bpf_md_ptr(struct bpf_sock *, sk); }; /* User bpf_sock_ops struct to access socket values and specify request ops diff --git a/net/core/filter.c b/net/core/filter.c index a5e4ac7fcbe5..37c4a2fd559b 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5922,6 +5922,10 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_skc_lookup_tcp: return &bpf_sock_addr_skc_lookup_tcp_proto; #endif /* CONFIG_INET */ + case BPF_FUNC_sk_storage_get: + return &bpf_sk_storage_get_proto; + case BPF_FUNC_sk_storage_delete: + return &bpf_sk_storage_delete_proto; default: return bpf_base_func_proto(func_id); } @@ -6828,6 +6832,13 @@ static bool sock_addr_is_valid_access(int off, int size, if (size != size_default) return false; break; + case offsetof(struct bpf_sock_addr, sk): + if (type != BPF_READ) + return false; + if (size != sizeof(__u64)) + return false; + info->reg_type = PTR_TO_SOCKET; + break; default: if (type == BPF_READ) { if (size != size_default) @@ -7778,6 +7789,11 @@ static u32 sock_addr_convert_ctx_access(enum bpf_access_type type, struct bpf_sock_addr_kern, struct in6_addr, t_ctx, s6_addr32[0], BPF_SIZE(si->code), off, tmp_reg); break; + case offsetof(struct bpf_sock_addr, sk): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_addr_kern, sk), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_addr_kern, sk)); + break; } return insn - insn_buf; -- cgit v1.2.3-71-gd317 From 1314ef561102e534e14cb1d37f89f5c1df0b2ea7 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Wed, 12 Jun 2019 10:30:38 -0700 Subject: bpf: export bpf_sock for BPF_PROG_TYPE_SOCK_OPS prog type And let it use bpf_sk_storage_{get,delete} helpers to access socket storage. Kernel context (struct bpf_sock_ops_kern) already has sk member, so I just expose it to the BPF hooks. I use PTR_TO_SOCKET_OR_NULL and return NULL in !is_fullsock case. I also export bpf_tcp_sock to make it possible to access tcp socket stats. Cc: Martin Lau Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 1 + net/core/filter.c | 26 ++++++++++++++++++++++++++ 2 files changed, 27 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 8815fc418cde..d0a23476f887 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3299,6 +3299,7 @@ struct bpf_sock_ops { __u32 sk_txhash; __u64 bytes_received; __u64 bytes_acked; + __bpf_md_ptr(struct bpf_sock *, sk); }; /* Definitions for bpf_sock_ops_cb_flags */ diff --git a/net/core/filter.c b/net/core/filter.c index 37c4a2fd559b..8c18f2781afa 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -6147,6 +6147,14 @@ sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_local_storage_proto; case BPF_FUNC_perf_event_output: return &bpf_sockopt_event_output_proto; + case BPF_FUNC_sk_storage_get: + return &bpf_sk_storage_get_proto; + case BPF_FUNC_sk_storage_delete: + return &bpf_sk_storage_delete_proto; +#ifdef CONFIG_INET + case BPF_FUNC_tcp_sock: + return &bpf_tcp_sock_proto; +#endif /* CONFIG_INET */ default: return bpf_base_func_proto(func_id); } @@ -6882,6 +6890,11 @@ static bool sock_ops_is_valid_access(int off, int size, if (size != sizeof(__u64)) return false; break; + case offsetof(struct bpf_sock_ops, sk): + if (size != sizeof(__u64)) + return false; + info->reg_type = PTR_TO_SOCKET_OR_NULL; + break; default: if (size != size_default) return false; @@ -8053,6 +8066,19 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, SOCK_OPS_GET_OR_SET_FIELD(sk_txhash, sk_txhash, struct sock, type); break; + case offsetof(struct bpf_sock_ops, sk): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct bpf_sock_ops_kern, + is_fullsock), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_ops_kern, + is_fullsock)); + *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1); + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct bpf_sock_ops_kern, sk), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_ops_kern, sk)); + break; } return insn - insn_buf; } -- cgit v1.2.3-71-gd317 From d5470d14431e9d39ee2131323589afac2a0bfee4 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Tue, 4 Jun 2019 19:14:03 +0900 Subject: kbuild: re-implement Makefile.headersinst without recursion Since commit fcc8487d477a ("uapi: export all headers under uapi directories"), the headers in uapi directories are all exported by default although exceptional cases are still allowed by the syntax 'no-export-headers'. The traditional directory descending has been kept (in a somewhat hacky way), but it is actually unneeded. Get rid of it to simplify the code. Also, handle files one by one instead of the previous per-directory processing. This will emit much more log, but I like it. Signed-off-by: Masahiro Yamada --- Makefile | 8 +-- include/uapi/Kbuild | 14 +++++ include/uapi/linux/Kbuild | 14 ----- scripts/Makefile.headersinst | 132 ++++++++++++++++++------------------------- 4 files changed, 73 insertions(+), 95 deletions(-) create mode 100644 include/uapi/Kbuild delete mode 100644 include/uapi/linux/Kbuild (limited to 'include/uapi/linux') diff --git a/Makefile b/Makefile index 11445a7dd820..507ce2c92065 100644 --- a/Makefile +++ b/Makefile @@ -1194,13 +1194,13 @@ PHONY += headers headers: $(version_h) scripts_unifdef uapi-asm-generic archheaders archscripts $(if $(wildcard $(srctree)/arch/$(SRCARCH)/include/uapi/asm/Kbuild),, \ $(error Headers not exportable for the $(SRCARCH) architecture)) - $(Q)$(MAKE) $(hdr-inst)=include/uapi dst=include - $(Q)$(MAKE) $(hdr-inst)=arch/$(SRCARCH)/include/uapi dst=include + $(Q)$(MAKE) $(hdr-inst)=include/uapi + $(Q)$(MAKE) $(hdr-inst)=arch/$(SRCARCH)/include/uapi PHONY += headers_check headers_check: headers - $(Q)$(MAKE) $(hdr-inst)=include/uapi dst=include HDRCHECK=1 - $(Q)$(MAKE) $(hdr-inst)=arch/$(SRCARCH)/include/uapi dst=include HDRCHECK=1 + $(Q)$(MAKE) $(hdr-inst)=include/uapi HDRCHECK=1 + $(Q)$(MAKE) $(hdr-inst)=arch/$(SRCARCH)/include/uapi HDRCHECK=1 ifdef CONFIG_HEADERS_INSTALL prepare: headers diff --git a/include/uapi/Kbuild b/include/uapi/Kbuild new file mode 100644 index 000000000000..61ee6e59c930 --- /dev/null +++ b/include/uapi/Kbuild @@ -0,0 +1,14 @@ +# SPDX-License-Identifier: GPL-2.0 +ifeq ($(wildcard $(srctree)/arch/$(SRCARCH)/include/uapi/asm/a.out.h),) +no-export-headers += linux/a.out.h +endif + +ifeq ($(wildcard $(srctree)/arch/$(SRCARCH)/include/uapi/asm/kvm.h),) +no-export-headers += linux/kvm.h +endif + +ifeq ($(wildcard $(srctree)/arch/$(SRCARCH)/include/uapi/asm/kvm_para.h),) +ifeq ($(wildcard $(objtree)/arch/$(SRCARCH)/include/generated/uapi/asm/kvm_para.h),) +no-export-headers += linux/kvm_para.h +endif +endif diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild deleted file mode 100644 index 34711c5d6968..000000000000 --- a/include/uapi/linux/Kbuild +++ /dev/null @@ -1,14 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0 -ifeq ($(wildcard $(srctree)/arch/$(SRCARCH)/include/uapi/asm/a.out.h),) -no-export-headers += a.out.h -endif - -ifeq ($(wildcard $(srctree)/arch/$(SRCARCH)/include/uapi/asm/kvm.h),) -no-export-headers += kvm.h -endif - -ifeq ($(wildcard $(srctree)/arch/$(SRCARCH)/include/uapi/asm/kvm_para.h),) -ifeq ($(wildcard $(objtree)/arch/$(SRCARCH)/include/generated/uapi/asm/kvm_para.h),) -no-export-headers += kvm_para.h -endif -endif diff --git a/scripts/Makefile.headersinst b/scripts/Makefile.headersinst index 1af6d0b06585..c96c4c26e240 100644 --- a/scripts/Makefile.headersinst +++ b/scripts/Makefile.headersinst @@ -14,109 +14,87 @@ __headers: include scripts/Kbuild.include -srcdir := $(srctree)/$(obj) +src := $(srctree)/$(obj) +gen := $(objtree)/$(subst include/,include/generated/,$(obj)) +dst := usr/include -# When make is run under a fakechroot environment, the function -# $(wildcard $(srcdir)/*/.) doesn't only return directories, but also regular -# files. So, we are using a combination of sort/dir/wildcard which works -# with fakechroot. -subdirs := $(patsubst $(srcdir)/%/,%,\ - $(filter-out $(srcdir)/,\ - $(sort $(dir $(wildcard $(srcdir)/*/))))) +-include $(src)/Kbuild -# Recursion -__headers: $(subdirs) +src-subdirs := $(patsubst $(src)/%/,%,$(wildcard $(src)/*/)) +gen-subdirs := $(patsubst $(gen)/%/,%,$(wildcard $(gen)/*/)) +all-subdirs := $(sort $(src-subdirs) $(gen-subdirs)) -PHONY += $(subdirs) -$(subdirs): - $(Q)$(MAKE) $(hdr-inst)=$(obj)/$@ dst=$(dst)/$@ +src-headers := $(if $(src-subdirs), $(shell cd $(src) && find $(src-subdirs) -name '*.h')) +src-headers := $(filter-out $(no-export-headers), $(src-headers)) +gen-headers := $(if $(gen-subdirs), $(shell cd $(gen) && find $(gen-subdirs) -name '*.h')) +gen-headers := $(filter-out $(no-export-headers), $(gen-headers)) -# Skip header install/check for include/uapi and arch/$(SRCARCH)/include/uapi. -# We have only sub-directories there. -skip-inst := $(if $(filter %/uapi,$(obj)),1) +# If the same header is exported from source and generated directories, +# the former takes precedence, but this should be warned. +duplicated := $(filter $(gen-headers), $(src-headers)) +$(if $(duplicated), $(warning duplicated header export: $(duplicated))) -ifeq ($(skip-inst),) +gen-headers := $(filter-out $(duplicated), $(gen-headers)) -# Kbuild file is optional -kbuild-file := $(srctree)/$(obj)/Kbuild --include $(kbuild-file) +# Add dst path prefix +all-subdirs := $(addprefix $(dst)/, $(all-subdirs)) +src-headers := $(addprefix $(dst)/, $(src-headers)) +gen-headers := $(addprefix $(dst)/, $(gen-headers)) +all-headers := $(src-headers) $(gen-headers) -installdir := usr/$(dst) -gendir := $(objtree)/$(subst include/,include/generated/,$(obj)) -header-files := $(notdir $(wildcard $(srcdir)/*.h)) -header-files := $(filter-out $(no-export-headers), $(header-files)) -genhdr-files := $(notdir $(wildcard $(gendir)/*.h)) -genhdr-files := $(filter-out $(header-files), $(genhdr-files)) +# Work out what needs to be removed +old-subdirs := $(wildcard $(all-subdirs)) +old-headers := $(if $(old-subdirs),$(shell find $(old-subdirs) -name '*.h')) +unwanted := $(filter-out $(all-headers), $(old-headers)) -# files used to track state of install/check -install-file := $(installdir)/.install -check-file := $(installdir)/.check +# Create directories +existing-dirs := $(sort $(dir $(old-headers))) +wanted-dirs := $(sort $(dir $(all-headers))) +new-dirs := $(filter-out $(existing-dirs), $(wanted-dirs)) +$(if $(new-dirs), $(shell mkdir -p $(new-dirs))) -# all headers files for this dir -all-files := $(header-files) $(genhdr-files) -output-files := $(addprefix $(installdir)/, $(all-files)) +# Rules -# Work out what needs to be removed -oldheaders := $(patsubst $(installdir)/%,%,$(wildcard $(installdir)/*.h)) -unwanted := $(filter-out $(all-files),$(oldheaders)) +ifndef HDRCHECK -# Prefix unwanted with full paths to objtree -unwanted-file := $(addprefix $(installdir)/, $(unwanted)) +quiet_cmd_install = HDRINST $@ + cmd_install = $(CONFIG_SHELL) $(srctree)/scripts/headers_install.sh $(@D) $( Date: Sat, 15 Jun 2019 11:03:49 +0200 Subject: net: sched: remove NET_CLS_IND config option This config option makes only couple of lines optional. Two small helpers and an int in couple of cls structs. Remove the config option and always compile this in. This saves the user from unexpected surprises when he adds a filter with ingress device match which is silently ignored in case the config option is not set. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- arch/mips/configs/malta_defconfig | 1 - arch/mips/configs/malta_kvm_defconfig | 1 - arch/mips/configs/malta_kvm_guest_defconfig | 1 - arch/mips/configs/malta_qemu_32r6_defconfig | 1 - arch/mips/configs/maltaaprp_defconfig | 1 - arch/mips/configs/maltasmvp_defconfig | 1 - arch/mips/configs/maltasmvp_eva_defconfig | 1 - arch/mips/configs/maltaup_defconfig | 1 - arch/mips/configs/maltaup_xpa_defconfig | 1 - arch/mips/configs/rb532_defconfig | 1 - arch/powerpc/configs/ppc6xx_defconfig | 1 - arch/sh/configs/se7712_defconfig | 1 - arch/sh/configs/se7721_defconfig | 1 - arch/sh/configs/titan_defconfig | 1 - include/net/pkt_cls.h | 5 +---- include/uapi/linux/pkt_cls.h | 2 +- net/sched/Kconfig | 8 -------- net/sched/cls_flower.c | 3 +-- net/sched/cls_fw.c | 13 ------------- net/sched/cls_u32.c | 15 --------------- tools/include/uapi/linux/pkt_cls.h | 2 +- tools/testing/selftests/tc-testing/config | 1 - 22 files changed, 4 insertions(+), 59 deletions(-) (limited to 'include/uapi/linux') diff --git a/arch/mips/configs/malta_defconfig b/arch/mips/configs/malta_defconfig index 0ee5e677662e..0de92ac1ca64 100644 --- a/arch/mips/configs/malta_defconfig +++ b/arch/mips/configs/malta_defconfig @@ -210,7 +210,6 @@ CONFIG_NET_ACT_NAT=m CONFIG_NET_ACT_PEDIT=m CONFIG_NET_ACT_SIMP=m CONFIG_NET_ACT_SKBEDIT=m -CONFIG_NET_CLS_IND=y CONFIG_CFG80211=m CONFIG_MAC80211=m CONFIG_MAC80211_MESH=y diff --git a/arch/mips/configs/malta_kvm_defconfig b/arch/mips/configs/malta_kvm_defconfig index 041bffac043b..efc3abace048 100644 --- a/arch/mips/configs/malta_kvm_defconfig +++ b/arch/mips/configs/malta_kvm_defconfig @@ -215,7 +215,6 @@ CONFIG_NET_ACT_NAT=m CONFIG_NET_ACT_PEDIT=m CONFIG_NET_ACT_SIMP=m CONFIG_NET_ACT_SKBEDIT=m -CONFIG_NET_CLS_IND=y CONFIG_CFG80211=m CONFIG_MAC80211=m CONFIG_MAC80211_MESH=y diff --git a/arch/mips/configs/malta_kvm_guest_defconfig b/arch/mips/configs/malta_kvm_guest_defconfig index 511065e62182..c6ceeca4394d 100644 --- a/arch/mips/configs/malta_kvm_guest_defconfig +++ b/arch/mips/configs/malta_kvm_guest_defconfig @@ -212,7 +212,6 @@ CONFIG_NET_ACT_NAT=m CONFIG_NET_ACT_PEDIT=m CONFIG_NET_ACT_SIMP=m CONFIG_NET_ACT_SKBEDIT=m -CONFIG_NET_CLS_IND=y CONFIG_CFG80211=m CONFIG_MAC80211=m CONFIG_MAC80211_MESH=y diff --git a/arch/mips/configs/malta_qemu_32r6_defconfig b/arch/mips/configs/malta_qemu_32r6_defconfig index 299088043164..e6c600dc1814 100644 --- a/arch/mips/configs/malta_qemu_32r6_defconfig +++ b/arch/mips/configs/malta_qemu_32r6_defconfig @@ -74,7 +74,6 @@ CONFIG_NET_CLS_RSVP=m CONFIG_NET_CLS_RSVP6=m CONFIG_NET_CLS_ACT=y CONFIG_NET_ACT_POLICE=y -CONFIG_NET_CLS_IND=y # CONFIG_WIRELESS is not set CONFIG_DEVTMPFS=y CONFIG_BLK_DEV_LOOP=y diff --git a/arch/mips/configs/maltaaprp_defconfig b/arch/mips/configs/maltaaprp_defconfig index 2b4b3a24f637..82b44b774553 100644 --- a/arch/mips/configs/maltaaprp_defconfig +++ b/arch/mips/configs/maltaaprp_defconfig @@ -76,7 +76,6 @@ CONFIG_NET_CLS_RSVP=m CONFIG_NET_CLS_RSVP6=m CONFIG_NET_CLS_ACT=y CONFIG_NET_ACT_POLICE=y -CONFIG_NET_CLS_IND=y # CONFIG_WIRELESS is not set CONFIG_DEVTMPFS=y CONFIG_BLK_DEV_LOOP=y diff --git a/arch/mips/configs/maltasmvp_defconfig b/arch/mips/configs/maltasmvp_defconfig index 425ddfd7cd78..4190fc6189a0 100644 --- a/arch/mips/configs/maltasmvp_defconfig +++ b/arch/mips/configs/maltasmvp_defconfig @@ -77,7 +77,6 @@ CONFIG_NET_CLS_RSVP=m CONFIG_NET_CLS_RSVP6=m CONFIG_NET_CLS_ACT=y CONFIG_NET_ACT_POLICE=y -CONFIG_NET_CLS_IND=y # CONFIG_WIRELESS is not set CONFIG_DEVTMPFS=y CONFIG_BLK_DEV_LOOP=y diff --git a/arch/mips/configs/maltasmvp_eva_defconfig b/arch/mips/configs/maltasmvp_eva_defconfig index 8beaa7ba1e52..a13c10e910ec 100644 --- a/arch/mips/configs/maltasmvp_eva_defconfig +++ b/arch/mips/configs/maltasmvp_eva_defconfig @@ -78,7 +78,6 @@ CONFIG_NET_CLS_RSVP=m CONFIG_NET_CLS_RSVP6=m CONFIG_NET_CLS_ACT=y CONFIG_NET_ACT_POLICE=y -CONFIG_NET_CLS_IND=y # CONFIG_WIRELESS is not set CONFIG_DEVTMPFS=y CONFIG_BLK_DEV_LOOP=y diff --git a/arch/mips/configs/maltaup_defconfig b/arch/mips/configs/maltaup_defconfig index 6e8b95ceb54a..b35f1fc690fb 100644 --- a/arch/mips/configs/maltaup_defconfig +++ b/arch/mips/configs/maltaup_defconfig @@ -75,7 +75,6 @@ CONFIG_NET_CLS_RSVP=m CONFIG_NET_CLS_RSVP6=m CONFIG_NET_CLS_ACT=y CONFIG_NET_ACT_POLICE=y -CONFIG_NET_CLS_IND=y # CONFIG_WIRELESS is not set CONFIG_DEVTMPFS=y CONFIG_BLK_DEV_LOOP=y diff --git a/arch/mips/configs/maltaup_xpa_defconfig b/arch/mips/configs/maltaup_xpa_defconfig index 6c026db96ff9..56861aef2756 100644 --- a/arch/mips/configs/maltaup_xpa_defconfig +++ b/arch/mips/configs/maltaup_xpa_defconfig @@ -212,7 +212,6 @@ CONFIG_NET_ACT_NAT=m CONFIG_NET_ACT_PEDIT=m CONFIG_NET_ACT_SIMP=m CONFIG_NET_ACT_SKBEDIT=m -CONFIG_NET_CLS_IND=y CONFIG_CFG80211=m CONFIG_MAC80211=m CONFIG_MAC80211_MESH=y diff --git a/arch/mips/configs/rb532_defconfig b/arch/mips/configs/rb532_defconfig index 50632a3103dd..864c70fbe668 100644 --- a/arch/mips/configs/rb532_defconfig +++ b/arch/mips/configs/rb532_defconfig @@ -103,7 +103,6 @@ CONFIG_GACT_PROB=y CONFIG_NET_ACT_MIRRED=m CONFIG_NET_ACT_IPT=m CONFIG_NET_ACT_PEDIT=m -CONFIG_NET_CLS_IND=y CONFIG_HAMRADIO=y CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" CONFIG_MTD=y diff --git a/arch/powerpc/configs/ppc6xx_defconfig b/arch/powerpc/configs/ppc6xx_defconfig index 7c6baf6df139..aa51b9b66fa2 100644 --- a/arch/powerpc/configs/ppc6xx_defconfig +++ b/arch/powerpc/configs/ppc6xx_defconfig @@ -301,7 +301,6 @@ CONFIG_NET_ACT_NAT=m CONFIG_NET_ACT_PEDIT=m CONFIG_NET_ACT_SIMP=m CONFIG_NET_ACT_SKBEDIT=m -CONFIG_NET_CLS_IND=y CONFIG_IRDA=m CONFIG_IRLAN=m CONFIG_IRNET=m diff --git a/arch/sh/configs/se7712_defconfig b/arch/sh/configs/se7712_defconfig index 5a1097641247..1e116529735f 100644 --- a/arch/sh/configs/se7712_defconfig +++ b/arch/sh/configs/se7712_defconfig @@ -63,7 +63,6 @@ CONFIG_NET_SCH_NETEM=y CONFIG_NET_CLS_TCINDEX=y CONFIG_NET_CLS_ROUTE4=y CONFIG_NET_CLS_FW=y -CONFIG_NET_CLS_IND=y CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" CONFIG_MTD=y CONFIG_MTD_BLOCK=y diff --git a/arch/sh/configs/se7721_defconfig b/arch/sh/configs/se7721_defconfig index 9c0ef13bee10..c66e512719ab 100644 --- a/arch/sh/configs/se7721_defconfig +++ b/arch/sh/configs/se7721_defconfig @@ -62,7 +62,6 @@ CONFIG_NET_SCH_NETEM=y CONFIG_NET_CLS_TCINDEX=y CONFIG_NET_CLS_ROUTE4=y CONFIG_NET_CLS_FW=y -CONFIG_NET_CLS_IND=y CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" CONFIG_MTD=y CONFIG_MTD_BLOCK=y diff --git a/arch/sh/configs/titan_defconfig b/arch/sh/configs/titan_defconfig index 822fa9e96f74..171ab05ce4fc 100644 --- a/arch/sh/configs/titan_defconfig +++ b/arch/sh/configs/titan_defconfig @@ -142,7 +142,6 @@ CONFIG_GACT_PROB=y CONFIG_NET_ACT_MIRRED=m CONFIG_NET_ACT_IPT=m CONFIG_NET_ACT_PEDIT=m -CONFIG_NET_CLS_IND=y CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" CONFIG_FW_LOADER=m CONFIG_CONNECTOR=m diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index 514e3c80ecc1..720f2b32fc2f 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -7,6 +7,7 @@ #include #include #include +#include /* TC action not accessible from user space */ #define TC_ACT_REINSERT (TC_ACT_VALUE_MAX + 1) @@ -576,9 +577,6 @@ static inline int tcf_valid_offset(const struct sk_buff *skb, (ptr <= (ptr + len))); } -#ifdef CONFIG_NET_CLS_IND -#include - static inline int tcf_change_indev(struct net *net, struct nlattr *indev_tlv, struct netlink_ext_ack *extack) @@ -605,7 +603,6 @@ tcf_match_indev(struct sk_buff *skb, int ifindex) return false; return ifindex == skb->skb_iif; } -#endif /* CONFIG_NET_CLS_IND */ int tc_setup_flow_action(struct flow_action *flow_action, const struct tcf_exts *exts); diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h index a93680fc4bfa..8cc6b6777b3c 100644 --- a/include/uapi/linux/pkt_cls.h +++ b/include/uapi/linux/pkt_cls.h @@ -295,7 +295,7 @@ enum { TCA_FW_UNSPEC, TCA_FW_CLASSID, TCA_FW_POLICE, - TCA_FW_INDEV, /* used by CONFIG_NET_CLS_IND */ + TCA_FW_INDEV, TCA_FW_ACT, /* used by CONFIG_NET_CLS_ACT */ TCA_FW_MASK, __TCA_FW_MAX diff --git a/net/sched/Kconfig b/net/sched/Kconfig index d104f7ee26c7..360fdd3eaa77 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -941,14 +941,6 @@ config NET_IFE_SKBTCINDEX tristate "Support to encoding decoding skb tcindex on IFE action" depends on NET_ACT_IFE -config NET_CLS_IND - bool "Incoming device classification" - depends on NET_CLS_U32 || NET_CLS_FW - ---help--- - Say Y here to extend the u32 and fw classifier to support - classification based on the incoming device. This option is - likely to disappear in favour of the metadata ematch. - endif # NET_SCHED config NET_SCH_FIFO diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index c388372df0e2..84c7f279855b 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -1010,7 +1010,7 @@ static int fl_set_key(struct net *net, struct nlattr **tb, { __be16 ethertype; int ret = 0; -#ifdef CONFIG_NET_CLS_IND + if (tb[TCA_FLOWER_INDEV]) { int err = tcf_change_indev(net, tb[TCA_FLOWER_INDEV], extack); if (err < 0) @@ -1018,7 +1018,6 @@ static int fl_set_key(struct net *net, struct nlattr **tb, key->indev_ifindex = err; mask->indev_ifindex = 0xffffffff; } -#endif fl_set_key_val(tb, key->eth.dst, TCA_FLOWER_KEY_ETH_DST, mask->eth.dst, TCA_FLOWER_KEY_ETH_DST_MASK, diff --git a/net/sched/cls_fw.c b/net/sched/cls_fw.c index 4dab833f66cb..c9496c920d6f 100644 --- a/net/sched/cls_fw.c +++ b/net/sched/cls_fw.c @@ -8,9 +8,6 @@ * Karlis Peisenieks : 990415 : fw_walk off by one * Karlis Peisenieks : 990415 : fw_delete killed all the filter (and kernel). * Alex : 2004xxyy: Added Action extension - * - * JHS: We should remove the CONFIG_NET_CLS_IND from here - * eventually when the meta match extension is made available */ #include @@ -37,9 +34,7 @@ struct fw_filter { struct fw_filter __rcu *next; u32 id; struct tcf_result res; -#ifdef CONFIG_NET_CLS_IND int ifindex; -#endif /* CONFIG_NET_CLS_IND */ struct tcf_exts exts; struct tcf_proto *tp; struct rcu_work rwork; @@ -67,10 +62,8 @@ static int fw_classify(struct sk_buff *skb, const struct tcf_proto *tp, f = rcu_dereference_bh(f->next)) { if (f->id == id) { *res = f->res; -#ifdef CONFIG_NET_CLS_IND if (!tcf_match_indev(skb, f->ifindex)) continue; -#endif /* CONFIG_NET_CLS_IND */ r = tcf_exts_exec(skb, &f->exts, res); if (r < 0) continue; @@ -222,7 +215,6 @@ static int fw_set_parms(struct net *net, struct tcf_proto *tp, tcf_bind_filter(tp, &f->res, base); } -#ifdef CONFIG_NET_CLS_IND if (tb[TCA_FW_INDEV]) { int ret; ret = tcf_change_indev(net, tb[TCA_FW_INDEV], extack); @@ -230,7 +222,6 @@ static int fw_set_parms(struct net *net, struct tcf_proto *tp, return ret; f->ifindex = ret; } -#endif /* CONFIG_NET_CLS_IND */ err = -EINVAL; if (tb[TCA_FW_MASK]) { @@ -276,9 +267,7 @@ static int fw_change(struct net *net, struct sk_buff *in_skb, fnew->id = f->id; fnew->res = f->res; -#ifdef CONFIG_NET_CLS_IND fnew->ifindex = f->ifindex; -#endif /* CONFIG_NET_CLS_IND */ fnew->tp = f->tp; err = tcf_exts_init(&fnew->exts, net, TCA_FW_ACT, @@ -405,14 +394,12 @@ static int fw_dump(struct net *net, struct tcf_proto *tp, void *fh, if (f->res.classid && nla_put_u32(skb, TCA_FW_CLASSID, f->res.classid)) goto nla_put_failure; -#ifdef CONFIG_NET_CLS_IND if (f->ifindex) { struct net_device *dev; dev = __dev_get_by_index(net, f->ifindex); if (dev && nla_put_string(skb, TCA_FW_INDEV, dev->name)) goto nla_put_failure; } -#endif /* CONFIG_NET_CLS_IND */ if (head->mask != 0xFFFFFFFF && nla_put_u32(skb, TCA_FW_MASK, head->mask)) goto nla_put_failure; diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index c7727de5e073..be9e46c77e8b 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -20,9 +20,6 @@ * pure RSVP doesn't need such a general approach and can use * much simpler (and faster) schemes, sort of cls_rsvp.c. * - * JHS: We should remove the CONFIG_NET_CLS_IND from here - * eventually when the meta match extension is made available - * * nfmark match added by Catalin(ux aka Dino) BOIE */ @@ -48,9 +45,7 @@ struct tc_u_knode { u32 handle; struct tc_u_hnode __rcu *ht_up; struct tcf_exts exts; -#ifdef CONFIG_NET_CLS_IND int ifindex; -#endif u8 fshift; struct tcf_result res; struct tc_u_hnode __rcu *ht_down; @@ -176,12 +171,10 @@ check_terminal: if (n->sel.flags & TC_U32_TERMINAL) { *res = n->res; -#ifdef CONFIG_NET_CLS_IND if (!tcf_match_indev(skb, n->ifindex)) { n = rcu_dereference_bh(n->next); goto next_knode; } -#endif #ifdef CONFIG_CLS_U32_PERF __this_cpu_inc(n->pf->rhit); #endif @@ -761,7 +754,6 @@ static int u32_set_parms(struct net *net, struct tcf_proto *tp, tcf_bind_filter(tp, &n->res, base); } -#ifdef CONFIG_NET_CLS_IND if (tb[TCA_U32_INDEV]) { int ret; ret = tcf_change_indev(net, tb[TCA_U32_INDEV], extack); @@ -769,7 +761,6 @@ static int u32_set_parms(struct net *net, struct tcf_proto *tp, return -EINVAL; n->ifindex = ret; } -#endif return 0; } @@ -817,9 +808,7 @@ static struct tc_u_knode *u32_init_knode(struct net *net, struct tcf_proto *tp, new->handle = n->handle; RCU_INIT_POINTER(new->ht_up, n->ht_up); -#ifdef CONFIG_NET_CLS_IND new->ifindex = n->ifindex; -#endif new->fshift = n->fshift; new->res = n->res; new->flags = n->flags; @@ -1351,14 +1340,12 @@ static int u32_dump(struct net *net, struct tcf_proto *tp, void *fh, if (tcf_exts_dump(skb, &n->exts) < 0) goto nla_put_failure; -#ifdef CONFIG_NET_CLS_IND if (n->ifindex) { struct net_device *dev; dev = __dev_get_by_index(net, n->ifindex); if (dev && nla_put_string(skb, TCA_U32_INDEV, dev->name)) goto nla_put_failure; } -#endif #ifdef CONFIG_CLS_U32_PERF gpf = kzalloc(sizeof(struct tc_u32_pcnt) + n->sel.nkeys * sizeof(u64), @@ -1422,9 +1409,7 @@ static int __init init_u32(void) #ifdef CONFIG_CLS_U32_PERF pr_info(" Performance counters on\n"); #endif -#ifdef CONFIG_NET_CLS_IND pr_info(" input device check on\n"); -#endif #ifdef CONFIG_NET_CLS_ACT pr_info(" Actions configured\n"); #endif diff --git a/tools/include/uapi/linux/pkt_cls.h b/tools/include/uapi/linux/pkt_cls.h index 401d0c1e612d..12153771396a 100644 --- a/tools/include/uapi/linux/pkt_cls.h +++ b/tools/include/uapi/linux/pkt_cls.h @@ -257,7 +257,7 @@ enum { TCA_FW_UNSPEC, TCA_FW_CLASSID, TCA_FW_POLICE, - TCA_FW_INDEV, /* used by CONFIG_NET_CLS_IND */ + TCA_FW_INDEV, TCA_FW_ACT, /* used by CONFIG_NET_CLS_ACT */ TCA_FW_MASK, __TCA_FW_MAX diff --git a/tools/testing/selftests/tc-testing/config b/tools/testing/selftests/tc-testing/config index b235efd55367..1adc4f9bb795 100644 --- a/tools/testing/selftests/tc-testing/config +++ b/tools/testing/selftests/tc-testing/config @@ -45,5 +45,4 @@ CONFIG_NET_ACT_TUNNEL_KEY=m CONFIG_NET_IFE_SKBMARK=m CONFIG_NET_IFE_SKBPRIO=m CONFIG_NET_IFE_SKBTCINDEX=m -CONFIG_NET_CLS_IND=y CONFIG_NET_SCH_FIFO=y -- cgit v1.2.3-71-gd317 From 857b46027d6f91150797295752581b7155b9d0e1 Mon Sep 17 00:00:00 2001 From: Stéphane Veyret Date: Sat, 25 May 2019 15:30:58 +0200 Subject: netfilter: nft_ct: add ct expectations support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch allows to add, list and delete expectations via nft objref infrastructure and assigning these expectations via nft rule. This allows manual port triggering when no helper is defined to manage a specific protocol. For example, if I have an online game which protocol is based on initial connection to TCP port 9753 of the server, and where the server opens a connection to port 9876, I can set rules as follow: table ip filter { ct expectation mygame { protocol udp; dport 9876; timeout 2m; size 1; } chain input { type filter hook input priority 0; policy drop; tcp dport 9753 ct expectation set "mygame"; } chain output { type filter hook output priority 0; policy drop; udp dport 9876 ct status expected accept; } } Signed-off-by: Stéphane Veyret Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 14 +++- net/netfilter/nft_ct.c | 138 ++++++++++++++++++++++++++++++- 2 files changed, 149 insertions(+), 3 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 505393c6e959..31a6b8f7ff73 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -1445,6 +1445,17 @@ enum nft_ct_timeout_timeout_attributes { }; #define NFTA_CT_TIMEOUT_MAX (__NFTA_CT_TIMEOUT_MAX - 1) +enum nft_ct_expectation_attributes { + NFTA_CT_EXPECT_UNSPEC, + NFTA_CT_EXPECT_L3PROTO, + NFTA_CT_EXPECT_L4PROTO, + NFTA_CT_EXPECT_DPORT, + NFTA_CT_EXPECT_TIMEOUT, + NFTA_CT_EXPECT_SIZE, + __NFTA_CT_EXPECT_MAX, +}; +#define NFTA_CT_EXPECT_MAX (__NFTA_CT_EXPECT_MAX - 1) + #define NFT_OBJECT_UNSPEC 0 #define NFT_OBJECT_COUNTER 1 #define NFT_OBJECT_QUOTA 2 @@ -1454,7 +1465,8 @@ enum nft_ct_timeout_timeout_attributes { #define NFT_OBJECT_TUNNEL 6 #define NFT_OBJECT_CT_TIMEOUT 7 #define NFT_OBJECT_SECMARK 8 -#define __NFT_OBJECT_MAX 9 +#define NFT_OBJECT_CT_EXPECT 9 +#define __NFT_OBJECT_MAX 10 #define NFT_OBJECT_MAX (__NFT_OBJECT_MAX - 1) /** diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index f043936763f3..06b52c894573 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -24,6 +24,7 @@ #include #include #include +#include struct nft_ct { enum nft_ct_keys key:8; @@ -1156,6 +1157,131 @@ static struct nft_object_type nft_ct_helper_obj_type __read_mostly = { .owner = THIS_MODULE, }; +struct nft_ct_expect_obj { + u16 l3num; + __be16 dport; + u8 l4proto; + u8 size; + u32 timeout; +}; + +static int nft_ct_expect_obj_init(const struct nft_ctx *ctx, + const struct nlattr * const tb[], + struct nft_object *obj) +{ + struct nft_ct_expect_obj *priv = nft_obj_data(obj); + + if (!tb[NFTA_CT_EXPECT_L4PROTO] || + !tb[NFTA_CT_EXPECT_DPORT] || + !tb[NFTA_CT_EXPECT_TIMEOUT] || + !tb[NFTA_CT_EXPECT_SIZE]) + return -EINVAL; + + priv->l3num = ctx->family; + if (tb[NFTA_CT_EXPECT_L3PROTO]) + priv->l3num = ntohs(nla_get_be16(tb[NFTA_CT_EXPECT_L3PROTO])); + + priv->l4proto = nla_get_u8(tb[NFTA_CT_EXPECT_L4PROTO]); + priv->dport = nla_get_be16(tb[NFTA_CT_EXPECT_DPORT]); + priv->timeout = nla_get_u32(tb[NFTA_CT_EXPECT_TIMEOUT]); + priv->size = nla_get_u8(tb[NFTA_CT_EXPECT_SIZE]); + + return nf_ct_netns_get(ctx->net, ctx->family); +} + +static void nft_ct_expect_obj_destroy(const struct nft_ctx *ctx, + struct nft_object *obj) +{ + nf_ct_netns_put(ctx->net, ctx->family); +} + +static int nft_ct_expect_obj_dump(struct sk_buff *skb, + struct nft_object *obj, bool reset) +{ + const struct nft_ct_expect_obj *priv = nft_obj_data(obj); + + if (nla_put_be16(skb, NFTA_CT_EXPECT_L3PROTO, htons(priv->l3num)) || + nla_put_u8(skb, NFTA_CT_EXPECT_L4PROTO, priv->l4proto) || + nla_put_be16(skb, NFTA_CT_EXPECT_DPORT, priv->dport) || + nla_put_u32(skb, NFTA_CT_EXPECT_TIMEOUT, priv->timeout) || + nla_put_u8(skb, NFTA_CT_EXPECT_SIZE, priv->size)) + return -1; + + return 0; +} + +static void nft_ct_expect_obj_eval(struct nft_object *obj, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + const struct nft_ct_expect_obj *priv = nft_obj_data(obj); + struct nf_conntrack_expect *exp; + enum ip_conntrack_info ctinfo; + struct nf_conn_help *help; + enum ip_conntrack_dir dir; + u16 l3num = priv->l3num; + struct nf_conn *ct; + + ct = nf_ct_get(pkt->skb, &ctinfo); + if (!ct || ctinfo == IP_CT_UNTRACKED) { + regs->verdict.code = NFT_BREAK; + return; + } + dir = CTINFO2DIR(ctinfo); + + help = nfct_help(ct); + if (!help) + help = nf_ct_helper_ext_add(ct, GFP_ATOMIC); + + if (help->expecting[NF_CT_EXPECT_CLASS_DEFAULT] >= priv->size) { + regs->verdict.code = NFT_BREAK; + return; + } + if (l3num == NFPROTO_INET) + l3num = nf_ct_l3num(ct); + + exp = nf_ct_expect_alloc(ct); + if (exp == NULL) { + regs->verdict.code = NF_DROP; + return; + } + nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, l3num, + &ct->tuplehash[!dir].tuple.src.u3, + &ct->tuplehash[!dir].tuple.dst.u3, + priv->l4proto, NULL, &priv->dport); + exp->timeout.expires = jiffies + priv->timeout * HZ; + + if (nf_ct_expect_related(exp) != 0) + regs->verdict.code = NF_DROP; +} + +static const struct nla_policy nft_ct_expect_policy[NFTA_CT_EXPECT_MAX + 1] = { + [NFTA_CT_EXPECT_L3PROTO] = { .type = NLA_U16 }, + [NFTA_CT_EXPECT_L4PROTO] = { .type = NLA_U8 }, + [NFTA_CT_EXPECT_DPORT] = { .type = NLA_U16 }, + [NFTA_CT_EXPECT_TIMEOUT] = { .type = NLA_U32 }, + [NFTA_CT_EXPECT_SIZE] = { .type = NLA_U8 }, +}; + +static struct nft_object_type nft_ct_expect_obj_type; + +static const struct nft_object_ops nft_ct_expect_obj_ops = { + .type = &nft_ct_expect_obj_type, + .size = sizeof(struct nft_ct_expect_obj), + .eval = nft_ct_expect_obj_eval, + .init = nft_ct_expect_obj_init, + .destroy = nft_ct_expect_obj_destroy, + .dump = nft_ct_expect_obj_dump, +}; + +static struct nft_object_type nft_ct_expect_obj_type __read_mostly = { + .type = NFT_OBJECT_CT_EXPECT, + .ops = &nft_ct_expect_obj_ops, + .maxattr = NFTA_CT_EXPECT_MAX, + .policy = nft_ct_expect_policy, + .owner = THIS_MODULE, +}; + static int __init nft_ct_module_init(void) { int err; @@ -1173,17 +1299,23 @@ static int __init nft_ct_module_init(void) err = nft_register_obj(&nft_ct_helper_obj_type); if (err < 0) goto err2; + + err = nft_register_obj(&nft_ct_expect_obj_type); + if (err < 0) + goto err3; #ifdef CONFIG_NF_CONNTRACK_TIMEOUT err = nft_register_obj(&nft_ct_timeout_obj_type); if (err < 0) - goto err3; + goto err4; #endif return 0; #ifdef CONFIG_NF_CONNTRACK_TIMEOUT +err4: + nft_unregister_obj(&nft_ct_expect_obj_type); +#endif err3: nft_unregister_obj(&nft_ct_helper_obj_type); -#endif err2: nft_unregister_expr(&nft_notrack_type); err1: @@ -1196,6 +1328,7 @@ static void __exit nft_ct_module_exit(void) #ifdef CONFIG_NF_CONNTRACK_TIMEOUT nft_unregister_obj(&nft_ct_timeout_obj_type); #endif + nft_unregister_obj(&nft_ct_expect_obj_type); nft_unregister_obj(&nft_ct_helper_obj_type); nft_unregister_expr(&nft_notrack_type); nft_unregister_expr(&nft_ct_type); @@ -1210,3 +1343,4 @@ MODULE_ALIAS_NFT_EXPR("ct"); MODULE_ALIAS_NFT_EXPR("notrack"); MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_CT_HELPER); MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_CT_TIMEOUT); +MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_CT_EXPECT); -- cgit v1.2.3-71-gd317 From 9911c1139fd072594ac259c2ce055b004ca92f49 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 7 Jun 2019 16:37:30 +0200 Subject: netfilter: xt_owner: bail out with EINVAL in case of unsupported flags Reject flags that are not supported with EINVAL. Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/xt_owner.h | 5 +++++ net/netfilter/xt_owner.c | 3 +++ 2 files changed, 8 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/netfilter/xt_owner.h b/include/uapi/linux/netfilter/xt_owner.h index 9e98c09eda32..5108df4d0313 100644 --- a/include/uapi/linux/netfilter/xt_owner.h +++ b/include/uapi/linux/netfilter/xt_owner.h @@ -11,6 +11,11 @@ enum { XT_OWNER_SUPPL_GROUPS = 1 << 3, }; +#define XT_OWNER_MASK (XT_OWNER_UID | \ + XT_OWNER_GID | \ + XT_OWNER_SOCKET | \ + XT_OWNER_SUPPL_GROUPS) + struct xt_owner_match_info { __u32 uid_min, uid_max; __u32 gid_min, gid_max; diff --git a/net/netfilter/xt_owner.c b/net/netfilter/xt_owner.c index a8784502aca6..ee597fdc5db7 100644 --- a/net/netfilter/xt_owner.c +++ b/net/netfilter/xt_owner.c @@ -25,6 +25,9 @@ static int owner_check(const struct xt_mtchk_param *par) struct xt_owner_match_info *info = par->matchinfo; struct net *net = par->net; + if (info->match & ~XT_OWNER_MASK) + return -EINVAL; + /* Only allow the common case where the userns of the writer * matches the userns of the network namespace. */ -- cgit v1.2.3-71-gd317 From 5fcc88ecf681b64da6c2c918352e2451db6a97ec Mon Sep 17 00:00:00 2001 From: Fernando Fernandez Mancera Date: Fri, 7 Jun 2019 02:36:02 +0200 Subject: netfilter: synproxy: add common uapi for SYNPROXY infrastructure This new UAPI file is going to be used by the xt and nft common SYNPROXY infrastructure. It is needed to avoid duplicated code. Signed-off-by: Fernando Fernandez Mancera Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_SYNPROXY.h | 19 +++++++++++++++++++ include/uapi/linux/netfilter/xt_SYNPROXY.h | 18 +++++++----------- 2 files changed, 26 insertions(+), 11 deletions(-) create mode 100644 include/uapi/linux/netfilter/nf_SYNPROXY.h (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/netfilter/nf_SYNPROXY.h b/include/uapi/linux/netfilter/nf_SYNPROXY.h new file mode 100644 index 000000000000..068d1b3a6f06 --- /dev/null +++ b/include/uapi/linux/netfilter/nf_SYNPROXY.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _NF_SYNPROXY_H +#define _NF_SYNPROXY_H + +#include + +#define NF_SYNPROXY_OPT_MSS 0x01 +#define NF_SYNPROXY_OPT_WSCALE 0x02 +#define NF_SYNPROXY_OPT_SACK_PERM 0x04 +#define NF_SYNPROXY_OPT_TIMESTAMP 0x08 +#define NF_SYNPROXY_OPT_ECN 0x10 + +struct nf_synproxy_info { + __u8 options; + __u8 wscale; + __u16 mss; +}; + +#endif /* _NF_SYNPROXY_H */ diff --git a/include/uapi/linux/netfilter/xt_SYNPROXY.h b/include/uapi/linux/netfilter/xt_SYNPROXY.h index ea5eba15d4c1..4d5611d647df 100644 --- a/include/uapi/linux/netfilter/xt_SYNPROXY.h +++ b/include/uapi/linux/netfilter/xt_SYNPROXY.h @@ -2,18 +2,14 @@ #ifndef _XT_SYNPROXY_H #define _XT_SYNPROXY_H -#include +#include -#define XT_SYNPROXY_OPT_MSS 0x01 -#define XT_SYNPROXY_OPT_WSCALE 0x02 -#define XT_SYNPROXY_OPT_SACK_PERM 0x04 -#define XT_SYNPROXY_OPT_TIMESTAMP 0x08 -#define XT_SYNPROXY_OPT_ECN 0x10 +#define XT_SYNPROXY_OPT_MSS NF_SYNPROXY_OPT_MSS +#define XT_SYNPROXY_OPT_WSCALE NF_SYNPROXY_OPT_WSCALE +#define XT_SYNPROXY_OPT_SACK_PERM NF_SYNPROXY_OPT_SACK_PERM +#define XT_SYNPROXY_OPT_TIMESTAMP NF_SYNPROXY_OPT_TIMESTAMP +#define XT_SYNPROXY_OPT_ECN NF_SYNPROXY_OPT_ECN -struct xt_synproxy_info { - __u8 options; - __u8 wscale; - __u16 mss; -}; +#define xt_synproxy_info nf_synproxy_info #endif /* _XT_SYNPROXY_H */ -- cgit v1.2.3-71-gd317 From 6d101f24f1dd41ef6eff3d7f175417ce27a3055a Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Mon, 10 Jun 2019 15:36:58 -0700 Subject: USB: add usbfs ioctl to retrieve the connection parameters Recently usfbs gained availability to retrieve device speed, but there is sill no way to determine the bus number or list of ports the device is connected to when using usbfs. While this information can be obtained from sysfs, not all environments allow sysfs access. In a jailed environment a program might be simply given an opened file descriptor to usbfs device, and it is really important that all data can be gathered from said file descriptor. This patch introduces a new ioctl, USBDEVFS_CONNINFO_EX, which return extended connection information for the device, including the bus number, address, port list and speed. The API allows kernel to extend amount of data returned by the ioctl and userspace has an option of adjusting the amount of data it is willing to consume. A new capability, USBDEVFS_CAP_CONNINFO_EX, is introduced to help userspace in determining whether the kernel supports this new ioctl. Signed-off-by: Dmitry Torokhov Acked-by: Alan Stern Signed-off-by: Greg Kroah-Hartman --- drivers/usb/core/devio.c | 42 ++++++++++++++++++++++++++++++++++++++- include/uapi/linux/usbdevice_fs.h | 26 ++++++++++++++++++++++++ 2 files changed, 67 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/drivers/usb/core/devio.c b/drivers/usb/core/devio.c index aa17dab6c4ea..186790b06b11 100644 --- a/drivers/usb/core/devio.c +++ b/drivers/usb/core/devio.c @@ -1308,6 +1308,39 @@ static int proc_connectinfo(struct usb_dev_state *ps, void __user *arg) return 0; } +static int proc_conninfo_ex(struct usb_dev_state *ps, + void __user *arg, size_t size) +{ + struct usbdevfs_conninfo_ex ci; + struct usb_device *udev = ps->dev; + + if (size < sizeof(ci.size)) + return -EINVAL; + + memset(&ci, 0, sizeof(ci)); + ci.size = sizeof(ci); + ci.busnum = udev->bus->busnum; + ci.devnum = udev->devnum; + ci.speed = udev->speed; + + while (udev && udev->portnum != 0) { + if (++ci.num_ports <= ARRAY_SIZE(ci.ports)) + ci.ports[ARRAY_SIZE(ci.ports) - ci.num_ports] = + udev->portnum; + udev = udev->parent; + } + + if (ci.num_ports < ARRAY_SIZE(ci.ports)) + memmove(&ci.ports[0], + &ci.ports[ARRAY_SIZE(ci.ports) - ci.num_ports], + ci.num_ports); + + if (copy_to_user(arg, &ci, min(sizeof(ci), size))) + return -EFAULT; + + return 0; +} + static int proc_resetdevice(struct usb_dev_state *ps) { struct usb_host_config *actconfig = ps->dev->actconfig; @@ -2250,7 +2283,7 @@ static int proc_get_capabilities(struct usb_dev_state *ps, void __user *arg) caps = USBDEVFS_CAP_ZERO_PACKET | USBDEVFS_CAP_NO_PACKET_SIZE_LIM | USBDEVFS_CAP_REAP_AFTER_DISCONNECT | USBDEVFS_CAP_MMAP | - USBDEVFS_CAP_DROP_PRIVILEGES; + USBDEVFS_CAP_DROP_PRIVILEGES | USBDEVFS_CAP_CONNINFO_EX; if (!ps->dev->bus->no_stop_on_short) caps |= USBDEVFS_CAP_BULK_CONTINUATION; if (ps->dev->bus->sg_tablesize) @@ -2549,6 +2582,13 @@ static long usbdev_do_ioctl(struct file *file, unsigned int cmd, break; } + /* Handle variable-length commands */ + switch (cmd & ~IOCSIZE_MASK) { + case USBDEVFS_CONNINFO_EX(0): + ret = proc_conninfo_ex(ps, p, _IOC_SIZE(cmd)); + break; + } + done: usb_unlock_device(dev); if (ret >= 0) diff --git a/include/uapi/linux/usbdevice_fs.h b/include/uapi/linux/usbdevice_fs.h index 964e87217be4..4b267fe3776e 100644 --- a/include/uapi/linux/usbdevice_fs.h +++ b/include/uapi/linux/usbdevice_fs.h @@ -76,6 +76,26 @@ struct usbdevfs_connectinfo { unsigned char slow; }; +struct usbdevfs_conninfo_ex { + __u32 size; /* Size of the structure from the kernel's */ + /* point of view. Can be used by userspace */ + /* to determine how much data can be */ + /* used/trusted. */ + __u32 busnum; /* USB bus number, as enumerated by the */ + /* kernel, the device is connected to. */ + __u32 devnum; /* Device address on the bus. */ + __u32 speed; /* USB_SPEED_* constants from ch9.h */ + u8 num_ports; /* Number of ports the device is connected */ + /* to on the way to the root hub. It may */ + /* be bigger than size of 'ports' array so */ + /* userspace can detect overflows. */ + u8 ports[7]; /* List of ports on the way from the root */ + /* hub to the device. Current limit in */ + /* USB specification is 7 tiers (root hub, */ + /* 5 intermediate hubs, device), which */ + /* gives at most 6 port entries. */ +}; + #define USBDEVFS_URB_SHORT_NOT_OK 0x01 #define USBDEVFS_URB_ISO_ASAP 0x02 #define USBDEVFS_URB_BULK_CONTINUATION 0x04 @@ -137,6 +157,7 @@ struct usbdevfs_hub_portinfo { #define USBDEVFS_CAP_REAP_AFTER_DISCONNECT 0x10 #define USBDEVFS_CAP_MMAP 0x20 #define USBDEVFS_CAP_DROP_PRIVILEGES 0x40 +#define USBDEVFS_CAP_CONNINFO_EX 0x80 /* USBDEVFS_DISCONNECT_CLAIM flags & struct */ @@ -197,5 +218,10 @@ struct usbdevfs_streams { #define USBDEVFS_FREE_STREAMS _IOR('U', 29, struct usbdevfs_streams) #define USBDEVFS_DROP_PRIVILEGES _IOW('U', 30, __u32) #define USBDEVFS_GET_SPEED _IO('U', 31) +/* + * Returns struct usbdevfs_conninfo_ex; length is variable to allow + * extending size of the data returned. + */ +#define USBDEVFS_CONNINFO_EX(len) _IOC(_IOC_READ, 'U', 32, len) #endif /* _UAPI_LINUX_USBDEVICE_FS_H */ -- cgit v1.2.3-71-gd317 From 58b55c859ac00c6845b6aed8852c541bc204c935 Mon Sep 17 00:00:00 2001 From: Jean-Philippe Brucker Date: Wed, 12 Jun 2019 18:59:38 +0100 Subject: iommu: Add padding to struct iommu_fault Ease future extensions of struct iommu_fault_page_request and struct iommu_fault_unrecoverable by adding a few bytes of padding. That way, a new field can be added to either of these structures by simply introducing a new flag. To extend it after the size limit is reached, a new fault reporting structure will have to be negotiated with userspace. With 56 bytes of padding, the total size of iommu_fault is 64 bytes and fits in a cache line on a lot of contemporary machines, while providing 16 and 24 bytes of extension to structures iommu_fault_page_request and iommu_fault_unrecoverable respectively. Signed-off-by: Jean-Philippe Brucker Acked-by: Jacob Pan Reviewed-by: Eric Auger Signed-off-by: Joerg Roedel --- include/uapi/linux/iommu.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/iommu.h b/include/uapi/linux/iommu.h index f45d8e9e59c3..fc00c5d4741b 100644 --- a/include/uapi/linux/iommu.h +++ b/include/uapi/linux/iommu.h @@ -106,6 +106,7 @@ struct iommu_fault_page_request { * @padding: reserved for future use (should be zero) * @event: fault event, when @type is %IOMMU_FAULT_DMA_UNRECOV * @prm: Page Request message, when @type is %IOMMU_FAULT_PAGE_REQ + * @padding2: sets the fault size to allow for future extensions */ struct iommu_fault { __u32 type; @@ -113,6 +114,7 @@ struct iommu_fault { union { struct iommu_fault_unrecoverable event; struct iommu_fault_page_request prm; + __u8 padding2[56]; }; }; -- cgit v1.2.3-71-gd317 From 75345f888f700c4ab2448287e35d48c760b202e6 Mon Sep 17 00:00:00 2001 From: Denis Kirjanov Date: Mon, 17 Jun 2019 10:53:41 +0200 Subject: ipoib: show VF broadcast address in IPoIB case we can't see a VF broadcast address for but can see for PF Before: 11: ib1: mtu 2044 qdisc pfifo_fast state UP mode DEFAULT group default qlen 256 link/infiniband 80:00:00:66:fe:80:00:00:00:00:00:00:24:8a:07:03:00:a4:3e:7c brd 00:ff:ff:ff:ff:12:40:1b:ff:ff:00:00:00:00:00:00:ff:ff:ff:ff vf 0 MAC 14:80:00:00:66:fe, spoof checking off, link-state disable, trust off, query_rss off ... After: 11: ib1: mtu 2044 qdisc pfifo_fast state UP mode DEFAULT group default qlen 256 link/infiniband 80:00:00:66:fe:80:00:00:00:00:00:00:24:8a:07:03:00:a4:3e:7c brd 00:ff:ff:ff:ff:12:40:1b:ff:ff:00:00:00:00:00:00:ff:ff:ff:ff vf 0 link/infiniband 80:00:00:66:fe:80:00:00:00:00:00:00:24:8a:07:03:00:a4:3e:7c brd 00:ff:ff:ff:ff:12:40:1b:ff:ff:00:00:00:00:00:00:ff:ff:ff:ff, spoof checking off, link-state disable, trust off, query_rss off v1->v2: add the IFLA_VF_BROADCAST constant v2->v3: put IFLA_VF_BROADCAST at the end to avoid KABI breakage and set NLA_REJECT dev_setlink Signed-off-by: Denis Kirjanov Acked-by: Doug Ledford Signed-off-by: David S. Miller --- include/uapi/linux/if_link.h | 5 +++++ net/core/rtnetlink.c | 5 +++++ 2 files changed, 10 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 5b225ff63b48..6f75bda2c2d7 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -694,6 +694,7 @@ enum { IFLA_VF_IB_NODE_GUID, /* VF Infiniband node GUID */ IFLA_VF_IB_PORT_GUID, /* VF Infiniband port GUID */ IFLA_VF_VLAN_LIST, /* nested list of vlans, option for QinQ */ + IFLA_VF_BROADCAST, /* VF broadcast */ __IFLA_VF_MAX, }; @@ -704,6 +705,10 @@ struct ifla_vf_mac { __u8 mac[32]; /* MAX_ADDR_LEN */ }; +struct ifla_vf_broadcast { + __u8 broadcast[32]; +}; + struct ifla_vf_vlan { __u32 vf; __u32 vlan; /* 0 - 4095, 0 disables VLAN filter */ diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index cec60583931f..8ac81630ab5c 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -908,6 +908,7 @@ static inline int rtnl_vfinfo_size(const struct net_device *dev, size += num_vfs * (nla_total_size(0) + nla_total_size(sizeof(struct ifla_vf_mac)) + + nla_total_size(sizeof(struct ifla_vf_broadcast)) + nla_total_size(sizeof(struct ifla_vf_vlan)) + nla_total_size(0) + /* nest IFLA_VF_VLAN_LIST */ nla_total_size(MAX_VLAN_LIST_LEN * @@ -1197,6 +1198,7 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb, struct ifla_vf_vlan vf_vlan; struct ifla_vf_rate vf_rate; struct ifla_vf_mac vf_mac; + struct ifla_vf_broadcast vf_broadcast; struct ifla_vf_info ivi; memset(&ivi, 0, sizeof(ivi)); @@ -1231,6 +1233,7 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb, vf_trust.vf = ivi.vf; memcpy(vf_mac.mac, ivi.mac, sizeof(ivi.mac)); + memcpy(vf_broadcast.broadcast, dev->broadcast, dev->addr_len); vf_vlan.vlan = ivi.vlan; vf_vlan.qos = ivi.qos; vf_vlan_info.vlan = ivi.vlan; @@ -1247,6 +1250,7 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb, if (!vf) goto nla_put_vfinfo_failure; if (nla_put(skb, IFLA_VF_MAC, sizeof(vf_mac), &vf_mac) || + nla_put(skb, IFLA_VF_BROADCAST, sizeof(vf_broadcast), &vf_broadcast) || nla_put(skb, IFLA_VF_VLAN, sizeof(vf_vlan), &vf_vlan) || nla_put(skb, IFLA_VF_RATE, sizeof(vf_rate), &vf_rate) || @@ -1753,6 +1757,7 @@ static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = { static const struct nla_policy ifla_vf_policy[IFLA_VF_MAX+1] = { [IFLA_VF_MAC] = { .len = sizeof(struct ifla_vf_mac) }, + [IFLA_VF_BROADCAST] = { .type = NLA_REJECT }, [IFLA_VF_VLAN] = { .len = sizeof(struct ifla_vf_vlan) }, [IFLA_VF_VLAN_LIST] = { .type = NLA_NESTED }, [IFLA_VF_TX_RATE] = { .len = sizeof(struct ifla_vf_tx_rate) }, -- cgit v1.2.3-71-gd317 From 45e0f30c30bb131663fbe1752974d6f2e39611e2 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 30 May 2019 14:53:10 +0100 Subject: keys: Add capability-checking keyctl function Add a keyctl function that requests a set of capability bits to find out what features are supported. Signed-off-by: David Howells --- include/uapi/linux/keyctl.h | 14 ++++++++++++++ security/keys/compat.c | 3 +++ security/keys/internal.h | 2 ++ security/keys/keyctl.c | 35 +++++++++++++++++++++++++++++++++++ 4 files changed, 54 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/keyctl.h b/include/uapi/linux/keyctl.h index fd9fb11b312b..551b5814f53e 100644 --- a/include/uapi/linux/keyctl.h +++ b/include/uapi/linux/keyctl.h @@ -68,6 +68,7 @@ #define KEYCTL_PKEY_VERIFY 28 /* Verify a public key signature */ #define KEYCTL_RESTRICT_KEYRING 29 /* Restrict keys allowed to link to a keyring */ #define KEYCTL_MOVE 30 /* Move keys between keyrings */ +#define KEYCTL_CAPABILITIES 31 /* Find capabilities of keyrings subsystem */ /* keyctl structures */ struct keyctl_dh_params { @@ -115,4 +116,17 @@ struct keyctl_pkey_params { #define KEYCTL_MOVE_EXCL 0x00000001 /* Do not displace from the to-keyring */ +/* + * Capabilities flags. The capabilities list is an array of 8-bit integers; + * each integer can carry up to 8 flags. + */ +#define KEYCTL_CAPS0_CAPABILITIES 0x01 /* KEYCTL_CAPABILITIES supported */ +#define KEYCTL_CAPS0_PERSISTENT_KEYRINGS 0x02 /* Persistent keyrings enabled */ +#define KEYCTL_CAPS0_DIFFIE_HELLMAN 0x04 /* Diffie-Hellman computation enabled */ +#define KEYCTL_CAPS0_PUBLIC_KEY 0x08 /* Public key ops enabled */ +#define KEYCTL_CAPS0_BIG_KEY 0x10 /* big_key-type enabled */ +#define KEYCTL_CAPS0_INVALIDATE 0x20 /* KEYCTL_INVALIDATE supported */ +#define KEYCTL_CAPS0_RESTRICT_KEYRING 0x40 /* KEYCTL_RESTRICT_KEYRING supported */ +#define KEYCTL_CAPS0_MOVE 0x80 /* KEYCTL_MOVE supported */ + #endif /* _LINUX_KEYCTL_H */ diff --git a/security/keys/compat.c b/security/keys/compat.c index b326bc4f84d7..a53e30da20c5 100644 --- a/security/keys/compat.c +++ b/security/keys/compat.c @@ -162,6 +162,9 @@ COMPAT_SYSCALL_DEFINE5(keyctl, u32, option, case KEYCTL_MOVE: return keyctl_keyring_move(arg2, arg3, arg4, arg5); + case KEYCTL_CAPABILITIES: + return keyctl_capabilities(compat_ptr(arg2), arg3); + default: return -EOPNOTSUPP; } diff --git a/security/keys/internal.h b/security/keys/internal.h index b54a58c025ae..d04bff631227 100644 --- a/security/keys/internal.h +++ b/security/keys/internal.h @@ -329,6 +329,8 @@ static inline long keyctl_pkey_e_d_s(int op, } #endif +extern long keyctl_capabilities(unsigned char __user *_buffer, size_t buflen); + /* * Debugging key validation */ diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c index bbfe7d92d41c..9f418e66f067 100644 --- a/security/keys/keyctl.c +++ b/security/keys/keyctl.c @@ -30,6 +30,18 @@ #define KEY_MAX_DESC_SIZE 4096 +static const unsigned char keyrings_capabilities[1] = { + [0] = (KEYCTL_CAPS0_CAPABILITIES | + (IS_ENABLED(CONFIG_PERSISTENT_KEYRINGS) ? KEYCTL_CAPS0_PERSISTENT_KEYRINGS : 0) | + (IS_ENABLED(CONFIG_KEY_DH_OPERATIONS) ? KEYCTL_CAPS0_DIFFIE_HELLMAN : 0) | + (IS_ENABLED(CONFIG_ASYMMETRIC_KEY_TYPE) ? KEYCTL_CAPS0_PUBLIC_KEY : 0) | + (IS_ENABLED(CONFIG_BIG_KEYS) ? KEYCTL_CAPS0_BIG_KEY : 0) | + KEYCTL_CAPS0_INVALIDATE | + KEYCTL_CAPS0_RESTRICT_KEYRING | + KEYCTL_CAPS0_MOVE + ), +}; + static int key_get_type_from_user(char *type, const char __user *_type, unsigned len) @@ -1678,6 +1690,26 @@ error: return ret; } +/* + * Get keyrings subsystem capabilities. + */ +long keyctl_capabilities(unsigned char __user *_buffer, size_t buflen) +{ + size_t size = buflen; + + if (size > 0) { + if (size > sizeof(keyrings_capabilities)) + size = sizeof(keyrings_capabilities); + if (copy_to_user(_buffer, keyrings_capabilities, size) != 0) + return -EFAULT; + if (size < buflen && + clear_user(_buffer + size, buflen - size) != 0) + return -EFAULT; + } + + return sizeof(keyrings_capabilities); +} + /* * The key control system call */ @@ -1824,6 +1856,9 @@ SYSCALL_DEFINE5(keyctl, int, option, unsigned long, arg2, unsigned long, arg3, (key_serial_t)arg4, (unsigned int)arg5); + case KEYCTL_CAPABILITIES: + return keyctl_capabilities((unsigned char __user *)arg2, (size_t)arg3); + default: return -EOPNOTSUPP; } -- cgit v1.2.3-71-gd317 From 23cdf8752b26d4edbd60a6293bca492d83192d4d Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Wed, 19 Jun 2019 10:12:58 -0400 Subject: act_ctinfo: Don't use BIT() in UAPI headers. Use _BITUL() instead. Reported-by: Stephen Rothwell Signed-off-by: David S. Miller --- include/uapi/linux/tc_act/tc_ctinfo.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/tc_act/tc_ctinfo.h b/include/uapi/linux/tc_act/tc_ctinfo.h index da803e05a89b..32337304fbe5 100644 --- a/include/uapi/linux/tc_act/tc_ctinfo.h +++ b/include/uapi/linux/tc_act/tc_ctinfo.h @@ -27,8 +27,8 @@ enum { #define TCA_CTINFO_MAX (__TCA_CTINFO_MAX - 1) enum { - CTINFO_MODE_DSCP = BIT(0), - CTINFO_MODE_CPMARK = BIT(1) + CTINFO_MODE_DSCP = _BITUL(0), + CTINFO_MODE_CPMARK = _BITUL(1) }; #endif -- cgit v1.2.3-71-gd317 From b119deca1e016e37614117f56f74461eac559af5 Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Wed, 19 Jun 2019 16:36:16 +1000 Subject: USB: fix types in uapi include Signed-off-by: Stephen Rothwell Signed-off-by: Greg Kroah-Hartman --- include/uapi/linux/usbdevice_fs.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/usbdevice_fs.h b/include/uapi/linux/usbdevice_fs.h index 4b267fe3776e..78efe870c2b7 100644 --- a/include/uapi/linux/usbdevice_fs.h +++ b/include/uapi/linux/usbdevice_fs.h @@ -85,11 +85,11 @@ struct usbdevfs_conninfo_ex { /* kernel, the device is connected to. */ __u32 devnum; /* Device address on the bus. */ __u32 speed; /* USB_SPEED_* constants from ch9.h */ - u8 num_ports; /* Number of ports the device is connected */ + __u8 num_ports; /* Number of ports the device is connected */ /* to on the way to the root hub. It may */ /* be bigger than size of 'ports' array so */ /* userspace can detect overflows. */ - u8 ports[7]; /* List of ports on the way from the root */ + __u8 ports[7]; /* List of ports on the way from the root */ /* hub to the device. Current limit in */ /* USB specification is 7 tiers (root hub, */ /* 5 intermediate hubs, device), which */ -- cgit v1.2.3-71-gd317 From 16e5a266f51639492ac30761d043525d7d43f4c8 Mon Sep 17 00:00:00 2001 From: Kevin Darbyshire-Bryant Date: Wed, 19 Jun 2019 18:41:10 +0100 Subject: net: sched: act_ctinfo: tidy UAPI definition Remove some enums from the UAPI definition that were only used internally and are NOT part of the UAPI. Signed-off-by: Kevin Darbyshire-Bryant Signed-off-by: David S. Miller --- include/net/tc_act/tc_ctinfo.h | 5 +++++ include/uapi/linux/tc_act/tc_ctinfo.h | 5 ----- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/tc_act/tc_ctinfo.h b/include/net/tc_act/tc_ctinfo.h index d6a688571672..f071c1d70a25 100644 --- a/include/net/tc_act/tc_ctinfo.h +++ b/include/net/tc_act/tc_ctinfo.h @@ -23,6 +23,11 @@ struct tcf_ctinfo { u64 stats_cpmark_set; }; +enum { + CTINFO_MODE_DSCP = BIT(0), + CTINFO_MODE_CPMARK = BIT(1) +}; + #define to_ctinfo(a) ((struct tcf_ctinfo *)a) #endif /* __NET_TC_CTINFO_H */ diff --git a/include/uapi/linux/tc_act/tc_ctinfo.h b/include/uapi/linux/tc_act/tc_ctinfo.h index 32337304fbe5..f5f26d95d0e7 100644 --- a/include/uapi/linux/tc_act/tc_ctinfo.h +++ b/include/uapi/linux/tc_act/tc_ctinfo.h @@ -26,9 +26,4 @@ enum { #define TCA_CTINFO_MAX (__TCA_CTINFO_MAX - 1) -enum { - CTINFO_MODE_DSCP = _BITUL(0), - CTINFO_MODE_CPMARK = _BITUL(1) -}; - #endif -- cgit v1.2.3-71-gd317 From dbb5281a1f84b2f93032d4864c211ce8a20811a7 Mon Sep 17 00:00:00 2001 From: Stephen Suryaputra Date: Thu, 20 Jun 2019 12:19:59 -0400 Subject: netfilter: nf_tables: add support for matching IPv4 options This is the kernel change for the overall changes with this description: Add capability to have rules matching IPv4 options. This is developed mainly to support dropping of IP packets with loose and/or strict source route route options. Signed-off-by: Stephen Suryaputra Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 2 + net/ipv4/ip_options.c | 1 + net/netfilter/nft_exthdr.c | 133 +++++++++++++++++++++++++++++++ 3 files changed, 136 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 31a6b8f7ff73..c6c8ec5c7c00 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -730,10 +730,12 @@ enum nft_exthdr_flags { * * @NFT_EXTHDR_OP_IPV6: match against ipv6 extension headers * @NFT_EXTHDR_OP_TCP: match against tcp options + * @NFT_EXTHDR_OP_IPV4: match against ipv4 options */ enum nft_exthdr_op { NFT_EXTHDR_OP_IPV6, NFT_EXTHDR_OP_TCPOPT, + NFT_EXTHDR_OP_IPV4, __NFT_EXTHDR_OP_MAX }; #define NFT_EXTHDR_OP_MAX (__NFT_EXTHDR_OP_MAX - 1) diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c index 3db31bb9df50..ddaa01ec2bce 100644 --- a/net/ipv4/ip_options.c +++ b/net/ipv4/ip_options.c @@ -473,6 +473,7 @@ error: *info = htonl((pp_ptr-iph)<<24); return -EINVAL; } +EXPORT_SYMBOL(__ip_options_compile); int ip_options_compile(struct net *net, struct ip_options *opt, struct sk_buff *skb) diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c index 45c8a6c07783..8032b2937c7f 100644 --- a/net/netfilter/nft_exthdr.c +++ b/net/netfilter/nft_exthdr.c @@ -62,6 +62,103 @@ err: regs->verdict.code = NFT_BREAK; } +/* find the offset to specified option. + * + * If target header is found, its offset is set in *offset and return option + * number. Otherwise, return negative error. + * + * If the first fragment doesn't contain the End of Options it is considered + * invalid. + */ +static int ipv4_find_option(struct net *net, struct sk_buff *skb, + unsigned int *offset, int target) +{ + unsigned char optbuf[sizeof(struct ip_options) + 40]; + struct ip_options *opt = (struct ip_options *)optbuf; + struct iphdr *iph, _iph; + unsigned int start; + bool found = false; + __be32 info; + int optlen; + + iph = skb_header_pointer(skb, 0, sizeof(_iph), &_iph); + if (!iph) + return -EBADMSG; + start = sizeof(struct iphdr); + + optlen = iph->ihl * 4 - (int)sizeof(struct iphdr); + if (optlen <= 0) + return -ENOENT; + + memset(opt, 0, sizeof(struct ip_options)); + /* Copy the options since __ip_options_compile() modifies + * the options. + */ + if (skb_copy_bits(skb, start, opt->__data, optlen)) + return -EBADMSG; + opt->optlen = optlen; + + if (__ip_options_compile(net, opt, NULL, &info)) + return -EBADMSG; + + switch (target) { + case IPOPT_SSRR: + case IPOPT_LSRR: + if (!opt->srr) + break; + found = target == IPOPT_SSRR ? opt->is_strictroute : + !opt->is_strictroute; + if (found) + *offset = opt->srr + start; + break; + case IPOPT_RR: + if (!opt->rr) + break; + *offset = opt->rr + start; + found = true; + break; + case IPOPT_RA: + if (!opt->router_alert) + break; + *offset = opt->router_alert + start; + found = true; + break; + default: + return -EOPNOTSUPP; + } + return found ? target : -ENOENT; +} + +static void nft_exthdr_ipv4_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + struct nft_exthdr *priv = nft_expr_priv(expr); + u32 *dest = ®s->data[priv->dreg]; + struct sk_buff *skb = pkt->skb; + unsigned int offset; + int err; + + if (skb->protocol != htons(ETH_P_IP)) + goto err; + + err = ipv4_find_option(nft_net(pkt), skb, &offset, priv->type); + if (priv->flags & NFT_EXTHDR_F_PRESENT) { + *dest = (err >= 0); + return; + } else if (err < 0) { + goto err; + } + offset += priv->offset; + + dest[priv->len / NFT_REG32_SIZE] = 0; + if (skb_copy_bits(pkt->skb, offset, dest, priv->len) < 0) + goto err; + return; +err: + regs->verdict.code = NFT_BREAK; +} + static void * nft_tcp_header_pointer(const struct nft_pktinfo *pkt, unsigned int len, void *buffer, unsigned int *tcphdr_len) @@ -315,6 +412,28 @@ static int nft_exthdr_tcp_set_init(const struct nft_ctx *ctx, return nft_validate_register_load(priv->sreg, priv->len); } +static int nft_exthdr_ipv4_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_exthdr *priv = nft_expr_priv(expr); + int err = nft_exthdr_init(ctx, expr, tb); + + if (err < 0) + return err; + + switch (priv->type) { + case IPOPT_SSRR: + case IPOPT_LSRR: + case IPOPT_RR: + case IPOPT_RA: + break; + default: + return -EOPNOTSUPP; + } + return 0; +} + static int nft_exthdr_dump_common(struct sk_buff *skb, const struct nft_exthdr *priv) { if (nla_put_u8(skb, NFTA_EXTHDR_TYPE, priv->type)) @@ -361,6 +480,14 @@ static const struct nft_expr_ops nft_exthdr_ipv6_ops = { .dump = nft_exthdr_dump, }; +static const struct nft_expr_ops nft_exthdr_ipv4_ops = { + .type = &nft_exthdr_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_exthdr)), + .eval = nft_exthdr_ipv4_eval, + .init = nft_exthdr_ipv4_init, + .dump = nft_exthdr_dump, +}; + static const struct nft_expr_ops nft_exthdr_tcp_ops = { .type = &nft_exthdr_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_exthdr)), @@ -401,6 +528,12 @@ nft_exthdr_select_ops(const struct nft_ctx *ctx, if (tb[NFTA_EXTHDR_DREG]) return &nft_exthdr_ipv6_ops; break; + case NFT_EXTHDR_OP_IPV4: + if (ctx->family != NFPROTO_IPV6) { + if (tb[NFTA_EXTHDR_DREG]) + return &nft_exthdr_ipv4_ops; + } + break; } return ERR_PTR(-EOPNOTSUPP); -- cgit v1.2.3-71-gd317 From 5ca004d11bfa4f5705b2761b6de29f81914cf3fe Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Wed, 12 Jun 2019 20:56:20 -0400 Subject: media: media.h: Fix shifting signed 32-bit value by 31 bits problem Fix MEDIA_ENT_ID_FLAG_NEXT to use "U" cast to avoid shifting signed 32-bit value by 31 bits problem. This isn't a problem for kernel builds with gcc. This could be problem since this header is part of public API which could be included for builds using compilers that don't handle this condition safely resulting in undefined behavior. Signed-off-by: Shuah Khan Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- include/uapi/linux/media.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/media.h b/include/uapi/linux/media.h index 9aedb187bc48..383ac7b7d8f0 100644 --- a/include/uapi/linux/media.h +++ b/include/uapi/linux/media.h @@ -146,7 +146,7 @@ struct media_device_info { #define MEDIA_ENT_FL_CONNECTOR (1 << 1) /* OR with the entity id value to find the next entity */ -#define MEDIA_ENT_ID_FLAG_NEXT (1 << 31) +#define MEDIA_ENT_ID_FLAG_NEXT (1U << 31) struct media_entity_desc { __u32 id; -- cgit v1.2.3-71-gd317 From ff3c65cb81157b7259250a1f68ddf13a43923ecb Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Wed, 12 Jun 2019 20:56:52 -0400 Subject: media: videodev2.h: Fix shifting signed 32-bit value by 31 bits problem Fix v4l2_fourcc define to use "U" cast to avoid shifting signed 32-bit value by 31 bits problem. This isn't a problem for kernel builds with gcc. This could be problem since this header is part of public API which could be included for builds using compilers that don't handle this condition safely resulting in undefined behavior. Signed-off-by: Shuah Khan Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- include/uapi/linux/videodev2.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h index 1050a75fb7ef..9d9705ceda76 100644 --- a/include/uapi/linux/videodev2.h +++ b/include/uapi/linux/videodev2.h @@ -80,7 +80,7 @@ /* Four-character-code (FOURCC) */ #define v4l2_fourcc(a, b, c, d)\ ((__u32)(a) | ((__u32)(b) << 8) | ((__u32)(c) << 16) | ((__u32)(d) << 24)) -#define v4l2_fourcc_be(a, b, c, d) (v4l2_fourcc(a, b, c, d) | (1 << 31)) +#define v4l2_fourcc_be(a, b, c, d) (v4l2_fourcc(a, b, c, d) | (1U << 31)) /* * E N U M S -- cgit v1.2.3-71-gd317 From 06d2bfedd147d26af6908e4202466586133e73a7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 13 Jun 2019 09:08:52 +0200 Subject: binfmt_flat: remove the uapi header The split between the two flat.h files is completely arbitrary, and the uapi version even contains CONFIG_ ifdefs that can't work in userspace. The only userspace program known to use the header is elf2flt, and it ships with its own version of the combined header. Use the chance to move the inclusion out of this file, as it is in no way needed for the format defintion, but just for the binfmt implementation. Signed-off-by: Christoph Hellwig Tested-by: Vladimir Murzin Reviewed-by: Vladimir Murzin Signed-off-by: Greg Ungerer --- fs/binfmt_flat.c | 1 + include/linux/flat.h | 45 ++++++++++++++++++++++++++++++++---- include/uapi/linux/flat.h | 59 ----------------------------------------------- 3 files changed, 42 insertions(+), 63 deletions(-) delete mode 100644 include/uapi/linux/flat.h (limited to 'include/uapi/linux') diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c index a15fdd5d95ed..b63c5e63ae3f 100644 --- a/fs/binfmt_flat.c +++ b/fs/binfmt_flat.c @@ -42,6 +42,7 @@ #include #include #include +#include #ifndef flat_get_relocate_addr #define flat_get_relocate_addr(rel) (rel) diff --git a/include/linux/flat.h b/include/linux/flat.h index 569b67d64d5c..21d901ba191b 100644 --- a/include/linux/flat.h +++ b/include/linux/flat.h @@ -10,8 +10,47 @@ #ifndef _LINUX_FLAT_H #define _LINUX_FLAT_H -#include -#include +#define FLAT_VERSION 0x00000004L + +#ifdef CONFIG_BINFMT_SHARED_FLAT +#define MAX_SHARED_LIBS (4) +#else +#define MAX_SHARED_LIBS (1) +#endif + +/* + * To make everything easier to port and manage cross platform + * development, all fields are in network byte order. + */ + +struct flat_hdr { + char magic[4]; + unsigned long rev; /* version (as above) */ + unsigned long entry; /* Offset of first executable instruction + with text segment from beginning of file */ + unsigned long data_start; /* Offset of data segment from beginning of + file */ + unsigned long data_end; /* Offset of end of data segment + from beginning of file */ + unsigned long bss_end; /* Offset of end of bss segment from beginning + of file */ + + /* (It is assumed that data_end through bss_end forms the bss segment.) */ + + unsigned long stack_size; /* Size of stack, in bytes */ + unsigned long reloc_start; /* Offset of relocation records from + beginning of file */ + unsigned long reloc_count; /* Number of relocation records */ + unsigned long flags; + unsigned long build_date; /* When the program/library was built */ + unsigned long filler[5]; /* Reservered, set to zero */ +}; + +#define FLAT_FLAG_RAM 0x0001 /* load program entirely into RAM */ +#define FLAT_FLAG_GOTPIC 0x0002 /* program is PIC with GOT */ +#define FLAT_FLAG_GZIP 0x0004 /* all but the header is compressed */ +#define FLAT_FLAG_GZDATA 0x0008 /* only data/relocs are compressed (for XIP) */ +#define FLAT_FLAG_KTRACE 0x0010 /* output useful kernel trace for debugging */ /* * While it would be nice to keep this header clean, users of older @@ -22,8 +61,6 @@ * with the format above, except to fix bugs with old format support. */ -#include - #define OLD_FLAT_VERSION 0x00000002L #define OLD_FLAT_RELOC_TYPE_TEXT 0 #define OLD_FLAT_RELOC_TYPE_DATA 1 diff --git a/include/uapi/linux/flat.h b/include/uapi/linux/flat.h deleted file mode 100644 index 27e595e44fb7..000000000000 --- a/include/uapi/linux/flat.h +++ /dev/null @@ -1,59 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -/* - * Copyright (C) 2002-2003 David McCullough - * Copyright (C) 1998 Kenneth Albanowski - * The Silver Hammer Group, Ltd. - * - * This file provides the definitions and structures needed to - * support uClinux flat-format executables. - */ - -#ifndef _UAPI_LINUX_FLAT_H -#define _UAPI_LINUX_FLAT_H - - -#define FLAT_VERSION 0x00000004L - -#ifdef CONFIG_BINFMT_SHARED_FLAT -#define MAX_SHARED_LIBS (4) -#else -#define MAX_SHARED_LIBS (1) -#endif - -/* - * To make everything easier to port and manage cross platform - * development, all fields are in network byte order. - */ - -struct flat_hdr { - char magic[4]; - unsigned long rev; /* version (as above) */ - unsigned long entry; /* Offset of first executable instruction - with text segment from beginning of file */ - unsigned long data_start; /* Offset of data segment from beginning of - file */ - unsigned long data_end; /* Offset of end of data segment - from beginning of file */ - unsigned long bss_end; /* Offset of end of bss segment from beginning - of file */ - - /* (It is assumed that data_end through bss_end forms the bss segment.) */ - - unsigned long stack_size; /* Size of stack, in bytes */ - unsigned long reloc_start; /* Offset of relocation records from - beginning of file */ - unsigned long reloc_count; /* Number of relocation records */ - unsigned long flags; - unsigned long build_date; /* When the program/library was built */ - unsigned long filler[5]; /* Reservered, set to zero */ -}; - -#define FLAT_FLAG_RAM 0x0001 /* load program entirely into RAM */ -#define FLAT_FLAG_GOTPIC 0x0002 /* program is PIC with GOT */ -#define FLAT_FLAG_GZIP 0x0004 /* all but the header is compressed */ -#define FLAT_FLAG_GZDATA 0x0008 /* only data/relocs are compressed (for XIP) */ -#define FLAT_FLAG_KTRACE 0x0010 /* output useful kernel trace for debugging */ - - - -#endif /* _UAPI_LINUX_FLAT_H */ -- cgit v1.2.3-71-gd317 From 9e645e1105ca60fbbc6bddf2fd5ef7e57ed3dca8 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 10 May 2019 16:07:28 -0600 Subject: io_uring: add support for sqe links With SQE links, we can create chains of dependent SQEs. One example would be queueing an SQE that's a read from one file descriptor, with the linked SQE being a write to another with the same set of buffers. An SQE link will not stall the pipeline, it'll just ensure that dependent SQEs aren't issued before the previous link has completed. Any error at submission or completion time will break the chain of SQEs. For completions, this also includes short reads or writes, as the next SQE could depend on the previous one being fully completed. Any SQE in a chain that gets canceled due to any of the above errors, will get an CQE fill with -ECANCELED as the error value. Signed-off-by: Jens Axboe --- fs/io_uring.c | 241 +++++++++++++++++++++++++++++++++--------- include/uapi/linux/io_uring.h | 1 + 2 files changed, 194 insertions(+), 48 deletions(-) (limited to 'include/uapi/linux') diff --git a/fs/io_uring.c b/fs/io_uring.c index 92debd8be535..9f0ef4956f87 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -322,6 +322,7 @@ struct io_kiocb { struct io_ring_ctx *ctx; struct list_head list; + struct list_head link_list; unsigned int flags; refcount_t refs; #define REQ_F_NOWAIT 1 /* must not punt to workers */ @@ -330,8 +331,10 @@ struct io_kiocb { #define REQ_F_SEQ_PREV 8 /* sequential with previous */ #define REQ_F_IO_DRAIN 16 /* drain existing IO first */ #define REQ_F_IO_DRAINED 32 /* drain done */ +#define REQ_F_LINK 64 /* linked sqes */ +#define REQ_F_FAIL_LINK 128 /* fail rest of links */ u64 user_data; - u32 error; /* iopoll result from callback */ + u32 result; u32 sequence; struct work_struct work; @@ -583,6 +586,7 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx, req->flags = 0; /* one is dropped after submission, the other at completion */ refcount_set(&req->refs, 2); + req->result = 0; return req; out: io_ring_drop_ctx_refs(ctx, 1); @@ -598,7 +602,7 @@ static void io_free_req_many(struct io_ring_ctx *ctx, void **reqs, int *nr) } } -static void io_free_req(struct io_kiocb *req) +static void __io_free_req(struct io_kiocb *req) { if (req->file && !(req->flags & REQ_F_FIXED_FILE)) fput(req->file); @@ -606,6 +610,63 @@ static void io_free_req(struct io_kiocb *req) kmem_cache_free(req_cachep, req); } +static void io_req_link_next(struct io_kiocb *req) +{ + struct io_kiocb *nxt; + + /* + * The list should never be empty when we are called here. But could + * potentially happen if the chain is messed up, check to be on the + * safe side. + */ + nxt = list_first_entry_or_null(&req->link_list, struct io_kiocb, list); + if (nxt) { + list_del(&nxt->list); + if (!list_empty(&req->link_list)) { + INIT_LIST_HEAD(&nxt->link_list); + list_splice(&req->link_list, &nxt->link_list); + nxt->flags |= REQ_F_LINK; + } + + INIT_WORK(&nxt->work, io_sq_wq_submit_work); + queue_work(req->ctx->sqo_wq, &nxt->work); + } +} + +/* + * Called if REQ_F_LINK is set, and we fail the head request + */ +static void io_fail_links(struct io_kiocb *req) +{ + struct io_kiocb *link; + + while (!list_empty(&req->link_list)) { + link = list_first_entry(&req->link_list, struct io_kiocb, list); + list_del(&link->list); + + io_cqring_add_event(req->ctx, link->user_data, -ECANCELED); + __io_free_req(link); + } +} + +static void io_free_req(struct io_kiocb *req) +{ + /* + * If LINK is set, we have dependent requests in this chain. If we + * didn't fail this request, queue the first one up, moving any other + * dependencies to the next request. In case of failure, fail the rest + * of the chain. + */ + if (req->flags & REQ_F_LINK) { + if (req->flags & REQ_F_FAIL_LINK) + io_fail_links(req); + else + io_req_link_next(req); + } + + __io_free_req(req); +} + static void io_put_req(struct io_kiocb *req) { if (refcount_dec_and_test(&req->refs)) @@ -627,16 +688,17 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events, req = list_first_entry(done, struct io_kiocb, list); list_del(&req->list); - io_cqring_fill_event(ctx, req->user_data, req->error); + io_cqring_fill_event(ctx, req->user_data, req->result); (*nr_events)++; if (refcount_dec_and_test(&req->refs)) { /* If we're not using fixed files, we have to pair the * completion part with the file put. Use regular * completions for those, only batch free for fixed - * file. + * file and non-linked commands. */ - if (req->flags & REQ_F_FIXED_FILE) { + if ((req->flags & (REQ_F_FIXED_FILE|REQ_F_LINK)) == + REQ_F_FIXED_FILE) { reqs[to_free++] = req; if (to_free == ARRAY_SIZE(reqs)) io_free_req_many(ctx, reqs, &to_free); @@ -775,6 +837,8 @@ static void io_complete_rw(struct kiocb *kiocb, long res, long res2) kiocb_end_write(kiocb); + if ((req->flags & REQ_F_LINK) && res != req->result) + req->flags |= REQ_F_FAIL_LINK; io_cqring_add_event(req->ctx, req->user_data, res); io_put_req(req); } @@ -785,7 +849,9 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2) kiocb_end_write(kiocb); - req->error = res; + if ((req->flags & REQ_F_LINK) && res != req->result) + req->flags |= REQ_F_FAIL_LINK; + req->result = res; if (res != -EAGAIN) req->flags |= REQ_F_IOPOLL_COMPLETED; } @@ -928,7 +994,6 @@ static int io_prep_rw(struct io_kiocb *req, const struct sqe_submit *s, !kiocb->ki_filp->f_op->iopoll) return -EOPNOTSUPP; - req->error = 0; kiocb->ki_flags |= IOCB_HIPRI; kiocb->ki_complete = io_complete_rw_iopoll; } else { @@ -1106,6 +1171,9 @@ static int io_read(struct io_kiocb *req, const struct sqe_submit *s, return ret; read_size = ret; + if (req->flags & REQ_F_LINK) + req->result = read_size; + iov_count = iov_iter_count(&iter); ret = rw_verify_area(READ, file, &kiocb->ki_pos, iov_count); if (!ret) { @@ -1163,6 +1231,9 @@ static int io_write(struct io_kiocb *req, const struct sqe_submit *s, if (ret < 0) return ret; + if (req->flags & REQ_F_LINK) + req->result = ret; + iov_count = iov_iter_count(&iter); ret = -EAGAIN; @@ -1266,6 +1337,8 @@ static int io_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe, end > 0 ? end : LLONG_MAX, fsync_flags & IORING_FSYNC_DATASYNC); + if (ret < 0 && (req->flags & REQ_F_LINK)) + req->flags |= REQ_F_FAIL_LINK; io_cqring_add_event(req->ctx, sqe->user_data, ret); io_put_req(req); return 0; @@ -1310,6 +1383,8 @@ static int io_sync_file_range(struct io_kiocb *req, ret = sync_file_range(req->rw.ki_filp, sqe_off, sqe_len, flags); + if (ret < 0 && (req->flags & REQ_F_LINK)) + req->flags |= REQ_F_FAIL_LINK; io_cqring_add_event(req->ctx, sqe->user_data, ret); io_put_req(req); return 0; @@ -1562,9 +1637,10 @@ static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, { int ret, opcode; + req->user_data = READ_ONCE(s->sqe->user_data); + if (unlikely(s->index >= ctx->sq_entries)) return -EINVAL; - req->user_data = READ_ONCE(s->sqe->user_data); opcode = READ_ONCE(s->sqe->opcode); switch (opcode) { @@ -1608,7 +1684,7 @@ static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, return ret; if (ctx->flags & IORING_SETUP_IOPOLL) { - if (req->error == -EAGAIN) + if (req->result == -EAGAIN) return -EAGAIN; /* workqueue context doesn't hold uring_lock, grab it now */ @@ -1834,31 +1910,11 @@ static int io_req_set_file(struct io_ring_ctx *ctx, const struct sqe_submit *s, return 0; } -static int io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s, - struct io_submit_state *state) +static int io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, + struct sqe_submit *s) { - struct io_kiocb *req; int ret; - /* enforce forwards compatibility on users */ - if (unlikely(s->sqe->flags & ~(IOSQE_FIXED_FILE | IOSQE_IO_DRAIN))) - return -EINVAL; - - req = io_get_req(ctx, state); - if (unlikely(!req)) - return -EAGAIN; - - ret = io_req_set_file(ctx, s, state, req); - if (unlikely(ret)) - goto out; - - ret = io_req_defer(ctx, req, s->sqe); - if (ret) { - if (ret == -EIOCBQUEUED) - ret = 0; - return ret; - } - ret = __io_submit_sqe(ctx, req, s, true); if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) { struct io_uring_sqe *sqe_copy; @@ -1881,24 +1937,93 @@ static int io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s, /* * Queued up for async execution, worker will release - * submit reference when the iocb is actually - * submitted. + * submit reference when the iocb is actually submitted. */ return 0; } } -out: /* drop submission reference */ io_put_req(req); /* and drop final reference, if we failed */ - if (ret) + if (ret) { + io_cqring_add_event(ctx, req->user_data, ret); + if (req->flags & REQ_F_LINK) + req->flags |= REQ_F_FAIL_LINK; io_put_req(req); + } return ret; } +#define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK) + +static void io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s, + struct io_submit_state *state, struct io_kiocb **link) +{ + struct io_uring_sqe *sqe_copy; + struct io_kiocb *req; + int ret; + + /* enforce forwards compatibility on users */ + if (unlikely(s->sqe->flags & ~SQE_VALID_FLAGS)) { + ret = -EINVAL; + goto err; + } + + req = io_get_req(ctx, state); + if (unlikely(!req)) { + ret = -EAGAIN; + goto err; + } + + ret = io_req_set_file(ctx, s, state, req); + if (unlikely(ret)) { +err_req: + io_free_req(req); +err: + io_cqring_add_event(ctx, s->sqe->user_data, ret); + return; + } + + ret = io_req_defer(ctx, req, s->sqe); + if (ret) { + if (ret != -EIOCBQUEUED) + goto err_req; + return; + } + + /* + * If we already have a head request, queue this one for async + * submittal once the head completes. If we don't have a head but + * IOSQE_IO_LINK is set in the sqe, start a new head. This one will be + * submitted sync once the chain is complete. If none of those + * conditions are true (normal request), then just queue it. + */ + if (*link) { + struct io_kiocb *prev = *link; + + sqe_copy = kmemdup(s->sqe, sizeof(*sqe_copy), GFP_KERNEL); + if (!sqe_copy) { + ret = -EAGAIN; + goto err_req; + } + + s->sqe = sqe_copy; + memcpy(&req->submit, s, sizeof(*s)); + list_add_tail(&req->list, &prev->link_list); + } else if (s->sqe->flags & IOSQE_IO_LINK) { + req->flags |= REQ_F_LINK; + + memcpy(&req->submit, s, sizeof(*s)); + INIT_LIST_HEAD(&req->link_list); + *link = req; + } else { + io_queue_sqe(ctx, req, s); + } +} + /* * Batched submission is done, ensure local IO is flushed out. */ @@ -1981,7 +2106,9 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, struct sqe_submit *sqes, unsigned int nr, bool has_user, bool mm_fault) { struct io_submit_state state, *statep = NULL; - int ret, i, submitted = 0; + struct io_kiocb *link = NULL; + bool prev_was_link = false; + int i, submitted = 0; if (nr > IO_PLUG_THRESHOLD) { io_submit_state_start(&state, ctx, nr); @@ -1989,22 +2116,30 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, struct sqe_submit *sqes, } for (i = 0; i < nr; i++) { + /* + * If previous wasn't linked and we have a linked command, + * that's the end of the chain. Submit the previous link. + */ + if (!prev_was_link && link) { + io_queue_sqe(ctx, link, &link->submit); + link = NULL; + } + prev_was_link = (sqes[i].sqe->flags & IOSQE_IO_LINK) != 0; + if (unlikely(mm_fault)) { - ret = -EFAULT; + io_cqring_add_event(ctx, sqes[i].sqe->user_data, + -EFAULT); } else { sqes[i].has_user = has_user; sqes[i].needs_lock = true; sqes[i].needs_fixed_file = true; - ret = io_submit_sqe(ctx, &sqes[i], statep); - } - if (!ret) { + io_submit_sqe(ctx, &sqes[i], statep, &link); submitted++; - continue; } - - io_cqring_add_event(ctx, sqes[i].sqe->user_data, ret); } + if (link) + io_queue_sqe(ctx, link, &link->submit); if (statep) io_submit_state_end(&state); @@ -2145,6 +2280,8 @@ static int io_sq_thread(void *data) static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit) { struct io_submit_state state, *statep = NULL; + struct io_kiocb *link = NULL; + bool prev_was_link = false; int i, submit = 0; if (to_submit > IO_PLUG_THRESHOLD) { @@ -2154,22 +2291,30 @@ static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit) for (i = 0; i < to_submit; i++) { struct sqe_submit s; - int ret; if (!io_get_sqring(ctx, &s)) break; + /* + * If previous wasn't linked and we have a linked command, + * that's the end of the chain. Submit the previous link. + */ + if (!prev_was_link && link) { + io_queue_sqe(ctx, link, &link->submit); + link = NULL; + } + prev_was_link = (s.sqe->flags & IOSQE_IO_LINK) != 0; + s.has_user = true; s.needs_lock = false; s.needs_fixed_file = false; submit++; - - ret = io_submit_sqe(ctx, &s, statep); - if (ret) - io_cqring_add_event(ctx, s.sqe->user_data, ret); + io_submit_sqe(ctx, &s, statep, &link); } io_commit_sqring(ctx); + if (link) + io_queue_sqe(ctx, link, &link->submit); if (statep) io_submit_state_end(statep); diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index a0c460025036..10b7c45f6d57 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -40,6 +40,7 @@ struct io_uring_sqe { */ #define IOSQE_FIXED_FILE (1U << 0) /* use fixed fileset */ #define IOSQE_IO_DRAIN (1U << 1) /* issue after inflight IO */ +#define IOSQE_IO_LINK (1U << 2) /* links next sqe */ /* * io_uring_setup() flags -- cgit v1.2.3-71-gd317 From 1d6362fa0cfc8c7b243fa92924429d826599e691 Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Fri, 21 Jun 2019 09:42:06 +0100 Subject: sched/core: Allow sched_setattr() to use the current policy The sched_setattr() syscall mandates that a policy is always specified. This requires to always know which policy a task will have when attributes are configured and this makes it impossible to add more generic task attributes valid across different scheduling policies. Reading the policy before setting generic tasks attributes is racy since we cannot be sure it is not changed concurrently. Introduce the required support to change generic task attributes without affecting the current task policy. This is done by adding an attribute flag (SCHED_FLAG_KEEP_POLICY) to enforce the usage of the current policy. Add support for the SETPARAM_POLICY policy, which is already used by the sched_setparam() POSIX syscall, to the sched_setattr() non-POSIX syscall. Signed-off-by: Patrick Bellasi Signed-off-by: Peter Zijlstra (Intel) Cc: Alessio Balsini Cc: Dietmar Eggemann Cc: Joel Fernandes Cc: Juri Lelli Cc: Linus Torvalds Cc: Morten Rasmussen Cc: Paul Turner Cc: Peter Zijlstra Cc: Quentin Perret Cc: Rafael J . Wysocki Cc: Steve Muckle Cc: Suren Baghdasaryan Cc: Tejun Heo Cc: Thomas Gleixner Cc: Todd Kjos Cc: Vincent Guittot Cc: Viresh Kumar Link: https://lkml.kernel.org/r/20190621084217.8167-6-patrick.bellasi@arm.com Signed-off-by: Ingo Molnar --- include/uapi/linux/sched.h | 4 +++- kernel/sched/core.c | 2 ++ 2 files changed, 5 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h index ed4ee170bee2..58b2368d3634 100644 --- a/include/uapi/linux/sched.h +++ b/include/uapi/linux/sched.h @@ -51,9 +51,11 @@ #define SCHED_FLAG_RESET_ON_FORK 0x01 #define SCHED_FLAG_RECLAIM 0x02 #define SCHED_FLAG_DL_OVERRUN 0x04 +#define SCHED_FLAG_KEEP_POLICY 0x08 #define SCHED_FLAG_ALL (SCHED_FLAG_RESET_ON_FORK | \ SCHED_FLAG_RECLAIM | \ - SCHED_FLAG_DL_OVERRUN) + SCHED_FLAG_DL_OVERRUN | \ + SCHED_FLAG_KEEP_POLICY) #endif /* _UAPI_LINUX_SCHED_H */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b74de86b68c7..6d519f3f9789 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4897,6 +4897,8 @@ SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, if ((int)attr.sched_policy < 0) return -EINVAL; + if (attr.sched_flags & SCHED_FLAG_KEEP_POLICY) + attr.sched_policy = SETPARAM_POLICY; rcu_read_lock(); retval = -ESRCH; -- cgit v1.2.3-71-gd317 From a509a7cd79747074a2c018a45bbbc52d1f4aed44 Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Fri, 21 Jun 2019 09:42:07 +0100 Subject: sched/uclamp: Extend sched_setattr() to support utilization clamping The SCHED_DEADLINE scheduling class provides an advanced and formal model to define tasks requirements that can translate into proper decisions for both task placements and frequencies selections. Other classes have a more simplified model based on the POSIX concept of priorities. Such a simple priority based model however does not allow to exploit most advanced features of the Linux scheduler like, for example, driving frequencies selection via the schedutil cpufreq governor. However, also for non SCHED_DEADLINE tasks, it's still interesting to define tasks properties to support scheduler decisions. Utilization clamping exposes to user-space a new set of per-task attributes the scheduler can use as hints about the expected/required utilization for a task. This allows to implement a "proactive" per-task frequency control policy, a more advanced policy than the current one based just on "passive" measured task utilization. For example, it's possible to boost interactive tasks (e.g. to get better performance) or cap background tasks (e.g. to be more energy/thermal efficient). Introduce a new API to set utilization clamping values for a specified task by extending sched_setattr(), a syscall which already allows to define task specific properties for different scheduling classes. A new pair of attributes allows to specify a minimum and maximum utilization the scheduler can consider for a task. Do that by validating the required clamp values before and then applying the required changes using _the_ same pattern already in use for __setscheduler(). This ensures that the task is re-enqueued with the new clamp values. Signed-off-by: Patrick Bellasi Signed-off-by: Peter Zijlstra (Intel) Cc: Alessio Balsini Cc: Dietmar Eggemann Cc: Joel Fernandes Cc: Juri Lelli Cc: Linus Torvalds Cc: Morten Rasmussen Cc: Paul Turner Cc: Peter Zijlstra Cc: Quentin Perret Cc: Rafael J . Wysocki Cc: Steve Muckle Cc: Suren Baghdasaryan Cc: Tejun Heo Cc: Thomas Gleixner Cc: Todd Kjos Cc: Vincent Guittot Cc: Viresh Kumar Link: https://lkml.kernel.org/r/20190621084217.8167-7-patrick.bellasi@arm.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 9 ++++ include/uapi/linux/sched.h | 12 +++++- include/uapi/linux/sched/types.h | 66 +++++++++++++++++++++++++---- kernel/sched/core.c | 91 ++++++++++++++++++++++++++++++++++++---- 4 files changed, 161 insertions(+), 17 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 5485f411e8e1..1113dd4706ae 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -587,6 +587,7 @@ struct sched_dl_entity { * @value: clamp value "assigned" to a se * @bucket_id: bucket index corresponding to the "assigned" value * @active: the se is currently refcounted in a rq's bucket + * @user_defined: the requested clamp value comes from user-space * * The bucket_id is the index of the clamp bucket matching the clamp value * which is pre-computed and stored to avoid expensive integer divisions from @@ -596,11 +597,19 @@ struct sched_dl_entity { * which can be different from the clamp value "requested" from user-space. * This allows to know a task is refcounted in the rq's bucket corresponding * to the "effective" bucket_id. + * + * The user_defined bit is set whenever a task has got a task-specific clamp + * value requested from userspace, i.e. the system defaults apply to this task + * just as a restriction. This allows to relax default clamps when a less + * restrictive task-specific value has been requested, thus allowing to + * implement a "nice" semantic. For example, a task running with a 20% + * default boost can still drop its own boosting to 0%. */ struct uclamp_se { unsigned int value : bits_per(SCHED_CAPACITY_SCALE); unsigned int bucket_id : bits_per(UCLAMP_BUCKETS); unsigned int active : 1; + unsigned int user_defined : 1; }; #endif /* CONFIG_UCLAMP_TASK */ diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h index 58b2368d3634..617bb59aa8ba 100644 --- a/include/uapi/linux/sched.h +++ b/include/uapi/linux/sched.h @@ -52,10 +52,20 @@ #define SCHED_FLAG_RECLAIM 0x02 #define SCHED_FLAG_DL_OVERRUN 0x04 #define SCHED_FLAG_KEEP_POLICY 0x08 +#define SCHED_FLAG_KEEP_PARAMS 0x10 +#define SCHED_FLAG_UTIL_CLAMP_MIN 0x20 +#define SCHED_FLAG_UTIL_CLAMP_MAX 0x40 + +#define SCHED_FLAG_KEEP_ALL (SCHED_FLAG_KEEP_POLICY | \ + SCHED_FLAG_KEEP_PARAMS) + +#define SCHED_FLAG_UTIL_CLAMP (SCHED_FLAG_UTIL_CLAMP_MIN | \ + SCHED_FLAG_UTIL_CLAMP_MAX) #define SCHED_FLAG_ALL (SCHED_FLAG_RESET_ON_FORK | \ SCHED_FLAG_RECLAIM | \ SCHED_FLAG_DL_OVERRUN | \ - SCHED_FLAG_KEEP_POLICY) + SCHED_FLAG_KEEP_ALL | \ + SCHED_FLAG_UTIL_CLAMP) #endif /* _UAPI_LINUX_SCHED_H */ diff --git a/include/uapi/linux/sched/types.h b/include/uapi/linux/sched/types.h index 10fbb8031930..c852153ddb0d 100644 --- a/include/uapi/linux/sched/types.h +++ b/include/uapi/linux/sched/types.h @@ -9,6 +9,7 @@ struct sched_param { }; #define SCHED_ATTR_SIZE_VER0 48 /* sizeof first published struct */ +#define SCHED_ATTR_SIZE_VER1 56 /* add: util_{min,max} */ /* * Extended scheduling parameters data structure. @@ -21,8 +22,33 @@ struct sched_param { * the tasks may be useful for a wide variety of application fields, e.g., * multimedia, streaming, automation and control, and many others. * - * This variant (sched_attr) is meant at describing a so-called - * sporadic time-constrained task. In such model a task is specified by: + * This variant (sched_attr) allows to define additional attributes to + * improve the scheduler knowledge about task requirements. + * + * Scheduling Class Attributes + * =========================== + * + * A subset of sched_attr attributes specifies the + * scheduling policy and relative POSIX attributes: + * + * @size size of the structure, for fwd/bwd compat. + * + * @sched_policy task's scheduling policy + * @sched_nice task's nice value (SCHED_NORMAL/BATCH) + * @sched_priority task's static priority (SCHED_FIFO/RR) + * + * Certain more advanced scheduling features can be controlled by a + * predefined set of flags via the attribute: + * + * @sched_flags for customizing the scheduler behaviour + * + * Sporadic Time-Constrained Task Attributes + * ========================================= + * + * A subset of sched_attr attributes allows to describe a so-called + * sporadic time-constrained task. + * + * In such a model a task is specified by: * - the activation period or minimum instance inter-arrival time; * - the maximum (or average, depending on the actual scheduling * discipline) computation time of all instances, a.k.a. runtime; @@ -34,14 +60,8 @@ struct sched_param { * than the runtime and must be completed by time instant t equal to * the instance activation time + the deadline. * - * This is reflected by the actual fields of the sched_attr structure: + * This is reflected by the following fields of the sched_attr structure: * - * @size size of the structure, for fwd/bwd compat. - * - * @sched_policy task's scheduling policy - * @sched_flags for customizing the scheduler behaviour - * @sched_nice task's nice value (SCHED_NORMAL/BATCH) - * @sched_priority task's static priority (SCHED_FIFO/RR) * @sched_deadline representative of the task's deadline * @sched_runtime representative of the task's runtime * @sched_period representative of the task's period @@ -53,6 +73,29 @@ struct sched_param { * As of now, the SCHED_DEADLINE policy (sched_dl scheduling class) is the * only user of this new interface. More information about the algorithm * available in the scheduling class file or in Documentation/. + * + * Task Utilization Attributes + * =========================== + * + * A subset of sched_attr attributes allows to specify the utilization + * expected for a task. These attributes allow to inform the scheduler about + * the utilization boundaries within which it should schedule the task. These + * boundaries are valuable hints to support scheduler decisions on both task + * placement and frequency selection. + * + * @sched_util_min represents the minimum utilization + * @sched_util_max represents the maximum utilization + * + * Utilization is a value in the range [0..SCHED_CAPACITY_SCALE]. It + * represents the percentage of CPU time used by a task when running at the + * maximum frequency on the highest capacity CPU of the system. For example, a + * 20% utilization task is a task running for 2ms every 10ms at maximum + * frequency. + * + * A task with a min utilization value bigger than 0 is more likely scheduled + * on a CPU with a capacity big enough to fit the specified value. + * A task with a max utilization value smaller than 1024 is more likely + * scheduled on a CPU with no more capacity than the specified value. */ struct sched_attr { __u32 size; @@ -70,6 +113,11 @@ struct sched_attr { __u64 sched_runtime; __u64 sched_deadline; __u64 sched_period; + + /* Utilization hints */ + __u32 sched_util_min; + __u32 sched_util_max; + }; #endif /* _UAPI_LINUX_SCHED_TYPES_H */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 6d519f3f9789..e9a669266fa9 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -805,10 +805,12 @@ static inline unsigned int uclamp_none(int clamp_id) return SCHED_CAPACITY_SCALE; } -static inline void uclamp_se_set(struct uclamp_se *uc_se, unsigned int value) +static inline void uclamp_se_set(struct uclamp_se *uc_se, + unsigned int value, bool user_defined) { uc_se->value = value; uc_se->bucket_id = uclamp_bucket_id(value); + uc_se->user_defined = user_defined; } static inline unsigned int @@ -1016,11 +1018,11 @@ int sysctl_sched_uclamp_handler(struct ctl_table *table, int write, if (old_min != sysctl_sched_uclamp_util_min) { uclamp_se_set(&uclamp_default[UCLAMP_MIN], - sysctl_sched_uclamp_util_min); + sysctl_sched_uclamp_util_min, false); } if (old_max != sysctl_sched_uclamp_util_max) { uclamp_se_set(&uclamp_default[UCLAMP_MAX], - sysctl_sched_uclamp_util_max); + sysctl_sched_uclamp_util_max, false); } /* @@ -1038,6 +1040,42 @@ done: return result; } +static int uclamp_validate(struct task_struct *p, + const struct sched_attr *attr) +{ + unsigned int lower_bound = p->uclamp_req[UCLAMP_MIN].value; + unsigned int upper_bound = p->uclamp_req[UCLAMP_MAX].value; + + if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN) + lower_bound = attr->sched_util_min; + if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX) + upper_bound = attr->sched_util_max; + + if (lower_bound > upper_bound) + return -EINVAL; + if (upper_bound > SCHED_CAPACITY_SCALE) + return -EINVAL; + + return 0; +} + +static void __setscheduler_uclamp(struct task_struct *p, + const struct sched_attr *attr) +{ + if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP))) + return; + + if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN) { + uclamp_se_set(&p->uclamp_req[UCLAMP_MIN], + attr->sched_util_min, true); + } + + if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX) { + uclamp_se_set(&p->uclamp_req[UCLAMP_MAX], + attr->sched_util_max, true); + } +} + static void uclamp_fork(struct task_struct *p) { unsigned int clamp_id; @@ -1059,11 +1097,11 @@ static void __init init_uclamp(void) for_each_clamp_id(clamp_id) { uclamp_se_set(&init_task.uclamp_req[clamp_id], - uclamp_none(clamp_id)); + uclamp_none(clamp_id), false); } /* System defaults allow max clamp values for both indexes */ - uclamp_se_set(&uc_max, uclamp_none(UCLAMP_MAX)); + uclamp_se_set(&uc_max, uclamp_none(UCLAMP_MAX), false); for_each_clamp_id(clamp_id) uclamp_default[clamp_id] = uc_max; } @@ -1071,6 +1109,13 @@ static void __init init_uclamp(void) #else /* CONFIG_UCLAMP_TASK */ static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) { } static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) { } +static inline int uclamp_validate(struct task_struct *p, + const struct sched_attr *attr) +{ + return -EOPNOTSUPP; +} +static void __setscheduler_uclamp(struct task_struct *p, + const struct sched_attr *attr) { } static inline void uclamp_fork(struct task_struct *p) { } static inline void init_uclamp(void) { } #endif /* CONFIG_UCLAMP_TASK */ @@ -4412,6 +4457,13 @@ static void __setscheduler_params(struct task_struct *p, static void __setscheduler(struct rq *rq, struct task_struct *p, const struct sched_attr *attr, bool keep_boost) { + /* + * If params can't change scheduling class changes aren't allowed + * either. + */ + if (attr->sched_flags & SCHED_FLAG_KEEP_PARAMS) + return; + __setscheduler_params(p, attr); /* @@ -4549,6 +4601,13 @@ recheck: return retval; } + /* Update task specific "requested" clamps */ + if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) { + retval = uclamp_validate(p, attr); + if (retval) + return retval; + } + /* * Make sure no PI-waiters arrive (or leave) while we are * changing the priority of the task: @@ -4578,6 +4637,8 @@ recheck: goto change; if (dl_policy(policy) && dl_param_changed(p, attr)) goto change; + if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) + goto change; p->sched_reset_on_fork = reset_on_fork; task_rq_unlock(rq, p, &rf); @@ -4658,7 +4719,9 @@ change: put_prev_task(rq, p); prev_class = p->sched_class; + __setscheduler(rq, p, attr, pi); + __setscheduler_uclamp(p, attr); if (queued) { /* @@ -4834,6 +4897,10 @@ static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *a if (ret) return -EFAULT; + if ((attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) && + size < SCHED_ATTR_SIZE_VER1) + return -EINVAL; + /* * XXX: Do we want to be lenient like existing syscalls; or do we want * to be strict and return an error on out-of-bounds values? @@ -4903,10 +4970,15 @@ SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, rcu_read_lock(); retval = -ESRCH; p = find_process_by_pid(pid); - if (p != NULL) - retval = sched_setattr(p, &attr); + if (likely(p)) + get_task_struct(p); rcu_read_unlock(); + if (likely(p)) { + retval = sched_setattr(p, &attr); + put_task_struct(p); + } + return retval; } @@ -5057,6 +5129,11 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, else attr.sched_nice = task_nice(p); +#ifdef CONFIG_UCLAMP_TASK + attr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value; + attr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value; +#endif + rcu_read_unlock(); retval = sched_read_attr(uattr, &attr, size); -- cgit v1.2.3-71-gd317 From b206f281d0ee14969878469816a69db22d5838e8 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 26 Jun 2019 21:02:32 +0100 Subject: keys: Namespace keyring names Keyring names are held in a single global list that any process can pick from by means of keyctl_join_session_keyring (provided the keyring grants Search permission). This isn't very container friendly, however. Make the following changes: (1) Make default session, process and thread keyring names begin with a '.' instead of '_'. (2) Keyrings whose names begin with a '.' aren't added to the list. Such keyrings are system specials. (3) Replace the global list with per-user_namespace lists. A keyring adds its name to the list for the user_namespace that it is currently in. (4) When a user_namespace is deleted, it just removes itself from the keyring name list. The global keyring_name_lock is retained for accessing the name lists. This allows (4) to work. This can be tested by: # keyctl newring foo @s 995906392 # unshare -U $ keyctl show ... 995906392 --alswrv 65534 65534 \_ keyring: foo ... $ keyctl session foo Joined session keyring: 935622349 As can be seen, a new session keyring was created. The capability bit KEYCTL_CAPS1_NS_KEYRING_NAME is set if the kernel is employing this feature. Signed-off-by: David Howells cc: Eric W. Biederman --- include/linux/key.h | 2 + include/linux/user_namespace.h | 5 +++ include/uapi/linux/keyctl.h | 1 + kernel/user.c | 3 ++ kernel/user_namespace.c | 7 +-- security/keys/keyctl.c | 3 +- security/keys/keyring.c | 99 ++++++++++++++++++------------------------ 7 files changed, 60 insertions(+), 60 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/key.h b/include/linux/key.h index ff102731b3db..ae1177302d70 100644 --- a/include/linux/key.h +++ b/include/linux/key.h @@ -361,6 +361,7 @@ extern void key_set_timeout(struct key *, unsigned); extern key_ref_t lookup_user_key(key_serial_t id, unsigned long flags, key_perm_t perm); +extern void key_free_user_ns(struct user_namespace *); /* * The permissions required on a key that we're looking up. @@ -434,6 +435,7 @@ extern void key_init(void); #define key_fsuid_changed(c) do { } while(0) #define key_fsgid_changed(c) do { } while(0) #define key_init() do { } while(0) +#define key_free_user_ns(ns) do { } while(0) #endif /* CONFIG_KEYS */ #endif /* __KERNEL__ */ diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h index d6b74b91096b..90457015fa3f 100644 --- a/include/linux/user_namespace.h +++ b/include/linux/user_namespace.h @@ -64,6 +64,11 @@ struct user_namespace { struct ns_common ns; unsigned long flags; +#ifdef CONFIG_KEYS + /* List of joinable keyrings in this namespace */ + struct list_head keyring_name_list; +#endif + /* Register of per-UID persistent keyrings for this namespace */ #ifdef CONFIG_PERSISTENT_KEYRINGS struct key *persistent_keyring_register; diff --git a/include/uapi/linux/keyctl.h b/include/uapi/linux/keyctl.h index 551b5814f53e..35b405034674 100644 --- a/include/uapi/linux/keyctl.h +++ b/include/uapi/linux/keyctl.h @@ -128,5 +128,6 @@ struct keyctl_pkey_params { #define KEYCTL_CAPS0_INVALIDATE 0x20 /* KEYCTL_INVALIDATE supported */ #define KEYCTL_CAPS0_RESTRICT_KEYRING 0x40 /* KEYCTL_RESTRICT_KEYRING supported */ #define KEYCTL_CAPS0_MOVE 0x80 /* KEYCTL_MOVE supported */ +#define KEYCTL_CAPS1_NS_KEYRING_NAME 0x01 /* Keyring names are per-user_namespace */ #endif /* _LINUX_KEYCTL_H */ diff --git a/kernel/user.c b/kernel/user.c index 88b834f0eebc..50979fd1b7aa 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -62,6 +62,9 @@ struct user_namespace init_user_ns = { .ns.ops = &userns_operations, #endif .flags = USERNS_INIT_FLAGS, +#ifdef CONFIG_KEYS + .keyring_name_list = LIST_HEAD_INIT(init_user_ns.keyring_name_list), +#endif #ifdef CONFIG_PERSISTENT_KEYRINGS .persistent_keyring_register_sem = __RWSEM_INITIALIZER(init_user_ns.persistent_keyring_register_sem), diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 923414a246e9..bda6e890ad88 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -133,6 +133,9 @@ int create_user_ns(struct cred *new) ns->flags = parent_ns->flags; mutex_unlock(&userns_state_mutex); +#ifdef CONFIG_KEYS + INIT_LIST_HEAD(&ns->keyring_name_list); +#endif #ifdef CONFIG_PERSISTENT_KEYRINGS init_rwsem(&ns->persistent_keyring_register_sem); #endif @@ -196,9 +199,7 @@ static void free_user_ns(struct work_struct *work) kfree(ns->projid_map.reverse); } retire_userns_sysctls(ns); -#ifdef CONFIG_PERSISTENT_KEYRINGS - key_put(ns->persistent_keyring_register); -#endif + key_free_user_ns(ns); ns_free_inum(&ns->ns); kmem_cache_free(user_ns_cachep, ns); dec_user_namespaces(ucounts); diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c index 169409b611b0..8a813220f269 100644 --- a/security/keys/keyctl.c +++ b/security/keys/keyctl.c @@ -30,7 +30,7 @@ #define KEY_MAX_DESC_SIZE 4096 -static const unsigned char keyrings_capabilities[1] = { +static const unsigned char keyrings_capabilities[2] = { [0] = (KEYCTL_CAPS0_CAPABILITIES | (IS_ENABLED(CONFIG_PERSISTENT_KEYRINGS) ? KEYCTL_CAPS0_PERSISTENT_KEYRINGS : 0) | (IS_ENABLED(CONFIG_KEY_DH_OPERATIONS) ? KEYCTL_CAPS0_DIFFIE_HELLMAN : 0) | @@ -40,6 +40,7 @@ static const unsigned char keyrings_capabilities[1] = { KEYCTL_CAPS0_RESTRICT_KEYRING | KEYCTL_CAPS0_MOVE ), + [1] = (KEYCTL_CAPS1_NS_KEYRING_NAME), }; static int key_get_type_from_user(char *type, diff --git a/security/keys/keyring.c b/security/keys/keyring.c index 20891cd198f0..fe851292509e 100644 --- a/security/keys/keyring.c +++ b/security/keys/keyring.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -28,11 +29,6 @@ */ #define KEYRING_SEARCH_MAX_DEPTH 6 -/* - * We keep all named keyrings in a hash to speed looking them up. - */ -#define KEYRING_NAME_HASH_SIZE (1 << 5) - /* * We mark pointers we pass to the associative array with bit 1 set if * they're keyrings and clear otherwise. @@ -55,17 +51,20 @@ static inline void *keyring_key_to_ptr(struct key *key) return key; } -static struct list_head keyring_name_hash[KEYRING_NAME_HASH_SIZE]; static DEFINE_RWLOCK(keyring_name_lock); -static inline unsigned keyring_hash(const char *desc) +/* + * Clean up the bits of user_namespace that belong to us. + */ +void key_free_user_ns(struct user_namespace *ns) { - unsigned bucket = 0; - - for (; *desc; desc++) - bucket += (unsigned char)*desc; + write_lock(&keyring_name_lock); + list_del_init(&ns->keyring_name_list); + write_unlock(&keyring_name_lock); - return bucket & (KEYRING_NAME_HASH_SIZE - 1); +#ifdef CONFIG_PERSISTENT_KEYRINGS + key_put(ns->persistent_keyring_register); +#endif } /* @@ -104,23 +103,17 @@ static DEFINE_MUTEX(keyring_serialise_link_lock); /* * Publish the name of a keyring so that it can be found by name (if it has - * one). + * one and it doesn't begin with a dot). */ static void keyring_publish_name(struct key *keyring) { - int bucket; - - if (keyring->description) { - bucket = keyring_hash(keyring->description); + struct user_namespace *ns = current_user_ns(); + if (keyring->description && + keyring->description[0] && + keyring->description[0] != '.') { write_lock(&keyring_name_lock); - - if (!keyring_name_hash[bucket].next) - INIT_LIST_HEAD(&keyring_name_hash[bucket]); - - list_add_tail(&keyring->name_link, - &keyring_name_hash[bucket]); - + list_add_tail(&keyring->name_link, &ns->keyring_name_list); write_unlock(&keyring_name_lock); } } @@ -1097,50 +1090,44 @@ found: */ struct key *find_keyring_by_name(const char *name, bool uid_keyring) { + struct user_namespace *ns = current_user_ns(); struct key *keyring; - int bucket; if (!name) return ERR_PTR(-EINVAL); - bucket = keyring_hash(name); - read_lock(&keyring_name_lock); - if (keyring_name_hash[bucket].next) { - /* search this hash bucket for a keyring with a matching name - * that's readable and that hasn't been revoked */ - list_for_each_entry(keyring, - &keyring_name_hash[bucket], - name_link - ) { - if (!kuid_has_mapping(current_user_ns(), keyring->user->uid)) - continue; - - if (test_bit(KEY_FLAG_REVOKED, &keyring->flags)) - continue; + /* Search this hash bucket for a keyring with a matching name that + * grants Search permission and that hasn't been revoked + */ + list_for_each_entry(keyring, &ns->keyring_name_list, name_link) { + if (!kuid_has_mapping(ns, keyring->user->uid)) + continue; - if (strcmp(keyring->description, name) != 0) - continue; + if (test_bit(KEY_FLAG_REVOKED, &keyring->flags)) + continue; - if (uid_keyring) { - if (!test_bit(KEY_FLAG_UID_KEYRING, - &keyring->flags)) - continue; - } else { - if (key_permission(make_key_ref(keyring, 0), - KEY_NEED_SEARCH) < 0) - continue; - } + if (strcmp(keyring->description, name) != 0) + continue; - /* we've got a match but we might end up racing with - * key_cleanup() if the keyring is currently 'dead' - * (ie. it has a zero usage count) */ - if (!refcount_inc_not_zero(&keyring->usage)) + if (uid_keyring) { + if (!test_bit(KEY_FLAG_UID_KEYRING, + &keyring->flags)) + continue; + } else { + if (key_permission(make_key_ref(keyring, 0), + KEY_NEED_SEARCH) < 0) continue; - keyring->last_used_at = ktime_get_real_seconds(); - goto out; } + + /* we've got a match but we might end up racing with + * key_cleanup() if the keyring is currently 'dead' + * (ie. it has a zero usage count) */ + if (!refcount_inc_not_zero(&keyring->usage)) + continue; + keyring->last_used_at = ktime_get_real_seconds(); + goto out; } keyring = ERR_PTR(-ENOKEY); -- cgit v1.2.3-71-gd317 From 3b6e4de05e9ee2e2f94e4a3fe14d945e2418d9a8 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 26 Jun 2019 21:02:32 +0100 Subject: keys: Include target namespace in match criteria Currently a key has a standard matching criteria of { type, description } and this is used to only allow keys with unique criteria in a keyring. This means, however, that you cannot have keys with the same type and description but a different target namespace in the same keyring. This is a potential problem for a containerised environment where, say, a container is made up of some parts of its mount space involving netfs superblocks from two different network namespaces. This is also a problem for shared system management keyrings such as the DNS records keyring or the NFS idmapper keyring that might contain keys from different network namespaces. Fix this by including a namespace component in a key's matching criteria. Keyring types are marked to indicate which, if any, namespace is relevant to keys of that type, and that namespace is set when the key is created from the current task's namespace set. The capability bit KEYCTL_CAPS1_NS_KEY_TAG is set if the kernel is employing this feature. Signed-off-by: David Howells --- include/linux/key.h | 10 ++++++++++ include/uapi/linux/keyctl.h | 1 + security/keys/gc.c | 2 +- security/keys/key.c | 1 + security/keys/keyctl.c | 3 ++- security/keys/keyring.c | 36 ++++++++++++++++++++++++++++++++++-- security/keys/persistent.c | 1 + 7 files changed, 50 insertions(+), 4 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/key.h b/include/linux/key.h index ae1177302d70..abc68555bac3 100644 --- a/include/linux/key.h +++ b/include/linux/key.h @@ -82,9 +82,16 @@ struct cred; struct key_type; struct key_owner; +struct key_tag; struct keyring_list; struct keyring_name; +struct key_tag { + struct rcu_head rcu; + refcount_t usage; + bool removed; /* T when subject removed */ +}; + struct keyring_index_key { /* [!] If this structure is altered, the union in struct key must change too! */ unsigned long hash; /* Hash value */ @@ -101,6 +108,7 @@ struct keyring_index_key { unsigned long x; }; struct key_type *type; + struct key_tag *domain_tag; /* Domain of operation */ const char *description; }; @@ -218,6 +226,7 @@ struct key { unsigned long hash; unsigned long len_desc; struct key_type *type; /* type of key */ + struct key_tag *domain_tag; /* Domain of operation */ char *description; }; }; @@ -268,6 +277,7 @@ extern struct key *key_alloc(struct key_type *type, extern void key_revoke(struct key *key); extern void key_invalidate(struct key *key); extern void key_put(struct key *key); +extern bool key_put_tag(struct key_tag *tag); static inline struct key *__key_get(struct key *key) { diff --git a/include/uapi/linux/keyctl.h b/include/uapi/linux/keyctl.h index 35b405034674..ed3d5893830d 100644 --- a/include/uapi/linux/keyctl.h +++ b/include/uapi/linux/keyctl.h @@ -129,5 +129,6 @@ struct keyctl_pkey_params { #define KEYCTL_CAPS0_RESTRICT_KEYRING 0x40 /* KEYCTL_RESTRICT_KEYRING supported */ #define KEYCTL_CAPS0_MOVE 0x80 /* KEYCTL_MOVE supported */ #define KEYCTL_CAPS1_NS_KEYRING_NAME 0x01 /* Keyring names are per-user_namespace */ +#define KEYCTL_CAPS1_NS_KEY_TAG 0x02 /* Key indexing can include a namespace tag */ #endif /* _LINUX_KEYCTL_H */ diff --git a/security/keys/gc.c b/security/keys/gc.c index 634e96b380e8..83d279fb7793 100644 --- a/security/keys/gc.c +++ b/security/keys/gc.c @@ -154,7 +154,7 @@ static noinline void key_gc_unused_keys(struct list_head *keys) atomic_dec(&key->user->nikeys); key_user_put(key->user); - + key_put_tag(key->domain_tag); kfree(key->description); memzero_explicit(key, sizeof(*key)); diff --git a/security/keys/key.c b/security/keys/key.c index 9d52f2472a09..85fdc2ea6c14 100644 --- a/security/keys/key.c +++ b/security/keys/key.c @@ -317,6 +317,7 @@ struct key *key_alloc(struct key_type *type, const char *desc, goto security_error; /* publish the key by giving it a serial number */ + refcount_inc(&key->domain_tag->usage); atomic_inc(&user->nkeys); key_alloc_serial(key); diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c index 8a813220f269..4bb5781d3ddf 100644 --- a/security/keys/keyctl.c +++ b/security/keys/keyctl.c @@ -40,7 +40,8 @@ static const unsigned char keyrings_capabilities[2] = { KEYCTL_CAPS0_RESTRICT_KEYRING | KEYCTL_CAPS0_MOVE ), - [1] = (KEYCTL_CAPS1_NS_KEYRING_NAME), + [1] = (KEYCTL_CAPS1_NS_KEYRING_NAME | + KEYCTL_CAPS1_NS_KEY_TAG), }; static int key_get_type_from_user(char *type, diff --git a/security/keys/keyring.c b/security/keys/keyring.c index 3663e5168583..0da8fa282d56 100644 --- a/security/keys/keyring.c +++ b/security/keys/keyring.c @@ -175,6 +175,9 @@ static void hash_key_type_and_desc(struct keyring_index_key *index_key) type = (unsigned long)index_key->type; acc = mult_64x32_and_fold(type, desc_len + 13); acc = mult_64x32_and_fold(acc, 9207); + piece = (unsigned long)index_key->domain_tag; + acc = mult_64x32_and_fold(acc, piece); + acc = mult_64x32_and_fold(acc, 9207); for (;;) { n = desc_len; @@ -208,16 +211,36 @@ static void hash_key_type_and_desc(struct keyring_index_key *index_key) /* * Finalise an index key to include a part of the description actually in the - * index key and to add in the hash too. + * index key, to set the domain tag and to calculate the hash. */ void key_set_index_key(struct keyring_index_key *index_key) { + static struct key_tag default_domain_tag = { .usage = REFCOUNT_INIT(1), }; size_t n = min_t(size_t, index_key->desc_len, sizeof(index_key->desc)); + memcpy(index_key->desc, index_key->description, n); + index_key->domain_tag = &default_domain_tag; hash_key_type_and_desc(index_key); } +/** + * key_put_tag - Release a ref on a tag. + * @tag: The tag to release. + * + * This releases a reference the given tag and returns true if that ref was the + * last one. + */ +bool key_put_tag(struct key_tag *tag) +{ + if (refcount_dec_and_test(&tag->usage)) { + kfree_rcu(tag, rcu); + return true; + } + + return false; +} + /* * Build the next index key chunk. * @@ -238,8 +261,10 @@ static unsigned long keyring_get_key_chunk(const void *data, int level) return index_key->x; case 2: return (unsigned long)index_key->type; + case 3: + return (unsigned long)index_key->domain_tag; default: - level -= 3; + level -= 4; if (desc_len <= sizeof(index_key->desc)) return 0; @@ -268,6 +293,7 @@ static bool keyring_compare_object(const void *object, const void *data) const struct key *key = keyring_ptr_to_key(object); return key->index_key.type == index_key->type && + key->index_key.domain_tag == index_key->domain_tag && key->index_key.desc_len == index_key->desc_len && memcmp(key->index_key.description, index_key->description, index_key->desc_len) == 0; @@ -309,6 +335,12 @@ static int keyring_diff_objects(const void *object, const void *data) goto differ; level += sizeof(unsigned long); + seg_a = (unsigned long)a->domain_tag; + seg_b = (unsigned long)b->domain_tag; + if ((seg_a ^ seg_b) != 0) + goto differ; + level += sizeof(unsigned long); + i = sizeof(a->desc); if (a->desc_len <= i) goto same; diff --git a/security/keys/persistent.c b/security/keys/persistent.c index 90303fe4a394..9944d855a28d 100644 --- a/security/keys/persistent.c +++ b/security/keys/persistent.c @@ -84,6 +84,7 @@ static long key_get_persistent(struct user_namespace *ns, kuid_t uid, long ret; /* Look in the register if it exists */ + memset(&index_key, 0, sizeof(index_key)); index_key.type = &key_type_keyring; index_key.description = buf; index_key.desc_len = sprintf(buf, "_persistent.%u", from_kuid(ns, uid)); -- cgit v1.2.3-71-gd317 From 3ae762a09cd72a08ef620c80fbb263693c3fb204 Mon Sep 17 00:00:00 2001 From: Russell King Date: Tue, 4 Jun 2019 14:49:25 +0100 Subject: fs/adfs: correct disc record structure Fill in some padding in the disc record structure, and add GCC packed and aligned attributes to ensure that it is correctly laid out. Signed-off-by: Russell King Signed-off-by: Al Viro --- include/uapi/linux/adfs_fs.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/adfs_fs.h b/include/uapi/linux/adfs_fs.h index 151d93e27ed4..f1a7d67a7323 100644 --- a/include/uapi/linux/adfs_fs.h +++ b/include/uapi/linux/adfs_fs.h @@ -29,17 +29,17 @@ struct adfs_discrecord { __u8 log2sharesize:4; __u8 unused40:4; __u8 big_flag:1; - __u8 unused41:1; + __u8 unused41:7; __u8 nzones_high; + __u8 reserved43; __le32 format_version; __le32 root_size; __u8 unused52[60 - 52]; -}; +} __attribute__((packed, aligned(4))); #define ADFS_DISCRECORD (0xc00) #define ADFS_DR_OFFSET (0x1c0) #define ADFS_DR_SIZE 60 #define ADFS_DR_SIZE_BITS (ADFS_DR_SIZE << 3) - #endif /* _UAPI_ADFS_FS_H */ -- cgit v1.2.3-71-gd317 From 61caf3d109f5411a7f5b433f1eb73ead7e0789fa Mon Sep 17 00:00:00 2001 From: Linus Lüssing Date: Tue, 11 Jun 2019 22:58:40 +0200 Subject: batman-adv: mcast: detect, distribute and maintain multicast router presence MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To be able to apply our group aware multicast optimizations to packets with a scope greater than link-local we need to not only keep track of multicast listeners but also multicast routers. With this patch a node detects the presence of multicast routers on its segment by checking if /proc/sys/net/ipv{4,6}/conf//mc_forwarding is set for one thing. This option is enabled by multicast routing daemons and needed for the kernel's multicast routing tables to receive and route packets. For another thing if a bridge is configured on top of bat0 then the presence of an IPv6 multicast router behind this bridge is currently detected by checking for an IPv6 multicast "All Routers Address" (ff02::2). This should later be replaced by querying the bridge, which performs proper, RFC4286 compliant Multicast Router Discovery (our simplified approach includes more hosts than necessary, most notably not just multicast routers but also unicast ones and is not applicable for IPv4). If no multicast router is detected then this is signalized via the new BATADV_MCAST_WANT_NO_RTR4 and BATADV_MCAST_WANT_NO_RTR6 multicast tvlv flags. Signed-off-by: Linus Lüssing Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- include/uapi/linux/batadv_packet.h | 8 + net/batman-adv/multicast.c | 412 ++++++++++++++++++++++++++++++++----- net/batman-adv/originator.c | 4 +- net/batman-adv/types.h | 29 +++ 4 files changed, 399 insertions(+), 54 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/batadv_packet.h b/include/uapi/linux/batadv_packet.h index 4ebc2135e950..2a15f01c2243 100644 --- a/include/uapi/linux/batadv_packet.h +++ b/include/uapi/linux/batadv_packet.h @@ -107,12 +107,20 @@ enum batadv_icmp_packettype { * @BATADV_MCAST_WANT_ALL_UNSNOOPABLES: we want all packets destined for * 224.0.0.0/24 or ff02::1 * @BATADV_MCAST_WANT_ALL_IPV4: we want all IPv4 multicast packets + * (both link-local and routable ones) * @BATADV_MCAST_WANT_ALL_IPV6: we want all IPv6 multicast packets + * (both link-local and routable ones) + * @BATADV_MCAST_WANT_NO_RTR4: we have no IPv4 multicast router and therefore + * only need routable IPv4 multicast packets we signed up for explicitly + * @BATADV_MCAST_WANT_NO_RTR6: we have no IPv6 multicast router and therefore + * only need routable IPv6 multicast packets we signed up for explicitly */ enum batadv_mcast_flags { BATADV_MCAST_WANT_ALL_UNSNOOPABLES = 1UL << 0, BATADV_MCAST_WANT_ALL_IPV4 = 1UL << 1, BATADV_MCAST_WANT_ALL_IPV6 = 1UL << 2, + BATADV_MCAST_WANT_NO_RTR4 = 1UL << 3, + BATADV_MCAST_WANT_NO_RTR6 = 1UL << 4, }; /* tt data subtypes */ diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c index d4e7474022e3..80d5f3c892cb 100644 --- a/net/batman-adv/multicast.c +++ b/net/batman-adv/multicast.c @@ -73,26 +73,200 @@ static void batadv_mcast_start_timer(struct batadv_priv *bat_priv) } /** - * batadv_mcast_has_bridge() - check whether the soft-iface is bridged - * @bat_priv: the bat priv with all the soft interface information + * batadv_mcast_get_bridge() - get the bridge on top of the softif if it exists + * @soft_iface: netdev struct of the mesh interface * - * Checks whether there is a bridge on top of our soft interface. + * If the given soft interface has a bridge on top then the refcount + * of the according net device is increased. * - * Return: true if there is a bridge, false otherwise. + * Return: NULL if no such bridge exists. Otherwise the net device of the + * bridge. */ -static bool batadv_mcast_has_bridge(struct batadv_priv *bat_priv) +static struct net_device *batadv_mcast_get_bridge(struct net_device *soft_iface) { - struct net_device *upper = bat_priv->soft_iface; + struct net_device *upper = soft_iface; rcu_read_lock(); do { upper = netdev_master_upper_dev_get_rcu(upper); } while (upper && !(upper->priv_flags & IFF_EBRIDGE)); + + if (upper) + dev_hold(upper); rcu_read_unlock(); return upper; } +/** + * batadv_mcast_mla_rtr_flags_softif_get_ipv4() - get mcast router flags from + * node for IPv4 + * @dev: the interface to check + * + * Checks the presence of an IPv4 multicast router on this node. + * + * Caller needs to hold rcu read lock. + * + * Return: BATADV_NO_FLAGS if present, BATADV_MCAST_WANT_NO_RTR4 otherwise. + */ +static u8 batadv_mcast_mla_rtr_flags_softif_get_ipv4(struct net_device *dev) +{ + struct in_device *in_dev = __in_dev_get_rcu(dev); + + if (in_dev && IN_DEV_MFORWARD(in_dev)) + return BATADV_NO_FLAGS; + else + return BATADV_MCAST_WANT_NO_RTR4; +} + +/** + * batadv_mcast_mla_rtr_flags_softif_get_ipv6() - get mcast router flags from + * node for IPv6 + * @dev: the interface to check + * + * Checks the presence of an IPv6 multicast router on this node. + * + * Caller needs to hold rcu read lock. + * + * Return: BATADV_NO_FLAGS if present, BATADV_MCAST_WANT_NO_RTR6 otherwise. + */ +#if IS_ENABLED(CONFIG_IPV6_MROUTE) +static u8 batadv_mcast_mla_rtr_flags_softif_get_ipv6(struct net_device *dev) +{ + struct inet6_dev *in6_dev = __in6_dev_get(dev); + + if (in6_dev && in6_dev->cnf.mc_forwarding) + return BATADV_NO_FLAGS; + else + return BATADV_MCAST_WANT_NO_RTR6; +} +#else +static inline u8 +batadv_mcast_mla_rtr_flags_softif_get_ipv6(struct net_device *dev) +{ + return BATADV_MCAST_WANT_NO_RTR6; +} +#endif + +/** + * batadv_mcast_mla_rtr_flags_softif_get() - get mcast router flags from node + * @bat_priv: the bat priv with all the soft interface information + * @bridge: bridge interface on top of the soft_iface if present, + * otherwise pass NULL + * + * Checks the presence of IPv4 and IPv6 multicast routers on this + * node. + * + * Return: + * BATADV_NO_FLAGS: Both an IPv4 and IPv6 multicast router is present + * BATADV_MCAST_WANT_NO_RTR4: No IPv4 multicast router is present + * BATADV_MCAST_WANT_NO_RTR6: No IPv6 multicast router is present + * The former two OR'd: no multicast router is present + */ +static u8 batadv_mcast_mla_rtr_flags_softif_get(struct batadv_priv *bat_priv, + struct net_device *bridge) +{ + struct net_device *dev = bridge ? bridge : bat_priv->soft_iface; + u8 flags = BATADV_NO_FLAGS; + + rcu_read_lock(); + + flags |= batadv_mcast_mla_rtr_flags_softif_get_ipv4(dev); + flags |= batadv_mcast_mla_rtr_flags_softif_get_ipv6(dev); + + rcu_read_unlock(); + + return flags; +} + +/** + * batadv_mcast_mla_rtr_flags_bridge_get() - get mcast router flags from bridge + * @bat_priv: the bat priv with all the soft interface information + * @bridge: bridge interface on top of the soft_iface if present, + * otherwise pass NULL + * + * Checks the presence of IPv4 and IPv6 multicast routers behind a bridge. + * + * Return: + * BATADV_NO_FLAGS: Both an IPv4 and IPv6 multicast router is present + * BATADV_MCAST_WANT_NO_RTR4: No IPv4 multicast router is present + * BATADV_MCAST_WANT_NO_RTR6: No IPv6 multicast router is present + * The former two OR'd: no multicast router is present + */ +#if IS_ENABLED(CONFIG_IPV6) +static u8 batadv_mcast_mla_rtr_flags_bridge_get(struct batadv_priv *bat_priv, + struct net_device *bridge) +{ + struct list_head bridge_mcast_list = LIST_HEAD_INIT(bridge_mcast_list); + struct net_device *dev = bat_priv->soft_iface; + struct br_ip_list *br_ip_entry, *tmp; + u8 flags = BATADV_MCAST_WANT_NO_RTR6; + int ret; + + if (!bridge) + return BATADV_MCAST_WANT_NO_RTR4 | BATADV_MCAST_WANT_NO_RTR6; + + /* TODO: ask the bridge if a multicast router is present (the bridge + * is capable of performing proper RFC4286 multicast multicast router + * discovery) instead of searching for a ff02::2 listener here + */ + ret = br_multicast_list_adjacent(dev, &bridge_mcast_list); + if (ret < 0) + return BATADV_NO_FLAGS; + + list_for_each_entry_safe(br_ip_entry, tmp, &bridge_mcast_list, list) { + /* the bridge snooping does not maintain IPv4 link-local + * addresses - therefore we won't find any IPv4 multicast router + * address here, only IPv6 ones + */ + if (br_ip_entry->addr.proto == htons(ETH_P_IPV6) && + ipv6_addr_is_ll_all_routers(&br_ip_entry->addr.u.ip6)) + flags &= ~BATADV_MCAST_WANT_NO_RTR6; + + list_del(&br_ip_entry->list); + kfree(br_ip_entry); + } + + return flags; +} +#else +static inline u8 +batadv_mcast_mla_rtr_flags_bridge_get(struct batadv_priv *bat_priv, + struct net_device *bridge) +{ + if (bridge) + return BATADV_NO_FLAGS; + else + return BATADV_MCAST_WANT_NO_RTR4 | BATADV_MCAST_WANT_NO_RTR6; +} +#endif + +/** + * batadv_mcast_mla_rtr_flags_get() - get multicast router flags + * @bat_priv: the bat priv with all the soft interface information + * @bridge: bridge interface on top of the soft_iface if present, + * otherwise pass NULL + * + * Checks the presence of IPv4 and IPv6 multicast routers on this + * node or behind its bridge. + * + * Return: + * BATADV_NO_FLAGS: Both an IPv4 and IPv6 multicast router is present + * BATADV_MCAST_WANT_NO_RTR4: No IPv4 multicast router is present + * BATADV_MCAST_WANT_NO_RTR6: No IPv6 multicast router is present + * The former two OR'd: no multicast router is present + */ +static u8 batadv_mcast_mla_rtr_flags_get(struct batadv_priv *bat_priv, + struct net_device *bridge) +{ + u8 flags = BATADV_MCAST_WANT_NO_RTR4 | BATADV_MCAST_WANT_NO_RTR6; + + flags &= batadv_mcast_mla_rtr_flags_softif_get(bat_priv, bridge); + flags &= batadv_mcast_mla_rtr_flags_bridge_get(bat_priv, bridge); + + return flags; +} + /** * batadv_mcast_mla_flags_get() - get the new multicast flags * @bat_priv: the bat priv with all the soft interface information @@ -106,13 +280,20 @@ batadv_mcast_mla_flags_get(struct batadv_priv *bat_priv) struct net_device *dev = bat_priv->soft_iface; struct batadv_mcast_querier_state *qr4, *qr6; struct batadv_mcast_mla_flags mla_flags; + struct net_device *bridge; + + bridge = batadv_mcast_get_bridge(dev); memset(&mla_flags, 0, sizeof(mla_flags)); mla_flags.enabled = 1; + mla_flags.tvlv_flags |= batadv_mcast_mla_rtr_flags_get(bat_priv, + bridge); - if (!batadv_mcast_has_bridge(bat_priv)) + if (!bridge) return mla_flags; + dev_put(bridge); + mla_flags.bridged = 1; qr4 = &mla_flags.querier_ipv4; qr6 = &mla_flags.querier_ipv6; @@ -137,41 +318,19 @@ batadv_mcast_mla_flags_get(struct batadv_priv *bat_priv) * In both cases, we will signalize other batman nodes that * we need all multicast traffic of the according protocol. */ - if (!qr4->exists || qr4->shadowing) + if (!qr4->exists || qr4->shadowing) { mla_flags.tvlv_flags |= BATADV_MCAST_WANT_ALL_IPV4; + mla_flags.tvlv_flags &= ~BATADV_MCAST_WANT_NO_RTR4; + } - if (!qr6->exists || qr6->shadowing) + if (!qr6->exists || qr6->shadowing) { mla_flags.tvlv_flags |= BATADV_MCAST_WANT_ALL_IPV6; + mla_flags.tvlv_flags &= ~BATADV_MCAST_WANT_NO_RTR6; + } return mla_flags; } -/** - * batadv_mcast_get_bridge() - get the bridge on top of the softif if it exists - * @soft_iface: netdev struct of the mesh interface - * - * If the given soft interface has a bridge on top then the refcount - * of the according net device is increased. - * - * Return: NULL if no such bridge exists. Otherwise the net device of the - * bridge. - */ -static struct net_device *batadv_mcast_get_bridge(struct net_device *soft_iface) -{ - struct net_device *upper = soft_iface; - - rcu_read_lock(); - do { - upper = netdev_master_upper_dev_get_rcu(upper); - } while (upper && !(upper->priv_flags & IFF_EBRIDGE)); - - if (upper) - dev_hold(upper); - rcu_read_unlock(); - - return upper; -} - /** * batadv_mcast_mla_is_duplicate() - check whether an address is in a list * @mcast_addr: the multicast address to check @@ -234,6 +393,10 @@ batadv_mcast_mla_softif_get_ipv4(struct net_device *dev, ipv4_is_local_multicast(pmc->multiaddr)) continue; + if (!(flags->tvlv_flags & BATADV_MCAST_WANT_NO_RTR4) && + !ipv4_is_local_multicast(pmc->multiaddr)) + continue; + ip_eth_mc_map(pmc->multiaddr, mcast_addr); if (batadv_mcast_mla_is_duplicate(mcast_addr, mcast_list)) @@ -301,6 +464,11 @@ batadv_mcast_mla_softif_get_ipv6(struct net_device *dev, ipv6_addr_is_ll_all_nodes(&pmc6->mca_addr)) continue; + if (!(flags->tvlv_flags & BATADV_MCAST_WANT_NO_RTR6) && + IPV6_ADDR_MC_SCOPE(&pmc6->mca_addr) > + IPV6_ADDR_SCOPE_LINKLOCAL) + continue; + ipv6_eth_mc_map(&pmc6->mca_addr, mcast_addr); if (batadv_mcast_mla_is_duplicate(mcast_addr, mcast_list)) @@ -442,6 +610,10 @@ static int batadv_mcast_mla_bridge_get(struct net_device *dev, if (tvlv_flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES && ipv4_is_local_multicast(br_ip_entry->addr.u.ip4)) continue; + + if (!(tvlv_flags & BATADV_MCAST_WANT_NO_RTR4) && + !ipv4_is_local_multicast(br_ip_entry->addr.u.ip4)) + continue; } #if IS_ENABLED(CONFIG_IPV6) @@ -452,6 +624,11 @@ static int batadv_mcast_mla_bridge_get(struct net_device *dev, if (tvlv_flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES && ipv6_addr_is_ll_all_nodes(&br_ip_entry->addr.u.ip6)) continue; + + if (!(tvlv_flags & BATADV_MCAST_WANT_NO_RTR6) && + IPV6_ADDR_MC_SCOPE(&br_ip_entry->addr.u.ip6) > + IPV6_ADDR_SCOPE_LINKLOCAL) + continue; } #endif @@ -662,19 +839,23 @@ static void batadv_mcast_flags_log(struct batadv_priv *bat_priv, u8 flags) { bool old_enabled = bat_priv->mcast.mla_flags.enabled; u8 old_flags = bat_priv->mcast.mla_flags.tvlv_flags; - char str_old_flags[] = "[...]"; + char str_old_flags[] = "[.... . ]"; - sprintf(str_old_flags, "[%c%c%c]", + sprintf(str_old_flags, "[%c%c%c%s%s]", (old_flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES) ? 'U' : '.', (old_flags & BATADV_MCAST_WANT_ALL_IPV4) ? '4' : '.', - (old_flags & BATADV_MCAST_WANT_ALL_IPV6) ? '6' : '.'); + (old_flags & BATADV_MCAST_WANT_ALL_IPV6) ? '6' : '.', + !(old_flags & BATADV_MCAST_WANT_NO_RTR4) ? "R4" : ". ", + !(old_flags & BATADV_MCAST_WANT_NO_RTR6) ? "R6" : ". "); batadv_dbg(BATADV_DBG_MCAST, bat_priv, - "Changing multicast flags from '%s' to '[%c%c%c]'\n", + "Changing multicast flags from '%s' to '[%c%c%c%s%s]'\n", old_enabled ? str_old_flags : "", (flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES) ? 'U' : '.', (flags & BATADV_MCAST_WANT_ALL_IPV4) ? '4' : '.', - (flags & BATADV_MCAST_WANT_ALL_IPV6) ? '6' : '.'); + (flags & BATADV_MCAST_WANT_ALL_IPV6) ? '6' : '.', + !(flags & BATADV_MCAST_WANT_NO_RTR4) ? "R4" : ". ", + !(flags & BATADV_MCAST_WANT_NO_RTR6) ? "R6" : ". "); } /** @@ -1466,6 +1647,127 @@ static void batadv_mcast_want_ipv6_update(struct batadv_priv *bat_priv, } } +/** + * batadv_mcast_want_rtr4_update() - update want-all-rtr4 counter and list + * @bat_priv: the bat priv with all the soft interface information + * @orig: the orig_node which multicast state might have changed of + * @mcast_flags: flags indicating the new multicast state + * + * If the BATADV_MCAST_WANT_NO_RTR4 flag of this originator, orig, has + * toggled then this method updates counter and list accordingly. + * + * Caller needs to hold orig->mcast_handler_lock. + */ +static void batadv_mcast_want_rtr4_update(struct batadv_priv *bat_priv, + struct batadv_orig_node *orig, + u8 mcast_flags) +{ + struct hlist_node *node = &orig->mcast_want_all_rtr4_node; + struct hlist_head *head = &bat_priv->mcast.want_all_rtr4_list; + + lockdep_assert_held(&orig->mcast_handler_lock); + + /* switched from flag set to unset */ + if (!(mcast_flags & BATADV_MCAST_WANT_NO_RTR4) && + orig->mcast_flags & BATADV_MCAST_WANT_NO_RTR4) { + atomic_inc(&bat_priv->mcast.num_want_all_rtr4); + + spin_lock_bh(&bat_priv->mcast.want_lists_lock); + /* flag checks above + mcast_handler_lock prevents this */ + WARN_ON(!hlist_unhashed(node)); + + hlist_add_head_rcu(node, head); + spin_unlock_bh(&bat_priv->mcast.want_lists_lock); + /* switched from flag unset to set */ + } else if (mcast_flags & BATADV_MCAST_WANT_NO_RTR4 && + !(orig->mcast_flags & BATADV_MCAST_WANT_NO_RTR4)) { + atomic_dec(&bat_priv->mcast.num_want_all_rtr4); + + spin_lock_bh(&bat_priv->mcast.want_lists_lock); + /* flag checks above + mcast_handler_lock prevents this */ + WARN_ON(hlist_unhashed(node)); + + hlist_del_init_rcu(node); + spin_unlock_bh(&bat_priv->mcast.want_lists_lock); + } +} + +/** + * batadv_mcast_want_rtr6_update() - update want-all-rtr6 counter and list + * @bat_priv: the bat priv with all the soft interface information + * @orig: the orig_node which multicast state might have changed of + * @mcast_flags: flags indicating the new multicast state + * + * If the BATADV_MCAST_WANT_NO_RTR6 flag of this originator, orig, has + * toggled then this method updates counter and list accordingly. + * + * Caller needs to hold orig->mcast_handler_lock. + */ +static void batadv_mcast_want_rtr6_update(struct batadv_priv *bat_priv, + struct batadv_orig_node *orig, + u8 mcast_flags) +{ + struct hlist_node *node = &orig->mcast_want_all_rtr6_node; + struct hlist_head *head = &bat_priv->mcast.want_all_rtr6_list; + + lockdep_assert_held(&orig->mcast_handler_lock); + + /* switched from flag set to unset */ + if (!(mcast_flags & BATADV_MCAST_WANT_NO_RTR6) && + orig->mcast_flags & BATADV_MCAST_WANT_NO_RTR6) { + atomic_inc(&bat_priv->mcast.num_want_all_rtr6); + + spin_lock_bh(&bat_priv->mcast.want_lists_lock); + /* flag checks above + mcast_handler_lock prevents this */ + WARN_ON(!hlist_unhashed(node)); + + hlist_add_head_rcu(node, head); + spin_unlock_bh(&bat_priv->mcast.want_lists_lock); + /* switched from flag unset to set */ + } else if (mcast_flags & BATADV_MCAST_WANT_NO_RTR6 && + !(orig->mcast_flags & BATADV_MCAST_WANT_NO_RTR6)) { + atomic_dec(&bat_priv->mcast.num_want_all_rtr6); + + spin_lock_bh(&bat_priv->mcast.want_lists_lock); + /* flag checks above + mcast_handler_lock prevents this */ + WARN_ON(hlist_unhashed(node)); + + hlist_del_init_rcu(node); + spin_unlock_bh(&bat_priv->mcast.want_lists_lock); + } +} + +/** + * batadv_mcast_tvlv_flags_get() - get multicast flags from an OGM TVLV + * @enabled: whether the originator has multicast TVLV support enabled + * @tvlv_value: tvlv buffer containing the multicast flags + * @tvlv_value_len: tvlv buffer length + * + * Return: multicast flags for the given tvlv buffer + */ +static u8 +batadv_mcast_tvlv_flags_get(bool enabled, void *tvlv_value, u16 tvlv_value_len) +{ + u8 mcast_flags = BATADV_NO_FLAGS; + + if (enabled && tvlv_value && tvlv_value_len >= sizeof(mcast_flags)) + mcast_flags = *(u8 *)tvlv_value; + + if (!enabled) { + mcast_flags |= BATADV_MCAST_WANT_ALL_IPV4; + mcast_flags |= BATADV_MCAST_WANT_ALL_IPV6; + } + + /* remove redundant flags to avoid sending duplicate packets later */ + if (mcast_flags & BATADV_MCAST_WANT_ALL_IPV4) + mcast_flags |= BATADV_MCAST_WANT_NO_RTR4; + + if (mcast_flags & BATADV_MCAST_WANT_ALL_IPV6) + mcast_flags |= BATADV_MCAST_WANT_NO_RTR6; + + return mcast_flags; +} + /** * batadv_mcast_tvlv_ogm_handler() - process incoming multicast tvlv container * @bat_priv: the bat priv with all the soft interface information @@ -1481,16 +1783,10 @@ static void batadv_mcast_tvlv_ogm_handler(struct batadv_priv *bat_priv, u16 tvlv_value_len) { bool orig_mcast_enabled = !(flags & BATADV_TVLV_HANDLER_OGM_CIFNOTFND); - u8 mcast_flags = BATADV_NO_FLAGS; - - if (orig_mcast_enabled && tvlv_value && - tvlv_value_len >= sizeof(mcast_flags)) - mcast_flags = *(u8 *)tvlv_value; + u8 mcast_flags; - if (!orig_mcast_enabled) { - mcast_flags |= BATADV_MCAST_WANT_ALL_IPV4; - mcast_flags |= BATADV_MCAST_WANT_ALL_IPV6; - } + mcast_flags = batadv_mcast_tvlv_flags_get(orig_mcast_enabled, + tvlv_value, tvlv_value_len); spin_lock_bh(&orig->mcast_handler_lock); @@ -1507,6 +1803,8 @@ static void batadv_mcast_tvlv_ogm_handler(struct batadv_priv *bat_priv, batadv_mcast_want_unsnoop_update(bat_priv, orig, mcast_flags); batadv_mcast_want_ipv4_update(bat_priv, orig, mcast_flags); batadv_mcast_want_ipv6_update(bat_priv, orig, mcast_flags); + batadv_mcast_want_rtr4_update(bat_priv, orig, mcast_flags); + batadv_mcast_want_rtr6_update(bat_priv, orig, mcast_flags); orig->mcast_flags = mcast_flags; spin_unlock_bh(&orig->mcast_handler_lock); @@ -1556,10 +1854,12 @@ static void batadv_mcast_flags_print_header(struct batadv_priv *bat_priv, shadowing6 = '?'; } - seq_printf(seq, "Multicast flags (own flags: [%c%c%c])\n", + seq_printf(seq, "Multicast flags (own flags: [%c%c%c%s%s])\n", (flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES) ? 'U' : '.', (flags & BATADV_MCAST_WANT_ALL_IPV4) ? '4' : '.', - (flags & BATADV_MCAST_WANT_ALL_IPV6) ? '6' : '.'); + (flags & BATADV_MCAST_WANT_ALL_IPV6) ? '6' : '.', + !(flags & BATADV_MCAST_WANT_NO_RTR4) ? "R4" : ". ", + !(flags & BATADV_MCAST_WANT_NO_RTR6) ? "R6" : ". "); seq_printf(seq, "* Bridged [U]\t\t\t\t%c\n", bridged ? 'U' : '.'); seq_printf(seq, "* No IGMP/MLD Querier [4/6]:\t\t%c/%c\n", querier4, querier6); @@ -1613,13 +1913,17 @@ int batadv_mcast_flags_seq_print_text(struct seq_file *seq, void *offset) flags = orig_node->mcast_flags; - seq_printf(seq, "%pM [%c%c%c]\n", orig_node->orig, + seq_printf(seq, "%pM [%c%c%c%s%s]\n", orig_node->orig, (flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES) ? 'U' : '.', (flags & BATADV_MCAST_WANT_ALL_IPV4) ? '4' : '.', (flags & BATADV_MCAST_WANT_ALL_IPV6) - ? '6' : '.'); + ? '6' : '.', + !(flags & BATADV_MCAST_WANT_NO_RTR4) + ? "R4" : ". ", + !(flags & BATADV_MCAST_WANT_NO_RTR6) + ? "R6" : ". "); } rcu_read_unlock(); } @@ -1893,6 +2197,8 @@ void batadv_mcast_purge_orig(struct batadv_orig_node *orig) batadv_mcast_want_unsnoop_update(bat_priv, orig, BATADV_NO_FLAGS); batadv_mcast_want_ipv4_update(bat_priv, orig, BATADV_NO_FLAGS); batadv_mcast_want_ipv6_update(bat_priv, orig, BATADV_NO_FLAGS); + batadv_mcast_want_rtr4_update(bat_priv, orig, BATADV_NO_FLAGS); + batadv_mcast_want_rtr6_update(bat_priv, orig, BATADV_NO_FLAGS); spin_unlock_bh(&orig->mcast_handler_lock); } diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c index 45db798a7297..38613487fb1b 100644 --- a/net/batman-adv/originator.c +++ b/net/batman-adv/originator.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include "bat_algo.h" @@ -1043,7 +1044,8 @@ struct batadv_orig_node *batadv_orig_node_new(struct batadv_priv *bat_priv, orig_node->bcast_seqno_reset = reset_time; #ifdef CONFIG_BATMAN_ADV_MCAST - orig_node->mcast_flags = BATADV_NO_FLAGS; + orig_node->mcast_flags = BATADV_MCAST_WANT_NO_RTR4; + orig_node->mcast_flags |= BATADV_MCAST_WANT_NO_RTR6; INIT_HLIST_NODE(&orig_node->mcast_want_all_unsnoopables_node); INIT_HLIST_NODE(&orig_node->mcast_want_all_ipv4_node); INIT_HLIST_NODE(&orig_node->mcast_want_all_ipv6_node); diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h index 72f65b3769d0..c2996296b953 100644 --- a/net/batman-adv/types.h +++ b/net/batman-adv/types.h @@ -404,6 +404,17 @@ struct batadv_orig_node { * list */ struct hlist_node mcast_want_all_ipv6_node; + + /** + * @mcast_want_all_rtr4_node: a list node for the mcast.want_all_rtr4 + * list + */ + struct hlist_node mcast_want_all_rtr4_node; + /** + * @mcast_want_all_rtr6_node: a list node for the mcast.want_all_rtr6 + * list + */ + struct hlist_node mcast_want_all_rtr6_node; #endif /** @capabilities: announced capabilities of this originator */ @@ -1218,6 +1229,18 @@ struct batadv_priv_mcast { */ struct hlist_head want_all_ipv6_list; + /** + * @want_all_rtr4_list: a list of orig_nodes wanting all routable IPv4 + * multicast traffic + */ + struct hlist_head want_all_rtr4_list; + + /** + * @want_all_rtr6_list: a list of orig_nodes wanting all routable IPv6 + * multicast traffic + */ + struct hlist_head want_all_rtr6_list; + /** * @mla_flags: flags for the querier, bridge and tvlv state */ @@ -1240,6 +1263,12 @@ struct batadv_priv_mcast { /** @num_want_all_ipv6: counter for items in want_all_ipv6_list */ atomic_t num_want_all_ipv6; + /** @num_want_all_rtr4: counter for items in want_all_rtr4_list */ + atomic_t num_want_all_rtr4; + + /** @num_want_all_rtr6: counter for items in want_all_rtr6_list */ + atomic_t num_want_all_rtr6; + /** * @want_lists_lock: lock for protecting modifications to mcasts * want_all_{unsnoopables,ipv4,ipv6}_list (traversals are rcu-locked) -- cgit v1.2.3-71-gd317 From 2640d3c8123223e0a205b2a25a446df6f072b3ea Mon Sep 17 00:00:00 2001 From: Maxim Mikityanskiy Date: Wed, 26 Jun 2019 17:35:25 +0300 Subject: xsk: Add getsockopt XDP_OPTIONS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Make it possible for the application to determine whether the AF_XDP socket is running in zero-copy mode. To achieve this, add a new getsockopt option XDP_OPTIONS that returns flags. The only flag supported for now is the zero-copy mode indicator. Signed-off-by: Maxim Mikityanskiy Signed-off-by: Tariq Toukan Acked-by: Saeed Mahameed Acked-by: Björn Töpel Signed-off-by: Daniel Borkmann --- include/uapi/linux/if_xdp.h | 8 ++++++++ net/xdp/xsk.c | 20 ++++++++++++++++++++ tools/include/uapi/linux/if_xdp.h | 8 ++++++++ 3 files changed, 36 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h index caed8b1614ff..faaa5ca2a117 100644 --- a/include/uapi/linux/if_xdp.h +++ b/include/uapi/linux/if_xdp.h @@ -46,6 +46,7 @@ struct xdp_mmap_offsets { #define XDP_UMEM_FILL_RING 5 #define XDP_UMEM_COMPLETION_RING 6 #define XDP_STATISTICS 7 +#define XDP_OPTIONS 8 struct xdp_umem_reg { __u64 addr; /* Start of packet data area */ @@ -60,6 +61,13 @@ struct xdp_statistics { __u64 tx_invalid_descs; /* Dropped due to invalid descriptor */ }; +struct xdp_options { + __u32 flags; +}; + +/* Flags for the flags field of struct xdp_options */ +#define XDP_OPTIONS_ZEROCOPY (1 << 0) + /* Pgoff for mmaping the rings */ #define XDP_PGOFF_RX_RING 0 #define XDP_PGOFF_TX_RING 0x80000000 diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index b68a380f50b3..35ca531ac74e 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -650,6 +650,26 @@ static int xsk_getsockopt(struct socket *sock, int level, int optname, return 0; } + case XDP_OPTIONS: + { + struct xdp_options opts = {}; + + if (len < sizeof(opts)) + return -EINVAL; + + mutex_lock(&xs->mutex); + if (xs->zc) + opts.flags |= XDP_OPTIONS_ZEROCOPY; + mutex_unlock(&xs->mutex); + + len = sizeof(opts); + if (copy_to_user(optval, &opts, len)) + return -EFAULT; + if (put_user(len, optlen)) + return -EFAULT; + + return 0; + } default: break; } diff --git a/tools/include/uapi/linux/if_xdp.h b/tools/include/uapi/linux/if_xdp.h index caed8b1614ff..faaa5ca2a117 100644 --- a/tools/include/uapi/linux/if_xdp.h +++ b/tools/include/uapi/linux/if_xdp.h @@ -46,6 +46,7 @@ struct xdp_mmap_offsets { #define XDP_UMEM_FILL_RING 5 #define XDP_UMEM_COMPLETION_RING 6 #define XDP_STATISTICS 7 +#define XDP_OPTIONS 8 struct xdp_umem_reg { __u64 addr; /* Start of packet data area */ @@ -60,6 +61,13 @@ struct xdp_statistics { __u64 tx_invalid_descs; /* Dropped due to invalid descriptor */ }; +struct xdp_options { + __u32 flags; +}; + +/* Flags for the flags field of struct xdp_options */ +#define XDP_OPTIONS_ZEROCOPY (1 << 0) + /* Pgoff for mmaping the rings */ #define XDP_PGOFF_RX_RING 0 #define XDP_PGOFF_TX_RING 0x80000000 -- cgit v1.2.3-71-gd317 From 2e12256b9a76584fa3a6da19210509d4775aee36 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 27 Jun 2019 23:03:07 +0100 Subject: keys: Replace uid/gid/perm permissions checking with an ACL Replace the uid/gid/perm permissions checking on a key with an ACL to allow the SETATTR and SEARCH permissions to be split. This will also allow a greater range of subjects to represented. ============ WHY DO THIS? ============ The problem is that SETATTR and SEARCH cover a slew of actions, not all of which should be grouped together. For SETATTR, this includes actions that are about controlling access to a key: (1) Changing a key's ownership. (2) Changing a key's security information. (3) Setting a keyring's restriction. And actions that are about managing a key's lifetime: (4) Setting an expiry time. (5) Revoking a key. and (proposed) managing a key as part of a cache: (6) Invalidating a key. Managing a key's lifetime doesn't really have anything to do with controlling access to that key. Expiry time is awkward since it's more about the lifetime of the content and so, in some ways goes better with WRITE permission. It can, however, be set unconditionally by a process with an appropriate authorisation token for instantiating a key, and can also be set by the key type driver when a key is instantiated, so lumping it with the access-controlling actions is probably okay. As for SEARCH permission, that currently covers: (1) Finding keys in a keyring tree during a search. (2) Permitting keyrings to be joined. (3) Invalidation. But these don't really belong together either, since these actions really need to be controlled separately. Finally, there are number of special cases to do with granting the administrator special rights to invalidate or clear keys that I would like to handle with the ACL rather than key flags and special checks. =============== WHAT IS CHANGED =============== The SETATTR permission is split to create two new permissions: (1) SET_SECURITY - which allows the key's owner, group and ACL to be changed and a restriction to be placed on a keyring. (2) REVOKE - which allows a key to be revoked. The SEARCH permission is split to create: (1) SEARCH - which allows a keyring to be search and a key to be found. (2) JOIN - which allows a keyring to be joined as a session keyring. (3) INVAL - which allows a key to be invalidated. The WRITE permission is also split to create: (1) WRITE - which allows a key's content to be altered and links to be added, removed and replaced in a keyring. (2) CLEAR - which allows a keyring to be cleared completely. This is split out to make it possible to give just this to an administrator. (3) REVOKE - see above. Keys acquire ACLs which consist of a series of ACEs, and all that apply are unioned together. An ACE specifies a subject, such as: (*) Possessor - permitted to anyone who 'possesses' a key (*) Owner - permitted to the key owner (*) Group - permitted to the key group (*) Everyone - permitted to everyone Note that 'Other' has been replaced with 'Everyone' on the assumption that you wouldn't grant a permit to 'Other' that you wouldn't also grant to everyone else. Further subjects may be made available by later patches. The ACE also specifies a permissions mask. The set of permissions is now: VIEW Can view the key metadata READ Can read the key content WRITE Can update/modify the key content SEARCH Can find the key by searching/requesting LINK Can make a link to the key SET_SECURITY Can change owner, ACL, expiry INVAL Can invalidate REVOKE Can revoke JOIN Can join this keyring CLEAR Can clear this keyring The KEYCTL_SETPERM function is then deprecated. The KEYCTL_SET_TIMEOUT function then is permitted if SET_SECURITY is set, or if the caller has a valid instantiation auth token. The KEYCTL_INVALIDATE function then requires INVAL. The KEYCTL_REVOKE function then requires REVOKE. The KEYCTL_JOIN_SESSION_KEYRING function then requires JOIN to join an existing keyring. The JOIN permission is enabled by default for session keyrings and manually created keyrings only. ====================== BACKWARD COMPATIBILITY ====================== To maintain backward compatibility, KEYCTL_SETPERM will translate the permissions mask it is given into a new ACL for a key - unless KEYCTL_SET_ACL has been called on that key, in which case an error will be returned. It will convert possessor, owner, group and other permissions into separate ACEs, if each portion of the mask is non-zero. SETATTR permission turns on all of INVAL, REVOKE and SET_SECURITY. WRITE permission turns on WRITE, REVOKE and, if a keyring, CLEAR. JOIN is turned on if a keyring is being altered. The KEYCTL_DESCRIBE function translates the ACL back into a permissions mask to return depending on possessor, owner, group and everyone ACEs. It will make the following mappings: (1) INVAL, JOIN -> SEARCH (2) SET_SECURITY -> SETATTR (3) REVOKE -> WRITE if SETATTR isn't already set (4) CLEAR -> WRITE Note that the value subsequently returned by KEYCTL_DESCRIBE may not match the value set with KEYCTL_SETATTR. ======= TESTING ======= This passes the keyutils testsuite for all but a couple of tests: (1) tests/keyctl/dh_compute/badargs: The first wrong-key-type test now returns EOPNOTSUPP rather than ENOKEY as READ permission isn't removed if the type doesn't have ->read(). You still can't actually read the key. (2) tests/keyctl/permitting/valid: The view-other-permissions test doesn't work as Other has been replaced with Everyone in the ACL. Signed-off-by: David Howells --- Documentation/security/keys/core.rst | 128 ++++++++--- Documentation/security/keys/request-key.rst | 9 +- certs/blacklist.c | 7 +- certs/system_keyring.c | 12 +- drivers/md/dm-crypt.c | 2 +- drivers/nvdimm/security.c | 2 +- fs/afs/security.c | 2 +- fs/cifs/cifs_spnego.c | 25 ++- fs/cifs/cifsacl.c | 28 ++- fs/cifs/connect.c | 4 +- fs/crypto/keyinfo.c | 2 +- fs/ecryptfs/ecryptfs_kernel.h | 2 +- fs/ecryptfs/keystore.c | 2 +- fs/fscache/object-list.c | 2 +- fs/nfs/nfs4idmap.c | 30 ++- fs/ubifs/auth.c | 2 +- include/linux/key.h | 121 ++++++----- include/uapi/linux/keyctl.h | 63 ++++++ lib/digsig.c | 2 +- net/ceph/ceph_common.c | 2 +- net/dns_resolver/dns_key.c | 12 +- net/dns_resolver/dns_query.c | 15 +- net/rxrpc/key.c | 19 +- net/wireless/reg.c | 6 +- security/integrity/digsig.c | 31 ++- security/integrity/digsig_asymmetric.c | 2 +- security/integrity/evm/evm_crypto.c | 2 +- security/integrity/ima/ima_mok.c | 13 +- security/integrity/integrity.h | 6 +- .../integrity/platform_certs/platform_keyring.c | 14 +- security/keys/encrypted-keys/encrypted.c | 2 +- security/keys/encrypted-keys/masterkey_trusted.c | 2 +- security/keys/gc.c | 2 +- security/keys/internal.h | 11 +- security/keys/key.c | 29 +-- security/keys/keyctl.c | 96 +++++--- security/keys/keyring.c | 27 ++- security/keys/permission.c | 242 ++++++++++++++++++--- security/keys/persistent.c | 27 ++- security/keys/proc.c | 22 +- security/keys/process_keys.c | 86 ++++++-- security/keys/request_key.c | 34 ++- security/keys/request_key_auth.c | 15 +- security/selinux/hooks.c | 16 +- security/smack/smack_lsm.c | 3 +- 45 files changed, 857 insertions(+), 324 deletions(-) (limited to 'include/uapi/linux') diff --git a/Documentation/security/keys/core.rst b/Documentation/security/keys/core.rst index 0e74f372e58c..1b3c907980ad 100644 --- a/Documentation/security/keys/core.rst +++ b/Documentation/security/keys/core.rst @@ -57,9 +57,9 @@ Each key has a number of attributes: type provides an operation to perform a match between the description on a key and a criterion string. - * Each key has an owner user ID, a group ID and a permissions mask. These - are used to control what a process may do to a key from userspace, and - whether a kernel service will be able to find the key. + * Each key has an owner user ID, a group ID and an ACL. These are used to + control what a process may do to a key from userspace, and whether a + kernel service will be able to find the key. * Each key can be set to expire at a specific time by the key type's instantiation function. Keys can also be immortal. @@ -198,43 +198,110 @@ The key service provides a number of features besides keys: Key Access Permissions ====================== -Keys have an owner user ID, a group access ID, and a permissions mask. The mask -has up to eight bits each for possessor, user, group and other access. Only -six of each set of eight bits are defined. These permissions granted are: +Keys have an owner user ID, a group ID and an ACL. The ACL is made up of a +sequence of ACEs that each contain three elements: - * View + * The type of subject. + * The subject. - This permits a key or keyring's attributes to be viewed - including key - type and description. + These two together indicate the subject to whom the permits are granted. + The type can be one of: - * Read + * ``KEY_ACE_SUBJ_STANDARD`` - This permits a key's payload to be viewed or a keyring's list of linked - keys. + The subject is a standard 'macro' type. The subject can be one of: + + * ``KEY_ACE_EVERYONE`` + + The permits are granted to everyone. It replaces the old 'other' + type on the assumption that you wouldn't grant a permission to other + that you you wouldn't grant to everyone else. + + * ``KEY_ACE_OWNER`` + + The permits are granted to the owner of the key (key->uid). + + * ``KEY_ACE_GROUP`` + + The permits are granted to the key's group (key->gid). + + * ``KEY_ACE_POSSESSOR`` + + The permits are granted to anyone who possesses the key. + + * The set of permits granted to the subject. These include: + + * ``KEY_ACE_VIEW`` + + This permits a key or keyring's attributes to be viewed - including the + key type and description. + + * ``KEY_ACE_READ`` + + This permits a key's payload to be viewed or a keyring's list of linked + keys. - * Write + * ``KEY_ACE_WRITE`` - This permits a key's payload to be instantiated or updated, or it allows a - link to be added to or removed from a keyring. + This permits a key's payload to be instantiated or updated, or it allows + a link to be added to or removed from a keyring. - * Search + * ``KEY_ACE_SEARCH`` - This permits keyrings to be searched and keys to be found. Searches can - only recurse into nested keyrings that have search permission set. + This permits keyrings to be searched and keys to be found. Searches can + only recurse into nested keyrings that have search permission set. - * Link + * ``KEY_ACE_LINK`` - This permits a key or keyring to be linked to. To create a link from a - keyring to a key, a process must have Write permission on the keyring and - Link permission on the key. + This permits a key or keyring to be linked to. To create a link from a + keyring to a key, a process must have Write permission on the keyring + and Link permission on the key. - * Set Attribute + * ``KEY_ACE_SET_SECURITY`` - This permits a key's UID, GID and permissions mask to be changed. + This permits a key's UID, GID and permissions mask to be changed. + + * ``KEY_ACE_INVAL`` + + This permits a key to be invalidated with KEYCTL_INVALIDATE. + + * ``KEY_ACE_REVOKE`` + + This permits a key to be revoked with KEYCTL_REVOKE. + + * ``KEY_ACE_JOIN`` + + This permits a keyring to be joined as a session by + KEYCTL_JOIN_SESSION_KEYRING or KEYCTL_SESSION_TO_PARENT. + + * ``KEY_ACE_CLEAR`` + + This permits a keyring to be cleared. For changing the ownership, group ID or permissions mask, being the owner of the key or having the sysadmin capability is sufficient. +The legacy KEYCTL_SETPERM and KEYCTL_DESCRIBE functions can only see/generate +View, Read, Write, Search, Link and SetAttr permits, and do this for each of +possessor, user, group and other permission sets as a 32-bit flag mask. These +will be approximated/inferred: + + SETPERM Permit Implied ACE Permit + =============== ======================= + Search Inval, Join + Write Revoke, Clear + Setattr Set Security, Revoke + + ACE Permit Described as + =============== ======================= + Inval Search + Join Search + Revoke Write (unless Setattr) + Clear write + Set Security Setattr + +'Other' will be approximated as/inferred from the 'Everyone' subject. + SELinux Support =============== @@ -1084,7 +1151,8 @@ payload contents" for more information. struct key *request_key(const struct key_type *type, const char *description, - const char *callout_info); + const char *callout_info, + struct key_acl *acl); This is used to request a key or keyring with a description that matches the description specified according to the key type's match_preparse() @@ -1099,6 +1167,8 @@ payload contents" for more information. If successful, the key will have been attached to the default keyring for implicitly obtained request-key keys, as set by KEYCTL_SET_REQKEY_KEYRING. + If a key is created, it will be given the specified ACL. + See also Documentation/security/keys/request-key.rst. @@ -1107,7 +1177,8 @@ payload contents" for more information. struct key *request_key_tag(const struct key_type *type, const char *description, struct key_tag *domain_tag, - const char *callout_info); + const char *callout_info, + struct key_acl *acl); This is identical to request_key(), except that a domain tag may be specifies that causes search algorithm to only match keys matching that @@ -1122,7 +1193,8 @@ payload contents" for more information. struct key_tag *domain_tag, const void *callout_info, size_t callout_len, - void *aux); + void *aux, + struct key_acl *acl); This is identical to request_key_tag(), except that the auxiliary data is passed to the key_type->request_key() op if it exists, and the @@ -1195,7 +1267,7 @@ payload contents" for more information. struct key *keyring_alloc(const char *description, uid_t uid, gid_t gid, const struct cred *cred, - key_perm_t perm, + struct key_acl *acl, struct key_restriction *restrict_link, unsigned long flags, struct key *dest); diff --git a/Documentation/security/keys/request-key.rst b/Documentation/security/keys/request-key.rst index 35f2296b704a..f356fd06c8d5 100644 --- a/Documentation/security/keys/request-key.rst +++ b/Documentation/security/keys/request-key.rst @@ -11,14 +11,16 @@ The process starts by either the kernel requesting a service by calling struct key *request_key(const struct key_type *type, const char *description, - const char *callout_info); + const char *callout_info, + struct key_acl *acl); or:: struct key *request_key_tag(const struct key_type *type, const char *description, const struct key_tag *domain_tag, - const char *callout_info); + const char *callout_info, + struct key_acl *acl); or:: @@ -27,7 +29,8 @@ or:: const struct key_tag *domain_tag, const char *callout_info, size_t callout_len, - void *aux); + void *aux, + struct key_acl *acl); or:: diff --git a/certs/blacklist.c b/certs/blacklist.c index 181cb7fa9540..39de9d68b21e 100644 --- a/certs/blacklist.c +++ b/certs/blacklist.c @@ -93,8 +93,7 @@ int mark_hash_blacklisted(const char *hash) hash, NULL, 0, - ((KEY_POS_ALL & ~KEY_POS_SETATTR) | - KEY_USR_VIEW), + &internal_key_acl, KEY_ALLOC_NOT_IN_QUOTA | KEY_ALLOC_BUILT_IN); if (IS_ERR(key)) { @@ -153,9 +152,7 @@ static int __init blacklist_init(void) keyring_alloc(".blacklist", KUIDT_INIT(0), KGIDT_INIT(0), current_cred(), - (KEY_POS_ALL & ~KEY_POS_SETATTR) | - KEY_USR_VIEW | KEY_USR_READ | - KEY_USR_SEARCH, + &internal_keyring_acl, KEY_ALLOC_NOT_IN_QUOTA | KEY_FLAG_KEEP, NULL, NULL); diff --git a/certs/system_keyring.c b/certs/system_keyring.c index c05c29ae4d5d..2873a4ce2828 100644 --- a/certs/system_keyring.c +++ b/certs/system_keyring.c @@ -103,9 +103,7 @@ static __init int system_trusted_keyring_init(void) builtin_trusted_keys = keyring_alloc(".builtin_trusted_keys", KUIDT_INIT(0), KGIDT_INIT(0), current_cred(), - ((KEY_POS_ALL & ~KEY_POS_SETATTR) | - KEY_USR_VIEW | KEY_USR_READ | KEY_USR_SEARCH), - KEY_ALLOC_NOT_IN_QUOTA, + &internal_key_acl, KEY_ALLOC_NOT_IN_QUOTA, NULL, NULL); if (IS_ERR(builtin_trusted_keys)) panic("Can't allocate builtin trusted keyring\n"); @@ -114,10 +112,7 @@ static __init int system_trusted_keyring_init(void) secondary_trusted_keys = keyring_alloc(".secondary_trusted_keys", KUIDT_INIT(0), KGIDT_INIT(0), current_cred(), - ((KEY_POS_ALL & ~KEY_POS_SETATTR) | - KEY_USR_VIEW | KEY_USR_READ | KEY_USR_SEARCH | - KEY_USR_WRITE), - KEY_ALLOC_NOT_IN_QUOTA, + &internal_writable_keyring_acl, KEY_ALLOC_NOT_IN_QUOTA, get_builtin_and_secondary_restriction(), NULL); if (IS_ERR(secondary_trusted_keys)) @@ -167,8 +162,7 @@ static __init int load_system_certificate_list(void) NULL, p, plen, - ((KEY_POS_ALL & ~KEY_POS_SETATTR) | - KEY_USR_VIEW | KEY_USR_READ), + &internal_key_acl, KEY_ALLOC_NOT_IN_QUOTA | KEY_ALLOC_BUILT_IN | KEY_ALLOC_BYPASS_RESTRICTION); diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 1b16d34bb785..0fd3ca9bfe54 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -2035,7 +2035,7 @@ static int crypt_set_keyring_key(struct crypt_config *cc, const char *key_string return -ENOMEM; key = request_key(key_string[0] == 'l' ? &key_type_logon : &key_type_user, - key_desc + 1, NULL); + key_desc + 1, NULL, NULL); if (IS_ERR(key)) { kzfree(new_key_string); return PTR_ERR(key); diff --git a/drivers/nvdimm/security.c b/drivers/nvdimm/security.c index a570f2263a42..99a5708b37e3 100644 --- a/drivers/nvdimm/security.c +++ b/drivers/nvdimm/security.c @@ -55,7 +55,7 @@ static struct key *nvdimm_request_key(struct nvdimm *nvdimm) struct device *dev = &nvdimm->dev; sprintf(desc, "%s%s", NVDIMM_PREFIX, nvdimm->dimm_id); - key = request_key(&key_type_encrypted, desc, ""); + key = request_key(&key_type_encrypted, desc, "", NULL); if (IS_ERR(key)) { if (PTR_ERR(key) == -ENOKEY) dev_dbg(dev, "request_key() found no key\n"); diff --git a/fs/afs/security.c b/fs/afs/security.c index 5d8ece98561e..3185898237b2 100644 --- a/fs/afs/security.c +++ b/fs/afs/security.c @@ -32,7 +32,7 @@ struct key *afs_request_key(struct afs_cell *cell) _debug("key %s", cell->anonymous_key->description); key = request_key(&key_type_rxrpc, cell->anonymous_key->description, - NULL); + NULL, NULL); if (IS_ERR(key)) { if (PTR_ERR(key) != -ENOKEY) { _leave(" = %ld", PTR_ERR(key)); diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c index 7f01c6e60791..d1b439ad0f1a 100644 --- a/fs/cifs/cifs_spnego.c +++ b/fs/cifs/cifs_spnego.c @@ -32,6 +32,25 @@ #include "cifsproto.h" static const struct cred *spnego_cred; +static struct key_acl cifs_spnego_key_acl = { + .usage = REFCOUNT_INIT(1), + .nr_ace = 2, + .possessor_viewable = true, + .aces = { + KEY_POSSESSOR_ACE(KEY_ACE_VIEW | KEY_ACE_SEARCH | KEY_ACE_READ), + KEY_OWNER_ACE(KEY_ACE_VIEW), + } +}; + +static struct key_acl cifs_spnego_keyring_acl = { + .usage = REFCOUNT_INIT(1), + .nr_ace = 2, + .aces = { + KEY_POSSESSOR_ACE(KEY_ACE_SEARCH | KEY_ACE_WRITE), + KEY_OWNER_ACE(KEY_ACE_VIEW | KEY_ACE_READ | KEY_ACE_CLEAR), + } +}; + /* create a new cifs key */ static int cifs_spnego_key_instantiate(struct key *key, struct key_preparsed_payload *prep) @@ -170,7 +189,8 @@ cifs_get_spnego_key(struct cifs_ses *sesInfo) cifs_dbg(FYI, "key description = %s\n", description); saved_cred = override_creds(spnego_cred); - spnego_key = request_key(&cifs_spnego_key_type, description, ""); + spnego_key = request_key(&cifs_spnego_key_type, description, "", + &cifs_spnego_key_acl); revert_creds(saved_cred); #ifdef CONFIG_CIFS_DEBUG2 @@ -207,8 +227,7 @@ init_cifs_spnego(void) keyring = keyring_alloc(".cifs_spnego", GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, cred, - (KEY_POS_ALL & ~KEY_POS_SETATTR) | - KEY_USR_VIEW | KEY_USR_READ, + &cifs_spnego_keyring_acl, KEY_ALLOC_NOT_IN_QUOTA, NULL, NULL); if (IS_ERR(keyring)) { ret = PTR_ERR(keyring); diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c index 1d377b7f2860..78eed72f3af0 100644 --- a/fs/cifs/cifsacl.c +++ b/fs/cifs/cifsacl.c @@ -33,6 +33,25 @@ #include "cifsproto.h" #include "cifs_debug.h" +static struct key_acl cifs_idmap_key_acl = { + .usage = REFCOUNT_INIT(1), + .nr_ace = 2, + .possessor_viewable = true, + .aces = { + KEY_POSSESSOR_ACE(KEY_ACE_VIEW | KEY_ACE_SEARCH | KEY_ACE_READ), + KEY_OWNER_ACE(KEY_ACE_VIEW), + } +}; + +static struct key_acl cifs_idmap_keyring_acl = { + .usage = REFCOUNT_INIT(1), + .nr_ace = 2, + .aces = { + KEY_POSSESSOR_ACE(KEY_ACE_SEARCH | KEY_ACE_WRITE), + KEY_OWNER_ACE(KEY_ACE_VIEW | KEY_ACE_READ), + } +}; + /* security id for everyone/world system group */ static const struct cifs_sid sid_everyone = { 1, 1, {0, 0, 0, 0, 0, 1}, {0} }; @@ -298,7 +317,8 @@ id_to_sid(unsigned int cid, uint sidtype, struct cifs_sid *ssid) rc = 0; saved_cred = override_creds(root_cred); - sidkey = request_key(&cifs_idmap_key_type, desc, ""); + sidkey = request_key(&cifs_idmap_key_type, desc, "", + &cifs_idmap_key_acl); if (IS_ERR(sidkey)) { rc = -EINVAL; cifs_dbg(FYI, "%s: Can't map %cid %u to a SID\n", @@ -403,7 +423,8 @@ try_upcall_to_get_id: return -ENOMEM; saved_cred = override_creds(root_cred); - sidkey = request_key(&cifs_idmap_key_type, sidstr, ""); + sidkey = request_key(&cifs_idmap_key_type, sidstr, "", + &cifs_idmap_key_acl); if (IS_ERR(sidkey)) { rc = -EINVAL; cifs_dbg(FYI, "%s: Can't map SID %s to a %cid\n", @@ -481,8 +502,7 @@ init_cifs_idmap(void) keyring = keyring_alloc(".cifs_idmap", GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, cred, - (KEY_POS_ALL & ~KEY_POS_SETATTR) | - KEY_USR_VIEW | KEY_USR_READ, + &cifs_idmap_keyring_acl, KEY_ALLOC_NOT_IN_QUOTA, NULL, NULL); if (IS_ERR(keyring)) { ret = PTR_ERR(keyring); diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 8c4121da624e..6e50d3e87948 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -2990,7 +2990,7 @@ cifs_set_cifscreds(struct smb_vol *vol, struct cifs_ses *ses) } cifs_dbg(FYI, "%s: desc=%s\n", __func__, desc); - key = request_key(&key_type_logon, desc, ""); + key = request_key(&key_type_logon, desc, "", NULL); if (IS_ERR(key)) { if (!ses->domainName) { cifs_dbg(FYI, "domainName is NULL\n"); @@ -3001,7 +3001,7 @@ cifs_set_cifscreds(struct smb_vol *vol, struct cifs_ses *ses) /* didn't work, try to find a domain key */ sprintf(desc, "cifs:d:%s", ses->domainName); cifs_dbg(FYI, "%s: desc=%s\n", __func__, desc); - key = request_key(&key_type_logon, desc, ""); + key = request_key(&key_type_logon, desc, "", NULL); if (IS_ERR(key)) { rc = PTR_ERR(key); goto out_err; diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c index dcd91a3fbe49..4f85af8ab239 100644 --- a/fs/crypto/keyinfo.c +++ b/fs/crypto/keyinfo.c @@ -92,7 +92,7 @@ find_and_lock_process_key(const char *prefix, if (!description) return ERR_PTR(-ENOMEM); - key = request_key(&key_type_logon, description, NULL); + key = request_key(&key_type_logon, description, NULL, NULL); kfree(description); if (IS_ERR(key)) return key; diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h index e74cb2a0b299..6460bd2a4e9d 100644 --- a/fs/ecryptfs/ecryptfs_kernel.h +++ b/fs/ecryptfs/ecryptfs_kernel.h @@ -105,7 +105,7 @@ ecryptfs_get_encrypted_key_payload_data(struct key *key) static inline struct key *ecryptfs_get_encrypted_key(char *sig) { - return request_key(&key_type_encrypted, sig, NULL); + return request_key(&key_type_encrypted, sig, NULL, NULL); } #else diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c index 90fbac5d485b..923a6006ccea 100644 --- a/fs/ecryptfs/keystore.c +++ b/fs/ecryptfs/keystore.c @@ -1624,7 +1624,7 @@ int ecryptfs_keyring_auth_tok_for_sig(struct key **auth_tok_key, { int rc = 0; - (*auth_tok_key) = request_key(&key_type_user, sig, NULL); + (*auth_tok_key) = request_key(&key_type_user, sig, NULL, NULL); if (!(*auth_tok_key) || IS_ERR(*auth_tok_key)) { (*auth_tok_key) = ecryptfs_get_encrypted_key(sig); if (!(*auth_tok_key) || IS_ERR(*auth_tok_key)) { diff --git a/fs/fscache/object-list.c b/fs/fscache/object-list.c index 43e6e28c164f..6a672289e5ec 100644 --- a/fs/fscache/object-list.c +++ b/fs/fscache/object-list.c @@ -321,7 +321,7 @@ static void fscache_objlist_config(struct fscache_objlist_data *data) const char *buf; int len; - key = request_key(&key_type_user, "fscache:objlist", NULL); + key = request_key(&key_type_user, "fscache:objlist", NULL, NULL); if (IS_ERR(key)) goto no_config; diff --git a/fs/nfs/nfs4idmap.c b/fs/nfs/nfs4idmap.c index 1e7296395d71..69679f4f2e6c 100644 --- a/fs/nfs/nfs4idmap.c +++ b/fs/nfs/nfs4idmap.c @@ -72,6 +72,25 @@ struct idmap { const struct cred *cred; }; +static struct key_acl nfs_idmap_key_acl = { + .usage = REFCOUNT_INIT(1), + .nr_ace = 2, + .possessor_viewable = true, + .aces = { + KEY_POSSESSOR_ACE(KEY_ACE_VIEW | KEY_ACE_SEARCH | KEY_ACE_READ), + KEY_OWNER_ACE(KEY_ACE_VIEW), + } +}; + +static struct key_acl nfs_idmap_keyring_acl = { + .usage = REFCOUNT_INIT(1), + .nr_ace = 2, + .aces = { + KEY_POSSESSOR_ACE(KEY_ACE_SEARCH | KEY_ACE_WRITE), + KEY_OWNER_ACE(KEY_ACE_VIEW | KEY_ACE_READ), + } +}; + static struct user_namespace *idmap_userns(const struct idmap *idmap) { if (idmap && idmap->cred) @@ -208,8 +227,7 @@ int nfs_idmap_init(void) keyring = keyring_alloc(".id_resolver", GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, cred, - (KEY_POS_ALL & ~KEY_POS_SETATTR) | - KEY_USR_VIEW | KEY_USR_READ, + &nfs_idmap_keyring_acl, KEY_ALLOC_NOT_IN_QUOTA, NULL, NULL); if (IS_ERR(keyring)) { ret = PTR_ERR(keyring); @@ -287,11 +305,13 @@ static struct key *nfs_idmap_request_key(const char *name, size_t namelen, return ERR_PTR(ret); if (!idmap->cred || idmap->cred->user_ns == &init_user_ns) - rkey = request_key(&key_type_id_resolver, desc, ""); + rkey = request_key(&key_type_id_resolver, desc, "", + &nfs_idmap_key_acl); if (IS_ERR(rkey)) { mutex_lock(&idmap->idmap_mutex); rkey = request_key_with_auxdata(&key_type_id_resolver_legacy, - desc, NULL, "", 0, idmap); + desc, NULL, "", 0, idmap, + &nfs_idmap_key_acl); mutex_unlock(&idmap->idmap_mutex); } if (!IS_ERR(rkey)) @@ -320,8 +340,6 @@ static ssize_t nfs_idmap_get_key(const char *name, size_t namelen, } rcu_read_lock(); - rkey->perm |= KEY_USR_VIEW; - ret = key_validate(rkey); if (ret < 0) goto out_up; diff --git a/fs/ubifs/auth.c b/fs/ubifs/auth.c index 60f43b93d06e..38718026ad0b 100644 --- a/fs/ubifs/auth.c +++ b/fs/ubifs/auth.c @@ -227,7 +227,7 @@ int ubifs_init_authentication(struct ubifs_info *c) snprintf(hmac_name, CRYPTO_MAX_ALG_NAME, "hmac(%s)", c->auth_hash_name); - keyring_key = request_key(&key_type_logon, c->auth_key_name, NULL); + keyring_key = request_key(&key_type_logon, c->auth_key_name, NULL, NULL); if (IS_ERR(keyring_key)) { ubifs_err(c, "Failed to request key: %ld", diff --git a/include/linux/key.h b/include/linux/key.h index 18d7f62ab6b0..bc4adfd254fe 100644 --- a/include/linux/key.h +++ b/include/linux/key.h @@ -32,49 +32,14 @@ /* key handle serial number */ typedef int32_t key_serial_t; -/* key handle permissions mask */ -typedef uint32_t key_perm_t; - struct key; struct net; #ifdef CONFIG_KEYS -#undef KEY_DEBUGGING +#include -#define KEY_POS_VIEW 0x01000000 /* possessor can view a key's attributes */ -#define KEY_POS_READ 0x02000000 /* possessor can read key payload / view keyring */ -#define KEY_POS_WRITE 0x04000000 /* possessor can update key payload / add link to keyring */ -#define KEY_POS_SEARCH 0x08000000 /* possessor can find a key in search / search a keyring */ -#define KEY_POS_LINK 0x10000000 /* possessor can create a link to a key/keyring */ -#define KEY_POS_SETATTR 0x20000000 /* possessor can set key attributes */ -#define KEY_POS_ALL 0x3f000000 - -#define KEY_USR_VIEW 0x00010000 /* user permissions... */ -#define KEY_USR_READ 0x00020000 -#define KEY_USR_WRITE 0x00040000 -#define KEY_USR_SEARCH 0x00080000 -#define KEY_USR_LINK 0x00100000 -#define KEY_USR_SETATTR 0x00200000 -#define KEY_USR_ALL 0x003f0000 - -#define KEY_GRP_VIEW 0x00000100 /* group permissions... */ -#define KEY_GRP_READ 0x00000200 -#define KEY_GRP_WRITE 0x00000400 -#define KEY_GRP_SEARCH 0x00000800 -#define KEY_GRP_LINK 0x00001000 -#define KEY_GRP_SETATTR 0x00002000 -#define KEY_GRP_ALL 0x00003f00 - -#define KEY_OTH_VIEW 0x00000001 /* third party permissions... */ -#define KEY_OTH_READ 0x00000002 -#define KEY_OTH_WRITE 0x00000004 -#define KEY_OTH_SEARCH 0x00000008 -#define KEY_OTH_LINK 0x00000010 -#define KEY_OTH_SETATTR 0x00000020 -#define KEY_OTH_ALL 0x0000003f - -#define KEY_PERM_UNDEF 0xffffffff +#undef KEY_DEBUGGING struct seq_file; struct user_struct; @@ -118,6 +83,36 @@ union key_payload { void *data[4]; }; +struct key_ace { + unsigned int type; + unsigned int perm; + union { + kuid_t uid; + kgid_t gid; + unsigned int subject_id; + }; +}; + +struct key_acl { + refcount_t usage; + unsigned short nr_ace; + bool possessor_viewable; + struct rcu_head rcu; + struct key_ace aces[]; +}; + +#define KEY_POSSESSOR_ACE(perms) { \ + .type = KEY_ACE_SUBJ_STANDARD, \ + .perm = perms, \ + .subject_id = KEY_ACE_POSSESSOR \ + } + +#define KEY_OWNER_ACE(perms) { \ + .type = KEY_ACE_SUBJ_STANDARD, \ + .perm = perms, \ + .subject_id = KEY_ACE_OWNER \ + } + /*****************************************************************************/ /* * key reference with possession attribute handling @@ -184,6 +179,7 @@ struct key { struct rw_semaphore sem; /* change vs change sem */ struct key_user *user; /* owner of this key */ void *security; /* security data for this key */ + struct key_acl __rcu *acl; union { time64_t expiry; /* time at which key expires (or 0) */ time64_t revoked_at; /* time at which key was revoked */ @@ -191,7 +187,6 @@ struct key { time64_t last_used_at; /* last time used for LRU keyring discard */ kuid_t uid; kgid_t gid; - key_perm_t perm; /* access permissions */ unsigned short quotalen; /* length added to quota */ unsigned short datalen; /* payload data length * - may not match RCU dereferenced payload @@ -215,6 +210,7 @@ struct key { #define KEY_FLAG_ROOT_CAN_INVAL 7 /* set if key can be invalidated by root without permission */ #define KEY_FLAG_KEEP 8 /* set if key should not be removed */ #define KEY_FLAG_UID_KEYRING 9 /* set if key is a user or user session keyring */ +#define KEY_FLAG_HAS_ACL 10 /* Set if KEYCTL_SETACL called on key */ /* the key type and key description string * - the desc is used to match a key against search criteria @@ -263,7 +259,7 @@ extern struct key *key_alloc(struct key_type *type, const char *desc, kuid_t uid, kgid_t gid, const struct cred *cred, - key_perm_t perm, + struct key_acl *acl, unsigned long flags, struct key_restriction *restrict_link); @@ -300,7 +296,8 @@ static inline void key_ref_put(key_ref_t key_ref) extern struct key *request_key_tag(struct key_type *type, const char *description, struct key_tag *domain_tag, - const char *callout_info); + const char *callout_info, + struct key_acl *acl); extern struct key *request_key_rcu(struct key_type *type, const char *description, @@ -311,21 +308,24 @@ extern struct key *request_key_with_auxdata(struct key_type *type, struct key_tag *domain_tag, const void *callout_info, size_t callout_len, - void *aux); + void *aux, + struct key_acl *acl); /** * request_key - Request a key and wait for construction * @type: Type of key. * @description: The searchable description of the key. * @callout_info: The data to pass to the instantiation upcall (or NULL). + * @acl: The ACL to attach to a new key (or NULL). * * As for request_key_tag(), but with the default global domain tag. */ static inline struct key *request_key(struct key_type *type, const char *description, - const char *callout_info) + const char *callout_info, + struct key_acl *acl) { - return request_key_tag(type, description, NULL, callout_info); + return request_key_tag(type, description, NULL, callout_info, acl); } #ifdef CONFIG_NET @@ -335,6 +335,7 @@ static inline struct key *request_key(struct key_type *type, * @description: The searchable description of the key. * @net: The network namespace that is the key's domain of operation. * @callout_info: The data to pass to the instantiation upcall (or NULL). + * @acl: The ACL to attach to a new key (or NULL). * * As for request_key() except that it does not add the returned key to a * keyring if found, new keys are always allocated in the user's quota, the @@ -344,8 +345,8 @@ static inline struct key *request_key(struct key_type *type, * Furthermore, it then works as wait_for_key_construction() to wait for the * completion of keys undergoing construction with a non-interruptible wait. */ -#define request_key_net(type, description, net, callout_info) \ - request_key_tag(type, description, net->key_domain, callout_info); +#define request_key_net(type, description, net, callout_info, acl) \ + request_key_tag(type, description, net->key_domain, callout_info, acl); #endif /* CONFIG_NET */ extern int wait_for_key_construction(struct key *key, bool intr); @@ -357,7 +358,7 @@ extern key_ref_t key_create_or_update(key_ref_t keyring, const char *description, const void *payload, size_t plen, - key_perm_t perm, + struct key_acl *acl, unsigned long flags); extern int key_update(key_ref_t key, @@ -377,7 +378,7 @@ extern int key_unlink(struct key *keyring, extern struct key *keyring_alloc(const char *description, kuid_t uid, kgid_t gid, const struct cred *cred, - key_perm_t perm, + struct key_acl *acl, unsigned long flags, struct key_restriction *restrict_link, struct key *dest); @@ -410,19 +411,29 @@ static inline key_serial_t key_serial(const struct key *key) extern void key_set_timeout(struct key *, unsigned); extern key_ref_t lookup_user_key(key_serial_t id, unsigned long flags, - key_perm_t perm); + u32 desired_perm); extern void key_free_user_ns(struct user_namespace *); /* * The permissions required on a key that we're looking up. */ -#define KEY_NEED_VIEW 0x01 /* Require permission to view attributes */ -#define KEY_NEED_READ 0x02 /* Require permission to read content */ -#define KEY_NEED_WRITE 0x04 /* Require permission to update / modify */ -#define KEY_NEED_SEARCH 0x08 /* Require permission to search (keyring) or find (key) */ -#define KEY_NEED_LINK 0x10 /* Require permission to link */ -#define KEY_NEED_SETATTR 0x20 /* Require permission to change attributes */ -#define KEY_NEED_ALL 0x3f /* All the above permissions */ +#define KEY_NEED_VIEW 0x001 /* Require permission to view attributes */ +#define KEY_NEED_READ 0x002 /* Require permission to read content */ +#define KEY_NEED_WRITE 0x004 /* Require permission to update / modify */ +#define KEY_NEED_SEARCH 0x008 /* Require permission to search (keyring) or find (key) */ +#define KEY_NEED_LINK 0x010 /* Require permission to link */ +#define KEY_NEED_SETSEC 0x020 /* Require permission to set owner, group, ACL */ +#define KEY_NEED_INVAL 0x040 /* Require permission to invalidate key */ +#define KEY_NEED_REVOKE 0x080 /* Require permission to revoke key */ +#define KEY_NEED_JOIN 0x100 /* Require permission to join keyring as session */ +#define KEY_NEED_CLEAR 0x200 /* Require permission to clear a keyring */ +#define KEY_NEED_ALL 0x3ff + +#define OLD_KEY_NEED_SETATTR 0x20 /* Used to be Require permission to change attributes */ + +extern struct key_acl internal_key_acl; +extern struct key_acl internal_keyring_acl; +extern struct key_acl internal_writable_keyring_acl; static inline short key_read_state(const struct key *key) { diff --git a/include/uapi/linux/keyctl.h b/include/uapi/linux/keyctl.h index ed3d5893830d..e783bf957da8 100644 --- a/include/uapi/linux/keyctl.h +++ b/include/uapi/linux/keyctl.h @@ -15,6 +15,69 @@ #include +/* + * Keyring permission grant definitions + */ +enum key_ace_subject_type { + KEY_ACE_SUBJ_STANDARD = 0, /* subject is one of key_ace_standard_subject */ + nr__key_ace_subject_type +}; + +enum key_ace_standard_subject { + KEY_ACE_EVERYONE = 0, /* Everyone, including owner and group */ + KEY_ACE_GROUP = 1, /* The key's group */ + KEY_ACE_OWNER = 2, /* The owner of the key */ + KEY_ACE_POSSESSOR = 3, /* Any process that possesses of the key */ + nr__key_ace_standard_subject +}; + +#define KEY_ACE_VIEW 0x00000001 /* Can describe the key */ +#define KEY_ACE_READ 0x00000002 /* Can read the key content */ +#define KEY_ACE_WRITE 0x00000004 /* Can update/modify the key content */ +#define KEY_ACE_SEARCH 0x00000008 /* Can find the key by search */ +#define KEY_ACE_LINK 0x00000010 /* Can make a link to the key */ +#define KEY_ACE_SET_SECURITY 0x00000020 /* Can set owner, group, ACL */ +#define KEY_ACE_INVAL 0x00000040 /* Can invalidate the key */ +#define KEY_ACE_REVOKE 0x00000080 /* Can revoke the key */ +#define KEY_ACE_JOIN 0x00000100 /* Can join keyring */ +#define KEY_ACE_CLEAR 0x00000200 /* Can clear keyring */ +#define KEY_ACE__PERMS 0xffffffff + +/* + * Old-style permissions mask, deprecated in favour of ACL. + */ +#define KEY_POS_VIEW 0x01000000 /* possessor can view a key's attributes */ +#define KEY_POS_READ 0x02000000 /* possessor can read key payload / view keyring */ +#define KEY_POS_WRITE 0x04000000 /* possessor can update key payload / add link to keyring */ +#define KEY_POS_SEARCH 0x08000000 /* possessor can find a key in search / search a keyring */ +#define KEY_POS_LINK 0x10000000 /* possessor can create a link to a key/keyring */ +#define KEY_POS_SETATTR 0x20000000 /* possessor can set key attributes */ +#define KEY_POS_ALL 0x3f000000 + +#define KEY_USR_VIEW 0x00010000 /* user permissions... */ +#define KEY_USR_READ 0x00020000 +#define KEY_USR_WRITE 0x00040000 +#define KEY_USR_SEARCH 0x00080000 +#define KEY_USR_LINK 0x00100000 +#define KEY_USR_SETATTR 0x00200000 +#define KEY_USR_ALL 0x003f0000 + +#define KEY_GRP_VIEW 0x00000100 /* group permissions... */ +#define KEY_GRP_READ 0x00000200 +#define KEY_GRP_WRITE 0x00000400 +#define KEY_GRP_SEARCH 0x00000800 +#define KEY_GRP_LINK 0x00001000 +#define KEY_GRP_SETATTR 0x00002000 +#define KEY_GRP_ALL 0x00003f00 + +#define KEY_OTH_VIEW 0x00000001 /* third party permissions... */ +#define KEY_OTH_READ 0x00000002 +#define KEY_OTH_WRITE 0x00000004 +#define KEY_OTH_SEARCH 0x00000008 +#define KEY_OTH_LINK 0x00000010 +#define KEY_OTH_SETATTR 0x00000020 +#define KEY_OTH_ALL 0x0000003f + /* special process keyring shortcut IDs */ #define KEY_SPEC_THREAD_KEYRING -1 /* - key ID for thread-specific keyring */ #define KEY_SPEC_PROCESS_KEYRING -2 /* - key ID for process-specific keyring */ diff --git a/lib/digsig.c b/lib/digsig.c index 3782af401c68..ce87ca2e0929 100644 --- a/lib/digsig.c +++ b/lib/digsig.c @@ -227,7 +227,7 @@ int digsig_verify(struct key *keyring, const char *sig, int siglen, else key = key_ref_to_ptr(kref); } else { - key = request_key(&key_type_user, name, NULL); + key = request_key(&key_type_user, name, NULL, NULL); } if (IS_ERR(key)) { pr_err("key not found, id: %s\n", name); diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c index 79eac465ec65..d4af93a35e2b 100644 --- a/net/ceph/ceph_common.c +++ b/net/ceph/ceph_common.c @@ -305,7 +305,7 @@ static int get_secret(struct ceph_crypto_key *dst, const char *name) { int err = 0; struct ceph_crypto_key *ckey; - ukey = request_key(&key_type_ceph, name, NULL); + ukey = request_key(&key_type_ceph, name, NULL, NULL); if (IS_ERR(ukey)) { /* request_key errors don't map nicely to mount(2) errors; don't even try, but still printk */ diff --git a/net/dns_resolver/dns_key.c b/net/dns_resolver/dns_key.c index 3e1a90669006..6b201531b165 100644 --- a/net/dns_resolver/dns_key.c +++ b/net/dns_resolver/dns_key.c @@ -46,6 +46,15 @@ const struct cred *dns_resolver_cache; #define DNS_ERRORNO_OPTION "dnserror" +static struct key_acl dns_keyring_acl = { + .usage = REFCOUNT_INIT(1), + .nr_ace = 2, + .aces = { + KEY_POSSESSOR_ACE(KEY_ACE_SEARCH | KEY_ACE_WRITE), + KEY_OWNER_ACE(KEY_ACE_VIEW | KEY_ACE_READ | KEY_ACE_CLEAR), + } +}; + /* * Preparse instantiation data for a dns_resolver key. * @@ -343,8 +352,7 @@ static int __init init_dns_resolver(void) keyring = keyring_alloc(".dns_resolver", GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, cred, - (KEY_POS_ALL & ~KEY_POS_SETATTR) | - KEY_USR_VIEW | KEY_USR_READ, + &dns_keyring_acl, KEY_ALLOC_NOT_IN_QUOTA, NULL, NULL); if (IS_ERR(keyring)) { ret = PTR_ERR(keyring); diff --git a/net/dns_resolver/dns_query.c b/net/dns_resolver/dns_query.c index cab4e0df924f..236baf2bfa4c 100644 --- a/net/dns_resolver/dns_query.c +++ b/net/dns_resolver/dns_query.c @@ -47,6 +47,16 @@ #include "internal.h" +static struct key_acl dns_key_acl = { + .usage = REFCOUNT_INIT(1), + .nr_ace = 2, + .possessor_viewable = true, + .aces = { + KEY_POSSESSOR_ACE(KEY_ACE_VIEW | KEY_ACE_SEARCH | KEY_ACE_READ), + KEY_OWNER_ACE(KEY_ACE_VIEW | KEY_ACE_INVAL), + } +}; + /** * dns_query - Query the DNS * @net: The network namespace to operate in. @@ -125,7 +135,8 @@ int dns_query(struct net *net, * add_key() to preinstall malicious redirections */ saved_cred = override_creds(dns_resolver_cache); - rkey = request_key_net(&key_type_dns_resolver, desc, net, options); + rkey = request_key_net(&key_type_dns_resolver, desc, net, options, + &dns_key_acl); revert_creds(saved_cred); kfree(desc); if (IS_ERR(rkey)) { @@ -135,8 +146,6 @@ int dns_query(struct net *net, down_read(&rkey->sem); set_bit(KEY_FLAG_ROOT_CAN_INVAL, &rkey->flags); - rkey->perm |= KEY_USR_VIEW; - ret = key_validate(rkey); if (ret < 0) goto put; diff --git a/net/rxrpc/key.c b/net/rxrpc/key.c index 1cc6b0c6cc42..207d621d18c0 100644 --- a/net/rxrpc/key.c +++ b/net/rxrpc/key.c @@ -27,6 +27,14 @@ #include #include "ar-internal.h" +static struct key_acl rxrpc_null_key_acl = { + .usage = REFCOUNT_INIT(1), + .nr_ace = 1, + .aces = { + KEY_POSSESSOR_ACE(KEY_ACE_SEARCH | KEY_ACE_READ), + } +}; + static int rxrpc_vet_description_s(const char *); static int rxrpc_preparse(struct key_preparsed_payload *); static int rxrpc_preparse_s(struct key_preparsed_payload *); @@ -914,7 +922,8 @@ int rxrpc_request_key(struct rxrpc_sock *rx, char __user *optval, int optlen) if (IS_ERR(description)) return PTR_ERR(description); - key = request_key_net(&key_type_rxrpc, description, sock_net(&rx->sk), NULL); + key = request_key_net(&key_type_rxrpc, description, sock_net(&rx->sk), + NULL, NULL); if (IS_ERR(key)) { kfree(description); _leave(" = %ld", PTR_ERR(key)); @@ -945,7 +954,8 @@ int rxrpc_server_keyring(struct rxrpc_sock *rx, char __user *optval, if (IS_ERR(description)) return PTR_ERR(description); - key = request_key_net(&key_type_keyring, description, sock_net(&rx->sk), NULL); + key = request_key_net(&key_type_keyring, description, sock_net(&rx->sk), + NULL, NULL); if (IS_ERR(key)) { kfree(description); _leave(" = %ld", PTR_ERR(key)); @@ -978,7 +988,8 @@ int rxrpc_get_server_data_key(struct rxrpc_connection *conn, _enter(""); key = key_alloc(&key_type_rxrpc, "x", - GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, cred, 0, + GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, cred, + &internal_key_acl, KEY_ALLOC_NOT_IN_QUOTA, NULL); if (IS_ERR(key)) { _leave(" = -ENOMEM [alloc %ld]", PTR_ERR(key)); @@ -1026,7 +1037,7 @@ struct key *rxrpc_get_null_key(const char *keyname) key = key_alloc(&key_type_rxrpc, keyname, GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, cred, - KEY_POS_SEARCH, KEY_ALLOC_NOT_IN_QUOTA, NULL); + &rxrpc_null_key_acl, KEY_ALLOC_NOT_IN_QUOTA, NULL); if (IS_ERR(key)) return key; diff --git a/net/wireless/reg.c b/net/wireless/reg.c index 4831ad745f91..298fe91557f7 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -741,8 +741,7 @@ static void __init load_keys_from_buffer(const u8 *p, unsigned int buflen) key = key_create_or_update(make_key_ref(builtin_regdb_keys, 1), "asymmetric", NULL, p, plen, - ((KEY_POS_ALL & ~KEY_POS_SETATTR) | - KEY_USR_VIEW | KEY_USR_READ), + &internal_key_acl, KEY_ALLOC_NOT_IN_QUOTA | KEY_ALLOC_BUILT_IN | KEY_ALLOC_BYPASS_RESTRICTION); @@ -768,8 +767,7 @@ static int __init load_builtin_regdb_keys(void) builtin_regdb_keys = keyring_alloc(".builtin_regdb_keys", KUIDT_INIT(0), KGIDT_INIT(0), current_cred(), - ((KEY_POS_ALL & ~KEY_POS_SETATTR) | - KEY_USR_VIEW | KEY_USR_READ | KEY_USR_SEARCH), + &internal_keyring_acl, KEY_ALLOC_NOT_IN_QUOTA, NULL, NULL); if (IS_ERR(builtin_regdb_keys)) return PTR_ERR(builtin_regdb_keys); diff --git a/security/integrity/digsig.c b/security/integrity/digsig.c index e19c2eb72c51..3bd2cc28f4f5 100644 --- a/security/integrity/digsig.c +++ b/security/integrity/digsig.c @@ -51,7 +51,8 @@ int integrity_digsig_verify(const unsigned int id, const char *sig, int siglen, if (!keyring[id]) { keyring[id] = - request_key(&key_type_keyring, keyring_name[id], NULL); + request_key(&key_type_keyring, keyring_name[id], + NULL, NULL); if (IS_ERR(keyring[id])) { int err = PTR_ERR(keyring[id]); pr_err("no %s keyring: %d\n", keyring_name[id], err); @@ -73,14 +74,14 @@ int integrity_digsig_verify(const unsigned int id, const char *sig, int siglen, return -EOPNOTSUPP; } -static int __integrity_init_keyring(const unsigned int id, key_perm_t perm, +static int __integrity_init_keyring(const unsigned int id, struct key_acl *acl, struct key_restriction *restriction) { const struct cred *cred = current_cred(); int err = 0; keyring[id] = keyring_alloc(keyring_name[id], KUIDT_INIT(0), - KGIDT_INIT(0), cred, perm, + KGIDT_INIT(0), cred, acl, KEY_ALLOC_NOT_IN_QUOTA, restriction, NULL); if (IS_ERR(keyring[id])) { err = PTR_ERR(keyring[id]); @@ -98,10 +99,7 @@ static int __integrity_init_keyring(const unsigned int id, key_perm_t perm, int __init integrity_init_keyring(const unsigned int id) { struct key_restriction *restriction; - key_perm_t perm; - - perm = (KEY_POS_ALL & ~KEY_POS_SETATTR) | KEY_USR_VIEW - | KEY_USR_READ | KEY_USR_SEARCH; + struct key_acl *acl = &internal_keyring_acl; if (id == INTEGRITY_KEYRING_PLATFORM) { restriction = NULL; @@ -116,14 +114,14 @@ int __init integrity_init_keyring(const unsigned int id) return -ENOMEM; restriction->check = restrict_link_to_ima; - perm |= KEY_USR_WRITE; + acl = &internal_writable_keyring_acl; out: - return __integrity_init_keyring(id, perm, restriction); + return __integrity_init_keyring(id, acl, restriction); } -int __init integrity_add_key(const unsigned int id, const void *data, - off_t size, key_perm_t perm) +static int __init integrity_add_key(const unsigned int id, const void *data, + off_t size, struct key_acl *acl) { key_ref_t key; int rc = 0; @@ -132,7 +130,7 @@ int __init integrity_add_key(const unsigned int id, const void *data, return -EINVAL; key = key_create_or_update(make_key_ref(keyring[id], 1), "asymmetric", - NULL, data, size, perm, + NULL, data, size, acl ?: &internal_key_acl, KEY_ALLOC_NOT_IN_QUOTA); if (IS_ERR(key)) { rc = PTR_ERR(key); @@ -152,7 +150,6 @@ int __init integrity_load_x509(const unsigned int id, const char *path) void *data; loff_t size; int rc; - key_perm_t perm; rc = kernel_read_file_from_path(path, &data, &size, 0, READING_X509_CERTIFICATE); @@ -161,21 +158,19 @@ int __init integrity_load_x509(const unsigned int id, const char *path) return rc; } - perm = (KEY_POS_ALL & ~KEY_POS_SETATTR) | KEY_USR_VIEW | KEY_USR_READ; - pr_info("Loading X.509 certificate: %s\n", path); - rc = integrity_add_key(id, (const void *)data, size, perm); + rc = integrity_add_key(id, data, size, NULL); vfree(data); return rc; } int __init integrity_load_cert(const unsigned int id, const char *source, - const void *data, size_t len, key_perm_t perm) + const void *data, size_t len, struct key_acl *acl) { if (!data) return -EINVAL; pr_info("Loading X.509 certificate: %s\n", source); - return integrity_add_key(id, data, len, perm); + return integrity_add_key(id, data, len, acl); } diff --git a/security/integrity/digsig_asymmetric.c b/security/integrity/digsig_asymmetric.c index 358f614811e8..a8bd8b2f4fce 100644 --- a/security/integrity/digsig_asymmetric.c +++ b/security/integrity/digsig_asymmetric.c @@ -57,7 +57,7 @@ static struct key *request_asymmetric_key(struct key *keyring, uint32_t keyid) else key = key_ref_to_ptr(kref); } else { - key = request_key(&key_type_asymmetric, name, NULL); + key = request_key(&key_type_asymmetric, name, NULL, NULL); } if (IS_ERR(key)) { diff --git a/security/integrity/evm/evm_crypto.c b/security/integrity/evm/evm_crypto.c index e11564eb645b..304cb0b21f7a 100644 --- a/security/integrity/evm/evm_crypto.c +++ b/security/integrity/evm/evm_crypto.c @@ -356,7 +356,7 @@ int evm_init_key(void) struct encrypted_key_payload *ekp; int rc; - evm_key = request_key(&key_type_encrypted, EVMKEY, NULL); + evm_key = request_key(&key_type_encrypted, EVMKEY, NULL, NULL); if (IS_ERR(evm_key)) return -ENOENT; diff --git a/security/integrity/ima/ima_mok.c b/security/integrity/ima/ima_mok.c index 073ddc9bce5b..ce48303cfacc 100644 --- a/security/integrity/ima/ima_mok.c +++ b/security/integrity/ima/ima_mok.c @@ -21,6 +21,15 @@ #include +static struct key_acl integrity_blacklist_keyring_acl = { + .usage = REFCOUNT_INIT(1), + .nr_ace = 2, + .aces = { + KEY_POSSESSOR_ACE(KEY_ACE_SEARCH | KEY_ACE_WRITE), + KEY_OWNER_ACE(KEY_ACE_VIEW | KEY_ACE_READ | KEY_ACE_WRITE | KEY_ACE_SEARCH), + } +}; + struct key *ima_blacklist_keyring; /* @@ -40,9 +49,7 @@ __init int ima_mok_init(void) ima_blacklist_keyring = keyring_alloc(".ima_blacklist", KUIDT_INIT(0), KGIDT_INIT(0), current_cred(), - (KEY_POS_ALL & ~KEY_POS_SETATTR) | - KEY_USR_VIEW | KEY_USR_READ | - KEY_USR_WRITE | KEY_USR_SEARCH, + &integrity_blacklist_keyring_acl, KEY_ALLOC_NOT_IN_QUOTA, restriction, NULL); diff --git a/security/integrity/integrity.h b/security/integrity/integrity.h index 7de59f44cba3..1c50aff6f65a 100644 --- a/security/integrity/integrity.h +++ b/security/integrity/integrity.h @@ -17,6 +17,8 @@ #include #include +struct key_acl; + /* iint action cache flags */ #define IMA_MEASURE 0x00000001 #define IMA_MEASURED 0x00000002 @@ -154,7 +156,7 @@ int integrity_digsig_verify(const unsigned int id, const char *sig, int siglen, int __init integrity_init_keyring(const unsigned int id); int __init integrity_load_x509(const unsigned int id, const char *path); int __init integrity_load_cert(const unsigned int id, const char *source, - const void *data, size_t len, key_perm_t perm); + const void *data, size_t len, struct key_acl *acl); #else static inline int integrity_digsig_verify(const unsigned int id, @@ -172,7 +174,7 @@ static inline int integrity_init_keyring(const unsigned int id) static inline int __init integrity_load_cert(const unsigned int id, const char *source, const void *data, size_t len, - key_perm_t perm) + struct key_acl *acl) { return 0; } diff --git a/security/integrity/platform_certs/platform_keyring.c b/security/integrity/platform_certs/platform_keyring.c index bcafd7387729..7646e35f2d91 100644 --- a/security/integrity/platform_certs/platform_keyring.c +++ b/security/integrity/platform_certs/platform_keyring.c @@ -14,6 +14,15 @@ #include #include "../integrity.h" +static struct key_acl platform_key_acl = { + .usage = REFCOUNT_INIT(1), + .nr_ace = 2, + .aces = { + KEY_POSSESSOR_ACE(KEY_ACE_SEARCH | KEY_ACE_READ), + KEY_OWNER_ACE(KEY_ACE_VIEW), + } +}; + /** * add_to_platform_keyring - Add to platform keyring without validation. * @source: Source of key @@ -26,13 +35,10 @@ void __init add_to_platform_keyring(const char *source, const void *data, size_t len) { - key_perm_t perm; int rc; - perm = (KEY_POS_ALL & ~KEY_POS_SETATTR) | KEY_USR_VIEW; - rc = integrity_load_cert(INTEGRITY_KEYRING_PLATFORM, source, data, len, - perm); + &platform_key_acl); if (rc) pr_info("Error adding keys to platform keyring %s\n", source); } diff --git a/security/keys/encrypted-keys/encrypted.c b/security/keys/encrypted-keys/encrypted.c index 1b1456b21a93..dc76c60a27a6 100644 --- a/security/keys/encrypted-keys/encrypted.c +++ b/security/keys/encrypted-keys/encrypted.c @@ -307,7 +307,7 @@ static struct key *request_user_key(const char *master_desc, const u8 **master_k const struct user_key_payload *upayload; struct key *ukey; - ukey = request_key(&key_type_user, master_desc, NULL); + ukey = request_key(&key_type_user, master_desc, NULL, NULL); if (IS_ERR(ukey)) goto error; diff --git a/security/keys/encrypted-keys/masterkey_trusted.c b/security/keys/encrypted-keys/masterkey_trusted.c index dc3d18cae642..3322e7eeafce 100644 --- a/security/keys/encrypted-keys/masterkey_trusted.c +++ b/security/keys/encrypted-keys/masterkey_trusted.c @@ -33,7 +33,7 @@ struct key *request_trusted_key(const char *trusted_desc, struct trusted_key_payload *tpayload; struct key *tkey; - tkey = request_key(&key_type_trusted, trusted_desc, NULL); + tkey = request_key(&key_type_trusted, trusted_desc, NULL, NULL); if (IS_ERR(tkey)) goto error; diff --git a/security/keys/gc.c b/security/keys/gc.c index 83d279fb7793..3b13fb62827f 100644 --- a/security/keys/gc.c +++ b/security/keys/gc.c @@ -155,6 +155,7 @@ static noinline void key_gc_unused_keys(struct list_head *keys) key_user_put(key->user); key_put_tag(key->domain_tag); + key_put_acl(rcu_access_pointer(key->acl)); kfree(key->description); memzero_explicit(key, sizeof(*key)); @@ -224,7 +225,6 @@ continue_scanning: if (key->type == key_gc_dead_keytype) { gc_state |= KEY_GC_FOUND_DEAD_KEY; set_bit(KEY_FLAG_DEAD, &key->flags); - key->perm = 0; goto skip_dead_key; } else if (key->type == &key_type_keyring && key->restrict_link) { diff --git a/security/keys/internal.h b/security/keys/internal.h index f1f2b076f3a1..9375d6289bb9 100644 --- a/security/keys/internal.h +++ b/security/keys/internal.h @@ -88,8 +88,11 @@ extern struct rb_root key_serial_tree; extern spinlock_t key_serial_lock; extern struct mutex key_construction_mutex; extern wait_queue_head_t request_key_conswq; +extern struct key_acl default_key_acl; +extern struct key_acl joinable_keyring_acl; extern void key_set_index_key(struct keyring_index_key *index_key); + extern struct key_type *key_type_lookup(const char *type); extern void key_type_put(struct key_type *ktype); @@ -160,6 +163,7 @@ extern struct key *request_key_and_link(struct key_type *type, const void *callout_info, size_t callout_len, void *aux, + struct key_acl *acl, struct key *dest_keyring, unsigned long flags); @@ -183,7 +187,10 @@ extern void key_gc_keytype(struct key_type *ktype); extern int key_task_permission(const key_ref_t key_ref, const struct cred *cred, - key_perm_t perm); + u32 desired_perm); +extern unsigned int key_acl_to_perm(const struct key_acl *acl); +extern long key_set_acl(struct key *key, struct key_acl *acl); +extern void key_put_acl(struct key_acl *acl); /* * Check to see whether permission is granted to use a key in the desired way. @@ -230,7 +237,7 @@ extern long keyctl_keyring_search(key_serial_t, const char __user *, const char __user *, key_serial_t); extern long keyctl_read_key(key_serial_t, char __user *, size_t); extern long keyctl_chown_key(key_serial_t, uid_t, gid_t); -extern long keyctl_setperm_key(key_serial_t, key_perm_t); +extern long keyctl_setperm_key(key_serial_t, unsigned int); extern long keyctl_instantiate_key(key_serial_t, const void __user *, size_t, key_serial_t); extern long keyctl_negate_key(key_serial_t, unsigned, key_serial_t); diff --git a/security/keys/key.c b/security/keys/key.c index 85fdc2ea6c14..bb96d6235ea2 100644 --- a/security/keys/key.c +++ b/security/keys/key.c @@ -199,7 +199,7 @@ serial_exists: * @uid: The owner of the new key. * @gid: The group ID for the new key's group permissions. * @cred: The credentials specifying UID namespace. - * @perm: The permissions mask of the new key. + * @acl: The ACL to attach to the new key. * @flags: Flags specifying quota properties. * @restrict_link: Optional link restriction for new keyrings. * @@ -227,7 +227,7 @@ serial_exists: */ struct key *key_alloc(struct key_type *type, const char *desc, kuid_t uid, kgid_t gid, const struct cred *cred, - key_perm_t perm, unsigned long flags, + struct key_acl *acl, unsigned long flags, struct key_restriction *restrict_link) { struct key_user *user = NULL; @@ -250,6 +250,9 @@ struct key *key_alloc(struct key_type *type, const char *desc, desclen = strlen(desc); quotalen = desclen + 1 + type->def_datalen; + if (!acl) + acl = &default_key_acl; + /* get hold of the key tracking for this user */ user = key_user_lookup(uid); if (!user) @@ -296,7 +299,8 @@ struct key *key_alloc(struct key_type *type, const char *desc, key->datalen = type->def_datalen; key->uid = uid; key->gid = gid; - key->perm = perm; + refcount_inc(&acl->usage); + rcu_assign_pointer(key->acl, acl); key->restrict_link = restrict_link; key->last_used_at = ktime_get_real_seconds(); @@ -791,7 +795,7 @@ error: * @description: The searchable description for the key. * @payload: The data to use to instantiate or update the key. * @plen: The length of @payload. - * @perm: The permissions mask for a new key. + * @acl: The ACL to attach if a key is created. * @flags: The quota flags for a new key. * * Search the destination keyring for a key of the same description and if one @@ -814,7 +818,7 @@ key_ref_t key_create_or_update(key_ref_t keyring_ref, const char *description, const void *payload, size_t plen, - key_perm_t perm, + struct key_acl *acl, unsigned long flags) { struct keyring_index_key index_key = { @@ -911,22 +915,9 @@ key_ref_t key_create_or_update(key_ref_t keyring_ref, goto found_matching_key; } - /* if the client doesn't provide, decide on the permissions we want */ - if (perm == KEY_PERM_UNDEF) { - perm = KEY_POS_VIEW | KEY_POS_SEARCH | KEY_POS_LINK | KEY_POS_SETATTR; - perm |= KEY_USR_VIEW; - - if (index_key.type->read) - perm |= KEY_POS_READ; - - if (index_key.type == &key_type_keyring || - index_key.type->update) - perm |= KEY_POS_WRITE; - } - /* allocate a new key */ key = key_alloc(index_key.type, index_key.description, - cred->fsuid, cred->fsgid, cred, perm, flags, NULL); + cred->fsuid, cred->fsgid, cred, acl, flags, NULL); if (IS_ERR(key)) { key_ref = ERR_CAST(key); goto error_link_end; diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c index d2f8eabcbcf4..c8911b430e59 100644 --- a/security/keys/keyctl.c +++ b/security/keys/keyctl.c @@ -134,8 +134,7 @@ SYSCALL_DEFINE5(add_key, const char __user *, _type, /* create or update the requested key and add it to the target * keyring */ key_ref = key_create_or_update(keyring_ref, type, description, - payload, plen, KEY_PERM_UNDEF, - KEY_ALLOC_IN_QUOTA); + payload, plen, NULL, KEY_ALLOC_IN_QUOTA); if (!IS_ERR(key_ref)) { ret = key_ref_to_ptr(key_ref)->serial; key_ref_put(key_ref); @@ -225,7 +224,8 @@ SYSCALL_DEFINE4(request_key, const char __user *, _type, /* do the search */ key = request_key_and_link(ktype, description, NULL, callout_info, - callout_len, NULL, key_ref_to_ptr(dest_ref), + callout_len, NULL, NULL, + key_ref_to_ptr(dest_ref), KEY_ALLOC_IN_QUOTA); if (IS_ERR(key)) { ret = PTR_ERR(key); @@ -387,16 +387,10 @@ long keyctl_revoke_key(key_serial_t id) struct key *key; long ret; - key_ref = lookup_user_key(id, 0, KEY_NEED_WRITE); + key_ref = lookup_user_key(id, 0, KEY_NEED_REVOKE); if (IS_ERR(key_ref)) { ret = PTR_ERR(key_ref); - if (ret != -EACCES) - goto error; - key_ref = lookup_user_key(id, 0, KEY_NEED_SETATTR); - if (IS_ERR(key_ref)) { - ret = PTR_ERR(key_ref); - goto error; - } + goto error; } key = key_ref_to_ptr(key_ref); @@ -430,7 +424,7 @@ long keyctl_invalidate_key(key_serial_t id) kenter("%d", id); - key_ref = lookup_user_key(id, 0, KEY_NEED_SEARCH); + key_ref = lookup_user_key(id, 0, KEY_NEED_INVAL); if (IS_ERR(key_ref)) { ret = PTR_ERR(key_ref); @@ -475,7 +469,7 @@ long keyctl_keyring_clear(key_serial_t ringid) struct key *keyring; long ret; - keyring_ref = lookup_user_key(ringid, KEY_LOOKUP_CREATE, KEY_NEED_WRITE); + keyring_ref = lookup_user_key(ringid, KEY_LOOKUP_CREATE, KEY_NEED_CLEAR); if (IS_ERR(keyring_ref)) { ret = PTR_ERR(keyring_ref); @@ -650,6 +644,7 @@ long keyctl_describe_key(key_serial_t keyid, size_t buflen) { struct key *key, *instkey; + unsigned int perm; key_ref_t key_ref; char *infobuf; long ret; @@ -679,6 +674,10 @@ okay: key = key_ref_to_ptr(key_ref); desclen = strlen(key->description); + rcu_read_lock(); + perm = key_acl_to_perm(rcu_dereference(key->acl)); + rcu_read_unlock(); + /* calculate how much information we're going to return */ ret = -ENOMEM; infobuf = kasprintf(GFP_KERNEL, @@ -686,7 +685,7 @@ okay: key->type->name, from_kuid_munged(current_user_ns(), key->uid), from_kgid_munged(current_user_ns(), key->gid), - key->perm); + perm); if (!infobuf) goto error2; infolen = strlen(infobuf); @@ -903,7 +902,7 @@ long keyctl_chown_key(key_serial_t id, uid_t user, gid_t group) goto error; key_ref = lookup_user_key(id, KEY_LOOKUP_CREATE | KEY_LOOKUP_PARTIAL, - KEY_NEED_SETATTR); + KEY_NEED_SETSEC); if (IS_ERR(key_ref)) { ret = PTR_ERR(key_ref); goto error; @@ -998,18 +997,25 @@ quota_overrun: * the key need not be fully instantiated yet. If the caller does not have * sysadmin capability, it may only change the permission on keys that it owns. */ -long keyctl_setperm_key(key_serial_t id, key_perm_t perm) +long keyctl_setperm_key(key_serial_t id, unsigned int perm) { + struct key_acl *acl; struct key *key; key_ref_t key_ref; long ret; + int nr, i, j; - ret = -EINVAL; if (perm & ~(KEY_POS_ALL | KEY_USR_ALL | KEY_GRP_ALL | KEY_OTH_ALL)) - goto error; + return -EINVAL; + + nr = 0; + if (perm & KEY_POS_ALL) nr++; + if (perm & KEY_USR_ALL) nr++; + if (perm & KEY_GRP_ALL) nr++; + if (perm & KEY_OTH_ALL) nr++; key_ref = lookup_user_key(id, KEY_LOOKUP_CREATE | KEY_LOOKUP_PARTIAL, - KEY_NEED_SETATTR); + KEY_NEED_SETSEC); if (IS_ERR(key_ref)) { ret = PTR_ERR(key_ref); goto error; @@ -1017,17 +1023,45 @@ long keyctl_setperm_key(key_serial_t id, key_perm_t perm) key = key_ref_to_ptr(key_ref); - /* make the changes with the locks held to prevent chown/chmod races */ - ret = -EACCES; - down_write(&key->sem); + ret = -EOPNOTSUPP; + if (test_bit(KEY_FLAG_HAS_ACL, &key->flags)) + goto error_key; - /* if we're not the sysadmin, we can only change a key that we own */ - if (capable(CAP_SYS_ADMIN) || uid_eq(key->uid, current_fsuid())) { - key->perm = perm; - ret = 0; + ret = -ENOMEM; + acl = kzalloc(struct_size(acl, aces, nr), GFP_KERNEL); + if (!acl) + goto error_key; + + refcount_set(&acl->usage, 1); + acl->nr_ace = nr; + j = 0; + for (i = 0; i < 4; i++) { + struct key_ace *ace = &acl->aces[j]; + unsigned int subset = (perm >> (i * 8)) & KEY_OTH_ALL; + + if (!subset) + continue; + ace->type = KEY_ACE_SUBJ_STANDARD; + ace->subject_id = KEY_ACE_EVERYONE + i; + ace->perm = subset; + if (subset & (KEY_OTH_WRITE | KEY_OTH_SETATTR)) + ace->perm |= KEY_ACE_REVOKE; + if (subset & KEY_OTH_SEARCH) + ace->perm |= KEY_ACE_INVAL; + if (key->type == &key_type_keyring) { + if (subset & KEY_OTH_SEARCH) + ace->perm |= KEY_ACE_JOIN; + if (subset & KEY_OTH_WRITE) + ace->perm |= KEY_ACE_CLEAR; + } + j++; } + /* make the changes with the locks held to prevent chown/chmod races */ + down_write(&key->sem); + ret = key_set_acl(key, acl); up_write(&key->sem); +error_key: key_put(key); error: return ret; @@ -1392,7 +1426,7 @@ long keyctl_set_timeout(key_serial_t id, unsigned timeout) long ret; key_ref = lookup_user_key(id, KEY_LOOKUP_CREATE | KEY_LOOKUP_PARTIAL, - KEY_NEED_SETATTR); + KEY_NEED_SETSEC); if (IS_ERR(key_ref)) { /* setting the timeout on a key under construction is permitted * if we have the authorisation token handy */ @@ -1543,7 +1577,7 @@ long keyctl_get_security(key_serial_t keyid, * Attempt to install the calling process's session keyring on the process's * parent process. * - * The keyring must exist and must grant the caller LINK permission, and the + * The keyring must exist and must grant the caller JOIN permission, and the * parent process must be single-threaded and must have the same effective * ownership as this process and mustn't be SUID/SGID. * @@ -1560,7 +1594,7 @@ long keyctl_session_to_parent(void) struct cred *cred; int ret; - keyring_r = lookup_user_key(KEY_SPEC_SESSION_KEYRING, 0, KEY_NEED_LINK); + keyring_r = lookup_user_key(KEY_SPEC_SESSION_KEYRING, 0, KEY_NEED_JOIN); if (IS_ERR(keyring_r)) return PTR_ERR(keyring_r); @@ -1662,7 +1696,7 @@ long keyctl_restrict_keyring(key_serial_t id, const char __user *_type, char *restriction = NULL; long ret; - key_ref = lookup_user_key(id, 0, KEY_NEED_SETATTR); + key_ref = lookup_user_key(id, 0, KEY_NEED_SETSEC); if (IS_ERR(key_ref)) return PTR_ERR(key_ref); @@ -1768,7 +1802,7 @@ SYSCALL_DEFINE5(keyctl, int, option, unsigned long, arg2, unsigned long, arg3, case KEYCTL_SETPERM: return keyctl_setperm_key((key_serial_t) arg2, - (key_perm_t) arg3); + (unsigned int)arg3); case KEYCTL_INSTANTIATE: return keyctl_instantiate_key((key_serial_t) arg2, diff --git a/security/keys/keyring.c b/security/keys/keyring.c index 29c31585ed61..62fb26c61968 100644 --- a/security/keys/keyring.c +++ b/security/keys/keyring.c @@ -519,11 +519,19 @@ static long keyring_read(const struct key *keyring, return ret; } -/* - * Allocate a keyring and link into the destination keyring. +/** + * keyring_alloc - Allocate a keyring and link into the destination + * @description: The key description to allow the key to be searched out. + * @uid: The owner of the new key. + * @gid: The group ID for the new key's group permissions. + * @cred: The credentials specifying UID namespace. + * @acl: The ACL to attach to the new key. + * @flags: Flags specifying quota properties. + * @restrict_link: Optional link restriction for new keyrings. + * @dest: Destination keyring. */ struct key *keyring_alloc(const char *description, kuid_t uid, kgid_t gid, - const struct cred *cred, key_perm_t perm, + const struct cred *cred, struct key_acl *acl, unsigned long flags, struct key_restriction *restrict_link, struct key *dest) @@ -532,7 +540,7 @@ struct key *keyring_alloc(const char *description, kuid_t uid, kgid_t gid, int ret; keyring = key_alloc(&key_type_keyring, description, - uid, gid, cred, perm, flags, restrict_link); + uid, gid, cred, acl, flags, restrict_link); if (!IS_ERR(keyring)) { ret = key_instantiate_and_link(keyring, NULL, 0, dest, NULL); if (ret < 0) { @@ -1136,10 +1144,11 @@ found: /* * Find a keyring with the specified name. * - * Only keyrings that have nonzero refcount, are not revoked, and are owned by a - * user in the current user namespace are considered. If @uid_keyring is %true, - * the keyring additionally must have been allocated as a user or user session - * keyring; otherwise, it must grant Search permission directly to the caller. + * Only keyrings that have nonzero refcount, are not revoked, and are owned by + * a user in the current user namespace are considered. If @uid_keyring is + * %true, the keyring additionally must have been allocated as a user or user + * session keyring; otherwise, it must grant JOIN permission directly to the + * caller (ie. not through possession). * * Returns a pointer to the keyring with the keyring's refcount having being * incremented on success. -ENOKEY is returned if a key could not be found. @@ -1173,7 +1182,7 @@ struct key *find_keyring_by_name(const char *name, bool uid_keyring) continue; } else { if (key_permission(make_key_ref(keyring, 0), - KEY_NEED_SEARCH) < 0) + KEY_NEED_JOIN) < 0) continue; } diff --git a/security/keys/permission.c b/security/keys/permission.c index 06df9d5e7572..e3237bb2e970 100644 --- a/security/keys/permission.c +++ b/security/keys/permission.c @@ -11,13 +11,67 @@ #include #include +#include +#include #include "internal.h" +struct key_acl default_key_acl = { + .usage = REFCOUNT_INIT(1), + .nr_ace = 2, + .possessor_viewable = true, + .aces = { + KEY_POSSESSOR_ACE(KEY_ACE__PERMS & ~KEY_ACE_JOIN), + KEY_OWNER_ACE(KEY_ACE_VIEW), + } +}; +EXPORT_SYMBOL(default_key_acl); + +struct key_acl joinable_keyring_acl = { + .usage = REFCOUNT_INIT(1), + .nr_ace = 2, + .possessor_viewable = true, + .aces = { + KEY_POSSESSOR_ACE(KEY_ACE__PERMS & ~KEY_ACE_JOIN), + KEY_OWNER_ACE(KEY_ACE_VIEW | KEY_ACE_READ | KEY_ACE_LINK | KEY_ACE_JOIN), + } +}; +EXPORT_SYMBOL(joinable_keyring_acl); + +struct key_acl internal_key_acl = { + .usage = REFCOUNT_INIT(1), + .nr_ace = 2, + .aces = { + KEY_POSSESSOR_ACE(KEY_ACE_SEARCH), + KEY_OWNER_ACE(KEY_ACE_VIEW | KEY_ACE_READ | KEY_ACE_SEARCH), + } +}; +EXPORT_SYMBOL(internal_key_acl); + +struct key_acl internal_keyring_acl = { + .usage = REFCOUNT_INIT(1), + .nr_ace = 2, + .aces = { + KEY_POSSESSOR_ACE(KEY_ACE_SEARCH), + KEY_OWNER_ACE(KEY_ACE_VIEW | KEY_ACE_READ | KEY_ACE_SEARCH), + } +}; +EXPORT_SYMBOL(internal_keyring_acl); + +struct key_acl internal_writable_keyring_acl = { + .usage = REFCOUNT_INIT(1), + .nr_ace = 2, + .aces = { + KEY_POSSESSOR_ACE(KEY_ACE_SEARCH | KEY_ACE_WRITE), + KEY_OWNER_ACE(KEY_ACE_VIEW | KEY_ACE_READ | KEY_ACE_WRITE | KEY_ACE_SEARCH), + } +}; +EXPORT_SYMBOL(internal_writable_keyring_acl); + /** * key_task_permission - Check a key can be used * @key_ref: The key to check. * @cred: The credentials to use. - * @perm: The permissions to check for. + * @desired_perm: The permission to check for. * * Check to see whether permission is granted to use a key in the desired way, * but permit the security modules to override. @@ -28,53 +82,73 @@ * permissions bits or the LSM check. */ int key_task_permission(const key_ref_t key_ref, const struct cred *cred, - unsigned perm) + unsigned int desired_perm) { - struct key *key; - key_perm_t kperm; - int ret; + const struct key_acl *acl; + const struct key *key; + unsigned int allow = 0; + int i; + + BUILD_BUG_ON(KEY_NEED_VIEW != KEY_ACE_VIEW || + KEY_NEED_READ != KEY_ACE_READ || + KEY_NEED_WRITE != KEY_ACE_WRITE || + KEY_NEED_SEARCH != KEY_ACE_SEARCH || + KEY_NEED_LINK != KEY_ACE_LINK || + KEY_NEED_SETSEC != KEY_ACE_SET_SECURITY || + KEY_NEED_INVAL != KEY_ACE_INVAL || + KEY_NEED_REVOKE != KEY_ACE_REVOKE || + KEY_NEED_JOIN != KEY_ACE_JOIN || + KEY_NEED_CLEAR != KEY_ACE_CLEAR); key = key_ref_to_ptr(key_ref); - /* use the second 8-bits of permissions for keys the caller owns */ - if (uid_eq(key->uid, cred->fsuid)) { - kperm = key->perm >> 16; - goto use_these_perms; - } + rcu_read_lock(); - /* use the third 8-bits of permissions for keys the caller has a group - * membership in common with */ - if (gid_valid(key->gid) && key->perm & KEY_GRP_ALL) { - if (gid_eq(key->gid, cred->fsgid)) { - kperm = key->perm >> 8; - goto use_these_perms; - } + acl = rcu_dereference(key->acl); + if (!acl || acl->nr_ace == 0) + goto no_access_rcu; + + for (i = 0; i < acl->nr_ace; i++) { + const struct key_ace *ace = &acl->aces[i]; - ret = groups_search(cred->group_info, key->gid); - if (ret) { - kperm = key->perm >> 8; - goto use_these_perms; + switch (ace->type) { + case KEY_ACE_SUBJ_STANDARD: + switch (ace->subject_id) { + case KEY_ACE_POSSESSOR: + if (is_key_possessed(key_ref)) + allow |= ace->perm; + break; + case KEY_ACE_OWNER: + if (uid_eq(key->uid, cred->fsuid)) + allow |= ace->perm; + break; + case KEY_ACE_GROUP: + if (gid_valid(key->gid)) { + if (gid_eq(key->gid, cred->fsgid)) + allow |= ace->perm; + else if (groups_search(cred->group_info, key->gid)) + allow |= ace->perm; + } + break; + case KEY_ACE_EVERYONE: + allow |= ace->perm; + break; + } + break; } } - /* otherwise use the least-significant 8-bits */ - kperm = key->perm; - -use_these_perms: + rcu_read_unlock(); - /* use the top 8-bits of permissions for keys the caller possesses - * - possessor permissions are additive with other permissions - */ - if (is_key_possessed(key_ref)) - kperm |= key->perm >> 24; + if (!(allow & desired_perm)) + goto no_access; - kperm = kperm & perm & KEY_NEED_ALL; + return security_key_permission(key_ref, cred, desired_perm); - if (kperm != perm) - return -EACCES; - - /* let LSM be the final arbiter */ - return security_key_permission(key_ref, cred, perm); +no_access_rcu: + rcu_read_unlock(); +no_access: + return -EACCES; } EXPORT_SYMBOL(key_task_permission); @@ -108,3 +182,99 @@ int key_validate(const struct key *key) return 0; } EXPORT_SYMBOL(key_validate); + +/* + * Roughly render an ACL to an old-style permissions mask. We cannot + * accurately render what the ACL, particularly if it has ACEs that represent + * subjects outside of { poss, user, group, other }. + */ +unsigned int key_acl_to_perm(const struct key_acl *acl) +{ + unsigned int perm = 0, tperm; + int i; + + BUILD_BUG_ON(KEY_OTH_VIEW != KEY_ACE_VIEW || + KEY_OTH_READ != KEY_ACE_READ || + KEY_OTH_WRITE != KEY_ACE_WRITE || + KEY_OTH_SEARCH != KEY_ACE_SEARCH || + KEY_OTH_LINK != KEY_ACE_LINK || + KEY_OTH_SETATTR != KEY_ACE_SET_SECURITY); + + if (!acl || acl->nr_ace == 0) + return 0; + + for (i = 0; i < acl->nr_ace; i++) { + const struct key_ace *ace = &acl->aces[i]; + + switch (ace->type) { + case KEY_ACE_SUBJ_STANDARD: + tperm = ace->perm & KEY_OTH_ALL; + + /* Invalidation and joining were allowed by SEARCH */ + if (ace->perm & (KEY_ACE_INVAL | KEY_ACE_JOIN)) + tperm |= KEY_OTH_SEARCH; + + /* Revocation was allowed by either SETATTR or WRITE */ + if ((ace->perm & KEY_ACE_REVOKE) && !(tperm & KEY_OTH_SETATTR)) + tperm |= KEY_OTH_WRITE; + + /* Clearing was allowed by WRITE */ + if (ace->perm & KEY_ACE_CLEAR) + tperm |= KEY_OTH_WRITE; + + switch (ace->subject_id) { + case KEY_ACE_POSSESSOR: + perm |= tperm << 24; + break; + case KEY_ACE_OWNER: + perm |= tperm << 16; + break; + case KEY_ACE_GROUP: + perm |= tperm << 8; + break; + case KEY_ACE_EVERYONE: + perm |= tperm << 0; + break; + } + } + } + + return perm; +} + +/* + * Destroy a key's ACL. + */ +void key_put_acl(struct key_acl *acl) +{ + if (acl && refcount_dec_and_test(&acl->usage)) + kfree_rcu(acl, rcu); +} + +/* + * Try to set the ACL. This either attaches or discards the proposed ACL. + */ +long key_set_acl(struct key *key, struct key_acl *acl) +{ + int i; + + /* If we're not the sysadmin, we can only change a key that we own. */ + if (!capable(CAP_SYS_ADMIN) && !uid_eq(key->uid, current_fsuid())) { + key_put_acl(acl); + return -EACCES; + } + + for (i = 0; i < acl->nr_ace; i++) { + const struct key_ace *ace = &acl->aces[i]; + if (ace->type == KEY_ACE_SUBJ_STANDARD && + ace->subject_id == KEY_ACE_POSSESSOR) { + if (ace->perm & KEY_ACE_VIEW) + acl->possessor_viewable = true; + break; + } + } + + rcu_swap_protected(key->acl, acl, lockdep_is_held(&key->sem)); + key_put_acl(acl); + return 0; +} diff --git a/security/keys/persistent.c b/security/keys/persistent.c index 9944d855a28d..c4c480f630ea 100644 --- a/security/keys/persistent.c +++ b/security/keys/persistent.c @@ -16,6 +16,27 @@ unsigned persistent_keyring_expiry = 3 * 24 * 3600; /* Expire after 3 days of non-use */ +static struct key_acl persistent_register_keyring_acl = { + .usage = REFCOUNT_INIT(1), + .nr_ace = 2, + .aces = { + KEY_POSSESSOR_ACE(KEY_ACE_SEARCH | KEY_ACE_WRITE), + KEY_OWNER_ACE(KEY_ACE_VIEW | KEY_ACE_READ), + } +}; + +static struct key_acl persistent_keyring_acl = { + .usage = REFCOUNT_INIT(1), + .nr_ace = 2, + .possessor_viewable = true, + .aces = { + KEY_POSSESSOR_ACE(KEY_ACE_VIEW | KEY_ACE_READ | KEY_ACE_WRITE | + KEY_ACE_SEARCH | KEY_ACE_LINK | + KEY_ACE_CLEAR | KEY_ACE_INVAL), + KEY_OWNER_ACE(KEY_ACE_VIEW | KEY_ACE_READ), + } +}; + /* * Create the persistent keyring register for the current user namespace. * @@ -26,8 +47,7 @@ static int key_create_persistent_register(struct user_namespace *ns) struct key *reg = keyring_alloc(".persistent_register", KUIDT_INIT(0), KGIDT_INIT(0), current_cred(), - ((KEY_POS_ALL & ~KEY_POS_SETATTR) | - KEY_USR_VIEW | KEY_USR_READ), + &persistent_register_keyring_acl, KEY_ALLOC_NOT_IN_QUOTA, NULL, NULL); if (IS_ERR(reg)) return PTR_ERR(reg); @@ -60,8 +80,7 @@ static key_ref_t key_create_persistent(struct user_namespace *ns, kuid_t uid, persistent = keyring_alloc(index_key->description, uid, INVALID_GID, current_cred(), - ((KEY_POS_ALL & ~KEY_POS_SETATTR) | - KEY_USR_VIEW | KEY_USR_READ), + &persistent_keyring_acl, KEY_ALLOC_NOT_IN_QUOTA, NULL, ns->persistent_keyring_register); if (IS_ERR(persistent)) diff --git a/security/keys/proc.c b/security/keys/proc.c index b4f5ba56b9cb..0056fe2dc39b 100644 --- a/security/keys/proc.c +++ b/security/keys/proc.c @@ -114,11 +114,13 @@ static struct key *find_ge_key(struct seq_file *p, key_serial_t id) } static void *proc_keys_start(struct seq_file *p, loff_t *_pos) + __acquires(rcu) __acquires(key_serial_lock) { key_serial_t pos = *_pos; struct key *key; + rcu_read_lock(); spin_lock(&key_serial_lock); if (*_pos > INT_MAX) @@ -148,12 +150,15 @@ static void *proc_keys_next(struct seq_file *p, void *v, loff_t *_pos) static void proc_keys_stop(struct seq_file *p, void *v) __releases(key_serial_lock) + __releases(rcu) { spin_unlock(&key_serial_lock); + rcu_read_unlock(); } static int proc_keys_show(struct seq_file *m, void *v) { + const struct key_acl *acl; struct rb_node *_p = v; struct key *key = rb_entry(_p, struct key, serial_node); unsigned long flags; @@ -161,6 +166,7 @@ static int proc_keys_show(struct seq_file *m, void *v) time64_t now, expiry; char xbuf[16]; short state; + bool check_pos; u64 timo; int rc; @@ -174,15 +180,15 @@ static int proc_keys_show(struct seq_file *m, void *v) KEYRING_SEARCH_RECURSE), }; - key_ref = make_key_ref(key, 0); + acl = rcu_dereference(key->acl); + check_pos = acl->possessor_viewable; /* determine if the key is possessed by this process (a test we can * skip if the key does not indicate the possessor can view it */ - if (key->perm & KEY_POS_VIEW) { - rcu_read_lock(); + key_ref = make_key_ref(key, 0); + if (check_pos) { skey_ref = search_cred_keyrings_rcu(&ctx); - rcu_read_unlock(); if (!IS_ERR(skey_ref)) { key_ref_put(skey_ref); key_ref = make_key_ref(key, 1); @@ -192,12 +198,10 @@ static int proc_keys_show(struct seq_file *m, void *v) /* check whether the current task is allowed to view the key */ rc = key_task_permission(key_ref, ctx.cred, KEY_NEED_VIEW); if (rc < 0) - return 0; + goto out; now = ktime_get_real_seconds(); - rcu_read_lock(); - /* come up with a suitable timeout value */ expiry = READ_ONCE(key->expiry); if (expiry == 0) { @@ -236,7 +240,7 @@ static int proc_keys_show(struct seq_file *m, void *v) showflag(flags, 'i', KEY_FLAG_INVALIDATED), refcount_read(&key->usage), xbuf, - key->perm, + key_acl_to_perm(acl), from_kuid_munged(seq_user_ns(m), key->uid), from_kgid_munged(seq_user_ns(m), key->gid), key->type->name); @@ -247,7 +251,7 @@ static int proc_keys_show(struct seq_file *m, void *v) key->type->describe(key, m); seq_putc(m, '\n'); - rcu_read_unlock(); +out: return 0; } diff --git a/security/keys/process_keys.c b/security/keys/process_keys.c index f74d64215942..ddda8544630d 100644 --- a/security/keys/process_keys.c +++ b/security/keys/process_keys.c @@ -36,6 +36,47 @@ struct key_user root_key_user = { .uid = GLOBAL_ROOT_UID, }; +static struct key_acl user_reg_keyring_acl = { + .usage = REFCOUNT_INIT(1), + .possessor_viewable = true, + .nr_ace = 2, + .aces = { + KEY_POSSESSOR_ACE(KEY_ACE_WRITE | KEY_ACE_SEARCH), + KEY_OWNER_ACE(KEY_ACE_VIEW | KEY_ACE_READ), + } +}; + +static struct key_acl user_keyring_acl = { + .usage = REFCOUNT_INIT(1), + .possessor_viewable = true, + .nr_ace = 2, + .aces = { + KEY_POSSESSOR_ACE(KEY_ACE_VIEW | KEY_ACE_READ | KEY_ACE_WRITE | + KEY_ACE_SEARCH | KEY_ACE_LINK), + KEY_OWNER_ACE(KEY_ACE__PERMS & ~(KEY_ACE_JOIN | KEY_ACE_SET_SECURITY)), + } +}; + +static struct key_acl session_keyring_acl = { + .usage = REFCOUNT_INIT(1), + .possessor_viewable = true, + .nr_ace = 2, + .aces = { + KEY_POSSESSOR_ACE(KEY_ACE__PERMS & ~KEY_ACE_JOIN), + KEY_OWNER_ACE(KEY_ACE_VIEW | KEY_ACE_READ), + } +}; + +static struct key_acl thread_and_process_keyring_acl = { + .usage = REFCOUNT_INIT(1), + .possessor_viewable = true, + .nr_ace = 2, + .aces = { + KEY_POSSESSOR_ACE(KEY_ACE__PERMS & ~(KEY_ACE_JOIN | KEY_ACE_SET_SECURITY)), + KEY_OWNER_ACE(KEY_ACE_VIEW), + } +}; + /* * Get or create a user register keyring. */ @@ -55,11 +96,8 @@ static struct key *get_user_register(struct user_namespace *user_ns) if (!reg_keyring) { reg_keyring = keyring_alloc(".user_reg", user_ns->owner, INVALID_GID, - &init_cred, - KEY_POS_WRITE | KEY_POS_SEARCH | - KEY_USR_VIEW | KEY_USR_READ, - 0, - NULL, NULL); + &init_cred, &user_reg_keyring_acl, + 0, NULL, NULL); if (!IS_ERR(reg_keyring)) smp_store_release(&user_ns->user_keyring_register, reg_keyring); @@ -81,14 +119,11 @@ int look_up_user_keyrings(struct key **_user_keyring, const struct cred *cred = current_cred(); struct user_namespace *user_ns = current_user_ns(); struct key *reg_keyring, *uid_keyring, *session_keyring; - key_perm_t user_keyring_perm; key_ref_t uid_keyring_r, session_keyring_r; uid_t uid = from_kuid(user_ns, cred->user->uid); char buf[20]; int ret; - user_keyring_perm = (KEY_POS_ALL & ~KEY_POS_SETATTR) | KEY_USR_ALL; - kenter("%u", uid); reg_keyring = get_user_register(user_ns); @@ -108,7 +143,7 @@ int look_up_user_keyrings(struct key **_user_keyring, kdebug("_uid %p", uid_keyring_r); if (uid_keyring_r == ERR_PTR(-EAGAIN)) { uid_keyring = keyring_alloc(buf, cred->user->uid, INVALID_GID, - cred, user_keyring_perm, + cred, &user_keyring_acl, KEY_ALLOC_UID_KEYRING | KEY_ALLOC_IN_QUOTA, NULL, reg_keyring); @@ -130,7 +165,7 @@ int look_up_user_keyrings(struct key **_user_keyring, kdebug("_uid_ses %p", session_keyring_r); if (session_keyring_r == ERR_PTR(-EAGAIN)) { session_keyring = keyring_alloc(buf, cred->user->uid, INVALID_GID, - cred, user_keyring_perm, + cred, &user_keyring_acl, KEY_ALLOC_UID_KEYRING | KEY_ALLOC_IN_QUOTA, NULL, NULL); @@ -230,7 +265,7 @@ int install_thread_keyring_to_cred(struct cred *new) return 0; keyring = keyring_alloc("_tid", new->uid, new->gid, new, - KEY_POS_ALL | KEY_USR_VIEW, + &thread_and_process_keyring_acl, KEY_ALLOC_QUOTA_OVERRUN, NULL, NULL); if (IS_ERR(keyring)) @@ -277,7 +312,7 @@ int install_process_keyring_to_cred(struct cred *new) return 0; keyring = keyring_alloc("_pid", new->uid, new->gid, new, - KEY_POS_ALL | KEY_USR_VIEW, + &thread_and_process_keyring_acl, KEY_ALLOC_QUOTA_OVERRUN, NULL, NULL); if (IS_ERR(keyring)) @@ -332,8 +367,7 @@ int install_session_keyring_to_cred(struct cred *cred, struct key *keyring) flags = KEY_ALLOC_IN_QUOTA; keyring = keyring_alloc("_ses", cred->uid, cred->gid, cred, - KEY_POS_ALL | KEY_USR_VIEW | KEY_USR_READ, - flags, NULL, NULL); + &session_keyring_acl, flags, NULL, NULL); if (IS_ERR(keyring)) return PTR_ERR(keyring); } else { @@ -613,7 +647,7 @@ bool lookup_user_key_possessed(const struct key *key, * returned key reference. */ key_ref_t lookup_user_key(key_serial_t id, unsigned long lflags, - key_perm_t perm) + unsigned int desired_perm) { struct keyring_search_context ctx = { .match_data.cmp = lookup_user_key_possessed, @@ -788,12 +822,12 @@ try_again: case -ERESTARTSYS: goto invalid_key; default: - if (perm) + if (desired_perm) goto invalid_key; case 0: break; } - } else if (perm) { + } else if (desired_perm) { ret = key_validate(key); if (ret < 0) goto invalid_key; @@ -805,9 +839,11 @@ try_again: goto invalid_key; /* check the permissions */ - ret = key_task_permission(key_ref, ctx.cred, perm); - if (ret < 0) - goto invalid_key; + if (desired_perm) { + ret = key_task_permission(key_ref, ctx.cred, desired_perm); + if (ret < 0) + goto invalid_key; + } key->last_used_at = ktime_get_real_seconds(); @@ -872,13 +908,13 @@ long join_session_keyring(const char *name) if (PTR_ERR(keyring) == -ENOKEY) { /* not found - try and create a new one */ keyring = keyring_alloc( - name, old->uid, old->gid, old, - KEY_POS_ALL | KEY_USR_VIEW | KEY_USR_READ | KEY_USR_LINK, + name, old->uid, old->gid, old, &joinable_keyring_acl, KEY_ALLOC_IN_QUOTA, NULL, NULL); if (IS_ERR(keyring)) { ret = PTR_ERR(keyring); goto error2; } + goto no_perm_test; } else if (IS_ERR(keyring)) { ret = PTR_ERR(keyring); goto error2; @@ -887,6 +923,12 @@ long join_session_keyring(const char *name) goto error3; } + ret = key_task_permission(make_key_ref(keyring, false), old, + KEY_NEED_JOIN); + if (ret < 0) + goto error3; + +no_perm_test: /* we've got a keyring - now to install it */ ret = install_session_keyring_to_cred(new, keyring); if (ret < 0) diff --git a/security/keys/request_key.c b/security/keys/request_key.c index aa589d3c90e2..64af697a9126 100644 --- a/security/keys/request_key.c +++ b/security/keys/request_key.c @@ -139,8 +139,7 @@ static int call_sbin_request_key(struct key *authkey, void *aux) cred = get_current_cred(); keyring = keyring_alloc(desc, cred->fsuid, cred->fsgid, cred, - KEY_POS_ALL | KEY_USR_VIEW | KEY_USR_READ, - KEY_ALLOC_QUOTA_OVERRUN, NULL, NULL); + NULL, KEY_ALLOC_QUOTA_OVERRUN, NULL, NULL); put_cred(cred); if (IS_ERR(keyring)) { ret = PTR_ERR(keyring); @@ -371,11 +370,11 @@ static int construct_alloc_key(struct keyring_search_context *ctx, struct key *dest_keyring, unsigned long flags, struct key_user *user, + struct key_acl *acl, struct key **_key) { struct assoc_array_edit *edit = NULL; struct key *key; - key_perm_t perm; key_ref_t key_ref; int ret; @@ -385,17 +384,9 @@ static int construct_alloc_key(struct keyring_search_context *ctx, *_key = NULL; mutex_lock(&user->cons_lock); - perm = KEY_POS_VIEW | KEY_POS_SEARCH | KEY_POS_LINK | KEY_POS_SETATTR; - perm |= KEY_USR_VIEW; - if (ctx->index_key.type->read) - perm |= KEY_POS_READ; - if (ctx->index_key.type == &key_type_keyring || - ctx->index_key.type->update) - perm |= KEY_POS_WRITE; - key = key_alloc(ctx->index_key.type, ctx->index_key.description, ctx->cred->fsuid, ctx->cred->fsgid, ctx->cred, - perm, flags, NULL); + acl, flags, NULL); if (IS_ERR(key)) goto alloc_failed; @@ -478,6 +469,7 @@ static struct key *construct_key_and_link(struct keyring_search_context *ctx, const char *callout_info, size_t callout_len, void *aux, + struct key_acl *acl, struct key *dest_keyring, unsigned long flags) { @@ -500,7 +492,7 @@ static struct key *construct_key_and_link(struct keyring_search_context *ctx, goto error_put_dest_keyring; } - ret = construct_alloc_key(ctx, dest_keyring, flags, user, &key); + ret = construct_alloc_key(ctx, dest_keyring, flags, user, acl, &key); key_user_put(user); if (ret == 0) { @@ -538,6 +530,7 @@ error: * @callout_info: The data to pass to the instantiation upcall (or NULL). * @callout_len: The length of callout_info. * @aux: Auxiliary data for the upcall. + * @acl: The ACL to attach if a new key is created. * @dest_keyring: Where to cache the key. * @flags: Flags to key_alloc(). * @@ -565,6 +558,7 @@ struct key *request_key_and_link(struct key_type *type, const void *callout_info, size_t callout_len, void *aux, + struct key_acl *acl, struct key *dest_keyring, unsigned long flags) { @@ -639,7 +633,7 @@ struct key *request_key_and_link(struct key_type *type, goto error_free; key = construct_key_and_link(&ctx, callout_info, callout_len, - aux, dest_keyring, flags); + aux, acl, dest_keyring, flags); } error_free: @@ -682,6 +676,7 @@ EXPORT_SYMBOL(wait_for_key_construction); * @description: The searchable description of the key. * @domain_tag: The domain in which the key operates. * @callout_info: The data to pass to the instantiation upcall (or NULL). + * @acl: The ACL to attach if a new key is created. * * As for request_key_and_link() except that it does not add the returned key * to a keyring if found, new keys are always allocated in the user's quota, @@ -694,7 +689,8 @@ EXPORT_SYMBOL(wait_for_key_construction); struct key *request_key_tag(struct key_type *type, const char *description, struct key_tag *domain_tag, - const char *callout_info) + const char *callout_info, + struct key_acl *acl) { struct key *key; size_t callout_len = 0; @@ -704,7 +700,7 @@ struct key *request_key_tag(struct key_type *type, callout_len = strlen(callout_info); key = request_key_and_link(type, description, domain_tag, callout_info, callout_len, - NULL, NULL, KEY_ALLOC_IN_QUOTA); + NULL, acl, NULL, KEY_ALLOC_IN_QUOTA); if (!IS_ERR(key)) { ret = wait_for_key_construction(key, false); if (ret < 0) { @@ -724,6 +720,7 @@ EXPORT_SYMBOL(request_key_tag); * @callout_info: The data to pass to the instantiation upcall (or NULL). * @callout_len: The length of callout_info. * @aux: Auxiliary data for the upcall. + * @acl: The ACL to attach if a new key is created. * * As for request_key_and_link() except that it does not add the returned key * to a keyring if found and new keys are always allocated in the user's quota. @@ -736,14 +733,15 @@ struct key *request_key_with_auxdata(struct key_type *type, struct key_tag *domain_tag, const void *callout_info, size_t callout_len, - void *aux) + void *aux, + struct key_acl *acl) { struct key *key; int ret; key = request_key_and_link(type, description, domain_tag, callout_info, callout_len, - aux, NULL, KEY_ALLOC_IN_QUOTA); + aux, acl, NULL, KEY_ALLOC_IN_QUOTA); if (!IS_ERR(key)) { ret = wait_for_key_construction(key, false); if (ret < 0) { diff --git a/security/keys/request_key_auth.c b/security/keys/request_key_auth.c index f613987e8a63..d9146606f54e 100644 --- a/security/keys/request_key_auth.c +++ b/security/keys/request_key_auth.c @@ -28,6 +28,17 @@ static void request_key_auth_revoke(struct key *); static void request_key_auth_destroy(struct key *); static long request_key_auth_read(const struct key *, char __user *, size_t); +static struct key_acl request_key_auth_acl = { + .usage = REFCOUNT_INIT(1), + .nr_ace = 2, + .possessor_viewable = true, + .aces = { + KEY_POSSESSOR_ACE(KEY_ACE_VIEW | KEY_ACE_READ | KEY_ACE_SEARCH | + KEY_ACE_LINK), + KEY_OWNER_ACE(KEY_ACE_VIEW), + } +}; + /* * The request-key authorisation key type definition. */ @@ -214,8 +225,8 @@ struct key *request_key_auth_new(struct key *target, const char *op, authkey = key_alloc(&key_type_request_key_auth, desc, cred->fsuid, cred->fsgid, cred, - KEY_POS_VIEW | KEY_POS_READ | KEY_POS_SEARCH | KEY_POS_LINK | - KEY_USR_VIEW, KEY_ALLOC_NOT_IN_QUOTA, NULL); + &request_key_auth_acl, + KEY_ALLOC_NOT_IN_QUOTA, NULL); if (IS_ERR(authkey)) { ret = PTR_ERR(authkey); goto error_free_rka; diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index c61787b15f27..b828401dcb70 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -6481,6 +6481,7 @@ static int selinux_key_permission(key_ref_t key_ref, { struct key *key; struct key_security_struct *ksec; + unsigned oldstyle_perm; u32 sid; /* if no specific permissions are requested, we skip the @@ -6489,13 +6490,26 @@ static int selinux_key_permission(key_ref_t key_ref, if (perm == 0) return 0; + oldstyle_perm = perm & (KEY_NEED_VIEW | KEY_NEED_READ | KEY_NEED_WRITE | + KEY_NEED_SEARCH | KEY_NEED_LINK); + if (perm & KEY_NEED_SETSEC) + oldstyle_perm |= OLD_KEY_NEED_SETATTR; + if (perm & KEY_NEED_INVAL) + oldstyle_perm |= KEY_NEED_SEARCH; + if (perm & KEY_NEED_REVOKE && !(perm & OLD_KEY_NEED_SETATTR)) + oldstyle_perm |= KEY_NEED_WRITE; + if (perm & KEY_NEED_JOIN) + oldstyle_perm |= KEY_NEED_SEARCH; + if (perm & KEY_NEED_CLEAR) + oldstyle_perm |= KEY_NEED_WRITE; + sid = cred_sid(cred); key = key_ref_to_ptr(key_ref); ksec = key->security; return avc_has_perm(&selinux_state, - sid, ksec->sid, SECCLASS_KEY, perm, NULL); + sid, ksec->sid, SECCLASS_KEY, oldstyle_perm, NULL); } static int selinux_key_getsecurity(struct key *key, char **_buffer) diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c index 0de725f88bed..6095dc3565a5 100644 --- a/security/smack/smack_lsm.c +++ b/security/smack/smack_lsm.c @@ -4285,7 +4285,8 @@ static int smack_key_permission(key_ref_t key_ref, #endif if (perm & (KEY_NEED_READ | KEY_NEED_SEARCH | KEY_NEED_VIEW)) request |= MAY_READ; - if (perm & (KEY_NEED_WRITE | KEY_NEED_LINK | KEY_NEED_SETATTR)) + if (perm & (KEY_NEED_WRITE | KEY_NEED_LINK | KEY_NEED_SETSEC | + KEY_NEED_INVAL | KEY_NEED_REVOKE | KEY_NEED_CLEAR)) request |= MAY_WRITE; rc = smk_access(tkp, keyp->security, request, &ad); rc = smk_bu_note("key access", tkp, keyp->security, request, rc); -- cgit v1.2.3-71-gd317 From 0d01da6afc5402f60325c5da31b22f7d56689b49 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Thu, 27 Jun 2019 13:38:47 -0700 Subject: bpf: implement getsockopt and setsockopt hooks Implement new BPF_PROG_TYPE_CGROUP_SOCKOPT program type and BPF_CGROUP_{G,S}ETSOCKOPT cgroup hooks. BPF_CGROUP_SETSOCKOPT can modify user setsockopt arguments before passing them down to the kernel or bypass kernel completely. BPF_CGROUP_GETSOCKOPT can can inspect/modify getsockopt arguments that kernel returns. Both hooks reuse existing PTR_TO_PACKET{,_END} infrastructure. The buffer memory is pre-allocated (because I don't think there is a precedent for working with __user memory from bpf). This might be slow to do for each {s,g}etsockopt call, that's why I've added __cgroup_bpf_prog_array_is_empty that exits early if there is nothing attached to a cgroup. Note, however, that there is a race between __cgroup_bpf_prog_array_is_empty and BPF_PROG_RUN_ARRAY where cgroup program layout might have changed; this should not be a problem because in general there is a race between multiple calls to {s,g}etsocktop and user adding/removing bpf progs from a cgroup. The return code of the BPF program is handled as follows: * 0: EPERM * 1: success, continue with next BPF program in the cgroup chain v9: * allow overwriting setsockopt arguments (Alexei Starovoitov): * use set_fs (same as kernel_setsockopt) * buffer is always kzalloc'd (no small on-stack buffer) v8: * use s32 for optlen (Andrii Nakryiko) v7: * return only 0 or 1 (Alexei Starovoitov) * always run all progs (Alexei Starovoitov) * use optval=0 as kernel bypass in setsockopt (Alexei Starovoitov) (decided to use optval=-1 instead, optval=0 might be a valid input) * call getsockopt hook after kernel handlers (Alexei Starovoitov) v6: * rework cgroup chaining; stop as soon as bpf program returns 0 or 2; see patch with the documentation for the details * drop Andrii's and Martin's Acked-by (not sure they are comfortable with the new state of things) v5: * skip copy_to_user() and put_user() when ret == 0 (Martin Lau) v4: * don't export bpf_sk_fullsock helper (Martin Lau) * size != sizeof(__u64) for uapi pointers (Martin Lau) * offsetof instead of bpf_ctx_range when checking ctx access (Martin Lau) v3: * typos in BPF_PROG_CGROUP_SOCKOPT_RUN_ARRAY comments (Andrii Nakryiko) * reverse christmas tree in BPF_PROG_CGROUP_SOCKOPT_RUN_ARRAY (Andrii Nakryiko) * use __bpf_md_ptr instead of __u32 for optval{,_end} (Martin Lau) * use BPF_FIELD_SIZEOF() for consistency (Martin Lau) * new CG_SOCKOPT_ACCESS macro to wrap repeated parts v2: * moved bpf_sockopt_kern fields around to remove a hole (Martin Lau) * aligned bpf_sockopt_kern->buf to 8 bytes (Martin Lau) * bpf_prog_array_is_empty instead of bpf_prog_array_length (Martin Lau) * added [0,2] return code check to verifier (Martin Lau) * dropped unused buf[64] from the stack (Martin Lau) * use PTR_TO_SOCKET for bpf_sockopt->sk (Martin Lau) * dropped bpf_target_off from ctx rewrites (Martin Lau) * use return code for kernel bypass (Martin Lau & Andrii Nakryiko) Cc: Andrii Nakryiko Cc: Martin Lau Signed-off-by: Stanislav Fomichev Signed-off-by: Alexei Starovoitov --- include/linux/bpf-cgroup.h | 45 ++++++ include/linux/bpf.h | 2 + include/linux/bpf_types.h | 1 + include/linux/filter.h | 10 ++ include/uapi/linux/bpf.h | 14 ++ kernel/bpf/cgroup.c | 333 +++++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/core.c | 9 ++ kernel/bpf/syscall.c | 19 +++ kernel/bpf/verifier.c | 8 ++ net/core/filter.c | 2 +- net/socket.c | 30 ++++ 11 files changed, 472 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index bd79ae32909a..169fd25f6bc2 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -124,6 +124,14 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, loff_t *ppos, void **new_buf, enum bpf_attach_type type); +int __cgroup_bpf_run_filter_setsockopt(struct sock *sock, int *level, + int *optname, char __user *optval, + int *optlen, char **kernel_optval); +int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, + int optname, char __user *optval, + int __user *optlen, int max_optlen, + int retval); + static inline enum bpf_cgroup_storage_type cgroup_storage_type( struct bpf_map *map) { @@ -286,6 +294,38 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, __ret; \ }) +#define BPF_CGROUP_RUN_PROG_SETSOCKOPT(sock, level, optname, optval, optlen, \ + kernel_optval) \ +({ \ + int __ret = 0; \ + if (cgroup_bpf_enabled) \ + __ret = __cgroup_bpf_run_filter_setsockopt(sock, level, \ + optname, optval, \ + optlen, \ + kernel_optval); \ + __ret; \ +}) + +#define BPF_CGROUP_GETSOCKOPT_MAX_OPTLEN(optlen) \ +({ \ + int __ret = 0; \ + if (cgroup_bpf_enabled) \ + get_user(__ret, optlen); \ + __ret; \ +}) + +#define BPF_CGROUP_RUN_PROG_GETSOCKOPT(sock, level, optname, optval, optlen, \ + max_optlen, retval) \ +({ \ + int __ret = retval; \ + if (cgroup_bpf_enabled) \ + __ret = __cgroup_bpf_run_filter_getsockopt(sock, level, \ + optname, optval, \ + optlen, max_optlen, \ + retval); \ + __ret; \ +}) + int cgroup_bpf_prog_attach(const union bpf_attr *attr, enum bpf_prog_type ptype, struct bpf_prog *prog); int cgroup_bpf_prog_detach(const union bpf_attr *attr, @@ -357,6 +397,11 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map, #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) ({ 0; }) #define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type,major,minor,access) ({ 0; }) #define BPF_CGROUP_RUN_PROG_SYSCTL(head,table,write,buf,count,pos,nbuf) ({ 0; }) +#define BPF_CGROUP_GETSOCKOPT_MAX_OPTLEN(optlen) ({ 0; }) +#define BPF_CGROUP_RUN_PROG_GETSOCKOPT(sock, level, optname, optval, \ + optlen, max_optlen, retval) ({ retval; }) +#define BPF_CGROUP_RUN_PROG_SETSOCKOPT(sock, level, optname, optval, optlen, \ + kernel_optval) ({ 0; }) #define for_each_cgroup_storage_type(stype) for (; false; ) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index a62e7889b0b6..18f4cc2c6acd 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -518,6 +518,7 @@ struct bpf_prog_array { struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags); void bpf_prog_array_free(struct bpf_prog_array *progs); int bpf_prog_array_length(struct bpf_prog_array *progs); +bool bpf_prog_array_is_empty(struct bpf_prog_array *array); int bpf_prog_array_copy_to_user(struct bpf_prog_array *progs, __u32 __user *prog_ids, u32 cnt); @@ -1051,6 +1052,7 @@ extern const struct bpf_func_proto bpf_spin_unlock_proto; extern const struct bpf_func_proto bpf_get_local_storage_proto; extern const struct bpf_func_proto bpf_strtol_proto; extern const struct bpf_func_proto bpf_strtoul_proto; +extern const struct bpf_func_proto bpf_tcp_sock_proto; /* Shared helpers among cBPF and eBPF. */ void bpf_user_rnd_init_once(void); diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index 5a9975678d6f..eec5aeeeaf92 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -30,6 +30,7 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE, raw_tracepoint_writable) #ifdef CONFIG_CGROUP_BPF BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_DEVICE, cg_dev) BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SYSCTL, cg_sysctl) +BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SOCKOPT, cg_sockopt) #endif #ifdef CONFIG_BPF_LIRC_MODE2 BPF_PROG_TYPE(BPF_PROG_TYPE_LIRC_MODE2, lirc_mode2) diff --git a/include/linux/filter.h b/include/linux/filter.h index 43b45d6db36d..340f7d648974 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1199,4 +1199,14 @@ struct bpf_sysctl_kern { u64 tmp_reg; }; +struct bpf_sockopt_kern { + struct sock *sk; + u8 *optval; + u8 *optval_end; + s32 level; + s32 optname; + s32 optlen; + s32 retval; +}; + #endif /* __LINUX_FILTER_H__ */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index b077507efa3f..a396b516a2b2 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -170,6 +170,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_FLOW_DISSECTOR, BPF_PROG_TYPE_CGROUP_SYSCTL, BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE, + BPF_PROG_TYPE_CGROUP_SOCKOPT, }; enum bpf_attach_type { @@ -194,6 +195,8 @@ enum bpf_attach_type { BPF_CGROUP_SYSCTL, BPF_CGROUP_UDP4_RECVMSG, BPF_CGROUP_UDP6_RECVMSG, + BPF_CGROUP_GETSOCKOPT, + BPF_CGROUP_SETSOCKOPT, __MAX_BPF_ATTACH_TYPE }; @@ -3541,4 +3544,15 @@ struct bpf_sysctl { */ }; +struct bpf_sockopt { + __bpf_md_ptr(struct bpf_sock *, sk); + __bpf_md_ptr(void *, optval); + __bpf_md_ptr(void *, optval_end); + + __s32 level; + __s32 optname; + __s32 optlen; + __s32 retval; +}; + #endif /* _UAPI__LINUX_BPF_H__ */ diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 077ed3a19848..76fa0076f20d 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -15,6 +15,7 @@ #include #include #include +#include #include "../cgroup/cgroup-internal.h" @@ -938,6 +939,188 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, } EXPORT_SYMBOL(__cgroup_bpf_run_filter_sysctl); +static bool __cgroup_bpf_prog_array_is_empty(struct cgroup *cgrp, + enum bpf_attach_type attach_type) +{ + struct bpf_prog_array *prog_array; + bool empty; + + rcu_read_lock(); + prog_array = rcu_dereference(cgrp->bpf.effective[attach_type]); + empty = bpf_prog_array_is_empty(prog_array); + rcu_read_unlock(); + + return empty; +} + +static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen) +{ + if (unlikely(max_optlen > PAGE_SIZE) || max_optlen < 0) + return -EINVAL; + + ctx->optval = kzalloc(max_optlen, GFP_USER); + if (!ctx->optval) + return -ENOMEM; + + ctx->optval_end = ctx->optval + max_optlen; + ctx->optlen = max_optlen; + + return 0; +} + +static void sockopt_free_buf(struct bpf_sockopt_kern *ctx) +{ + kfree(ctx->optval); +} + +int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level, + int *optname, char __user *optval, + int *optlen, char **kernel_optval) +{ + struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); + struct bpf_sockopt_kern ctx = { + .sk = sk, + .level = *level, + .optname = *optname, + }; + int ret; + + /* Opportunistic check to see whether we have any BPF program + * attached to the hook so we don't waste time allocating + * memory and locking the socket. + */ + if (!cgroup_bpf_enabled || + __cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_SETSOCKOPT)) + return 0; + + ret = sockopt_alloc_buf(&ctx, *optlen); + if (ret) + return ret; + + if (copy_from_user(ctx.optval, optval, *optlen) != 0) { + ret = -EFAULT; + goto out; + } + + lock_sock(sk); + ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[BPF_CGROUP_SETSOCKOPT], + &ctx, BPF_PROG_RUN); + release_sock(sk); + + if (!ret) { + ret = -EPERM; + goto out; + } + + if (ctx.optlen == -1) { + /* optlen set to -1, bypass kernel */ + ret = 1; + } else if (ctx.optlen > *optlen || ctx.optlen < -1) { + /* optlen is out of bounds */ + ret = -EFAULT; + } else { + /* optlen within bounds, run kernel handler */ + ret = 0; + + /* export any potential modifications */ + *level = ctx.level; + *optname = ctx.optname; + *optlen = ctx.optlen; + *kernel_optval = ctx.optval; + } + +out: + if (ret) + sockopt_free_buf(&ctx); + return ret; +} +EXPORT_SYMBOL(__cgroup_bpf_run_filter_setsockopt); + +int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, + int optname, char __user *optval, + int __user *optlen, int max_optlen, + int retval) +{ + struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); + struct bpf_sockopt_kern ctx = { + .sk = sk, + .level = level, + .optname = optname, + .retval = retval, + }; + int ret; + + /* Opportunistic check to see whether we have any BPF program + * attached to the hook so we don't waste time allocating + * memory and locking the socket. + */ + if (!cgroup_bpf_enabled || + __cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_GETSOCKOPT)) + return retval; + + ret = sockopt_alloc_buf(&ctx, max_optlen); + if (ret) + return ret; + + if (!retval) { + /* If kernel getsockopt finished successfully, + * copy whatever was returned to the user back + * into our temporary buffer. Set optlen to the + * one that kernel returned as well to let + * BPF programs inspect the value. + */ + + if (get_user(ctx.optlen, optlen)) { + ret = -EFAULT; + goto out; + } + + if (ctx.optlen > max_optlen) + ctx.optlen = max_optlen; + + if (copy_from_user(ctx.optval, optval, ctx.optlen) != 0) { + ret = -EFAULT; + goto out; + } + } + + lock_sock(sk); + ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[BPF_CGROUP_GETSOCKOPT], + &ctx, BPF_PROG_RUN); + release_sock(sk); + + if (!ret) { + ret = -EPERM; + goto out; + } + + if (ctx.optlen > max_optlen) { + ret = -EFAULT; + goto out; + } + + /* BPF programs only allowed to set retval to 0, not some + * arbitrary value. + */ + if (ctx.retval != 0 && ctx.retval != retval) { + ret = -EFAULT; + goto out; + } + + if (copy_to_user(optval, ctx.optval, ctx.optlen) || + put_user(ctx.optlen, optlen)) { + ret = -EFAULT; + goto out; + } + + ret = ctx.retval; + +out: + sockopt_free_buf(&ctx); + return ret; +} +EXPORT_SYMBOL(__cgroup_bpf_run_filter_getsockopt); + static ssize_t sysctl_cpy_dir(const struct ctl_dir *dir, char **bufp, size_t *lenp) { @@ -1198,3 +1381,153 @@ const struct bpf_verifier_ops cg_sysctl_verifier_ops = { const struct bpf_prog_ops cg_sysctl_prog_ops = { }; + +static const struct bpf_func_proto * +cg_sockopt_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + switch (func_id) { + case BPF_FUNC_sk_storage_get: + return &bpf_sk_storage_get_proto; + case BPF_FUNC_sk_storage_delete: + return &bpf_sk_storage_delete_proto; +#ifdef CONFIG_INET + case BPF_FUNC_tcp_sock: + return &bpf_tcp_sock_proto; +#endif + default: + return cgroup_base_func_proto(func_id, prog); + } +} + +static bool cg_sockopt_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + const int size_default = sizeof(__u32); + + if (off < 0 || off >= sizeof(struct bpf_sockopt)) + return false; + + if (off % size != 0) + return false; + + if (type == BPF_WRITE) { + switch (off) { + case offsetof(struct bpf_sockopt, retval): + if (size != size_default) + return false; + return prog->expected_attach_type == + BPF_CGROUP_GETSOCKOPT; + case offsetof(struct bpf_sockopt, optname): + /* fallthrough */ + case offsetof(struct bpf_sockopt, level): + if (size != size_default) + return false; + return prog->expected_attach_type == + BPF_CGROUP_SETSOCKOPT; + case offsetof(struct bpf_sockopt, optlen): + return size == size_default; + default: + return false; + } + } + + switch (off) { + case offsetof(struct bpf_sockopt, sk): + if (size != sizeof(__u64)) + return false; + info->reg_type = PTR_TO_SOCKET; + break; + case offsetof(struct bpf_sockopt, optval): + if (size != sizeof(__u64)) + return false; + info->reg_type = PTR_TO_PACKET; + break; + case offsetof(struct bpf_sockopt, optval_end): + if (size != sizeof(__u64)) + return false; + info->reg_type = PTR_TO_PACKET_END; + break; + case offsetof(struct bpf_sockopt, retval): + if (size != size_default) + return false; + return prog->expected_attach_type == BPF_CGROUP_GETSOCKOPT; + default: + if (size != size_default) + return false; + break; + } + return true; +} + +#define CG_SOCKOPT_ACCESS_FIELD(T, F) \ + T(BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, F), \ + si->dst_reg, si->src_reg, \ + offsetof(struct bpf_sockopt_kern, F)) + +static u32 cg_sockopt_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, + struct bpf_insn *insn_buf, + struct bpf_prog *prog, + u32 *target_size) +{ + struct bpf_insn *insn = insn_buf; + + switch (si->off) { + case offsetof(struct bpf_sockopt, sk): + *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, sk); + break; + case offsetof(struct bpf_sockopt, level): + if (type == BPF_WRITE) + *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, level); + else + *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, level); + break; + case offsetof(struct bpf_sockopt, optname): + if (type == BPF_WRITE) + *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, optname); + else + *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optname); + break; + case offsetof(struct bpf_sockopt, optlen): + if (type == BPF_WRITE) + *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, optlen); + else + *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optlen); + break; + case offsetof(struct bpf_sockopt, retval): + if (type == BPF_WRITE) + *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, retval); + else + *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, retval); + break; + case offsetof(struct bpf_sockopt, optval): + *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optval); + break; + case offsetof(struct bpf_sockopt, optval_end): + *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optval_end); + break; + } + + return insn - insn_buf; +} + +static int cg_sockopt_get_prologue(struct bpf_insn *insn_buf, + bool direct_write, + const struct bpf_prog *prog) +{ + /* Nothing to do for sockopt argument. The data is kzalloc'ated. + */ + return 0; +} + +const struct bpf_verifier_ops cg_sockopt_verifier_ops = { + .get_func_proto = cg_sockopt_func_proto, + .is_valid_access = cg_sockopt_is_valid_access, + .convert_ctx_access = cg_sockopt_convert_ctx_access, + .gen_prologue = cg_sockopt_get_prologue, +}; + +const struct bpf_prog_ops cg_sockopt_prog_ops = { +}; diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 561ed07d3007..e2c1b43728da 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1809,6 +1809,15 @@ int bpf_prog_array_length(struct bpf_prog_array *array) return cnt; } +bool bpf_prog_array_is_empty(struct bpf_prog_array *array) +{ + struct bpf_prog_array_item *item; + + for (item = array->items; item->prog; item++) + if (item->prog != &dummy_bpf_prog.prog) + return false; + return true; +} static bool bpf_prog_array_copy_core(struct bpf_prog_array *array, u32 *prog_ids, diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 7713cf39795a..b0f545e07425 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1590,6 +1590,14 @@ bpf_prog_load_check_attach_type(enum bpf_prog_type prog_type, default: return -EINVAL; } + case BPF_PROG_TYPE_CGROUP_SOCKOPT: + switch (expected_attach_type) { + case BPF_CGROUP_SETSOCKOPT: + case BPF_CGROUP_GETSOCKOPT: + return 0; + default: + return -EINVAL; + } default: return 0; } @@ -1840,6 +1848,7 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, switch (prog->type) { case BPF_PROG_TYPE_CGROUP_SOCK: case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: + case BPF_PROG_TYPE_CGROUP_SOCKOPT: return attach_type == prog->expected_attach_type ? 0 : -EINVAL; case BPF_PROG_TYPE_CGROUP_SKB: return prog->enforce_expected_attach_type && @@ -1912,6 +1921,10 @@ static int bpf_prog_attach(const union bpf_attr *attr) case BPF_CGROUP_SYSCTL: ptype = BPF_PROG_TYPE_CGROUP_SYSCTL; break; + case BPF_CGROUP_GETSOCKOPT: + case BPF_CGROUP_SETSOCKOPT: + ptype = BPF_PROG_TYPE_CGROUP_SOCKOPT; + break; default: return -EINVAL; } @@ -1995,6 +2008,10 @@ static int bpf_prog_detach(const union bpf_attr *attr) case BPF_CGROUP_SYSCTL: ptype = BPF_PROG_TYPE_CGROUP_SYSCTL; break; + case BPF_CGROUP_GETSOCKOPT: + case BPF_CGROUP_SETSOCKOPT: + ptype = BPF_PROG_TYPE_CGROUP_SOCKOPT; + break; default: return -EINVAL; } @@ -2031,6 +2048,8 @@ static int bpf_prog_query(const union bpf_attr *attr, case BPF_CGROUP_SOCK_OPS: case BPF_CGROUP_DEVICE: case BPF_CGROUP_SYSCTL: + case BPF_CGROUP_GETSOCKOPT: + case BPF_CGROUP_SETSOCKOPT: break; case BPF_LIRC_MODE2: return lirc_prog_query(attr, uattr); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 0e079b2298f8..6b5623d320f9 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2215,6 +2215,13 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env, env->seen_direct_write = true; return true; + + case BPF_PROG_TYPE_CGROUP_SOCKOPT: + if (t == BPF_WRITE) + env->seen_direct_write = true; + + return true; + default: return false; } @@ -6066,6 +6073,7 @@ static int check_return_code(struct bpf_verifier_env *env) case BPF_PROG_TYPE_SOCK_OPS: case BPF_PROG_TYPE_CGROUP_DEVICE: case BPF_PROG_TYPE_CGROUP_SYSCTL: + case BPF_PROG_TYPE_CGROUP_SOCKOPT: break; default: return 0; diff --git a/net/core/filter.c b/net/core/filter.c index 2014d76e0d2a..dc8534be12fc 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5651,7 +5651,7 @@ BPF_CALL_1(bpf_tcp_sock, struct sock *, sk) return (unsigned long)NULL; } -static const struct bpf_func_proto bpf_tcp_sock_proto = { +const struct bpf_func_proto bpf_tcp_sock_proto = { .func = bpf_tcp_sock, .gpl_only = false, .ret_type = RET_PTR_TO_TCP_SOCK_OR_NULL, diff --git a/net/socket.c b/net/socket.c index 963df5dbdd54..0ddfbfb761d9 100644 --- a/net/socket.c +++ b/net/socket.c @@ -2051,6 +2051,8 @@ SYSCALL_DEFINE4(recv, int, fd, void __user *, ubuf, size_t, size, static int __sys_setsockopt(int fd, int level, int optname, char __user *optval, int optlen) { + mm_segment_t oldfs = get_fs(); + char *kernel_optval = NULL; int err, fput_needed; struct socket *sock; @@ -2063,6 +2065,22 @@ static int __sys_setsockopt(int fd, int level, int optname, if (err) goto out_put; + err = BPF_CGROUP_RUN_PROG_SETSOCKOPT(sock->sk, &level, + &optname, optval, &optlen, + &kernel_optval); + + if (err < 0) { + goto out_put; + } else if (err > 0) { + err = 0; + goto out_put; + } + + if (kernel_optval) { + set_fs(KERNEL_DS); + optval = (char __user __force *)kernel_optval; + } + if (level == SOL_SOCKET) err = sock_setsockopt(sock, level, optname, optval, @@ -2071,6 +2089,11 @@ static int __sys_setsockopt(int fd, int level, int optname, err = sock->ops->setsockopt(sock, level, optname, optval, optlen); + + if (kernel_optval) { + set_fs(oldfs); + kfree(kernel_optval); + } out_put: fput_light(sock->file, fput_needed); } @@ -2093,6 +2116,7 @@ static int __sys_getsockopt(int fd, int level, int optname, { int err, fput_needed; struct socket *sock; + int max_optlen; sock = sockfd_lookup_light(fd, &err, &fput_needed); if (sock != NULL) { @@ -2100,6 +2124,8 @@ static int __sys_getsockopt(int fd, int level, int optname, if (err) goto out_put; + max_optlen = BPF_CGROUP_GETSOCKOPT_MAX_OPTLEN(optlen); + if (level == SOL_SOCKET) err = sock_getsockopt(sock, level, optname, optval, @@ -2108,6 +2134,10 @@ static int __sys_getsockopt(int fd, int level, int optname, err = sock->ops->getsockopt(sock, level, optname, optval, optlen); + + err = BPF_CGROUP_RUN_PROG_GETSOCKOPT(sock->sk, level, optname, + optval, optlen, + max_optlen, err); out_put: fput_light(sock->file, fput_needed); } -- cgit v1.2.3-71-gd317 From d2ce8d6bfcfed014fd281e06c9b1d4638ddf3f1e Mon Sep 17 00:00:00 2001 From: Jiunn Chang Date: Thu, 27 Jun 2019 00:04:26 -0500 Subject: nl80211: Fix undefined behavior in bit shift Shifting signed 32-bit value by 31 bits is undefined. Changing most significant bit to unsigned. Signed-off-by: Jiunn Chang Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 6f09d1500960..fa7ebbc6ff27 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -5314,7 +5314,7 @@ enum nl80211_feature_flags { NL80211_FEATURE_TDLS_CHANNEL_SWITCH = 1 << 28, NL80211_FEATURE_SCAN_RANDOM_MAC_ADDR = 1 << 29, NL80211_FEATURE_SCHED_SCAN_RANDOM_MAC_ADDR = 1 << 30, - NL80211_FEATURE_ND_RANDOM_MAC_ADDR = 1 << 31, + NL80211_FEATURE_ND_RANDOM_MAC_ADDR = 1U << 31, }; /** -- cgit v1.2.3-71-gd317 From 9903c8dc734265689d5770ff28c84a7228fe5890 Mon Sep 17 00:00:00 2001 From: Vedang Patel Date: Tue, 25 Jun 2019 15:07:13 -0700 Subject: etf: Don't use BIT() in UAPI headers. The BIT() macro isn't exported as part of the UAPI interface. So, the compile-test to ensure they are self contained fails. So, use _BITUL() instead. Signed-off-by: Vedang Patel Signed-off-by: David S. Miller --- include/uapi/linux/pkt_sched.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index 8b2f993cbb77..f88c4e0bd9e5 100644 --- a/include/uapi/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h @@ -988,8 +988,8 @@ struct tc_etf_qopt { __s32 delta; __s32 clockid; __u32 flags; -#define TC_ETF_DEADLINE_MODE_ON BIT(0) -#define TC_ETF_OFFLOAD_ON BIT(1) +#define TC_ETF_DEADLINE_MODE_ON _BITUL(0) +#define TC_ETF_OFFLOAD_ON _BITUL(1) }; enum { -- cgit v1.2.3-71-gd317 From d14d2b20680f02fa739c2cbbb59e3629e487f359 Mon Sep 17 00:00:00 2001 From: Vedang Patel Date: Tue, 25 Jun 2019 15:07:14 -0700 Subject: etf: Add skip_sock_check Currently, etf expects a socket with SO_TXTIME option set for each packet it encounters. So, it will drop all other packets. But, in the future commits we are planning to add functionality where tstamp value will be set by another qdisc. Also, some packets which are generated from within the kernel (e.g. ICMP packets) do not have any socket associated with them. So, this commit adds support for skip_sock_check. When this option is set, etf will skip checking for a socket and other associated options for all skbs. Signed-off-by: Vedang Patel Signed-off-by: David S. Miller --- include/uapi/linux/pkt_sched.h | 1 + net/sched/sch_etf.c | 10 ++++++++++ 2 files changed, 11 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index f88c4e0bd9e5..127ac6d2888c 100644 --- a/include/uapi/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h @@ -990,6 +990,7 @@ struct tc_etf_qopt { __u32 flags; #define TC_ETF_DEADLINE_MODE_ON _BITUL(0) #define TC_ETF_OFFLOAD_ON _BITUL(1) +#define TC_ETF_SKIP_SOCK_CHECK _BITUL(2) }; enum { diff --git a/net/sched/sch_etf.c b/net/sched/sch_etf.c index db0c2ba1d156..cebfb65d8556 100644 --- a/net/sched/sch_etf.c +++ b/net/sched/sch_etf.c @@ -22,10 +22,12 @@ #define DEADLINE_MODE_IS_ON(x) ((x)->flags & TC_ETF_DEADLINE_MODE_ON) #define OFFLOAD_IS_ON(x) ((x)->flags & TC_ETF_OFFLOAD_ON) +#define SKIP_SOCK_CHECK_IS_SET(x) ((x)->flags & TC_ETF_SKIP_SOCK_CHECK) struct etf_sched_data { bool offload; bool deadline_mode; + bool skip_sock_check; int clockid; int queue; s32 delta; /* in ns */ @@ -77,6 +79,9 @@ static bool is_packet_valid(struct Qdisc *sch, struct sk_buff *nskb) struct sock *sk = nskb->sk; ktime_t now; + if (q->skip_sock_check) + goto skip; + if (!sk) return false; @@ -92,6 +97,7 @@ static bool is_packet_valid(struct Qdisc *sch, struct sk_buff *nskb) if (sk->sk_txtime_deadline_mode != q->deadline_mode) return false; +skip: now = q->get_time(); if (ktime_before(txtime, now) || ktime_before(txtime, q->last)) return false; @@ -385,6 +391,7 @@ static int etf_init(struct Qdisc *sch, struct nlattr *opt, q->clockid = qopt->clockid; q->offload = OFFLOAD_IS_ON(qopt); q->deadline_mode = DEADLINE_MODE_IS_ON(qopt); + q->skip_sock_check = SKIP_SOCK_CHECK_IS_SET(qopt); switch (q->clockid) { case CLOCK_REALTIME: @@ -473,6 +480,9 @@ static int etf_dump(struct Qdisc *sch, struct sk_buff *skb) if (q->deadline_mode) opt.flags |= TC_ETF_DEADLINE_MODE_ON; + if (q->skip_sock_check) + opt.flags |= TC_ETF_SKIP_SOCK_CHECK; + if (nla_put(skb, TCA_ETF_PARMS, sizeof(opt), &opt)) goto nla_put_failure; -- cgit v1.2.3-71-gd317 From 4cfd5779bd6efe8c76b4494aec63a063be0d2ff2 Mon Sep 17 00:00:00 2001 From: Vedang Patel Date: Tue, 25 Jun 2019 15:07:17 -0700 Subject: taprio: Add support for txtime-assist mode Currently, we are seeing non-critical packets being transmitted outside of their timeslice. We can confirm that the packets are being dequeued at the right time. So, the delay is induced in the hardware side. The most likely reason is the hardware queues are starving the lower priority queues. In order to improve the performance of taprio, we will be making use of the txtime feature provided by the ETF qdisc. For all the packets which do not have the SO_TXTIME option set, taprio will set the transmit timestamp (set in skb->tstamp) in this mode. TAPrio Qdisc will ensure that the transmit time for the packet is set to when the gate is open. If SO_TXTIME is set, the TAPrio qdisc will validate whether the timestamp (in skb->tstamp) occurs when the gate corresponding to skb's traffic class is open. Following two parameters added to support this mode: - flags: used to enable txtime-assist mode. Will also be used to enable other modes (like hardware offloading) later. - txtime-delay: This indicates the minimum time it will take for the packet to hit the wire. This is useful in determining whether we can transmit the packet in the remaining time if the gate corresponding to the packet is currently open. An example configuration for enabling txtime-assist: tc qdisc replace dev eth0 parent root handle 100 taprio \\ num_tc 3 \\ map 2 2 1 0 2 2 2 2 2 2 2 2 2 2 2 2 \\ queues 1@0 1@0 1@0 \\ base-time 1558653424279842568 \\ sched-entry S 01 300000 \\ sched-entry S 02 300000 \\ sched-entry S 04 400000 \\ flags 0x1 \\ txtime-delay 40000 \\ clockid CLOCK_TAI tc qdisc replace dev $IFACE parent 100:1 etf skip_sock_check \\ offload delta 200000 clockid CLOCK_TAI Note that all the traffic classes are mapped to the same queue. This is only possible in taprio when txtime-assist is enabled. Also, note that the ETF Qdisc is enabled with offload mode set. In this mode, if the packet's traffic class is open and the complete packet can be transmitted, taprio will try to transmit the packet immediately. This will be done by setting skb->tstamp to current_time + the time delta indicated in the txtime-delay parameter. This parameter indicates the time taken (in software) for packet to reach the network adapter. If the packet cannot be transmitted in the current interval or if the packet's traffic is not currently transmitting, the skb->tstamp is set to the next available timestamp value. This is tracked in the next_launchtime parameter in the struct sched_entry. The behaviour w.r.t admin and oper schedules is not changed from what is present in software mode. The transmit time is already known in advance. So, we do not need the HR timers to advance the schedule and wakeup the dequeue side of taprio. So, HR timer won't be run when this mode is enabled. Signed-off-by: Vedang Patel Signed-off-by: David S. Miller --- include/uapi/linux/pkt_sched.h | 4 + net/sched/sch_taprio.c | 341 +++++++++++++++++++++++++++++++++++++++-- 2 files changed, 328 insertions(+), 17 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index 127ac6d2888c..390efb54b2e0 100644 --- a/include/uapi/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h @@ -1159,6 +1159,8 @@ enum { * [TCA_TAPRIO_ATTR_SCHED_ENTRY_INTERVAL] */ +#define TCA_TAPRIO_ATTR_FLAG_TXTIME_ASSIST 0x1 + enum { TCA_TAPRIO_ATTR_UNSPEC, TCA_TAPRIO_ATTR_PRIOMAP, /* struct tc_mqprio_qopt */ @@ -1170,6 +1172,8 @@ enum { TCA_TAPRIO_ATTR_ADMIN_SCHED, /* The admin sched, only used in dump */ TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME, /* s64 */ TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME_EXTENSION, /* s64 */ + TCA_TAPRIO_ATTR_FLAGS, /* u32 */ + TCA_TAPRIO_ATTR_TXTIME_DELAY, /* s32 */ __TCA_TAPRIO_ATTR_MAX, }; diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c index 6ef0cc03fdb9..078230e44471 100644 --- a/net/sched/sch_taprio.c +++ b/net/sched/sch_taprio.c @@ -21,12 +21,16 @@ #include #include #include +#include static LIST_HEAD(taprio_list); static DEFINE_SPINLOCK(taprio_list_lock); #define TAPRIO_ALL_GATES_OPEN -1 +#define FLAGS_VALID(flags) (!((flags) & ~TCA_TAPRIO_ATTR_FLAG_TXTIME_ASSIST)) +#define TXTIME_ASSIST_IS_ENABLED(flags) ((flags) & TCA_TAPRIO_ATTR_FLAG_TXTIME_ASSIST) + struct sched_entry { struct list_head list; @@ -35,6 +39,7 @@ struct sched_entry { * packet leaves after this time. */ ktime_t close_time; + ktime_t next_txtime; atomic_t budget; int index; u32 gate_mask; @@ -55,6 +60,7 @@ struct sched_gate_list { struct taprio_sched { struct Qdisc **qdiscs; struct Qdisc *root; + u32 flags; int clockid; atomic64_t picos_per_byte; /* Using picoseconds because for 10Gbps+ * speeds it's sub-nanoseconds per byte @@ -68,6 +74,7 @@ struct taprio_sched { ktime_t (*get_time)(void); struct hrtimer advance_timer; struct list_head taprio_list; + int txtime_delay; }; static ktime_t sched_base_time(const struct sched_gate_list *sched) @@ -108,6 +115,227 @@ static void switch_schedules(struct taprio_sched *q, *admin = NULL; } +/* Get how much time has been already elapsed in the current cycle. */ +static s32 get_cycle_time_elapsed(struct sched_gate_list *sched, ktime_t time) +{ + ktime_t time_since_sched_start; + s32 time_elapsed; + + time_since_sched_start = ktime_sub(time, sched->base_time); + div_s64_rem(time_since_sched_start, sched->cycle_time, &time_elapsed); + + return time_elapsed; +} + +static ktime_t get_interval_end_time(struct sched_gate_list *sched, + struct sched_gate_list *admin, + struct sched_entry *entry, + ktime_t intv_start) +{ + s32 cycle_elapsed = get_cycle_time_elapsed(sched, intv_start); + ktime_t intv_end, cycle_ext_end, cycle_end; + + cycle_end = ktime_add_ns(intv_start, sched->cycle_time - cycle_elapsed); + intv_end = ktime_add_ns(intv_start, entry->interval); + cycle_ext_end = ktime_add(cycle_end, sched->cycle_time_extension); + + if (ktime_before(intv_end, cycle_end)) + return intv_end; + else if (admin && admin != sched && + ktime_after(admin->base_time, cycle_end) && + ktime_before(admin->base_time, cycle_ext_end)) + return admin->base_time; + else + return cycle_end; +} + +static int length_to_duration(struct taprio_sched *q, int len) +{ + return div_u64(len * atomic64_read(&q->picos_per_byte), 1000); +} + +/* Returns the entry corresponding to next available interval. If + * validate_interval is set, it only validates whether the timestamp occurs + * when the gate corresponding to the skb's traffic class is open. + */ +static struct sched_entry *find_entry_to_transmit(struct sk_buff *skb, + struct Qdisc *sch, + struct sched_gate_list *sched, + struct sched_gate_list *admin, + ktime_t time, + ktime_t *interval_start, + ktime_t *interval_end, + bool validate_interval) +{ + ktime_t curr_intv_start, curr_intv_end, cycle_end, packet_transmit_time; + ktime_t earliest_txtime = KTIME_MAX, txtime, cycle, transmit_end_time; + struct sched_entry *entry = NULL, *entry_found = NULL; + struct taprio_sched *q = qdisc_priv(sch); + struct net_device *dev = qdisc_dev(sch); + bool entry_available = false; + s32 cycle_elapsed; + int tc, n; + + tc = netdev_get_prio_tc_map(dev, skb->priority); + packet_transmit_time = length_to_duration(q, qdisc_pkt_len(skb)); + + *interval_start = 0; + *interval_end = 0; + + if (!sched) + return NULL; + + cycle = sched->cycle_time; + cycle_elapsed = get_cycle_time_elapsed(sched, time); + curr_intv_end = ktime_sub_ns(time, cycle_elapsed); + cycle_end = ktime_add_ns(curr_intv_end, cycle); + + list_for_each_entry(entry, &sched->entries, list) { + curr_intv_start = curr_intv_end; + curr_intv_end = get_interval_end_time(sched, admin, entry, + curr_intv_start); + + if (ktime_after(curr_intv_start, cycle_end)) + break; + + if (!(entry->gate_mask & BIT(tc)) || + packet_transmit_time > entry->interval) + continue; + + txtime = entry->next_txtime; + + if (ktime_before(txtime, time) || validate_interval) { + transmit_end_time = ktime_add_ns(time, packet_transmit_time); + if ((ktime_before(curr_intv_start, time) && + ktime_before(transmit_end_time, curr_intv_end)) || + (ktime_after(curr_intv_start, time) && !validate_interval)) { + entry_found = entry; + *interval_start = curr_intv_start; + *interval_end = curr_intv_end; + break; + } else if (!entry_available && !validate_interval) { + /* Here, we are just trying to find out the + * first available interval in the next cycle. + */ + entry_available = 1; + entry_found = entry; + *interval_start = ktime_add_ns(curr_intv_start, cycle); + *interval_end = ktime_add_ns(curr_intv_end, cycle); + } + } else if (ktime_before(txtime, earliest_txtime) && + !entry_available) { + earliest_txtime = txtime; + entry_found = entry; + n = div_s64(ktime_sub(txtime, curr_intv_start), cycle); + *interval_start = ktime_add(curr_intv_start, n * cycle); + *interval_end = ktime_add(curr_intv_end, n * cycle); + } + } + + return entry_found; +} + +static bool is_valid_interval(struct sk_buff *skb, struct Qdisc *sch) +{ + struct taprio_sched *q = qdisc_priv(sch); + struct sched_gate_list *sched, *admin; + ktime_t interval_start, interval_end; + struct sched_entry *entry; + + rcu_read_lock(); + sched = rcu_dereference(q->oper_sched); + admin = rcu_dereference(q->admin_sched); + + entry = find_entry_to_transmit(skb, sch, sched, admin, skb->tstamp, + &interval_start, &interval_end, true); + rcu_read_unlock(); + + return entry; +} + +/* There are a few scenarios where we will have to modify the txtime from + * what is read from next_txtime in sched_entry. They are: + * 1. If txtime is in the past, + * a. The gate for the traffic class is currently open and packet can be + * transmitted before it closes, schedule the packet right away. + * b. If the gate corresponding to the traffic class is going to open later + * in the cycle, set the txtime of packet to the interval start. + * 2. If txtime is in the future, there are packets corresponding to the + * current traffic class waiting to be transmitted. So, the following + * possibilities exist: + * a. We can transmit the packet before the window containing the txtime + * closes. + * b. The window might close before the transmission can be completed + * successfully. So, schedule the packet in the next open window. + */ +static long get_packet_txtime(struct sk_buff *skb, struct Qdisc *sch) +{ + ktime_t transmit_end_time, interval_end, interval_start; + struct taprio_sched *q = qdisc_priv(sch); + struct sched_gate_list *sched, *admin; + ktime_t minimum_time, now, txtime; + int len, packet_transmit_time; + struct sched_entry *entry; + bool sched_changed; + + now = q->get_time(); + minimum_time = ktime_add_ns(now, q->txtime_delay); + + rcu_read_lock(); + admin = rcu_dereference(q->admin_sched); + sched = rcu_dereference(q->oper_sched); + if (admin && ktime_after(minimum_time, admin->base_time)) + switch_schedules(q, &admin, &sched); + + /* Until the schedule starts, all the queues are open */ + if (!sched || ktime_before(minimum_time, sched->base_time)) { + txtime = minimum_time; + goto done; + } + + len = qdisc_pkt_len(skb); + packet_transmit_time = length_to_duration(q, len); + + do { + sched_changed = 0; + + entry = find_entry_to_transmit(skb, sch, sched, admin, + minimum_time, + &interval_start, &interval_end, + false); + if (!entry) { + txtime = 0; + goto done; + } + + txtime = entry->next_txtime; + txtime = max_t(ktime_t, txtime, minimum_time); + txtime = max_t(ktime_t, txtime, interval_start); + + if (admin && admin != sched && + ktime_after(txtime, admin->base_time)) { + sched = admin; + sched_changed = 1; + continue; + } + + transmit_end_time = ktime_add(txtime, packet_transmit_time); + minimum_time = transmit_end_time; + + /* Update the txtime of current entry to the next time it's + * interval starts. + */ + if (ktime_after(transmit_end_time, interval_end)) + entry->next_txtime = ktime_add(interval_start, sched->cycle_time); + } while (sched_changed || ktime_after(transmit_end_time, interval_end)); + + entry->next_txtime = transmit_end_time; + +done: + rcu_read_unlock(); + return txtime; +} + static int taprio_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { @@ -121,6 +349,15 @@ static int taprio_enqueue(struct sk_buff *skb, struct Qdisc *sch, if (unlikely(!child)) return qdisc_drop(skb, sch, to_free); + if (skb->sk && sock_flag(skb->sk, SOCK_TXTIME)) { + if (!is_valid_interval(skb, sch)) + return qdisc_drop(skb, sch, to_free); + } else if (TXTIME_ASSIST_IS_ENABLED(q->flags)) { + skb->tstamp = get_packet_txtime(skb, sch); + if (!skb->tstamp) + return qdisc_drop(skb, sch, to_free); + } + qdisc_qstats_backlog_inc(sch, skb); sch->q.qlen++; @@ -156,6 +393,9 @@ static struct sk_buff *taprio_peek(struct Qdisc *sch) if (!skb) continue; + if (TXTIME_ASSIST_IS_ENABLED(q->flags)) + return skb; + prio = skb->priority; tc = netdev_get_prio_tc_map(dev, prio); @@ -168,11 +408,6 @@ static struct sk_buff *taprio_peek(struct Qdisc *sch) return NULL; } -static int length_to_duration(struct taprio_sched *q, int len) -{ - return div_u64(len * atomic64_read(&q->picos_per_byte), 1000); -} - static void taprio_set_budget(struct taprio_sched *q, struct sched_entry *entry) { atomic_set(&entry->budget, @@ -216,6 +451,13 @@ static struct sk_buff *taprio_dequeue(struct Qdisc *sch) if (unlikely(!child)) continue; + if (TXTIME_ASSIST_IS_ENABLED(q->flags)) { + skb = child->ops->dequeue(child); + if (!skb) + continue; + goto skb_found; + } + skb = child->ops->peek(child); if (!skb) continue; @@ -246,6 +488,7 @@ static struct sk_buff *taprio_dequeue(struct Qdisc *sch) if (unlikely(!skb)) goto done; +skb_found: qdisc_bstats_update(sch, skb); qdisc_qstats_backlog_dec(sch, skb); sch->q.qlen--; @@ -522,7 +765,8 @@ static int parse_taprio_schedule(struct nlattr **tb, static int taprio_parse_mqprio_opt(struct net_device *dev, struct tc_mqprio_qopt *qopt, - struct netlink_ext_ack *extack) + struct netlink_ext_ack *extack, + u32 taprio_flags) { int i, j; @@ -570,6 +814,9 @@ static int taprio_parse_mqprio_opt(struct net_device *dev, return -EINVAL; } + if (TXTIME_ASSIST_IS_ENABLED(taprio_flags)) + continue; + /* Verify that the offset and counts do not overlap */ for (j = i + 1; j < qopt->num_tc; j++) { if (last > qopt->offset[j]) { @@ -700,6 +947,18 @@ static int taprio_dev_notifier(struct notifier_block *nb, unsigned long event, return NOTIFY_DONE; } +static void setup_txtime(struct taprio_sched *q, + struct sched_gate_list *sched, ktime_t base) +{ + struct sched_entry *entry; + u32 interval = 0; + + list_for_each_entry(entry, &sched->entries, list) { + entry->next_txtime = ktime_add_ns(base, interval); + interval += entry->interval; + } +} + static int taprio_change(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { @@ -708,6 +967,7 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt, struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); struct tc_mqprio_qopt *mqprio = NULL; + u32 taprio_flags = 0; int i, err, clockid; unsigned long flags; ktime_t start; @@ -720,7 +980,21 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt, if (tb[TCA_TAPRIO_ATTR_PRIOMAP]) mqprio = nla_data(tb[TCA_TAPRIO_ATTR_PRIOMAP]); - err = taprio_parse_mqprio_opt(dev, mqprio, extack); + if (tb[TCA_TAPRIO_ATTR_FLAGS]) { + taprio_flags = nla_get_u32(tb[TCA_TAPRIO_ATTR_FLAGS]); + + if (q->flags != 0 && q->flags != taprio_flags) { + NL_SET_ERR_MSG_MOD(extack, "Changing 'flags' of a running schedule is not supported"); + return -EOPNOTSUPP; + } else if (!FLAGS_VALID(taprio_flags)) { + NL_SET_ERR_MSG_MOD(extack, "Specified 'flags' are not valid"); + return -EINVAL; + } + + q->flags = taprio_flags; + } + + err = taprio_parse_mqprio_opt(dev, mqprio, extack, taprio_flags); if (err < 0) return err; @@ -779,7 +1053,18 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt, /* Protects against enqueue()/dequeue() */ spin_lock_bh(qdisc_lock(sch)); - if (!hrtimer_active(&q->advance_timer)) { + if (tb[TCA_TAPRIO_ATTR_TXTIME_DELAY]) { + if (!TXTIME_ASSIST_IS_ENABLED(q->flags)) { + NL_SET_ERR_MSG_MOD(extack, "txtime-delay can only be set when txtime-assist mode is enabled"); + err = -EINVAL; + goto unlock; + } + + q->txtime_delay = nla_get_s32(tb[TCA_TAPRIO_ATTR_TXTIME_DELAY]); + } + + if (!TXTIME_ASSIST_IS_ENABLED(taprio_flags) && + !hrtimer_active(&q->advance_timer)) { hrtimer_init(&q->advance_timer, q->clockid, HRTIMER_MODE_ABS); q->advance_timer.function = advance_sched; } @@ -822,20 +1107,35 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt, goto unlock; } - setup_first_close_time(q, new_admin, start); + if (TXTIME_ASSIST_IS_ENABLED(taprio_flags)) { + setup_txtime(q, new_admin, start); - /* Protects against advance_sched() */ - spin_lock_irqsave(&q->current_entry_lock, flags); + if (!oper) { + rcu_assign_pointer(q->oper_sched, new_admin); + err = 0; + new_admin = NULL; + goto unlock; + } - taprio_start_sched(sch, start, new_admin); + rcu_assign_pointer(q->admin_sched, new_admin); + if (admin) + call_rcu(&admin->rcu, taprio_free_sched_cb); + } else { + setup_first_close_time(q, new_admin, start); - rcu_assign_pointer(q->admin_sched, new_admin); - if (admin) - call_rcu(&admin->rcu, taprio_free_sched_cb); - new_admin = NULL; + /* Protects against advance_sched() */ + spin_lock_irqsave(&q->current_entry_lock, flags); + + taprio_start_sched(sch, start, new_admin); - spin_unlock_irqrestore(&q->current_entry_lock, flags); + rcu_assign_pointer(q->admin_sched, new_admin); + if (admin) + call_rcu(&admin->rcu, taprio_free_sched_cb); + spin_unlock_irqrestore(&q->current_entry_lock, flags); + } + + new_admin = NULL; err = 0; unlock: @@ -1073,6 +1373,13 @@ static int taprio_dump(struct Qdisc *sch, struct sk_buff *skb) if (nla_put_s32(skb, TCA_TAPRIO_ATTR_SCHED_CLOCKID, q->clockid)) goto options_error; + if (q->flags && nla_put_u32(skb, TCA_TAPRIO_ATTR_FLAGS, q->flags)) + goto options_error; + + if (q->txtime_delay && + nla_put_s32(skb, TCA_TAPRIO_ATTR_TXTIME_DELAY, q->txtime_delay)) + goto options_error; + if (oper && dump_schedule(skb, oper)) goto options_error; -- cgit v1.2.3-71-gd317 From 43e74c0267a35d6f5127218054b2d80c7fe801f5 Mon Sep 17 00:00:00 2001 From: Toke Høiland-Jørgensen Date: Fri, 28 Jun 2019 11:12:34 +0200 Subject: bpf_xdp_redirect_map: Perform map lookup in eBPF helper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The bpf_redirect_map() helper used by XDP programs doesn't return any indication of whether it can successfully redirect to the map index it was given. Instead, BPF programs have to track this themselves, leading to programs using duplicate maps to track which entries are populated in the devmap. This patch fixes this by moving the map lookup into the bpf_redirect_map() helper, which makes it possible to return failure to the eBPF program. The lower bits of the flags argument is used as the return code, which means that existing users who pass a '0' flag argument will get XDP_ABORTED. With this, a BPF program can check the return code from the helper call and react by, for instance, substituting a different redirect. This works for any type of map used for redirect. Signed-off-by: Toke Høiland-Jørgensen Acked-by: Jonathan Lemon Acked-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann --- include/linux/filter.h | 1 + include/trace/events/xdp.h | 5 ++--- include/uapi/linux/bpf.h | 7 +++++-- net/core/filter.c | 32 ++++++++++++++++++-------------- 4 files changed, 26 insertions(+), 19 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/filter.h b/include/linux/filter.h index 92bd192f7786..1fe53e78c7e3 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -580,6 +580,7 @@ struct bpf_skb_data_end { struct bpf_redirect_info { u32 flags; u32 tgt_index; + void *tgt_value; struct bpf_map *map; struct bpf_map *map_to_flush; u32 kern_flags; diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h index 81e708c4b513..68899fdc985b 100644 --- a/include/trace/events/xdp.h +++ b/include/trace/events/xdp.h @@ -175,9 +175,8 @@ struct _bpf_dtab_netdev { #endif /* __DEVMAP_OBJ_TYPE */ #define devmap_ifindex(fwd, map) \ - (!fwd ? 0 : \ - ((map->map_type == BPF_MAP_TYPE_DEVMAP) ? \ - ((struct _bpf_dtab_netdev *)fwd)->dev->ifindex : 0)) + ((map->map_type == BPF_MAP_TYPE_DEVMAP) ? \ + ((struct _bpf_dtab_netdev *)fwd)->dev->ifindex : 0) #define _trace_xdp_redirect_map(dev, xdp, fwd, map, idx) \ trace_xdp_redirect_map(dev, xdp, devmap_ifindex(fwd, map), \ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index a396b516a2b2..cffea1826a1f 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1571,8 +1571,11 @@ union bpf_attr { * but this is only implemented for native XDP (with driver * support) as of this writing). * - * All values for *flags* are reserved for future usage, and must - * be left at zero. + * The lower two bits of *flags* are used as the return code if + * the map lookup fails. This is so that the return value can be + * one of the XDP program return codes up to XDP_TX, as chosen by + * the caller. Any higher bits in the *flags* argument must be + * unset. * * When used to redirect packets to net devices, this helper * provides a high performance increase over **bpf_redirect**\ (). diff --git a/net/core/filter.c b/net/core/filter.c index b4a062379bb9..4836264f82ee 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3605,17 +3605,13 @@ static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp, struct bpf_redirect_info *ri) { u32 index = ri->tgt_index; - void *fwd = NULL; + void *fwd = ri->tgt_value; int err; ri->tgt_index = 0; + ri->tgt_value = NULL; WRITE_ONCE(ri->map, NULL); - fwd = __xdp_map_lookup_elem(map, index); - if (unlikely(!fwd)) { - err = -EINVAL; - goto err; - } if (ri->map_to_flush && unlikely(ri->map_to_flush != map)) xdp_do_flush_map(); @@ -3652,18 +3648,13 @@ static int xdp_do_generic_redirect_map(struct net_device *dev, { struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); u32 index = ri->tgt_index; - void *fwd = NULL; + void *fwd = ri->tgt_value; int err = 0; ri->tgt_index = 0; + ri->tgt_value = NULL; WRITE_ONCE(ri->map, NULL); - fwd = __xdp_map_lookup_elem(map, index); - if (unlikely(!fwd)) { - err = -EINVAL; - goto err; - } - if (map->map_type == BPF_MAP_TYPE_DEVMAP) { struct bpf_dtab_netdev *dst = fwd; @@ -3732,6 +3723,7 @@ BPF_CALL_2(bpf_xdp_redirect, u32, ifindex, u64, flags) ri->flags = flags; ri->tgt_index = ifindex; + ri->tgt_value = NULL; WRITE_ONCE(ri->map, NULL); return XDP_REDIRECT; @@ -3750,9 +3742,21 @@ BPF_CALL_3(bpf_xdp_redirect_map, struct bpf_map *, map, u32, ifindex, { struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); - if (unlikely(flags)) + /* Lower bits of the flags are used as return code on lookup failure */ + if (unlikely(flags > XDP_TX)) return XDP_ABORTED; + ri->tgt_value = __xdp_map_lookup_elem(map, ifindex); + if (unlikely(!ri->tgt_value)) { + /* If the lookup fails we want to clear out the state in the + * redirect_info struct completely, so that if an eBPF program + * performs multiple lookups, the last one always takes + * precedence. + */ + WRITE_ONCE(ri->map, NULL); + return flags; + } + ri->flags = flags; ri->tgt_index = ifindex; WRITE_ONCE(ri->map, map); -- cgit v1.2.3-71-gd317 From 0472301a28f6cf53a6bc5783e48a2d0bbff4682f Mon Sep 17 00:00:00 2001 From: Baruch Siach Date: Fri, 28 Jun 2019 07:08:45 +0300 Subject: bpf: fix uapi bpf_prog_info fields alignment Merge commit 1c8c5a9d38f60 ("Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next") undid the fix from commit 36f9814a494 ("bpf: fix uapi hole for 32 bit compat applications") by taking the gpl_compatible 1-bit field definition from commit b85fab0e67b162 ("bpf: Add gpl_compatible flag to struct bpf_prog_info") as is. That breaks architectures with 16-bit alignment like m68k. Add 31-bit pad after gpl_compatible to restore alignment of following fields. Thanks to Dmitry V. Levin his analysis of this bug history. Signed-off-by: Baruch Siach Acked-by: Song Liu Cc: Jiri Olsa Cc: Daniel Borkmann Cc: Geert Uytterhoeven Cc: Linus Torvalds Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 1 + tools/include/uapi/linux/bpf.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index a8b823c30b43..29a5bc3d5c66 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3143,6 +3143,7 @@ struct bpf_prog_info { char name[BPF_OBJ_NAME_LEN]; __u32 ifindex; __u32 gpl_compatible:1; + __u32 :31; /* alignment pad */ __u64 netns_dev; __u64 netns_ino; __u32 nr_jited_ksyms; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index a8b823c30b43..29a5bc3d5c66 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3143,6 +3143,7 @@ struct bpf_prog_info { char name[BPF_OBJ_NAME_LEN]; __u32 ifindex; __u32 gpl_compatible:1; + __u32 :31; /* alignment pad */ __u64 netns_dev; __u64 netns_ino; __u32 nr_jited_ksyms; -- cgit v1.2.3-71-gd317 From 5e4c7cf60ec3cad59703c203de1dfb31ea608e6e Mon Sep 17 00:00:00 2001 From: Revanth Rajashekar Date: Thu, 27 Jun 2019 16:30:02 -0600 Subject: block: sed-opal: PSID reverttper capability PSID is a 32 character password printed on the drive label, to prove its physical access. This PSID reverttper function is very useful to regain the control over the drive when it is locked and the user can no longer access it because of some failures. However, *all the data on the drive is completely erased*. This method is advisable only when the user is exhausted of all other recovery methods. PSID capabilities are described in: https://trustedcomputinggroup.org/wp-content/uploads/TCG_Storage-Opal_Feature_Set_PSID_v1.00_r1.00.pdf Signed-off-by: Revanth Rajashekar Signed-off-by: Jens Axboe --- block/sed-opal.c | 33 +++++++++++++++++++++++++++++---- include/linux/sed-opal.h | 1 + include/uapi/linux/sed-opal.h | 1 + 3 files changed, 31 insertions(+), 4 deletions(-) (limited to 'include/uapi/linux') diff --git a/block/sed-opal.c b/block/sed-opal.c index a46e8d13e16d..bb8ef7963d11 100644 --- a/block/sed-opal.c +++ b/block/sed-opal.c @@ -1307,6 +1307,7 @@ static int start_generic_opal_session(struct opal_dev *dev, break; case OPAL_ADMIN1_UID: case OPAL_SID_UID: + case OPAL_PSID_UID: add_token_u8(&err, dev, OPAL_STARTNAME); add_token_u8(&err, dev, 0); /* HostChallenge */ add_token_bytestring(&err, dev, key, key_len); @@ -1367,6 +1368,16 @@ static int start_admin1LSP_opal_session(struct opal_dev *dev, void *data) key->key, key->key_len); } +static int start_PSID_opal_session(struct opal_dev *dev, void *data) +{ + const struct opal_key *okey = data; + + return start_generic_opal_session(dev, OPAL_PSID_UID, + OPAL_ADMINSP_UID, + okey->key, + okey->key_len); +} + static int start_auth_opal_session(struct opal_dev *dev, void *data) { struct opal_session_info *session = data; @@ -2030,17 +2041,28 @@ static int opal_add_user_to_lr(struct opal_dev *dev, return ret; } -static int opal_reverttper(struct opal_dev *dev, struct opal_key *opal) +static int opal_reverttper(struct opal_dev *dev, struct opal_key *opal, bool psid) { + /* controller will terminate session */ const struct opal_step revert_steps[] = { { start_SIDASP_opal_session, opal }, - { revert_tper, } /* controller will terminate session */ + { revert_tper, } }; + const struct opal_step psid_revert_steps[] = { + { start_PSID_opal_session, opal }, + { revert_tper, } + }; + int ret; mutex_lock(&dev->dev_lock); setup_opal_dev(dev); - ret = execute_steps(dev, revert_steps, ARRAY_SIZE(revert_steps)); + if (psid) + ret = execute_steps(dev, psid_revert_steps, + ARRAY_SIZE(psid_revert_steps)); + else + ret = execute_steps(dev, revert_steps, + ARRAY_SIZE(revert_steps)); mutex_unlock(&dev->dev_lock); /* @@ -2280,7 +2302,7 @@ int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *arg) ret = opal_activate_user(dev, p); break; case IOC_OPAL_REVERT_TPR: - ret = opal_reverttper(dev, p); + ret = opal_reverttper(dev, p, false); break; case IOC_OPAL_LR_SETUP: ret = opal_setup_locking_range(dev, p); @@ -2297,6 +2319,9 @@ int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *arg) case IOC_OPAL_SECURE_ERASE_LR: ret = opal_secure_erase_locking_range(dev, p); break; + case IOC_OPAL_PSID_REVERT_TPR: + ret = opal_reverttper(dev, p, true); + break; default: break; } diff --git a/include/linux/sed-opal.h b/include/linux/sed-opal.h index 3e76b6d7d97f..f03bbffd3281 100644 --- a/include/linux/sed-opal.h +++ b/include/linux/sed-opal.h @@ -39,6 +39,7 @@ static inline bool is_sed_ioctl(unsigned int cmd) case IOC_OPAL_ENABLE_DISABLE_MBR: case IOC_OPAL_ERASE_LR: case IOC_OPAL_SECURE_ERASE_LR: + case IOC_OPAL_PSID_REVERT_TPR: return true; } return false; diff --git a/include/uapi/linux/sed-opal.h b/include/uapi/linux/sed-opal.h index 33e53b80cd1f..7a03e5b4df6e 100644 --- a/include/uapi/linux/sed-opal.h +++ b/include/uapi/linux/sed-opal.h @@ -107,5 +107,6 @@ struct opal_mbr_data { #define IOC_OPAL_ENABLE_DISABLE_MBR _IOW('p', 229, struct opal_mbr_data) #define IOC_OPAL_ERASE_LR _IOW('p', 230, struct opal_session_info) #define IOC_OPAL_SECURE_ERASE_LR _IOW('p', 231, struct opal_session_info) +#define IOC_OPAL_PSID_REVERT_TPR _IOW('p', 232, struct opal_key) #endif /* _UAPI_SED_OPAL_H */ -- cgit v1.2.3-71-gd317 From c9888443413e4e06013e482fc484dbb9c559c145 Mon Sep 17 00:00:00 2001 From: Jonas Rabenstein Date: Tue, 21 May 2019 22:46:44 +0200 Subject: block: sed-opal: add ioctl for done-mark of shadow mbr Enable users to mark the shadow mbr as done without completely deactivating the shadow mbr feature. This may be useful on reboots, when the power to the disk is not disconnected in between and the shadow mbr stores the required boot files. Of course, this saves also the (few) commands required to enable the feature if it is already enabled and one only wants to mark the shadow mbr as done. Co-authored-by: David Kozub Signed-off-by: Jonas Rabenstein Signed-off-by: David Kozub Reviewed-by: Christoph Hellwig Reviewed by: Scott Bauer Reviewed-by: Jon Derrick Signed-off-by: Jens Axboe --- block/sed-opal.c | 27 +++++++++++++++++++++++++++ include/linux/sed-opal.h | 1 + include/uapi/linux/sed-opal.h | 12 ++++++++++++ 3 files changed, 40 insertions(+) (limited to 'include/uapi/linux') diff --git a/block/sed-opal.c b/block/sed-opal.c index c54019c11e91..f94f359dd688 100644 --- a/block/sed-opal.c +++ b/block/sed-opal.c @@ -1989,6 +1989,30 @@ static int opal_enable_disable_shadow_mbr(struct opal_dev *dev, return ret; } +static int opal_set_mbr_done(struct opal_dev *dev, + struct opal_mbr_done *mbr_done) +{ + u8 mbr_done_tf = mbr_done->done_flag == OPAL_MBR_DONE ? + OPAL_TRUE : OPAL_FALSE; + + const struct opal_step mbr_steps[] = { + { start_admin1LSP_opal_session, &mbr_done->key }, + { set_mbr_done, &mbr_done_tf }, + { end_opal_session, } + }; + int ret; + + if (mbr_done->done_flag != OPAL_MBR_DONE && + mbr_done->done_flag != OPAL_MBR_NOT_DONE) + return -EINVAL; + + mutex_lock(&dev->dev_lock); + setup_opal_dev(dev); + ret = execute_steps(dev, mbr_steps, ARRAY_SIZE(mbr_steps)); + mutex_unlock(&dev->dev_lock); + return ret; +} + static int opal_save(struct opal_dev *dev, struct opal_lock_unlock *lk_unlk) { struct opal_suspend_data *suspend; @@ -2310,6 +2334,9 @@ int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *arg) case IOC_OPAL_ENABLE_DISABLE_MBR: ret = opal_enable_disable_shadow_mbr(dev, p); break; + case IOC_OPAL_MBR_DONE: + ret = opal_set_mbr_done(dev, p); + break; case IOC_OPAL_ERASE_LR: ret = opal_erase_locking_range(dev, p); break; diff --git a/include/linux/sed-opal.h b/include/linux/sed-opal.h index f03bbffd3281..f834e8a1495f 100644 --- a/include/linux/sed-opal.h +++ b/include/linux/sed-opal.h @@ -40,6 +40,7 @@ static inline bool is_sed_ioctl(unsigned int cmd) case IOC_OPAL_ERASE_LR: case IOC_OPAL_SECURE_ERASE_LR: case IOC_OPAL_PSID_REVERT_TPR: + case IOC_OPAL_MBR_DONE: return true; } return false; diff --git a/include/uapi/linux/sed-opal.h b/include/uapi/linux/sed-opal.h index 7a03e5b4df6e..5681f55d334b 100644 --- a/include/uapi/linux/sed-opal.h +++ b/include/uapi/linux/sed-opal.h @@ -20,6 +20,11 @@ enum opal_mbr { OPAL_MBR_DISABLE = 0x01, }; +enum opal_mbr_done_flag { + OPAL_MBR_NOT_DONE = 0x0, + OPAL_MBR_DONE = 0x01 +}; + enum opal_user { OPAL_ADMIN1 = 0x0, OPAL_USER1 = 0x01, @@ -95,6 +100,12 @@ struct opal_mbr_data { __u8 __align[7]; }; +struct opal_mbr_done { + struct opal_key key; + __u8 done_flag; + __u8 __align[7]; +}; + #define IOC_OPAL_SAVE _IOW('p', 220, struct opal_lock_unlock) #define IOC_OPAL_LOCK_UNLOCK _IOW('p', 221, struct opal_lock_unlock) #define IOC_OPAL_TAKE_OWNERSHIP _IOW('p', 222, struct opal_key) @@ -108,5 +119,6 @@ struct opal_mbr_data { #define IOC_OPAL_ERASE_LR _IOW('p', 230, struct opal_session_info) #define IOC_OPAL_SECURE_ERASE_LR _IOW('p', 231, struct opal_session_info) #define IOC_OPAL_PSID_REVERT_TPR _IOW('p', 232, struct opal_key) +#define IOC_OPAL_MBR_DONE _IOW('p', 233, struct opal_mbr_done) #endif /* _UAPI_SED_OPAL_H */ -- cgit v1.2.3-71-gd317 From a9b25b4cf2b76d320afc999f881ccb805fecdd84 Mon Sep 17 00:00:00 2001 From: Jonas Rabenstein Date: Tue, 21 May 2019 22:46:45 +0200 Subject: block: sed-opal: ioctl for writing to shadow mbr Allow modification of the shadow mbr. If the shadow mbr is not marked as done, this data will be presented read only as the device content. Only after marking the shadow mbr as done and unlocking a locking range the actual content is accessible. Co-authored-by: David Kozub Signed-off-by: Jonas Rabenstein Signed-off-by: David Kozub Reviewed-by: Scott Bauer Reviewed-by: Jon Derrick Signed-off-by: Jens Axboe --- block/sed-opal.c | 91 ++++++++++++++++++++++++++++++++++++++++++- include/linux/sed-opal.h | 1 + include/uapi/linux/sed-opal.h | 8 ++++ 3 files changed, 98 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/block/sed-opal.c b/block/sed-opal.c index f94f359dd688..b02ef2ff0d75 100644 --- a/block/sed-opal.c +++ b/block/sed-opal.c @@ -26,6 +26,9 @@ #define IO_BUFFER_LENGTH 2048 #define MAX_TOKS 64 +/* Number of bytes needed by cmd_finalize. */ +#define CMD_FINALIZE_BYTES_NEEDED 7 + struct opal_step { int (*fn)(struct opal_dev *dev, void *data); void *data; @@ -523,12 +526,17 @@ static int opal_discovery0_step(struct opal_dev *dev) return execute_step(dev, &discovery0_step, 0); } +static size_t remaining_size(struct opal_dev *cmd) +{ + return IO_BUFFER_LENGTH - cmd->pos; +} + static bool can_add(int *err, struct opal_dev *cmd, size_t len) { if (*err) return false; - if (len > IO_BUFFER_LENGTH || cmd->pos > IO_BUFFER_LENGTH - len) { + if (remaining_size(cmd) < len) { pr_debug("Error adding %zu bytes: end of buffer.\n", len); *err = -ERANGE; return false; @@ -674,7 +682,11 @@ static int cmd_finalize(struct opal_dev *cmd, u32 hsn, u32 tsn) struct opal_header *hdr; int err = 0; - /* close the parameter list opened from cmd_start */ + /* + * Close the parameter list opened from cmd_start. + * The number of bytes added must be equal to + * CMD_FINALIZE_BYTES_NEEDED. + */ add_token_u8(&err, cmd, OPAL_ENDLIST); add_token_u8(&err, cmd, OPAL_ENDOFDATA); @@ -1536,6 +1548,58 @@ static int set_mbr_enable_disable(struct opal_dev *dev, void *data) return finalize_and_send(dev, parse_and_check_status); } +static int write_shadow_mbr(struct opal_dev *dev, void *data) +{ + struct opal_shadow_mbr *shadow = data; + const u8 __user *src; + u8 *dst; + size_t off = 0; + u64 len; + int err = 0; + + /* do the actual transmission(s) */ + src = (u8 __user *)(uintptr_t)shadow->data; + while (off < shadow->size) { + err = cmd_start(dev, opaluid[OPAL_MBR], opalmethod[OPAL_SET]); + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, OPAL_WHERE); + add_token_u64(&err, dev, shadow->offset + off); + add_token_u8(&err, dev, OPAL_ENDNAME); + + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, OPAL_VALUES); + + /* + * The bytestring header is either 1 or 2 bytes, so assume 2. + * There also needs to be enough space to accommodate the + * trailing OPAL_ENDNAME (1 byte) and tokens added by + * cmd_finalize. + */ + len = min(remaining_size(dev) - (2+1+CMD_FINALIZE_BYTES_NEEDED), + (size_t)(shadow->size - off)); + pr_debug("MBR: write bytes %zu+%llu/%llu\n", + off, len, shadow->size); + + dst = add_bytestring_header(&err, dev, len); + if (!dst) + break; + if (copy_from_user(dst, src + off, len)) + err = -EFAULT; + dev->pos += len; + + add_token_u8(&err, dev, OPAL_ENDNAME); + if (err) + break; + + err = finalize_and_send(dev, parse_and_check_status); + if (err) + break; + + off += len; + } + return err; +} + static int generic_pw_cmd(u8 *key, size_t key_len, u8 *cpin_uid, struct opal_dev *dev) { @@ -2013,6 +2077,26 @@ static int opal_set_mbr_done(struct opal_dev *dev, return ret; } +static int opal_write_shadow_mbr(struct opal_dev *dev, + struct opal_shadow_mbr *info) +{ + const struct opal_step mbr_steps[] = { + { start_admin1LSP_opal_session, &info->key }, + { write_shadow_mbr, info }, + { end_opal_session, } + }; + int ret; + + if (info->size == 0) + return 0; + + mutex_lock(&dev->dev_lock); + setup_opal_dev(dev); + ret = execute_steps(dev, mbr_steps, ARRAY_SIZE(mbr_steps)); + mutex_unlock(&dev->dev_lock); + return ret; +} + static int opal_save(struct opal_dev *dev, struct opal_lock_unlock *lk_unlk) { struct opal_suspend_data *suspend; @@ -2337,6 +2421,9 @@ int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *arg) case IOC_OPAL_MBR_DONE: ret = opal_set_mbr_done(dev, p); break; + case IOC_OPAL_WRITE_SHADOW_MBR: + ret = opal_write_shadow_mbr(dev, p); + break; case IOC_OPAL_ERASE_LR: ret = opal_erase_locking_range(dev, p); break; diff --git a/include/linux/sed-opal.h b/include/linux/sed-opal.h index f834e8a1495f..53c28d750a45 100644 --- a/include/linux/sed-opal.h +++ b/include/linux/sed-opal.h @@ -41,6 +41,7 @@ static inline bool is_sed_ioctl(unsigned int cmd) case IOC_OPAL_SECURE_ERASE_LR: case IOC_OPAL_PSID_REVERT_TPR: case IOC_OPAL_MBR_DONE: + case IOC_OPAL_WRITE_SHADOW_MBR: return true; } return false; diff --git a/include/uapi/linux/sed-opal.h b/include/uapi/linux/sed-opal.h index 5681f55d334b..c6d035fa1b6c 100644 --- a/include/uapi/linux/sed-opal.h +++ b/include/uapi/linux/sed-opal.h @@ -106,6 +106,13 @@ struct opal_mbr_done { __u8 __align[7]; }; +struct opal_shadow_mbr { + struct opal_key key; + const __u64 data; + __u64 offset; + __u64 size; +}; + #define IOC_OPAL_SAVE _IOW('p', 220, struct opal_lock_unlock) #define IOC_OPAL_LOCK_UNLOCK _IOW('p', 221, struct opal_lock_unlock) #define IOC_OPAL_TAKE_OWNERSHIP _IOW('p', 222, struct opal_key) @@ -120,5 +127,6 @@ struct opal_mbr_done { #define IOC_OPAL_SECURE_ERASE_LR _IOW('p', 231, struct opal_session_info) #define IOC_OPAL_PSID_REVERT_TPR _IOW('p', 232, struct opal_key) #define IOC_OPAL_MBR_DONE _IOW('p', 233, struct opal_mbr_done) +#define IOC_OPAL_WRITE_SHADOW_MBR _IOW('p', 234, struct opal_shadow_mbr) #endif /* _UAPI_SED_OPAL_H */ -- cgit v1.2.3-71-gd317 From 79293f49677e2e703ef0d0efc9919319adacb3fb Mon Sep 17 00:00:00 2001 From: Jiunn Chang Date: Wed, 26 Jun 2019 22:25:30 -0500 Subject: packet: Fix undefined behavior in bit shift Shifting signed 32-bit value by 31 bits is undefined. Changing most significant bit to unsigned. Changes included in v2: - use subsystem specific subject lines - CC required mailing lists Signed-off-by: Jiunn Chang Signed-off-by: David S. Miller --- include/uapi/linux/if_packet.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/if_packet.h b/include/uapi/linux/if_packet.h index 467b654bd4c7..3d884d68eb30 100644 --- a/include/uapi/linux/if_packet.h +++ b/include/uapi/linux/if_packet.h @@ -123,7 +123,7 @@ struct tpacket_auxdata { /* Rx and Tx ring - header status */ #define TP_STATUS_TS_SOFTWARE (1 << 29) #define TP_STATUS_TS_SYS_HARDWARE (1 << 30) /* deprecated, never set */ -#define TP_STATUS_TS_RAW_HARDWARE (1 << 31) +#define TP_STATUS_TS_RAW_HARDWARE (1U << 31) /* Rx ring - feature request bits */ #define TP_FT_REQ_FILL_RXHASH 0x1 -- cgit v1.2.3-71-gd317 From c7369b3faea230cf6009449147ed755c45e74afd Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 31 May 2019 15:39:31 +0200 Subject: btrfs: add mask for all RAID1 types Preparatory patch for additional RAID1 profiles with more copies. The mask will contain 3-copy and 4-copy, most of the checks for plain RAID1 work the same for the other profiles. Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 8 ++++---- fs/btrfs/scrub.c | 2 +- fs/btrfs/volumes.c | 8 ++++---- include/uapi/linux/btrfs_tree.h | 2 ++ 4 files changed, 11 insertions(+), 9 deletions(-) (limited to 'include/uapi/linux') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index f24ef9020323..13c17f94f15d 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -7873,7 +7873,7 @@ search: */ if (!block_group_bits(block_group, flags)) { u64 extra = BTRFS_BLOCK_GROUP_DUP | - BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 | BTRFS_BLOCK_GROUP_RAID10; @@ -9564,7 +9564,7 @@ static u64 update_block_group_flags(struct btrfs_fs_info *fs_info, u64 flags) stripped = BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 | - BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10; + BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10; if (num_devices == 1) { stripped |= BTRFS_BLOCK_GROUP_DUP; @@ -9575,7 +9575,7 @@ static u64 update_block_group_flags(struct btrfs_fs_info *fs_info, u64 flags) return stripped; /* turn mirroring into duplication */ - if (flags & (BTRFS_BLOCK_GROUP_RAID1 | + if (flags & (BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10)) return stripped | BTRFS_BLOCK_GROUP_DUP; } else { @@ -10445,7 +10445,7 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info) list_for_each_entry_rcu(space_info, &info->space_info, list) { if (!(get_alloc_profile(info, space_info->flags) & (BTRFS_BLOCK_GROUP_RAID10 | - BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 | BTRFS_BLOCK_GROUP_DUP))) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 9f0297d529d4..0c99cf9fb595 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -3091,7 +3091,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, offset = map->stripe_len * (num / map->sub_stripes); increment = map->stripe_len * factor; mirror_num = num % map->sub_stripes + 1; - } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) { + } else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) { increment = map->stripe_len; mirror_num = num % map->num_stripes + 1; } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 776f5c7ca7c5..9e5167a0e406 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -5400,7 +5400,7 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) return 1; map = em->map_lookup; - if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1)) + if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1_MASK)) ret = map->num_stripes; else if (map->type & BTRFS_BLOCK_GROUP_RAID10) ret = map->sub_stripes; @@ -5474,7 +5474,7 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info, struct btrfs_device *srcdev; ASSERT((map->type & - (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10))); + (BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10))); if (map->type & BTRFS_BLOCK_GROUP_RAID10) num_stripes = map->sub_stripes; @@ -5663,7 +5663,7 @@ static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info, &remaining_stripes); div_u64_rem(stripe_nr_end - 1, factor, &last_stripe); last_stripe *= sub_stripes; - } else if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | + } else if (map->type & (BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_DUP)) { num_stripes = map->num_stripes; } else { @@ -6035,7 +6035,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, &stripe_index); if (!need_full_stripe(op)) mirror_num = 1; - } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) { + } else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) { if (need_full_stripe(op)) num_stripes = map->num_stripes; else if (mirror_num) diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h index 421239b98db2..34d5b34286fa 100644 --- a/include/uapi/linux/btrfs_tree.h +++ b/include/uapi/linux/btrfs_tree.h @@ -866,6 +866,8 @@ enum btrfs_raid_types { #define BTRFS_BLOCK_GROUP_RAID56_MASK (BTRFS_BLOCK_GROUP_RAID5 | \ BTRFS_BLOCK_GROUP_RAID6) +#define BTRFS_BLOCK_GROUP_RAID1_MASK (BTRFS_BLOCK_GROUP_RAID1) + /* * We need a bit for restriper to be able to tell when chunks of type * SINGLE are available. This "extended" profile format is used in -- cgit v1.2.3-71-gd317 From 35f2c14d2a076b063a76c5bf275c46c0743ba3a0 Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Wed, 26 Jun 2019 15:38:43 -0700 Subject: platform/x86: ISST: Add common API to register and handle ioctls Encapsulate common functions which all Intel Speed Select Technology interface drivers can use. This creates API to register misc device for user kernel communication and handle all common IOCTLs. As part of the registry it allows a callback which is to handle domain specific ioctl processing. There can be multiple drivers register for services, which can be built as modules. So this driver handle contention during registry and as well as during removal. Once user space opened the misc device, the registered driver will be prevented from removal. Also once misc device is opened by the user space new client driver can't register, till the misc device is closed. There are two types of client drivers, one to handle mail box interface and the other is to allow direct read/write to some specific MMIO space. This common driver implements IOCTL ISST_IF_GET_PLATFORM_INFO. Signed-off-by: Srinivas Pandruvada Signed-off-by: Andy Shevchenko --- drivers/platform/x86/Kconfig | 2 + drivers/platform/x86/Makefile | 1 + drivers/platform/x86/intel_speed_select_if/Kconfig | 17 ++ .../platform/x86/intel_speed_select_if/Makefile | 7 + .../x86/intel_speed_select_if/isst_if_common.c | 182 +++++++++++++++++++++ .../x86/intel_speed_select_if/isst_if_common.h | 60 +++++++ include/uapi/linux/isst_if.h | 41 +++++ 7 files changed, 310 insertions(+) create mode 100644 drivers/platform/x86/intel_speed_select_if/Kconfig create mode 100644 drivers/platform/x86/intel_speed_select_if/Makefile create mode 100644 drivers/platform/x86/intel_speed_select_if/isst_if_common.c create mode 100644 drivers/platform/x86/intel_speed_select_if/isst_if_common.h create mode 100644 include/uapi/linux/isst_if.h (limited to 'include/uapi/linux') diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig index 58494a12a9b0..ebd44d071f7b 100644 --- a/drivers/platform/x86/Kconfig +++ b/drivers/platform/x86/Kconfig @@ -1336,6 +1336,8 @@ config PCENGINES_APU2 To compile this driver as a module, choose M here: the module will be called pcengines-apuv2. +source "drivers/platform/x86/intel_speed_select_if/Kconfig" + endif # X86_PLATFORM_DEVICES config PMC_ATOM diff --git a/drivers/platform/x86/Makefile b/drivers/platform/x86/Makefile index f64445d69f99..3a62157e9062 100644 --- a/drivers/platform/x86/Makefile +++ b/drivers/platform/x86/Makefile @@ -99,3 +99,4 @@ obj-$(CONFIG_INTEL_MRFLD_PWRBTN) += intel_mrfld_pwrbtn.o obj-$(CONFIG_I2C_MULTI_INSTANTIATE) += i2c-multi-instantiate.o obj-$(CONFIG_INTEL_ATOMISP2_PM) += intel_atomisp2_pm.o obj-$(CONFIG_PCENGINES_APU2) += pcengines-apuv2.o +obj-$(CONFIG_INTEL_SPEED_SELECT_INTERFACE) += intel_speed_select_if/ diff --git a/drivers/platform/x86/intel_speed_select_if/Kconfig b/drivers/platform/x86/intel_speed_select_if/Kconfig new file mode 100644 index 000000000000..ce3e3dc076d2 --- /dev/null +++ b/drivers/platform/x86/intel_speed_select_if/Kconfig @@ -0,0 +1,17 @@ +menu "Intel Speed Select Technology interface support" + depends on PCI + depends on X86_64 || COMPILE_TEST + +config INTEL_SPEED_SELECT_INTERFACE + tristate "Intel(R) Speed Select Technology interface drivers" + help + This config enables the Intel(R) Speed Select Technology interface + drivers. The Intel(R) speed select technology features are non + architectural and only supported on specific Xeon(R) servers. + These drivers provide interface to directly communicate with hardware + via MMIO and Mail boxes to enumerate and control all the speed select + features. + + Enable this config, if there is a need to enable and control the + Intel(R) Speed Select Technology features from the user space. +endmenu diff --git a/drivers/platform/x86/intel_speed_select_if/Makefile b/drivers/platform/x86/intel_speed_select_if/Makefile new file mode 100644 index 000000000000..c12687672fc9 --- /dev/null +++ b/drivers/platform/x86/intel_speed_select_if/Makefile @@ -0,0 +1,7 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# Makefile - Intel Speed Select Interface drivers +# Copyright (c) 2019, Intel Corporation. +# + +obj-$(CONFIG_INTEL_SPEED_SELECT_INTERFACE) += isst_if_common.o diff --git a/drivers/platform/x86/intel_speed_select_if/isst_if_common.c b/drivers/platform/x86/intel_speed_select_if/isst_if_common.c new file mode 100644 index 000000000000..ab2bb4862dc8 --- /dev/null +++ b/drivers/platform/x86/intel_speed_select_if/isst_if_common.c @@ -0,0 +1,182 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Intel Speed Select Interface: Common functions + * Copyright (c) 2019, Intel Corporation. + * All rights reserved. + * + * Author: Srinivas Pandruvada + */ + +#include +#include +#include +#include +#include + +#include "isst_if_common.h" + +static struct isst_if_cmd_cb punit_callbacks[ISST_IF_DEV_MAX]; + +static int isst_if_get_platform_info(void __user *argp) +{ + struct isst_if_platform_info info; + + info.api_version = ISST_IF_API_VERSION, + info.driver_version = ISST_IF_DRIVER_VERSION, + info.max_cmds_per_ioctl = ISST_IF_CMD_LIMIT, + info.mbox_supported = punit_callbacks[ISST_IF_DEV_MBOX].registered; + info.mmio_supported = punit_callbacks[ISST_IF_DEV_MMIO].registered; + + if (copy_to_user(argp, &info, sizeof(info))) + return -EFAULT; + + return 0; +} + +static long isst_if_def_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + void __user *argp = (void __user *)arg; + long ret = -ENOTTY; + + switch (cmd) { + case ISST_IF_GET_PLATFORM_INFO: + ret = isst_if_get_platform_info(argp); + break; + default: + break; + } + + return ret; +} + +static DEFINE_MUTEX(punit_misc_dev_lock); +static int misc_usage_count; +static int misc_device_ret; +static int misc_device_open; + +static int isst_if_open(struct inode *inode, struct file *file) +{ + int i, ret = 0; + + /* Fail open, if a module is going away */ + mutex_lock(&punit_misc_dev_lock); + for (i = 0; i < ISST_IF_DEV_MAX; ++i) { + struct isst_if_cmd_cb *cb = &punit_callbacks[i]; + + if (cb->registered && !try_module_get(cb->owner)) { + ret = -ENODEV; + break; + } + } + if (ret) { + int j; + + for (j = 0; j < i; ++j) { + struct isst_if_cmd_cb *cb; + + cb = &punit_callbacks[j]; + if (cb->registered) + module_put(cb->owner); + } + } else { + misc_device_open++; + } + mutex_unlock(&punit_misc_dev_lock); + + return ret; +} + +static int isst_if_relase(struct inode *inode, struct file *f) +{ + int i; + + mutex_lock(&punit_misc_dev_lock); + misc_device_open--; + for (i = 0; i < ISST_IF_DEV_MAX; ++i) { + struct isst_if_cmd_cb *cb = &punit_callbacks[i]; + + if (cb->registered) + module_put(cb->owner); + } + mutex_unlock(&punit_misc_dev_lock); + + return 0; +} + +static const struct file_operations isst_if_char_driver_ops = { + .open = isst_if_open, + .unlocked_ioctl = isst_if_def_ioctl, + .release = isst_if_relase, +}; + +static struct miscdevice isst_if_char_driver = { + .minor = MISC_DYNAMIC_MINOR, + .name = "isst_interface", + .fops = &isst_if_char_driver_ops, +}; + +/** + * isst_if_cdev_register() - Register callback for IOCTL + * @device_type: The device type this callback handling. + * @cb: Callback structure. + * + * This function registers a callback to device type. On very first call + * it will register a misc device, which is used for user kernel interface. + * Other calls simply increment ref count. Registry will fail, if the user + * already opened misc device for operation. Also if the misc device + * creation failed, then it will not try again and all callers will get + * failure code. + * + * Return: Return the return value from the misc creation device or -EINVAL + * for unsupported device type. + */ +int isst_if_cdev_register(int device_type, struct isst_if_cmd_cb *cb) +{ + if (misc_device_ret) + return misc_device_ret; + + if (device_type >= ISST_IF_DEV_MAX) + return -EINVAL; + + mutex_lock(&punit_misc_dev_lock); + if (misc_device_open) { + mutex_unlock(&punit_misc_dev_lock); + return -EAGAIN; + } + if (!misc_usage_count) { + misc_device_ret = misc_register(&isst_if_char_driver); + if (misc_device_ret) + goto unlock_exit; + } + memcpy(&punit_callbacks[device_type], cb, sizeof(*cb)); + punit_callbacks[device_type].registered = 1; + misc_usage_count++; +unlock_exit: + mutex_unlock(&punit_misc_dev_lock); + + return misc_device_ret; +} +EXPORT_SYMBOL_GPL(isst_if_cdev_register); + +/** + * isst_if_cdev_unregister() - Unregister callback for IOCTL + * @device_type: The device type to unregister. + * + * This function unregisters the previously registered callback. If this + * is the last callback unregistering, then misc device is removed. + * + * Return: None. + */ +void isst_if_cdev_unregister(int device_type) +{ + mutex_lock(&punit_misc_dev_lock); + misc_usage_count--; + punit_callbacks[device_type].registered = 0; + if (!misc_usage_count && !misc_device_ret) + misc_deregister(&isst_if_char_driver); + mutex_unlock(&punit_misc_dev_lock); +} +EXPORT_SYMBOL_GPL(isst_if_cdev_unregister); + +MODULE_LICENSE("GPL v2"); diff --git a/drivers/platform/x86/intel_speed_select_if/isst_if_common.h b/drivers/platform/x86/intel_speed_select_if/isst_if_common.h new file mode 100644 index 000000000000..11f339226fb4 --- /dev/null +++ b/drivers/platform/x86/intel_speed_select_if/isst_if_common.h @@ -0,0 +1,60 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Intel Speed Select Interface: Drivers Internal defines + * Copyright (c) 2019, Intel Corporation. + * All rights reserved. + * + * Author: Srinivas Pandruvada + */ + +#ifndef __ISST_IF_COMMON_H +#define __ISST_IF_COMMON_H + +/* + * Validate maximum commands in a single request. + * This is enough to handle command to every core in one ioctl, or all + * possible message id to one CPU. Limit is also helpful for resonse time + * per IOCTL request, as PUNIT may take different times to process each + * request and may hold for long for too many commands. + */ +#define ISST_IF_CMD_LIMIT 64 + +#define ISST_IF_API_VERSION 0x01 +#define ISST_IF_DRIVER_VERSION 0x01 + +#define ISST_IF_DEV_MBOX 0 +#define ISST_IF_DEV_MMIO 1 +#define ISST_IF_DEV_MAX 2 + +/** + * struct isst_if_cmd_cb - Used to register a IOCTL handler + * @registered: Used by the common code to store registry. Caller don't + * to touch this field + * @cmd_size: The command size of the individual command in IOCTL + * @offset: Offset to the first valid member in command structure. + * This will be the offset of the start of the command + * after command count field + * @cmd_callback: Callback function to handle IOCTL. The callback has the + * command pointer with data for command. There is a pointer + * called write_only, which when set, will not copy the + * response to user ioctl buffer. The "resume" argument + * can be used to avoid storing the command for replay + * during system resume + * + * This structure is used to register an handler for IOCTL. To avoid + * code duplication common code handles all the IOCTL command read/write + * including handling multiple command in single IOCTL. The caller just + * need to execute a command via the registered callback. + */ +struct isst_if_cmd_cb { + int registered; + int cmd_size; + int offset; + struct module *owner; + long (*cmd_callback)(u8 *ptr, int *write_only, int resume); +}; + +/* Internal interface functions */ +int isst_if_cdev_register(int type, struct isst_if_cmd_cb *cb); +void isst_if_cdev_unregister(int type); +#endif diff --git a/include/uapi/linux/isst_if.h b/include/uapi/linux/isst_if.h new file mode 100644 index 000000000000..fa94480b5f74 --- /dev/null +++ b/include/uapi/linux/isst_if.h @@ -0,0 +1,41 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Intel Speed Select Interface: OS to hardware Interface + * Copyright (c) 2019, Intel Corporation. + * All rights reserved. + * + * Author: Srinivas Pandruvada + */ + +#ifndef __ISST_IF_H +#define __ISST_IF_H + +#include + +/** + * struct isst_if_platform_info - Define platform information + * @api_version: Version of the firmware document, which this driver + * can communicate + * @driver_version: Driver version, which will help user to send right + * commands. Even if the firmware is capable, driver may + * not be ready + * @max_cmds_per_ioctl: Returns the maximum number of commands driver will + * accept in a single ioctl + * @mbox_supported: Support of mail box interface + * @mmio_supported: Support of mmio interface for core-power feature + * + * Used to return output of IOCTL ISST_IF_GET_PLATFORM_INFO. This + * information can be used by the user space, to get the driver, firmware + * support and also number of commands to send in a single IOCTL request. + */ +struct isst_if_platform_info { + __u16 api_version; + __u16 driver_version; + __u16 max_cmds_per_ioctl; + __u8 mbox_supported; + __u8 mmio_supported; +}; + +#define ISST_IF_MAGIC 0xFE +#define ISST_IF_GET_PLATFORM_INFO _IOR(ISST_IF_MAGIC, 0, struct isst_if_platform_info *) +#endif -- cgit v1.2.3-71-gd317 From fb5b36a413b9f30fba573fc2a596ab7142dfaf12 Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Wed, 26 Jun 2019 15:38:45 -0700 Subject: platform/x86: ISST: Add IOCTL to Translate Linux logical CPU to PUNIT CPU number Add processing for IOCTL command ISST_IF_GET_PHY_ID. This converts from the Linux logical CPU to PUNIT CPU numbering scheme. Signed-off-by: Srinivas Pandruvada Signed-off-by: Andy Shevchenko --- .../x86/intel_speed_select_if/isst_if_common.c | 74 ++++++++++++++++++++++ include/uapi/linux/isst_if.h | 28 ++++++++ 2 files changed, 102 insertions(+) (limited to 'include/uapi/linux') diff --git a/drivers/platform/x86/intel_speed_select_if/isst_if_common.c b/drivers/platform/x86/intel_speed_select_if/isst_if_common.c index 0e16cbf685d0..72e74d72724b 100644 --- a/drivers/platform/x86/intel_speed_select_if/isst_if_common.c +++ b/drivers/platform/x86/intel_speed_select_if/isst_if_common.c @@ -134,16 +134,90 @@ static void isst_if_cpu_info_exit(void) kfree(isst_cpu_info); }; +static long isst_if_proc_phyid_req(u8 *cmd_ptr, int *write_only, int resume) +{ + struct isst_if_cpu_map *cpu_map; + + cpu_map = (struct isst_if_cpu_map *)cmd_ptr; + if (cpu_map->logical_cpu >= nr_cpu_ids || + cpu_map->logical_cpu >= num_possible_cpus()) + return -EINVAL; + + *write_only = 0; + cpu_map->physical_cpu = isst_cpu_info[cpu_map->logical_cpu].punit_cpu_id; + + return 0; +} + +static long isst_if_exec_multi_cmd(void __user *argp, struct isst_if_cmd_cb *cb) +{ + unsigned char __user *ptr; + u32 cmd_count; + u8 *cmd_ptr; + long ret; + int i; + + /* Each multi command has u32 command count as the first field */ + if (copy_from_user(&cmd_count, argp, sizeof(cmd_count))) + return -EFAULT; + + if (!cmd_count || cmd_count > ISST_IF_CMD_LIMIT) + return -EINVAL; + + cmd_ptr = kmalloc(cb->cmd_size, GFP_KERNEL); + if (!cmd_ptr) + return -ENOMEM; + + /* cb->offset points to start of the command after the command count */ + ptr = argp + cb->offset; + + for (i = 0; i < cmd_count; ++i) { + int wr_only; + + if (signal_pending(current)) { + ret = -EINTR; + break; + } + + if (copy_from_user(cmd_ptr, ptr, cb->cmd_size)) { + ret = -EFAULT; + break; + } + + ret = cb->cmd_callback(cmd_ptr, &wr_only, 0); + if (ret) + break; + + if (!wr_only && copy_to_user(ptr, cmd_ptr, cb->cmd_size)) { + ret = -EFAULT; + break; + } + + ptr += cb->cmd_size; + } + + kfree(cmd_ptr); + + return i ? i : ret; +} + static long isst_if_def_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { void __user *argp = (void __user *)arg; + struct isst_if_cmd_cb cmd_cb; long ret = -ENOTTY; switch (cmd) { case ISST_IF_GET_PLATFORM_INFO: ret = isst_if_get_platform_info(argp); break; + case ISST_IF_GET_PHY_ID: + cmd_cb.cmd_size = sizeof(struct isst_if_cpu_map); + cmd_cb.offset = offsetof(struct isst_if_cpu_maps, cpu_map); + cmd_cb.cmd_callback = isst_if_proc_phyid_req; + ret = isst_if_exec_multi_cmd(argp, &cmd_cb); + break; default: break; } diff --git a/include/uapi/linux/isst_if.h b/include/uapi/linux/isst_if.h index fa94480b5f74..15d1f286a830 100644 --- a/include/uapi/linux/isst_if.h +++ b/include/uapi/linux/isst_if.h @@ -36,6 +36,34 @@ struct isst_if_platform_info { __u8 mmio_supported; }; +/** + * struct isst_if_cpu_map - CPU mapping between logical and physical CPU + * @logical_cpu: Linux logical CPU number + * @physical_cpu: PUNIT CPU number + * + * Used to convert from Linux logical CPU to PUNIT CPU numbering scheme. + * The PUNIT CPU number is different than APIC ID based CPU numbering. + */ +struct isst_if_cpu_map { + __u32 logical_cpu; + __u32 physical_cpu; +}; + +/** + * struct isst_if_cpu_maps - structure for CPU map IOCTL + * @cmd_count: Number of CPU mapping command in cpu_map[] + * @cpu_map[]: Holds one or more CPU map data structure + * + * This structure used with ioctl ISST_IF_GET_PHY_ID to send + * one or more CPU mapping commands. Here IOCTL return value indicates + * number of commands sent or error number if no commands have been sent. + */ +struct isst_if_cpu_maps { + __u32 cmd_count; + struct isst_if_cpu_map cpu_map[1]; +}; + #define ISST_IF_MAGIC 0xFE #define ISST_IF_GET_PLATFORM_INFO _IOR(ISST_IF_MAGIC, 0, struct isst_if_platform_info *) +#define ISST_IF_GET_PHY_ID _IOWR(ISST_IF_MAGIC, 1, struct isst_if_cpu_map *) #endif -- cgit v1.2.3-71-gd317 From d3a23584294c1f379239a3b52bac13e03fecd147 Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Wed, 26 Jun 2019 15:38:46 -0700 Subject: platform/x86: ISST: Add Intel Speed Select mmio interface Added MMIO interface to read/write specific offsets in PUNIT PCI device which export core priortization. This MMIO interface can be used using ioctl interface on /dev/isst_interface using IOCTL ISST_IF_IO_CMD. This MMIO interface is used by the intel-speed-select tool under tools/x86/power to enumerate and set core priority. The MMIO offsets and semantics of the message can be checked from the source code of the tool. Signed-off-by: Srinivas Pandruvada Signed-off-by: Andy Shevchenko --- .../platform/x86/intel_speed_select_if/Makefile | 1 + .../x86/intel_speed_select_if/isst_if_common.c | 6 + .../x86/intel_speed_select_if/isst_if_common.h | 2 + .../x86/intel_speed_select_if/isst_if_mmio.c | 131 +++++++++++++++++++++ include/uapi/linux/isst_if.h | 33 ++++++ 5 files changed, 173 insertions(+) create mode 100644 drivers/platform/x86/intel_speed_select_if/isst_if_mmio.c (limited to 'include/uapi/linux') diff --git a/drivers/platform/x86/intel_speed_select_if/Makefile b/drivers/platform/x86/intel_speed_select_if/Makefile index c12687672fc9..7e94919208d3 100644 --- a/drivers/platform/x86/intel_speed_select_if/Makefile +++ b/drivers/platform/x86/intel_speed_select_if/Makefile @@ -5,3 +5,4 @@ # obj-$(CONFIG_INTEL_SPEED_SELECT_INTERFACE) += isst_if_common.o +obj-$(CONFIG_INTEL_SPEED_SELECT_INTERFACE) += isst_if_mmio.o diff --git a/drivers/platform/x86/intel_speed_select_if/isst_if_common.c b/drivers/platform/x86/intel_speed_select_if/isst_if_common.c index 72e74d72724b..3f96a3925bc6 100644 --- a/drivers/platform/x86/intel_speed_select_if/isst_if_common.c +++ b/drivers/platform/x86/intel_speed_select_if/isst_if_common.c @@ -206,6 +206,7 @@ static long isst_if_def_ioctl(struct file *file, unsigned int cmd, { void __user *argp = (void __user *)arg; struct isst_if_cmd_cb cmd_cb; + struct isst_if_cmd_cb *cb; long ret = -ENOTTY; switch (cmd) { @@ -218,6 +219,11 @@ static long isst_if_def_ioctl(struct file *file, unsigned int cmd, cmd_cb.cmd_callback = isst_if_proc_phyid_req; ret = isst_if_exec_multi_cmd(argp, &cmd_cb); break; + case ISST_IF_IO_CMD: + cb = &punit_callbacks[ISST_IF_DEV_MMIO]; + if (cb->registered) + ret = isst_if_exec_multi_cmd(argp, cb); + break; default: break; } diff --git a/drivers/platform/x86/intel_speed_select_if/isst_if_common.h b/drivers/platform/x86/intel_speed_select_if/isst_if_common.h index dade77c58b22..cdc7d019748a 100644 --- a/drivers/platform/x86/intel_speed_select_if/isst_if_common.h +++ b/drivers/platform/x86/intel_speed_select_if/isst_if_common.h @@ -10,6 +10,8 @@ #ifndef __ISST_IF_COMMON_H #define __ISST_IF_COMMON_H +#define INTEL_RAPL_PRIO_DEVID_0 0x3451 + /* * Validate maximum commands in a single request. * This is enough to handle command to every core in one ioctl, or all diff --git a/drivers/platform/x86/intel_speed_select_if/isst_if_mmio.c b/drivers/platform/x86/intel_speed_select_if/isst_if_mmio.c new file mode 100644 index 000000000000..1c25a1235b9e --- /dev/null +++ b/drivers/platform/x86/intel_speed_select_if/isst_if_mmio.c @@ -0,0 +1,131 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Intel Speed Select Interface: MMIO Interface + * Copyright (c) 2019, Intel Corporation. + * All rights reserved. + * + * Author: Srinivas Pandruvada + */ + +#include +#include +#include +#include +#include + +#include "isst_if_common.h" + +struct isst_if_device { + void __iomem *punit_mmio; + struct mutex mutex; +}; + +static long isst_if_mmio_rd_wr(u8 *cmd_ptr, int *write_only, int resume) +{ + struct isst_if_device *punit_dev; + struct isst_if_io_reg *io_reg; + struct pci_dev *pdev; + + io_reg = (struct isst_if_io_reg *)cmd_ptr; + if (io_reg->reg < 0x04 || io_reg->reg > 0xD0) + return -EINVAL; + + if (io_reg->read_write && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + pdev = isst_if_get_pci_dev(io_reg->logical_cpu, 0, 0, 1); + if (!pdev) + return -EINVAL; + + punit_dev = pci_get_drvdata(pdev); + if (!punit_dev) + return -EINVAL; + + /* + * Ensure that operation is complete on a PCI device to avoid read + * write race by using per PCI device mutex. + */ + mutex_lock(&punit_dev->mutex); + if (io_reg->read_write) { + writel(io_reg->value, punit_dev->punit_mmio+io_reg->reg); + *write_only = 1; + } else { + io_reg->value = readl(punit_dev->punit_mmio+io_reg->reg); + *write_only = 0; + } + mutex_unlock(&punit_dev->mutex); + + return 0; +} + +static const struct pci_device_id isst_if_ids[] = { + { PCI_DEVICE(PCI_VENDOR_ID_INTEL, INTEL_RAPL_PRIO_DEVID_0)}, + { 0 }, +}; +MODULE_DEVICE_TABLE(pci, isst_if_ids); + +static int isst_if_probe(struct pci_dev *pdev, const struct pci_device_id *ent) +{ + struct isst_if_device *punit_dev; + struct isst_if_cmd_cb cb; + u32 mmio_base, pcu_base; + u64 base_addr; + int ret; + + punit_dev = devm_kzalloc(&pdev->dev, sizeof(*punit_dev), GFP_KERNEL); + if (!punit_dev) + return -ENOMEM; + + ret = pcim_enable_device(pdev); + if (ret) + return ret; + + ret = pci_read_config_dword(pdev, 0xD0, &mmio_base); + if (ret) + return ret; + + ret = pci_read_config_dword(pdev, 0xFC, &pcu_base); + if (ret) + return ret; + + pcu_base &= GENMASK(10, 0); + base_addr = (u64)mmio_base << 23 | (u64) pcu_base << 12; + punit_dev->punit_mmio = devm_ioremap(&pdev->dev, base_addr, 256); + if (!punit_dev->punit_mmio) + return -ENOMEM; + + mutex_init(&punit_dev->mutex); + pci_set_drvdata(pdev, punit_dev); + + memset(&cb, 0, sizeof(cb)); + cb.cmd_size = sizeof(struct isst_if_io_reg); + cb.offset = offsetof(struct isst_if_io_regs, io_reg); + cb.cmd_callback = isst_if_mmio_rd_wr; + cb.owner = THIS_MODULE; + ret = isst_if_cdev_register(ISST_IF_DEV_MMIO, &cb); + if (ret) + mutex_destroy(&punit_dev->mutex); + + return ret; +} + +static void isst_if_remove(struct pci_dev *pdev) +{ + struct isst_if_device *punit_dev; + + punit_dev = pci_get_drvdata(pdev); + isst_if_cdev_unregister(ISST_IF_DEV_MBOX); + mutex_destroy(&punit_dev->mutex); +} + +static struct pci_driver isst_if_pci_driver = { + .name = "isst_if_pci", + .id_table = isst_if_ids, + .probe = isst_if_probe, + .remove = isst_if_remove, +}; + +module_pci_driver(isst_if_pci_driver); + +MODULE_LICENSE("GPL v2"); +MODULE_DESCRIPTION("Intel speed select interface mmio driver"); diff --git a/include/uapi/linux/isst_if.h b/include/uapi/linux/isst_if.h index 15d1f286a830..fe2492ade078 100644 --- a/include/uapi/linux/isst_if.h +++ b/include/uapi/linux/isst_if.h @@ -63,7 +63,40 @@ struct isst_if_cpu_maps { struct isst_if_cpu_map cpu_map[1]; }; +/** + * struct isst_if_io_reg - Read write PUNIT IO register + * @read_write: Value 0: Read, 1: Write + * @logical_cpu: Logical CPU number to get target PCI device. + * @reg: PUNIT register offset + * @value: For write operation value to write and for + * for read placeholder read value + * + * Structure to specify read/write data to PUNIT registers. + */ +struct isst_if_io_reg { + __u32 read_write; /* Read:0, Write:1 */ + __u32 logical_cpu; + __u32 reg; + __u32 value; +}; + +/** + * struct isst_if_io_regs - structure for IO register commands + * @cmd_count: Number of io reg commands in io_reg[] + * @io_reg[]: Holds one or more io_reg command structure + * + * This structure used with ioctl ISST_IF_IO_CMD to send + * one or more read/write commands to PUNIT. Here IOCTL return value + * indicates number of requests sent or error number if no requests have + * been sent. + */ +struct isst_if_io_regs { + __u32 req_count; + struct isst_if_io_reg io_reg[1]; +}; + #define ISST_IF_MAGIC 0xFE #define ISST_IF_GET_PLATFORM_INFO _IOR(ISST_IF_MAGIC, 0, struct isst_if_platform_info *) #define ISST_IF_GET_PHY_ID _IOWR(ISST_IF_MAGIC, 1, struct isst_if_cpu_map *) +#define ISST_IF_IO_CMD _IOW(ISST_IF_MAGIC, 2, struct isst_if_io_regs *) #endif -- cgit v1.2.3-71-gd317 From 31a166fe9c269af17977e650846ee4ea50361c07 Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Wed, 26 Jun 2019 15:38:47 -0700 Subject: platform/x86: ISST: Add Intel Speed Select mailbox interface via PCI Add an IOCTL to send mailbox commands to PUNIT using PUNIT PCI device. A limited set of mailbox commands can be sent to PUNIT. This MMIO interface is used by the intel-speed-select tool under tools/x86/power to enumerate and control Intel Speed Select features. The MBOX commands ids and semantics of the message can be checked from the source code of the tool. Signed-off-by: Srinivas Pandruvada Signed-off-by: Andy Shevchenko --- .../platform/x86/intel_speed_select_if/Makefile | 1 + .../x86/intel_speed_select_if/isst_if_common.c | 85 +++++++++ .../x86/intel_speed_select_if/isst_if_common.h | 3 + .../x86/intel_speed_select_if/isst_if_mbox_pci.c | 199 +++++++++++++++++++++ include/uapi/linux/isst_if.h | 38 ++++ 5 files changed, 326 insertions(+) create mode 100644 drivers/platform/x86/intel_speed_select_if/isst_if_mbox_pci.c (limited to 'include/uapi/linux') diff --git a/drivers/platform/x86/intel_speed_select_if/Makefile b/drivers/platform/x86/intel_speed_select_if/Makefile index 7e94919208d3..8dec8c858649 100644 --- a/drivers/platform/x86/intel_speed_select_if/Makefile +++ b/drivers/platform/x86/intel_speed_select_if/Makefile @@ -6,3 +6,4 @@ obj-$(CONFIG_INTEL_SPEED_SELECT_INTERFACE) += isst_if_common.o obj-$(CONFIG_INTEL_SPEED_SELECT_INTERFACE) += isst_if_mmio.o +obj-$(CONFIG_INTEL_SPEED_SELECT_INTERFACE) += isst_if_mbox_pci.o diff --git a/drivers/platform/x86/intel_speed_select_if/isst_if_common.c b/drivers/platform/x86/intel_speed_select_if/isst_if_common.c index 3f96a3925bc6..391fc3f12161 100644 --- a/drivers/platform/x86/intel_speed_select_if/isst_if_common.c +++ b/drivers/platform/x86/intel_speed_select_if/isst_if_common.c @@ -25,6 +25,86 @@ static struct isst_if_cmd_cb punit_callbacks[ISST_IF_DEV_MAX]; +struct isst_valid_cmd_ranges { + u16 cmd; + u16 sub_cmd_beg; + u16 sub_cmd_end; +}; + +struct isst_cmd_set_req_type { + u16 cmd; + u16 sub_cmd; + u16 param; +}; + +static const struct isst_valid_cmd_ranges isst_valid_cmds[] = { + {0xD0, 0x00, 0x03}, + {0x7F, 0x00, 0x0B}, + {0x7F, 0x10, 0x12}, + {0x7F, 0x20, 0x23}, +}; + +static const struct isst_cmd_set_req_type isst_cmd_set_reqs[] = { + {0xD0, 0x00, 0x08}, + {0xD0, 0x01, 0x08}, + {0xD0, 0x02, 0x08}, + {0xD0, 0x03, 0x08}, + {0x7F, 0x02, 0x00}, + {0x7F, 0x08, 0x00}, +}; + +/** + * isst_if_mbox_cmd_invalid() - Check invalid mailbox commands + * @cmd: Pointer to the command structure to verify. + * + * Invalid command to PUNIT to may result in instability of the platform. + * This function has a whitelist of commands, which are allowed. + * + * Return: Return true if the command is invalid, else false. + */ +bool isst_if_mbox_cmd_invalid(struct isst_if_mbox_cmd *cmd) +{ + int i; + + if (cmd->logical_cpu >= nr_cpu_ids) + return true; + + for (i = 0; i < ARRAY_SIZE(isst_valid_cmds); ++i) { + if (cmd->command == isst_valid_cmds[i].cmd && + (cmd->sub_command >= isst_valid_cmds[i].sub_cmd_beg && + cmd->sub_command <= isst_valid_cmds[i].sub_cmd_end)) { + return false; + } + } + + return true; +} +EXPORT_SYMBOL_GPL(isst_if_mbox_cmd_invalid); + +/** + * isst_if_mbox_cmd_set_req() - Check mailbox command is a set request + * @cmd: Pointer to the command structure to verify. + * + * Check if the given mail box level is set request and not a get request. + * + * Return: Return true if the command is set_req, else false. + */ +bool isst_if_mbox_cmd_set_req(struct isst_if_mbox_cmd *cmd) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(isst_cmd_set_reqs); ++i) { + if (cmd->command == isst_cmd_set_reqs[i].cmd && + cmd->sub_command == isst_cmd_set_reqs[i].sub_cmd && + cmd->parameter == isst_cmd_set_reqs[i].param) { + return true; + } + } + + return false; +} +EXPORT_SYMBOL_GPL(isst_if_mbox_cmd_set_req); + static int isst_if_get_platform_info(void __user *argp) { struct isst_if_platform_info info; @@ -224,6 +304,11 @@ static long isst_if_def_ioctl(struct file *file, unsigned int cmd, if (cb->registered) ret = isst_if_exec_multi_cmd(argp, cb); break; + case ISST_IF_MBOX_COMMAND: + cb = &punit_callbacks[ISST_IF_DEV_MBOX]; + if (cb->registered) + ret = isst_if_exec_multi_cmd(argp, cb); + break; default: break; } diff --git a/drivers/platform/x86/intel_speed_select_if/isst_if_common.h b/drivers/platform/x86/intel_speed_select_if/isst_if_common.h index cdc7d019748a..7c0f71221da7 100644 --- a/drivers/platform/x86/intel_speed_select_if/isst_if_common.h +++ b/drivers/platform/x86/intel_speed_select_if/isst_if_common.h @@ -11,6 +11,7 @@ #define __ISST_IF_COMMON_H #define INTEL_RAPL_PRIO_DEVID_0 0x3451 +#define INTEL_CFG_MBOX_DEVID_0 0x3459 /* * Validate maximum commands in a single request. @@ -60,4 +61,6 @@ struct isst_if_cmd_cb { int isst_if_cdev_register(int type, struct isst_if_cmd_cb *cb); void isst_if_cdev_unregister(int type); struct pci_dev *isst_if_get_pci_dev(int cpu, int bus, int dev, int fn); +bool isst_if_mbox_cmd_set_req(struct isst_if_mbox_cmd *mbox_cmd); +bool isst_if_mbox_cmd_invalid(struct isst_if_mbox_cmd *cmd); #endif diff --git a/drivers/platform/x86/intel_speed_select_if/isst_if_mbox_pci.c b/drivers/platform/x86/intel_speed_select_if/isst_if_mbox_pci.c new file mode 100644 index 000000000000..1c4f2893cd80 --- /dev/null +++ b/drivers/platform/x86/intel_speed_select_if/isst_if_mbox_pci.c @@ -0,0 +1,199 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Intel Speed Select Interface: Mbox via PCI Interface + * Copyright (c) 2019, Intel Corporation. + * All rights reserved. + * + * Author: Srinivas Pandruvada + */ + +#include +#include +#include +#include +#include +#include + +#include "isst_if_common.h" + +#define PUNIT_MAILBOX_DATA 0xA0 +#define PUNIT_MAILBOX_INTERFACE 0xA4 +#define PUNIT_MAILBOX_BUSY_BIT 31 + +/* + * Commands has variable amount of processing time. Most of the commands will + * be done in 0-3 tries, but some takes up to 50. + * The real processing time was observed as 25us for the most of the commands + * at 2GHz. It is possible to optimize this count taking samples on customer + * systems. + */ +#define OS_MAILBOX_RETRY_COUNT 50 + +struct isst_if_device { + struct mutex mutex; +}; + +static int isst_if_mbox_cmd(struct pci_dev *pdev, + struct isst_if_mbox_cmd *mbox_cmd) +{ + u32 retries, data; + int ret; + + /* Poll for rb bit == 0 */ + retries = OS_MAILBOX_RETRY_COUNT; + do { + ret = pci_read_config_dword(pdev, PUNIT_MAILBOX_INTERFACE, + &data); + if (ret) + return ret; + + if (data & BIT_ULL(PUNIT_MAILBOX_BUSY_BIT)) { + ret = -EBUSY; + continue; + } + ret = 0; + break; + } while (--retries); + + if (ret) + return ret; + + /* Write DATA register */ + ret = pci_write_config_dword(pdev, PUNIT_MAILBOX_DATA, + mbox_cmd->req_data); + if (ret) + return ret; + + /* Write command register */ + data = BIT_ULL(PUNIT_MAILBOX_BUSY_BIT) | + (mbox_cmd->parameter & GENMASK_ULL(13, 0)) << 16 | + (mbox_cmd->sub_command << 8) | + mbox_cmd->command; + + ret = pci_write_config_dword(pdev, PUNIT_MAILBOX_INTERFACE, data); + if (ret) + return ret; + + /* Poll for rb bit == 0 */ + retries = OS_MAILBOX_RETRY_COUNT; + do { + ret = pci_read_config_dword(pdev, PUNIT_MAILBOX_INTERFACE, + &data); + if (ret) + return ret; + + if (data & BIT_ULL(PUNIT_MAILBOX_BUSY_BIT)) { + ret = -EBUSY; + continue; + } + + if (data & 0xff) + return -ENXIO; + + ret = pci_read_config_dword(pdev, PUNIT_MAILBOX_DATA, &data); + if (ret) + return ret; + + mbox_cmd->resp_data = data; + ret = 0; + break; + } while (--retries); + + return ret; +} + +static long isst_if_mbox_proc_cmd(u8 *cmd_ptr, int *write_only, int resume) +{ + struct isst_if_mbox_cmd *mbox_cmd; + struct isst_if_device *punit_dev; + struct pci_dev *pdev; + int ret; + + mbox_cmd = (struct isst_if_mbox_cmd *)cmd_ptr; + + if (isst_if_mbox_cmd_invalid(mbox_cmd)) + return -EINVAL; + + if (isst_if_mbox_cmd_set_req(mbox_cmd) && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + pdev = isst_if_get_pci_dev(mbox_cmd->logical_cpu, 1, 30, 1); + if (!pdev) + return -EINVAL; + + punit_dev = pci_get_drvdata(pdev); + if (!punit_dev) + return -EINVAL; + + /* + * Basically we are allowing one complete mailbox transaction on + * a mapped PCI device at a time. + */ + mutex_lock(&punit_dev->mutex); + ret = isst_if_mbox_cmd(pdev, mbox_cmd); + mutex_unlock(&punit_dev->mutex); + if (ret) + return ret; + + *write_only = 0; + + return 0; +} + +static const struct pci_device_id isst_if_mbox_ids[] = { + { PCI_DEVICE(PCI_VENDOR_ID_INTEL, INTEL_CFG_MBOX_DEVID_0)}, + { 0 }, +}; +MODULE_DEVICE_TABLE(pci, isst_if_mbox_ids); + +static int isst_if_mbox_probe(struct pci_dev *pdev, + const struct pci_device_id *ent) +{ + struct isst_if_device *punit_dev; + struct isst_if_cmd_cb cb; + int ret; + + punit_dev = devm_kzalloc(&pdev->dev, sizeof(*punit_dev), GFP_KERNEL); + if (!punit_dev) + return -ENOMEM; + + ret = pcim_enable_device(pdev); + if (ret) + return ret; + + mutex_init(&punit_dev->mutex); + pci_set_drvdata(pdev, punit_dev); + + memset(&cb, 0, sizeof(cb)); + cb.cmd_size = sizeof(struct isst_if_mbox_cmd); + cb.offset = offsetof(struct isst_if_mbox_cmds, mbox_cmd); + cb.cmd_callback = isst_if_mbox_proc_cmd; + cb.owner = THIS_MODULE; + ret = isst_if_cdev_register(ISST_IF_DEV_MBOX, &cb); + + if (ret) + mutex_destroy(&punit_dev->mutex); + + return ret; +} + +static void isst_if_mbox_remove(struct pci_dev *pdev) +{ + struct isst_if_device *punit_dev; + + punit_dev = pci_get_drvdata(pdev); + isst_if_cdev_unregister(ISST_IF_DEV_MBOX); + mutex_destroy(&punit_dev->mutex); +} + +static struct pci_driver isst_if_pci_driver = { + .name = "isst_if_mbox_pci", + .id_table = isst_if_mbox_ids, + .probe = isst_if_mbox_probe, + .remove = isst_if_mbox_remove, +}; + +module_pci_driver(isst_if_pci_driver); + +MODULE_LICENSE("GPL v2"); +MODULE_DESCRIPTION("Intel speed select interface pci mailbox driver"); diff --git a/include/uapi/linux/isst_if.h b/include/uapi/linux/isst_if.h index fe2492ade078..e4b1c2ec3279 100644 --- a/include/uapi/linux/isst_if.h +++ b/include/uapi/linux/isst_if.h @@ -95,8 +95,46 @@ struct isst_if_io_regs { struct isst_if_io_reg io_reg[1]; }; +/** + * struct isst_if_mbox_cmd - Structure to define mail box command + * @logical_cpu: Logical CPU number to get target PCI device + * @parameter: Mailbox parameter value + * @req_data: Request data for the mailbox + * @resp_data: Response data for mailbox command response + * @command: Mailbox command value + * @sub_command: Mailbox sub command value + * @reserved: Unused, set to 0 + * + * Structure to specify mailbox command to be sent to PUNIT. + */ +struct isst_if_mbox_cmd { + __u32 logical_cpu; + __u32 parameter; + __u32 req_data; + __u32 resp_data; + __u16 command; + __u16 sub_command; + __u32 reserved; +}; + +/** + * struct isst_if_mbox_cmds - structure for mailbox commands + * @cmd_count: Number of mailbox commands in mbox_cmd[] + * @mbox_cmd[]: Holds one or more mbox commands + * + * This structure used with ioctl ISST_IF_MBOX_COMMAND to send + * one or more mailbox commands to PUNIT. Here IOCTL return value + * indicates number of commands sent or error number if no commands have + * been sent. + */ +struct isst_if_mbox_cmds { + __u32 cmd_count; + struct isst_if_mbox_cmd mbox_cmd[1]; +}; + #define ISST_IF_MAGIC 0xFE #define ISST_IF_GET_PLATFORM_INFO _IOR(ISST_IF_MAGIC, 0, struct isst_if_platform_info *) #define ISST_IF_GET_PHY_ID _IOWR(ISST_IF_MAGIC, 1, struct isst_if_cpu_map *) #define ISST_IF_IO_CMD _IOW(ISST_IF_MAGIC, 2, struct isst_if_io_regs *) +#define ISST_IF_MBOX_COMMAND _IOWR(ISST_IF_MAGIC, 3, struct isst_if_mbox_cmds *) #endif -- cgit v1.2.3-71-gd317 From e765f37b9b8b4fa65682e9a78a2ca2b11d3d9096 Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Wed, 26 Jun 2019 15:38:49 -0700 Subject: platform/x86: ISST: Add Intel Speed Select PUNIT MSR interface While using new non arhitectural features using PUNIT Mailbox and MMIO read/write interface, still there is need to operate using MSRs to control PUNIT. User space could have used user user-space MSR interface for this, but when user space MSR access is disabled, then it can't. Here only limited number of MSRs are allowed using this new interface. Signed-off-by: Srinivas Pandruvada Signed-off-by: Andy Shevchenko --- .../x86/intel_speed_select_if/isst_if_common.c | 59 ++++++++++++++++++++++ include/uapi/linux/isst_if.h | 32 ++++++++++++ 2 files changed, 91 insertions(+) (limited to 'include/uapi/linux') diff --git a/drivers/platform/x86/intel_speed_select_if/isst_if_common.c b/drivers/platform/x86/intel_speed_select_if/isst_if_common.c index 391fc3f12161..de2fb5292f1c 100644 --- a/drivers/platform/x86/intel_speed_select_if/isst_if_common.c +++ b/drivers/platform/x86/intel_speed_select_if/isst_if_common.c @@ -25,6 +25,11 @@ static struct isst_if_cmd_cb punit_callbacks[ISST_IF_DEV_MAX]; +static int punit_msr_white_list[] = { + MSR_TURBO_RATIO_LIMIT, + MSR_CONFIG_TDP_CONTROL, +}; + struct isst_valid_cmd_ranges { u16 cmd; u16 sub_cmd_beg; @@ -229,6 +234,54 @@ static long isst_if_proc_phyid_req(u8 *cmd_ptr, int *write_only, int resume) return 0; } +static bool match_punit_msr_white_list(int msr) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(punit_msr_white_list); ++i) { + if (punit_msr_white_list[i] == msr) + return true; + } + + return false; +} + +static long isst_if_msr_cmd_req(u8 *cmd_ptr, int *write_only, int resume) +{ + struct isst_if_msr_cmd *msr_cmd; + int ret; + + msr_cmd = (struct isst_if_msr_cmd *)cmd_ptr; + + if (!match_punit_msr_white_list(msr_cmd->msr)) + return -EINVAL; + + if (msr_cmd->logical_cpu >= nr_cpu_ids) + return -EINVAL; + + if (msr_cmd->read_write) { + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + ret = wrmsrl_safe_on_cpu(msr_cmd->logical_cpu, + msr_cmd->msr, + msr_cmd->data); + *write_only = 1; + } else { + u64 data; + + ret = rdmsrl_safe_on_cpu(msr_cmd->logical_cpu, + msr_cmd->msr, &data); + if (!ret) { + msr_cmd->data = data; + *write_only = 0; + } + } + + + return ret; +} + static long isst_if_exec_multi_cmd(void __user *argp, struct isst_if_cmd_cb *cb) { unsigned char __user *ptr; @@ -309,6 +362,12 @@ static long isst_if_def_ioctl(struct file *file, unsigned int cmd, if (cb->registered) ret = isst_if_exec_multi_cmd(argp, cb); break; + case ISST_IF_MSR_COMMAND: + cmd_cb.cmd_size = sizeof(struct isst_if_msr_cmd); + cmd_cb.offset = offsetof(struct isst_if_msr_cmds, msr_cmd); + cmd_cb.cmd_callback = isst_if_msr_cmd_req; + ret = isst_if_exec_multi_cmd(argp, &cmd_cb); + break; default: break; } diff --git a/include/uapi/linux/isst_if.h b/include/uapi/linux/isst_if.h index e4b1c2ec3279..d10b832c58c5 100644 --- a/include/uapi/linux/isst_if.h +++ b/include/uapi/linux/isst_if.h @@ -132,9 +132,41 @@ struct isst_if_mbox_cmds { struct isst_if_mbox_cmd mbox_cmd[1]; }; +/** + * struct isst_if_msr_cmd - Structure to define msr command + * @read_write: Value 0: Read, 1: Write + * @logical_cpu: Logical CPU number + * @msr: MSR number + * @data: For write operation, data to write, for read + * place holder + * + * Structure to specify MSR command related to PUNIT. + */ +struct isst_if_msr_cmd { + __u32 read_write; /* Read:0, Write:1 */ + __u32 logical_cpu; + __u64 msr; + __u64 data; +}; + +/** + * struct isst_if_msr_cmds - structure for msr commands + * @cmd_count: Number of mailbox commands in msr_cmd[] + * @msr_cmd[]: Holds one or more msr commands + * + * This structure used with ioctl ISST_IF_MSR_COMMAND to send + * one or more MSR commands. IOCTL return value indicates number of + * commands sent or error number if no commands have been sent. + */ +struct isst_if_msr_cmds { + __u32 cmd_count; + struct isst_if_msr_cmd msr_cmd[1]; +}; + #define ISST_IF_MAGIC 0xFE #define ISST_IF_GET_PLATFORM_INFO _IOR(ISST_IF_MAGIC, 0, struct isst_if_platform_info *) #define ISST_IF_GET_PHY_ID _IOWR(ISST_IF_MAGIC, 1, struct isst_if_cpu_map *) #define ISST_IF_IO_CMD _IOW(ISST_IF_MAGIC, 2, struct isst_if_io_regs *) #define ISST_IF_MBOX_COMMAND _IOWR(ISST_IF_MAGIC, 3, struct isst_if_mbox_cmds *) +#define ISST_IF_MSR_COMMAND _IOWR(ISST_IF_MAGIC, 4, struct isst_if_msr_cmds *) #endif -- cgit v1.2.3-71-gd317 From f85f6e7bc9682a6d8b342c010cd6aa58521fdeec Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Tue, 11 Jun 2019 20:23:48 +0800 Subject: KVM: X86: Yield to IPI target if necessary MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When sending a call-function IPI-many to vCPUs, yield if any of the IPI target vCPUs was preempted, we just select the first preempted target vCPU which we found since the state of target vCPUs can change underneath and to avoid race conditions. Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Liran Alon Signed-off-by: Wanpeng Li Signed-off-by: Paolo Bonzini --- Documentation/virtual/kvm/hypercalls.txt | 11 +++++++++++ arch/x86/include/uapi/asm/kvm_para.h | 1 + arch/x86/kernel/kvm.c | 21 +++++++++++++++++++++ include/uapi/linux/kvm_para.h | 1 + 4 files changed, 34 insertions(+) (limited to 'include/uapi/linux') diff --git a/Documentation/virtual/kvm/hypercalls.txt b/Documentation/virtual/kvm/hypercalls.txt index da24c138c8d1..da210651f714 100644 --- a/Documentation/virtual/kvm/hypercalls.txt +++ b/Documentation/virtual/kvm/hypercalls.txt @@ -141,3 +141,14 @@ a0 corresponds to the APIC ID in the third argument (a2), bit 1 corresponds to the APIC ID a2+1, and so on. Returns the number of CPUs to which the IPIs were delivered successfully. + +7. KVM_HC_SCHED_YIELD +------------------------ +Architecture: x86 +Status: active +Purpose: Hypercall used to yield if the IPI target vCPU is preempted + +a0: destination APIC ID + +Usage example: When sending a call-function IPI-many to vCPUs, yield if +any of the IPI target vCPUs was preempted. diff --git a/arch/x86/include/uapi/asm/kvm_para.h b/arch/x86/include/uapi/asm/kvm_para.h index 21d5f0240595..2a8e0b6b9805 100644 --- a/arch/x86/include/uapi/asm/kvm_para.h +++ b/arch/x86/include/uapi/asm/kvm_para.h @@ -30,6 +30,7 @@ #define KVM_FEATURE_ASYNC_PF_VMEXIT 10 #define KVM_FEATURE_PV_SEND_IPI 11 #define KVM_FEATURE_POLL_CONTROL 12 +#define KVM_FEATURE_PV_SCHED_YIELD 13 #define KVM_HINTS_REALTIME 0 diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 5169b8cc35bb..82caf01b63dd 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -527,6 +527,21 @@ static void kvm_setup_pv_ipi(void) pr_info("KVM setup pv IPIs\n"); } +static void kvm_smp_send_call_func_ipi(const struct cpumask *mask) +{ + int cpu; + + native_send_call_func_ipi(mask); + + /* Make sure other vCPUs get a chance to run if they need to. */ + for_each_cpu(cpu, mask) { + if (vcpu_is_preempted(cpu)) { + kvm_hypercall1(KVM_HC_SCHED_YIELD, per_cpu(x86_cpu_to_apicid, cpu)); + break; + } + } +} + static void __init kvm_smp_prepare_cpus(unsigned int max_cpus) { native_smp_prepare_cpus(max_cpus); @@ -638,6 +653,12 @@ static void __init kvm_guest_init(void) #ifdef CONFIG_SMP smp_ops.smp_prepare_cpus = kvm_smp_prepare_cpus; smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; + if (kvm_para_has_feature(KVM_FEATURE_PV_SCHED_YIELD) && + !kvm_para_has_hint(KVM_HINTS_REALTIME) && + kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) { + smp_ops.send_call_func_ipi = kvm_smp_send_call_func_ipi; + pr_info("KVM setup pv sched yield\n"); + } if (cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "x86/kvm:online", kvm_cpu_online, kvm_cpu_down_prepare) < 0) pr_err("kvm_guest: Failed to install cpu hotplug callbacks\n"); diff --git a/include/uapi/linux/kvm_para.h b/include/uapi/linux/kvm_para.h index 6c0ce49931e5..8b86609849b9 100644 --- a/include/uapi/linux/kvm_para.h +++ b/include/uapi/linux/kvm_para.h @@ -28,6 +28,7 @@ #define KVM_HC_MIPS_CONSOLE_OUTPUT 8 #define KVM_HC_CLOCK_PAIRING 9 #define KVM_HC_SEND_IPI 10 +#define KVM_HC_SCHED_YIELD 11 /* * hypercalls use architecture specific -- cgit v1.2.3-71-gd317 From 7a1ade847596dadc94b37e49f8c03f167fd71748 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 27 Jun 2019 23:03:07 +0100 Subject: keys: Provide KEYCTL_GRANT_PERMISSION Provide a keyctl() operation to grant/remove permissions. The grant operation, wrapped by libkeyutils, looks like: int ret = keyctl_grant_permission(key_serial_t key, enum key_ace_subject_type type, unsigned int subject, unsigned int perm); Where key is the key to be modified, type and subject represent the subject to which permission is to be granted (or removed) and perm is the set of permissions to be granted. 0 is returned on success. SET_SECURITY permission is required for this. The subject type currently must be KEY_ACE_SUBJ_STANDARD for the moment (other subject types will come along later). For subject type KEY_ACE_SUBJ_STANDARD, the following subject values are available: KEY_ACE_POSSESSOR The possessor of the key KEY_ACE_OWNER The owner of the key KEY_ACE_GROUP The key's group KEY_ACE_EVERYONE Everyone perm lists the permissions to be granted: KEY_ACE_VIEW Can view the key metadata KEY_ACE_READ Can read the key content KEY_ACE_WRITE Can update/modify the key content KEY_ACE_SEARCH Can find the key by searching/requesting KEY_ACE_LINK Can make a link to the key KEY_ACE_SET_SECURITY Can set security KEY_ACE_INVAL Can invalidate KEY_ACE_REVOKE Can revoke KEY_ACE_JOIN Can join this keyring KEY_ACE_CLEAR Can clear this keyring If an ACE already exists for the subject, then the permissions mask will be overwritten; if perm is 0, it will be deleted. Currently, the internal ACL is limited to a maximum of 16 entries. For example: int ret = keyctl_grant_permission(key, KEY_ACE_SUBJ_STANDARD, KEY_ACE_OWNER, KEY_ACE_VIEW | KEY_ACE_READ); Signed-off-by: David Howells --- include/uapi/linux/keyctl.h | 2 + security/keys/compat.c | 2 + security/keys/internal.h | 5 ++ security/keys/keyctl.c | 8 ++- security/keys/permission.c | 119 ++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 135 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/keyctl.h b/include/uapi/linux/keyctl.h index e783bf957da8..1f7a4e737214 100644 --- a/include/uapi/linux/keyctl.h +++ b/include/uapi/linux/keyctl.h @@ -132,6 +132,7 @@ enum key_ace_standard_subject { #define KEYCTL_RESTRICT_KEYRING 29 /* Restrict keys allowed to link to a keyring */ #define KEYCTL_MOVE 30 /* Move keys between keyrings */ #define KEYCTL_CAPABILITIES 31 /* Find capabilities of keyrings subsystem */ +#define KEYCTL_GRANT_PERMISSION 32 /* Grant a permit to a key */ /* keyctl structures */ struct keyctl_dh_params { @@ -193,5 +194,6 @@ struct keyctl_pkey_params { #define KEYCTL_CAPS0_MOVE 0x80 /* KEYCTL_MOVE supported */ #define KEYCTL_CAPS1_NS_KEYRING_NAME 0x01 /* Keyring names are per-user_namespace */ #define KEYCTL_CAPS1_NS_KEY_TAG 0x02 /* Key indexing can include a namespace tag */ +#define KEYCTL_CAPS1_ACL_ALTERABLE 0x04 /* Keys have internal ACL that can be altered */ #endif /* _LINUX_KEYCTL_H */ diff --git a/security/keys/compat.c b/security/keys/compat.c index a53e30da20c5..1eebb9a237b8 100644 --- a/security/keys/compat.c +++ b/security/keys/compat.c @@ -161,6 +161,8 @@ COMPAT_SYSCALL_DEFINE5(keyctl, u32, option, case KEYCTL_MOVE: return keyctl_keyring_move(arg2, arg3, arg4, arg5); + case KEYCTL_GRANT_PERMISSION: + return keyctl_grant_permission(arg2, arg3, arg4, arg5); case KEYCTL_CAPABILITIES: return keyctl_capabilities(compat_ptr(arg2), arg3); diff --git a/security/keys/internal.h b/security/keys/internal.h index 9375d6289bb9..5e27ebdf1937 100644 --- a/security/keys/internal.h +++ b/security/keys/internal.h @@ -342,6 +342,11 @@ static inline long keyctl_pkey_e_d_s(int op, extern long keyctl_capabilities(unsigned char __user *_buffer, size_t buflen); +extern long keyctl_grant_permission(key_serial_t keyid, + enum key_ace_subject_type type, + unsigned int subject, + unsigned int perm); + /* * Debugging key validation */ diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c index c8911b430e59..aa096c4080b2 100644 --- a/security/keys/keyctl.c +++ b/security/keys/keyctl.c @@ -41,7 +41,8 @@ static const unsigned char keyrings_capabilities[2] = { KEYCTL_CAPS0_MOVE ), [1] = (KEYCTL_CAPS1_NS_KEYRING_NAME | - KEYCTL_CAPS1_NS_KEY_TAG), + KEYCTL_CAPS1_NS_KEY_TAG | + KEYCTL_CAPS1_ACL_ALTERABLE), }; static int key_get_type_from_user(char *type, @@ -1891,6 +1892,11 @@ SYSCALL_DEFINE5(keyctl, int, option, unsigned long, arg2, unsigned long, arg3, (key_serial_t)arg3, (key_serial_t)arg4, (unsigned int)arg5); + case KEYCTL_GRANT_PERMISSION: + return keyctl_grant_permission((key_serial_t)arg2, + (enum key_ace_subject_type)arg3, + (unsigned int)arg4, + (unsigned int)arg5); case KEYCTL_CAPABILITIES: return keyctl_capabilities((unsigned char __user *)arg2, (size_t)arg3); diff --git a/security/keys/permission.c b/security/keys/permission.c index e3237bb2e970..11655a827ba1 100644 --- a/security/keys/permission.c +++ b/security/keys/permission.c @@ -278,3 +278,122 @@ long key_set_acl(struct key *key, struct key_acl *acl) key_put_acl(acl); return 0; } + +/* + * Allocate a new ACL with an extra ACE slot. + */ +static struct key_acl *key_alloc_acl(const struct key_acl *old_acl, int nr, int skip) +{ + struct key_acl *acl; + int nr_ace, i, j = 0; + + nr_ace = old_acl->nr_ace + nr; + if (nr_ace > 16) + return ERR_PTR(-EINVAL); + + acl = kzalloc(struct_size(acl, aces, nr_ace), GFP_KERNEL); + if (!acl) + return ERR_PTR(-ENOMEM); + + refcount_set(&acl->usage, 1); + acl->nr_ace = nr_ace; + for (i = 0; i < old_acl->nr_ace; i++) { + if (i == skip) + continue; + acl->aces[j] = old_acl->aces[i]; + j++; + } + return acl; +} + +/* + * Generate the revised ACL. + */ +static long key_change_acl(struct key *key, struct key_ace *new_ace) +{ + struct key_acl *acl, *old; + int i; + + old = rcu_dereference_protected(key->acl, lockdep_is_held(&key->sem)); + + for (i = 0; i < old->nr_ace; i++) + if (old->aces[i].type == new_ace->type && + old->aces[i].subject_id == new_ace->subject_id) + goto found_match; + + if (new_ace->perm == 0) + return 0; /* No permissions to remove. Add deny record? */ + + acl = key_alloc_acl(old, 1, -1); + if (IS_ERR(acl)) + return PTR_ERR(acl); + acl->aces[i] = *new_ace; + goto change; + +found_match: + if (new_ace->perm == 0) + goto delete_ace; + if (new_ace->perm == old->aces[i].perm) + return 0; + acl = key_alloc_acl(old, 0, -1); + if (IS_ERR(acl)) + return PTR_ERR(acl); + acl->aces[i].perm = new_ace->perm; + goto change; + +delete_ace: + acl = key_alloc_acl(old, -1, i); + if (IS_ERR(acl)) + return PTR_ERR(acl); + goto change; + +change: + return key_set_acl(key, acl); +} + +/* + * Add, alter or remove (if perm == 0) an ACE in a key's ACL. + */ +long keyctl_grant_permission(key_serial_t keyid, + enum key_ace_subject_type type, + unsigned int subject, + unsigned int perm) +{ + struct key_ace new_ace; + struct key *key; + key_ref_t key_ref; + long ret; + + new_ace.type = type; + new_ace.perm = perm; + + switch (type) { + case KEY_ACE_SUBJ_STANDARD: + if (subject >= nr__key_ace_standard_subject) + return -ENOENT; + new_ace.subject_id = subject; + break; + + default: + return -ENOENT; + } + + key_ref = lookup_user_key(keyid, KEY_LOOKUP_PARTIAL, KEY_NEED_SETSEC); + if (IS_ERR(key_ref)) { + ret = PTR_ERR(key_ref); + goto error; + } + + key = key_ref_to_ptr(key_ref); + + down_write(&key->sem); + + /* If we're not the sysadmin, we can only change a key that we own */ + ret = -EACCES; + if (capable(CAP_SYS_ADMIN) || uid_eq(key->uid, current_fsuid())) + ret = key_change_acl(key, &new_ace); + up_write(&key->sem); + key_put(key); +error: + return ret; +} -- cgit v1.2.3-71-gd317 From 23729ff23186424e54b4d6678fcd526cdacef4d3 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Tue, 2 Jul 2019 09:13:56 -0700 Subject: bpf: add BPF_CGROUP_SOCK_OPS callback that is executed on every RTT Performance impact should be minimal because it's under a new BPF_SOCK_OPS_RTT_CB_FLAG flag that has to be explicitly enabled. Suggested-by: Eric Dumazet Cc: Eric Dumazet Cc: Priyaranjan Jha Cc: Yuchung Cheng Cc: Soheil Hassas Yeganeh Acked-by: Soheil Hassas Yeganeh Acked-by: Yuchung Cheng Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann --- include/net/tcp.h | 8 ++++++++ include/uapi/linux/bpf.h | 6 +++++- net/ipv4/tcp_input.c | 4 ++++ 3 files changed, 17 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/net/tcp.h b/include/net/tcp.h index 9d36cc88d043..e16d8a3fd3b4 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -2221,6 +2221,14 @@ static inline bool tcp_bpf_ca_needs_ecn(struct sock *sk) return (tcp_call_bpf(sk, BPF_SOCK_OPS_NEEDS_ECN, 0, NULL) == 1); } +static inline void tcp_bpf_rtt(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_RTT_CB_FLAG)) + tcp_call_bpf(sk, BPF_SOCK_OPS_RTT_CB, 0, NULL); +} + #if IS_ENABLED(CONFIG_SMC) extern struct static_key_false tcp_have_smc; #endif diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index cffea1826a1f..9cdd0aaeba06 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1770,6 +1770,7 @@ union bpf_attr { * * **BPF_SOCK_OPS_RTO_CB_FLAG** (retransmission time out) * * **BPF_SOCK_OPS_RETRANS_CB_FLAG** (retransmission) * * **BPF_SOCK_OPS_STATE_CB_FLAG** (TCP state change) + * * **BPF_SOCK_OPS_RTT_CB_FLAG** (every RTT) * * Therefore, this function can be used to clear a callback flag by * setting the appropriate bit to zero. e.g. to disable the RTO @@ -3314,7 +3315,8 @@ struct bpf_sock_ops { #define BPF_SOCK_OPS_RTO_CB_FLAG (1<<0) #define BPF_SOCK_OPS_RETRANS_CB_FLAG (1<<1) #define BPF_SOCK_OPS_STATE_CB_FLAG (1<<2) -#define BPF_SOCK_OPS_ALL_CB_FLAGS 0x7 /* Mask of all currently +#define BPF_SOCK_OPS_RTT_CB_FLAG (1<<3) +#define BPF_SOCK_OPS_ALL_CB_FLAGS 0xF /* Mask of all currently * supported cb flags */ @@ -3369,6 +3371,8 @@ enum { BPF_SOCK_OPS_TCP_LISTEN_CB, /* Called on listen(2), right after * socket transition to LISTEN state. */ + BPF_SOCK_OPS_RTT_CB, /* Called on every RTT. + */ }; /* List of TCP states. There is a build check in net/ipv4/tcp.c to detect diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index b71efeb0ae5b..c21e8a22fb3b 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -778,6 +778,8 @@ static void tcp_rtt_estimator(struct sock *sk, long mrtt_us) tp->rttvar_us -= (tp->rttvar_us - tp->mdev_max_us) >> 2; tp->rtt_seq = tp->snd_nxt; tp->mdev_max_us = tcp_rto_min_us(sk); + + tcp_bpf_rtt(sk); } } else { /* no previous measure. */ @@ -786,6 +788,8 @@ static void tcp_rtt_estimator(struct sock *sk, long mrtt_us) tp->rttvar_us = max(tp->mdev_us, tcp_rto_min_us(sk)); tp->mdev_max_us = tp->rttvar_us; tp->rtt_seq = tp->snd_nxt; + + tcp_bpf_rtt(sk); } tp->srtt_us = max(1U, srtt); } -- cgit v1.2.3-71-gd317 From 0357746d1e40a8226f68a42c8d7222a12d7c451f Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Tue, 2 Jul 2019 09:13:58 -0700 Subject: bpf: add dsack_dups/delivered{, _ce} to bpf_tcp_sock Add more fields to bpf_tcp_sock that might be useful for debugging congestion control issues. Cc: Eric Dumazet Cc: Priyaranjan Jha Cc: Yuchung Cheng Cc: Soheil Hassas Yeganeh Acked-by: Soheil Hassas Yeganeh Acked-by: Yuchung Cheng Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 5 +++++ net/core/filter.c | 11 ++++++++++- 2 files changed, 15 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 9cdd0aaeba06..bfb0b1a76684 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3073,6 +3073,11 @@ struct bpf_tcp_sock { * sum(delta(snd_una)), or how many bytes * were acked. */ + __u32 dsack_dups; /* RFC4898 tcpEStatsStackDSACKDups + * total number of DSACK blocks received + */ + __u32 delivered; /* Total data packets delivered incl. rexmits */ + __u32 delivered_ce; /* Like the above but only ECE marked packets */ }; struct bpf_sock_tuple { diff --git a/net/core/filter.c b/net/core/filter.c index ad908526545d..3da4b6c38b46 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5544,7 +5544,7 @@ static const struct bpf_func_proto bpf_sock_addr_sk_lookup_udp_proto = { bool bpf_tcp_sock_is_valid_access(int off, int size, enum bpf_access_type type, struct bpf_insn_access_aux *info) { - if (off < 0 || off >= offsetofend(struct bpf_tcp_sock, bytes_acked)) + if (off < 0 || off >= offsetofend(struct bpf_tcp_sock, delivered_ce)) return false; if (off % size != 0) @@ -5652,6 +5652,15 @@ u32 bpf_tcp_sock_convert_ctx_access(enum bpf_access_type type, case offsetof(struct bpf_tcp_sock, bytes_acked): BPF_TCP_SOCK_GET_COMMON(bytes_acked); break; + case offsetof(struct bpf_tcp_sock, dsack_dups): + BPF_TCP_SOCK_GET_COMMON(dsack_dups); + break; + case offsetof(struct bpf_tcp_sock, delivered): + BPF_TCP_SOCK_GET_COMMON(delivered); + break; + case offsetof(struct bpf_tcp_sock, delivered_ce): + BPF_TCP_SOCK_GET_COMMON(delivered_ce); + break; } return insn - insn_buf; -- cgit v1.2.3-71-gd317 From c2cb5e82a720c05b707701c75dfeb356fe184787 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Tue, 2 Jul 2019 09:13:59 -0700 Subject: bpf: add icsk_retransmits to bpf_tcp_sock Add some inet_connection_sock fields to bpf_tcp_sock that might be useful for debugging congestion control issues. Cc: Eric Dumazet Cc: Priyaranjan Jha Cc: Yuchung Cheng Cc: Soheil Hassas Yeganeh Acked-by: Soheil Hassas Yeganeh Acked-by: Yuchung Cheng Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 1 + net/core/filter.c | 20 +++++++++++++++++++- 2 files changed, 20 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index bfb0b1a76684..ead27aebf491 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3078,6 +3078,7 @@ struct bpf_tcp_sock { */ __u32 delivered; /* Total data packets delivered incl. rexmits */ __u32 delivered_ce; /* Like the above but only ECE marked packets */ + __u32 icsk_retransmits; /* Number of unrecovered [RTO] timeouts */ }; struct bpf_sock_tuple { diff --git a/net/core/filter.c b/net/core/filter.c index 3da4b6c38b46..089aaea0ccc6 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5544,7 +5544,8 @@ static const struct bpf_func_proto bpf_sock_addr_sk_lookup_udp_proto = { bool bpf_tcp_sock_is_valid_access(int off, int size, enum bpf_access_type type, struct bpf_insn_access_aux *info) { - if (off < 0 || off >= offsetofend(struct bpf_tcp_sock, delivered_ce)) + if (off < 0 || off >= offsetofend(struct bpf_tcp_sock, + icsk_retransmits)) return false; if (off % size != 0) @@ -5575,6 +5576,20 @@ u32 bpf_tcp_sock_convert_ctx_access(enum bpf_access_type type, offsetof(struct tcp_sock, FIELD)); \ } while (0) +#define BPF_INET_SOCK_GET_COMMON(FIELD) \ + do { \ + BUILD_BUG_ON(FIELD_SIZEOF(struct inet_connection_sock, \ + FIELD) > \ + FIELD_SIZEOF(struct bpf_tcp_sock, FIELD)); \ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ + struct inet_connection_sock, \ + FIELD), \ + si->dst_reg, si->src_reg, \ + offsetof( \ + struct inet_connection_sock, \ + FIELD)); \ + } while (0) + if (insn > insn_buf) return insn - insn_buf; @@ -5661,6 +5676,9 @@ u32 bpf_tcp_sock_convert_ctx_access(enum bpf_access_type type, case offsetof(struct bpf_tcp_sock, delivered_ce): BPF_TCP_SOCK_GET_COMMON(delivered_ce); break; + case offsetof(struct bpf_tcp_sock, icsk_retransmits): + BPF_INET_SOCK_GET_COMMON(icsk_retransmits); + break; } return insn - insn_buf; -- cgit v1.2.3-71-gd317 From ecd6bf67da3126e8ec731c2dd8cb6c2f17d9563a Mon Sep 17 00:00:00 2001 From: Mark Greer Date: Wed, 26 Jun 2019 09:05:53 -0700 Subject: serial: mpsc: Remove obsolete MPSC driver Support for the Marvell MV64x60 line of bridge chips that contained MPSC controllers has been removed and there are no other components that have that controller so remove its driver. Signed-off-by: Mark Greer Link: https://lore.kernel.org/r/20190626160553.28518-1-mgreer@animalcreek.com Signed-off-by: Greg Kroah-Hartman --- Documentation/admin-guide/devices.txt | 4 +- drivers/tty/serial/Kconfig | 14 - drivers/tty/serial/Makefile | 1 - drivers/tty/serial/mpsc.c | 2138 --------------------------------- include/linux/mv643xx.h | 46 - include/uapi/linux/serial_core.h | 2 +- 6 files changed, 3 insertions(+), 2202 deletions(-) delete mode 100644 drivers/tty/serial/mpsc.c (limited to 'include/uapi/linux') diff --git a/Documentation/admin-guide/devices.txt b/Documentation/admin-guide/devices.txt index 1649117e6087..e56e00655153 100644 --- a/Documentation/admin-guide/devices.txt +++ b/Documentation/admin-guide/devices.txt @@ -2693,8 +2693,8 @@ 41 = /dev/ttySMX0 Motorola i.MX - port 0 42 = /dev/ttySMX1 Motorola i.MX - port 1 43 = /dev/ttySMX2 Motorola i.MX - port 2 - 44 = /dev/ttyMM0 Marvell MPSC - port 0 - 45 = /dev/ttyMM1 Marvell MPSC - port 1 + 44 = /dev/ttyMM0 Marvell MPSC - port 0 (obsolete unused) + 45 = /dev/ttyMM1 Marvell MPSC - port 1 (obsolete unused) 46 = /dev/ttyCPM0 PPC CPM (SCC or SMC) - port 0 ... 47 = /dev/ttyCPM5 PPC CPM (SCC or SMC) - port 5 diff --git a/drivers/tty/serial/Kconfig b/drivers/tty/serial/Kconfig index 0d31251e04cc..b416c7b33f49 100644 --- a/drivers/tty/serial/Kconfig +++ b/drivers/tty/serial/Kconfig @@ -457,20 +457,6 @@ config SERIAL_21285_CONSOLE your boot loader (lilo or loadlin) about how to pass options to the kernel at boot time.) -config SERIAL_MPSC - bool "Marvell MPSC serial port support" - depends on MV64X60 - select SERIAL_CORE - help - Say Y here if you want to use the Marvell MPSC serial controller. - -config SERIAL_MPSC_CONSOLE - bool "Support for console on Marvell MPSC serial port" - depends on SERIAL_MPSC - select SERIAL_CORE_CONSOLE - help - Say Y here if you want to support a serial console on a Marvell MPSC. - config SERIAL_PXA bool "PXA serial port support (DEPRECATED)" depends on ARCH_PXA || ARCH_MMP diff --git a/drivers/tty/serial/Makefile b/drivers/tty/serial/Makefile index 79c3d513db7e..7cd7cabfa6c4 100644 --- a/drivers/tty/serial/Makefile +++ b/drivers/tty/serial/Makefile @@ -46,7 +46,6 @@ obj-$(CONFIG_SERIAL_CPM) += cpm_uart/ obj-$(CONFIG_SERIAL_IMX) += imx.o obj-$(CONFIG_SERIAL_MPC52xx) += mpc52xx_uart.o obj-$(CONFIG_SERIAL_ICOM) += icom.o -obj-$(CONFIG_SERIAL_MPSC) += mpsc.o obj-$(CONFIG_SERIAL_MESON) += meson_uart.o obj-$(CONFIG_SERIAL_SB1250_DUART) += sb1250-duart.o obj-$(CONFIG_SERIAL_SCCNXP) += sccnxp.o diff --git a/drivers/tty/serial/mpsc.c b/drivers/tty/serial/mpsc.c deleted file mode 100644 index 1f60d6fe4ff2..000000000000 --- a/drivers/tty/serial/mpsc.c +++ /dev/null @@ -1,2138 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Generic driver for the MPSC (UART mode) on Marvell parts (e.g., GT64240, - * GT64260, MV64340, MV64360, GT96100, ... ). - * - * Author: Mark A. Greer - * - * Based on an old MPSC driver that was in the linuxppc tree. It appears to - * have been created by Chris Zankel (formerly of MontaVista) but there - * is no proper Copyright so I'm not sure. Apparently, parts were also - * taken from PPCBoot (now U-Boot). Also based on drivers/serial/8250.c - * by Russell King. - * - * 2004 (c) MontaVista, Software, Inc. - */ -/* - * The MPSC interface is much like a typical network controller's interface. - * That is, you set up separate rings of descriptors for transmitting and - * receiving data. There is also a pool of buffers with (one buffer per - * descriptor) that incoming data are dma'd into or outgoing data are dma'd - * out of. - * - * The MPSC requires two other controllers to be able to work. The Baud Rate - * Generator (BRG) provides a clock at programmable frequencies which determines - * the baud rate. The Serial DMA Controller (SDMA) takes incoming data from the - * MPSC and DMA's it into memory or DMA's outgoing data and passes it to the - * MPSC. It is actually the SDMA interrupt that the driver uses to keep the - * transmit and receive "engines" going (i.e., indicate data has been - * transmitted or received). - * - * NOTES: - * - * 1) Some chips have an erratum where several regs cannot be - * read. To work around that, we keep a local copy of those regs in - * 'mpsc_port_info'. - * - * 2) Some chips have an erratum where the ctlr will hang when the SDMA ctlr - * accesses system mem with coherency enabled. For that reason, the driver - * assumes that coherency for that ctlr has been disabled. This means - * that when in a cache coherent system, the driver has to manually manage - * the data cache on the areas that it touches because the dma_* macro are - * basically no-ops. - * - * 3) There is an erratum (on PPC) where you can't use the instruction to do - * a DMA_TO_DEVICE/cache clean so DMA_BIDIRECTIONAL/flushes are used in places - * where a DMA_TO_DEVICE/clean would have [otherwise] sufficed. - * - * 4) AFAICT, hardware flow control isn't supported by the controller --MAG. - */ - - -#if defined(CONFIG_SERIAL_MPSC_CONSOLE) && defined(CONFIG_MAGIC_SYSRQ) -#define SUPPORT_SYSRQ -#endif - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#define MPSC_NUM_CTLRS 2 - -/* - * Descriptors and buffers must be cache line aligned. - * Buffers lengths must be multiple of cache line size. - * Number of Tx & Rx descriptors must be powers of 2. - */ -#define MPSC_RXR_ENTRIES 32 -#define MPSC_RXRE_SIZE dma_get_cache_alignment() -#define MPSC_RXR_SIZE (MPSC_RXR_ENTRIES * MPSC_RXRE_SIZE) -#define MPSC_RXBE_SIZE dma_get_cache_alignment() -#define MPSC_RXB_SIZE (MPSC_RXR_ENTRIES * MPSC_RXBE_SIZE) - -#define MPSC_TXR_ENTRIES 32 -#define MPSC_TXRE_SIZE dma_get_cache_alignment() -#define MPSC_TXR_SIZE (MPSC_TXR_ENTRIES * MPSC_TXRE_SIZE) -#define MPSC_TXBE_SIZE dma_get_cache_alignment() -#define MPSC_TXB_SIZE (MPSC_TXR_ENTRIES * MPSC_TXBE_SIZE) - -#define MPSC_DMA_ALLOC_SIZE (MPSC_RXR_SIZE + MPSC_RXB_SIZE + MPSC_TXR_SIZE \ - + MPSC_TXB_SIZE + dma_get_cache_alignment() /* for alignment */) - -/* Rx and Tx Ring entry descriptors -- assume entry size is <= cacheline size */ -struct mpsc_rx_desc { - u16 bufsize; - u16 bytecnt; - u32 cmdstat; - u32 link; - u32 buf_ptr; -} __attribute((packed)); - -struct mpsc_tx_desc { - u16 bytecnt; - u16 shadow; - u32 cmdstat; - u32 link; - u32 buf_ptr; -} __attribute((packed)); - -/* - * Some regs that have the erratum that you can't read them are are shared - * between the two MPSC controllers. This struct contains those shared regs. - */ -struct mpsc_shared_regs { - phys_addr_t mpsc_routing_base_p; - phys_addr_t sdma_intr_base_p; - - void __iomem *mpsc_routing_base; - void __iomem *sdma_intr_base; - - u32 MPSC_MRR_m; - u32 MPSC_RCRR_m; - u32 MPSC_TCRR_m; - u32 SDMA_INTR_CAUSE_m; - u32 SDMA_INTR_MASK_m; -}; - -/* The main driver data structure */ -struct mpsc_port_info { - struct uart_port port; /* Overlay uart_port structure */ - - /* Internal driver state for this ctlr */ - u8 ready; - u8 rcv_data; - - /* Info passed in from platform */ - u8 mirror_regs; /* Need to mirror regs? */ - u8 cache_mgmt; /* Need manual cache mgmt? */ - u8 brg_can_tune; /* BRG has baud tuning? */ - u32 brg_clk_src; - u16 mpsc_max_idle; - int default_baud; - int default_bits; - int default_parity; - int default_flow; - - /* Physical addresses of various blocks of registers (from platform) */ - phys_addr_t mpsc_base_p; - phys_addr_t sdma_base_p; - phys_addr_t brg_base_p; - - /* Virtual addresses of various blocks of registers (from platform) */ - void __iomem *mpsc_base; - void __iomem *sdma_base; - void __iomem *brg_base; - - /* Descriptor ring and buffer allocations */ - void *dma_region; - dma_addr_t dma_region_p; - - dma_addr_t rxr; /* Rx descriptor ring */ - dma_addr_t rxr_p; /* Phys addr of rxr */ - u8 *rxb; /* Rx Ring I/O buf */ - u8 *rxb_p; /* Phys addr of rxb */ - u32 rxr_posn; /* First desc w/ Rx data */ - - dma_addr_t txr; /* Tx descriptor ring */ - dma_addr_t txr_p; /* Phys addr of txr */ - u8 *txb; /* Tx Ring I/O buf */ - u8 *txb_p; /* Phys addr of txb */ - int txr_head; /* Where new data goes */ - int txr_tail; /* Where sent data comes off */ - spinlock_t tx_lock; /* transmit lock */ - - /* Mirrored values of regs we can't read (if 'mirror_regs' set) */ - u32 MPSC_MPCR_m; - u32 MPSC_CHR_1_m; - u32 MPSC_CHR_2_m; - u32 MPSC_CHR_10_m; - u32 BRG_BCR_m; - struct mpsc_shared_regs *shared_regs; -}; - -/* Hooks to platform-specific code */ -int mpsc_platform_register_driver(void); -void mpsc_platform_unregister_driver(void); - -/* Hooks back in to mpsc common to be called by platform-specific code */ -struct mpsc_port_info *mpsc_device_probe(int index); -struct mpsc_port_info *mpsc_device_remove(int index); - -/* Main MPSC Configuration Register Offsets */ -#define MPSC_MMCRL 0x0000 -#define MPSC_MMCRH 0x0004 -#define MPSC_MPCR 0x0008 -#define MPSC_CHR_1 0x000c -#define MPSC_CHR_2 0x0010 -#define MPSC_CHR_3 0x0014 -#define MPSC_CHR_4 0x0018 -#define MPSC_CHR_5 0x001c -#define MPSC_CHR_6 0x0020 -#define MPSC_CHR_7 0x0024 -#define MPSC_CHR_8 0x0028 -#define MPSC_CHR_9 0x002c -#define MPSC_CHR_10 0x0030 -#define MPSC_CHR_11 0x0034 - -#define MPSC_MPCR_FRZ (1 << 9) -#define MPSC_MPCR_CL_5 0 -#define MPSC_MPCR_CL_6 1 -#define MPSC_MPCR_CL_7 2 -#define MPSC_MPCR_CL_8 3 -#define MPSC_MPCR_SBL_1 0 -#define MPSC_MPCR_SBL_2 1 - -#define MPSC_CHR_2_TEV (1<<1) -#define MPSC_CHR_2_TA (1<<7) -#define MPSC_CHR_2_TTCS (1<<9) -#define MPSC_CHR_2_REV (1<<17) -#define MPSC_CHR_2_RA (1<<23) -#define MPSC_CHR_2_CRD (1<<25) -#define MPSC_CHR_2_EH (1<<31) -#define MPSC_CHR_2_PAR_ODD 0 -#define MPSC_CHR_2_PAR_SPACE 1 -#define MPSC_CHR_2_PAR_EVEN 2 -#define MPSC_CHR_2_PAR_MARK 3 - -/* MPSC Signal Routing */ -#define MPSC_MRR 0x0000 -#define MPSC_RCRR 0x0004 -#define MPSC_TCRR 0x0008 - -/* Serial DMA Controller Interface Registers */ -#define SDMA_SDC 0x0000 -#define SDMA_SDCM 0x0008 -#define SDMA_RX_DESC 0x0800 -#define SDMA_RX_BUF_PTR 0x0808 -#define SDMA_SCRDP 0x0810 -#define SDMA_TX_DESC 0x0c00 -#define SDMA_SCTDP 0x0c10 -#define SDMA_SFTDP 0x0c14 - -#define SDMA_DESC_CMDSTAT_PE (1<<0) -#define SDMA_DESC_CMDSTAT_CDL (1<<1) -#define SDMA_DESC_CMDSTAT_FR (1<<3) -#define SDMA_DESC_CMDSTAT_OR (1<<6) -#define SDMA_DESC_CMDSTAT_BR (1<<9) -#define SDMA_DESC_CMDSTAT_MI (1<<10) -#define SDMA_DESC_CMDSTAT_A (1<<11) -#define SDMA_DESC_CMDSTAT_AM (1<<12) -#define SDMA_DESC_CMDSTAT_CT (1<<13) -#define SDMA_DESC_CMDSTAT_C (1<<14) -#define SDMA_DESC_CMDSTAT_ES (1<<15) -#define SDMA_DESC_CMDSTAT_L (1<<16) -#define SDMA_DESC_CMDSTAT_F (1<<17) -#define SDMA_DESC_CMDSTAT_P (1<<18) -#define SDMA_DESC_CMDSTAT_EI (1<<23) -#define SDMA_DESC_CMDSTAT_O (1<<31) - -#define SDMA_DESC_DFLT (SDMA_DESC_CMDSTAT_O \ - | SDMA_DESC_CMDSTAT_EI) - -#define SDMA_SDC_RFT (1<<0) -#define SDMA_SDC_SFM (1<<1) -#define SDMA_SDC_BLMR (1<<6) -#define SDMA_SDC_BLMT (1<<7) -#define SDMA_SDC_POVR (1<<8) -#define SDMA_SDC_RIFB (1<<9) - -#define SDMA_SDCM_ERD (1<<7) -#define SDMA_SDCM_AR (1<<15) -#define SDMA_SDCM_STD (1<<16) -#define SDMA_SDCM_TXD (1<<23) -#define SDMA_SDCM_AT (1<<31) - -#define SDMA_0_CAUSE_RXBUF (1<<0) -#define SDMA_0_CAUSE_RXERR (1<<1) -#define SDMA_0_CAUSE_TXBUF (1<<2) -#define SDMA_0_CAUSE_TXEND (1<<3) -#define SDMA_1_CAUSE_RXBUF (1<<8) -#define SDMA_1_CAUSE_RXERR (1<<9) -#define SDMA_1_CAUSE_TXBUF (1<<10) -#define SDMA_1_CAUSE_TXEND (1<<11) - -#define SDMA_CAUSE_RX_MASK (SDMA_0_CAUSE_RXBUF | SDMA_0_CAUSE_RXERR \ - | SDMA_1_CAUSE_RXBUF | SDMA_1_CAUSE_RXERR) -#define SDMA_CAUSE_TX_MASK (SDMA_0_CAUSE_TXBUF | SDMA_0_CAUSE_TXEND \ - | SDMA_1_CAUSE_TXBUF | SDMA_1_CAUSE_TXEND) - -/* SDMA Interrupt registers */ -#define SDMA_INTR_CAUSE 0x0000 -#define SDMA_INTR_MASK 0x0080 - -/* Baud Rate Generator Interface Registers */ -#define BRG_BCR 0x0000 -#define BRG_BTR 0x0004 - -/* - * Define how this driver is known to the outside (we've been assigned a - * range on the "Low-density serial ports" major). - */ -#define MPSC_MAJOR 204 -#define MPSC_MINOR_START 44 -#define MPSC_DRIVER_NAME "MPSC" -#define MPSC_DEV_NAME "ttyMM" -#define MPSC_VERSION "1.00" - -static struct mpsc_port_info mpsc_ports[MPSC_NUM_CTLRS]; -static struct mpsc_shared_regs mpsc_shared_regs; -static struct uart_driver mpsc_reg; - -static void mpsc_start_rx(struct mpsc_port_info *pi); -static void mpsc_free_ring_mem(struct mpsc_port_info *pi); -static void mpsc_release_port(struct uart_port *port); -/* - ****************************************************************************** - * - * Baud Rate Generator Routines (BRG) - * - ****************************************************************************** - */ -static void mpsc_brg_init(struct mpsc_port_info *pi, u32 clk_src) -{ - u32 v; - - v = (pi->mirror_regs) ? pi->BRG_BCR_m : readl(pi->brg_base + BRG_BCR); - v = (v & ~(0xf << 18)) | ((clk_src & 0xf) << 18); - - if (pi->brg_can_tune) - v &= ~(1 << 25); - - if (pi->mirror_regs) - pi->BRG_BCR_m = v; - writel(v, pi->brg_base + BRG_BCR); - - writel(readl(pi->brg_base + BRG_BTR) & 0xffff0000, - pi->brg_base + BRG_BTR); -} - -static void mpsc_brg_enable(struct mpsc_port_info *pi) -{ - u32 v; - - v = (pi->mirror_regs) ? pi->BRG_BCR_m : readl(pi->brg_base + BRG_BCR); - v |= (1 << 16); - - if (pi->mirror_regs) - pi->BRG_BCR_m = v; - writel(v, pi->brg_base + BRG_BCR); -} - -static void mpsc_brg_disable(struct mpsc_port_info *pi) -{ - u32 v; - - v = (pi->mirror_regs) ? pi->BRG_BCR_m : readl(pi->brg_base + BRG_BCR); - v &= ~(1 << 16); - - if (pi->mirror_regs) - pi->BRG_BCR_m = v; - writel(v, pi->brg_base + BRG_BCR); -} - -/* - * To set the baud, we adjust the CDV field in the BRG_BCR reg. - * From manual: Baud = clk / ((CDV+1)*2) ==> CDV = (clk / (baud*2)) - 1. - * However, the input clock is divided by 16 in the MPSC b/c of how - * 'MPSC_MMCRH' was set up so we have to divide the 'clk' used in our - * calculation by 16 to account for that. So the real calculation - * that accounts for the way the mpsc is set up is: - * CDV = (clk / (baud*2*16)) - 1 ==> CDV = (clk / (baud << 5)) - 1. - */ -static void mpsc_set_baudrate(struct mpsc_port_info *pi, u32 baud) -{ - u32 cdv = (pi->port.uartclk / (baud << 5)) - 1; - u32 v; - - mpsc_brg_disable(pi); - v = (pi->mirror_regs) ? pi->BRG_BCR_m : readl(pi->brg_base + BRG_BCR); - v = (v & 0xffff0000) | (cdv & 0xffff); - - if (pi->mirror_regs) - pi->BRG_BCR_m = v; - writel(v, pi->brg_base + BRG_BCR); - mpsc_brg_enable(pi); -} - -/* - ****************************************************************************** - * - * Serial DMA Routines (SDMA) - * - ****************************************************************************** - */ - -static void mpsc_sdma_burstsize(struct mpsc_port_info *pi, u32 burst_size) -{ - u32 v; - - pr_debug("mpsc_sdma_burstsize[%d]: burst_size: %d\n", - pi->port.line, burst_size); - - burst_size >>= 3; /* Divide by 8 b/c reg values are 8-byte chunks */ - - if (burst_size < 2) - v = 0x0; /* 1 64-bit word */ - else if (burst_size < 4) - v = 0x1; /* 2 64-bit words */ - else if (burst_size < 8) - v = 0x2; /* 4 64-bit words */ - else - v = 0x3; /* 8 64-bit words */ - - writel((readl(pi->sdma_base + SDMA_SDC) & (0x3 << 12)) | (v << 12), - pi->sdma_base + SDMA_SDC); -} - -static void mpsc_sdma_init(struct mpsc_port_info *pi, u32 burst_size) -{ - pr_debug("mpsc_sdma_init[%d]: burst_size: %d\n", pi->port.line, - burst_size); - - writel((readl(pi->sdma_base + SDMA_SDC) & 0x3ff) | 0x03f, - pi->sdma_base + SDMA_SDC); - mpsc_sdma_burstsize(pi, burst_size); -} - -static u32 mpsc_sdma_intr_mask(struct mpsc_port_info *pi, u32 mask) -{ - u32 old, v; - - pr_debug("mpsc_sdma_intr_mask[%d]: mask: 0x%x\n", pi->port.line, mask); - - old = v = (pi->mirror_regs) ? pi->shared_regs->SDMA_INTR_MASK_m : - readl(pi->shared_regs->sdma_intr_base + SDMA_INTR_MASK); - - mask &= 0xf; - if (pi->port.line) - mask <<= 8; - v &= ~mask; - - if (pi->mirror_regs) - pi->shared_regs->SDMA_INTR_MASK_m = v; - writel(v, pi->shared_regs->sdma_intr_base + SDMA_INTR_MASK); - - if (pi->port.line) - old >>= 8; - return old & 0xf; -} - -static void mpsc_sdma_intr_unmask(struct mpsc_port_info *pi, u32 mask) -{ - u32 v; - - pr_debug("mpsc_sdma_intr_unmask[%d]: mask: 0x%x\n", pi->port.line,mask); - - v = (pi->mirror_regs) ? pi->shared_regs->SDMA_INTR_MASK_m - : readl(pi->shared_regs->sdma_intr_base + SDMA_INTR_MASK); - - mask &= 0xf; - if (pi->port.line) - mask <<= 8; - v |= mask; - - if (pi->mirror_regs) - pi->shared_regs->SDMA_INTR_MASK_m = v; - writel(v, pi->shared_regs->sdma_intr_base + SDMA_INTR_MASK); -} - -static void mpsc_sdma_intr_ack(struct mpsc_port_info *pi) -{ - pr_debug("mpsc_sdma_intr_ack[%d]: Acknowledging IRQ\n", pi->port.line); - - if (pi->mirror_regs) - pi->shared_regs->SDMA_INTR_CAUSE_m = 0; - writeb(0x00, pi->shared_regs->sdma_intr_base + SDMA_INTR_CAUSE - + pi->port.line); -} - -static void mpsc_sdma_set_rx_ring(struct mpsc_port_info *pi, - struct mpsc_rx_desc *rxre_p) -{ - pr_debug("mpsc_sdma_set_rx_ring[%d]: rxre_p: 0x%x\n", - pi->port.line, (u32)rxre_p); - - writel((u32)rxre_p, pi->sdma_base + SDMA_SCRDP); -} - -static void mpsc_sdma_set_tx_ring(struct mpsc_port_info *pi, - struct mpsc_tx_desc *txre_p) -{ - writel((u32)txre_p, pi->sdma_base + SDMA_SFTDP); - writel((u32)txre_p, pi->sdma_base + SDMA_SCTDP); -} - -static void mpsc_sdma_cmd(struct mpsc_port_info *pi, u32 val) -{ - u32 v; - - v = readl(pi->sdma_base + SDMA_SDCM); - if (val) - v |= val; - else - v = 0; - wmb(); - writel(v, pi->sdma_base + SDMA_SDCM); - wmb(); -} - -static uint mpsc_sdma_tx_active(struct mpsc_port_info *pi) -{ - return readl(pi->sdma_base + SDMA_SDCM) & SDMA_SDCM_TXD; -} - -static void mpsc_sdma_start_tx(struct mpsc_port_info *pi) -{ - struct mpsc_tx_desc *txre, *txre_p; - - /* If tx isn't running & there's a desc ready to go, start it */ - if (!mpsc_sdma_tx_active(pi)) { - txre = (struct mpsc_tx_desc *)(pi->txr - + (pi->txr_tail * MPSC_TXRE_SIZE)); - dma_cache_sync(pi->port.dev, (void *)txre, MPSC_TXRE_SIZE, - DMA_FROM_DEVICE); -#if defined(CONFIG_PPC32) && !defined(CONFIG_NOT_COHERENT_CACHE) - if (pi->cache_mgmt) /* GT642[46]0 Res #COMM-2 */ - invalidate_dcache_range((ulong)txre, - (ulong)txre + MPSC_TXRE_SIZE); -#endif - - if (be32_to_cpu(txre->cmdstat) & SDMA_DESC_CMDSTAT_O) { - txre_p = (struct mpsc_tx_desc *) - (pi->txr_p + (pi->txr_tail * MPSC_TXRE_SIZE)); - - mpsc_sdma_set_tx_ring(pi, txre_p); - mpsc_sdma_cmd(pi, SDMA_SDCM_STD | SDMA_SDCM_TXD); - } - } -} - -static void mpsc_sdma_stop(struct mpsc_port_info *pi) -{ - pr_debug("mpsc_sdma_stop[%d]: Stopping SDMA\n", pi->port.line); - - /* Abort any SDMA transfers */ - mpsc_sdma_cmd(pi, 0); - mpsc_sdma_cmd(pi, SDMA_SDCM_AR | SDMA_SDCM_AT); - - /* Clear the SDMA current and first TX and RX pointers */ - mpsc_sdma_set_tx_ring(pi, NULL); - mpsc_sdma_set_rx_ring(pi, NULL); - - /* Disable interrupts */ - mpsc_sdma_intr_mask(pi, 0xf); - mpsc_sdma_intr_ack(pi); -} - -/* - ****************************************************************************** - * - * Multi-Protocol Serial Controller Routines (MPSC) - * - ****************************************************************************** - */ - -static void mpsc_hw_init(struct mpsc_port_info *pi) -{ - u32 v; - - pr_debug("mpsc_hw_init[%d]: Initializing hardware\n", pi->port.line); - - /* Set up clock routing */ - if (pi->mirror_regs) { - v = pi->shared_regs->MPSC_MRR_m; - v &= ~0x1c7; - pi->shared_regs->MPSC_MRR_m = v; - writel(v, pi->shared_regs->mpsc_routing_base + MPSC_MRR); - - v = pi->shared_regs->MPSC_RCRR_m; - v = (v & ~0xf0f) | 0x100; - pi->shared_regs->MPSC_RCRR_m = v; - writel(v, pi->shared_regs->mpsc_routing_base + MPSC_RCRR); - - v = pi->shared_regs->MPSC_TCRR_m; - v = (v & ~0xf0f) | 0x100; - pi->shared_regs->MPSC_TCRR_m = v; - writel(v, pi->shared_regs->mpsc_routing_base + MPSC_TCRR); - } else { - v = readl(pi->shared_regs->mpsc_routing_base + MPSC_MRR); - v &= ~0x1c7; - writel(v, pi->shared_regs->mpsc_routing_base + MPSC_MRR); - - v = readl(pi->shared_regs->mpsc_routing_base + MPSC_RCRR); - v = (v & ~0xf0f) | 0x100; - writel(v, pi->shared_regs->mpsc_routing_base + MPSC_RCRR); - - v = readl(pi->shared_regs->mpsc_routing_base + MPSC_TCRR); - v = (v & ~0xf0f) | 0x100; - writel(v, pi->shared_regs->mpsc_routing_base + MPSC_TCRR); - } - - /* Put MPSC in UART mode & enabel Tx/Rx egines */ - writel(0x000004c4, pi->mpsc_base + MPSC_MMCRL); - - /* No preamble, 16x divider, low-latency, */ - writel(0x04400400, pi->mpsc_base + MPSC_MMCRH); - mpsc_set_baudrate(pi, pi->default_baud); - - if (pi->mirror_regs) { - pi->MPSC_CHR_1_m = 0; - pi->MPSC_CHR_2_m = 0; - } - writel(0, pi->mpsc_base + MPSC_CHR_1); - writel(0, pi->mpsc_base + MPSC_CHR_2); - writel(pi->mpsc_max_idle, pi->mpsc_base + MPSC_CHR_3); - writel(0, pi->mpsc_base + MPSC_CHR_4); - writel(0, pi->mpsc_base + MPSC_CHR_5); - writel(0, pi->mpsc_base + MPSC_CHR_6); - writel(0, pi->mpsc_base + MPSC_CHR_7); - writel(0, pi->mpsc_base + MPSC_CHR_8); - writel(0, pi->mpsc_base + MPSC_CHR_9); - writel(0, pi->mpsc_base + MPSC_CHR_10); -} - -static void mpsc_enter_hunt(struct mpsc_port_info *pi) -{ - pr_debug("mpsc_enter_hunt[%d]: Hunting...\n", pi->port.line); - - if (pi->mirror_regs) { - writel(pi->MPSC_CHR_2_m | MPSC_CHR_2_EH, - pi->mpsc_base + MPSC_CHR_2); - /* Erratum prevents reading CHR_2 so just delay for a while */ - udelay(100); - } else { - writel(readl(pi->mpsc_base + MPSC_CHR_2) | MPSC_CHR_2_EH, - pi->mpsc_base + MPSC_CHR_2); - - while (readl(pi->mpsc_base + MPSC_CHR_2) & MPSC_CHR_2_EH) - udelay(10); - } -} - -static void mpsc_freeze(struct mpsc_port_info *pi) -{ - u32 v; - - pr_debug("mpsc_freeze[%d]: Freezing\n", pi->port.line); - - v = (pi->mirror_regs) ? pi->MPSC_MPCR_m : - readl(pi->mpsc_base + MPSC_MPCR); - v |= MPSC_MPCR_FRZ; - - if (pi->mirror_regs) - pi->MPSC_MPCR_m = v; - writel(v, pi->mpsc_base + MPSC_MPCR); -} - -static void mpsc_unfreeze(struct mpsc_port_info *pi) -{ - u32 v; - - v = (pi->mirror_regs) ? pi->MPSC_MPCR_m : - readl(pi->mpsc_base + MPSC_MPCR); - v &= ~MPSC_MPCR_FRZ; - - if (pi->mirror_regs) - pi->MPSC_MPCR_m = v; - writel(v, pi->mpsc_base + MPSC_MPCR); - - pr_debug("mpsc_unfreeze[%d]: Unfrozen\n", pi->port.line); -} - -static void mpsc_set_char_length(struct mpsc_port_info *pi, u32 len) -{ - u32 v; - - pr_debug("mpsc_set_char_length[%d]: char len: %d\n", pi->port.line,len); - - v = (pi->mirror_regs) ? pi->MPSC_MPCR_m : - readl(pi->mpsc_base + MPSC_MPCR); - v = (v & ~(0x3 << 12)) | ((len & 0x3) << 12); - - if (pi->mirror_regs) - pi->MPSC_MPCR_m = v; - writel(v, pi->mpsc_base + MPSC_MPCR); -} - -static void mpsc_set_stop_bit_length(struct mpsc_port_info *pi, u32 len) -{ - u32 v; - - pr_debug("mpsc_set_stop_bit_length[%d]: stop bits: %d\n", - pi->port.line, len); - - v = (pi->mirror_regs) ? pi->MPSC_MPCR_m : - readl(pi->mpsc_base + MPSC_MPCR); - - v = (v & ~(1 << 14)) | ((len & 0x1) << 14); - - if (pi->mirror_regs) - pi->MPSC_MPCR_m = v; - writel(v, pi->mpsc_base + MPSC_MPCR); -} - -static void mpsc_set_parity(struct mpsc_port_info *pi, u32 p) -{ - u32 v; - - pr_debug("mpsc_set_parity[%d]: parity bits: 0x%x\n", pi->port.line, p); - - v = (pi->mirror_regs) ? pi->MPSC_CHR_2_m : - readl(pi->mpsc_base + MPSC_CHR_2); - - p &= 0x3; - v = (v & ~0xc000c) | (p << 18) | (p << 2); - - if (pi->mirror_regs) - pi->MPSC_CHR_2_m = v; - writel(v, pi->mpsc_base + MPSC_CHR_2); -} - -/* - ****************************************************************************** - * - * Driver Init Routines - * - ****************************************************************************** - */ - -static void mpsc_init_hw(struct mpsc_port_info *pi) -{ - pr_debug("mpsc_init_hw[%d]: Initializing\n", pi->port.line); - - mpsc_brg_init(pi, pi->brg_clk_src); - mpsc_brg_enable(pi); - mpsc_sdma_init(pi, dma_get_cache_alignment()); /* burst a cacheline */ - mpsc_sdma_stop(pi); - mpsc_hw_init(pi); -} - -static int mpsc_alloc_ring_mem(struct mpsc_port_info *pi) -{ - int rc = 0; - - pr_debug("mpsc_alloc_ring_mem[%d]: Allocating ring mem\n", - pi->port.line); - - if (!pi->dma_region) { - if (!dma_set_mask(pi->port.dev, 0xffffffff)) { - printk(KERN_ERR "MPSC: Inadequate DMA support\n"); - rc = -ENXIO; - } else if ((pi->dma_region = dma_alloc_attrs(pi->port.dev, - MPSC_DMA_ALLOC_SIZE, - &pi->dma_region_p, GFP_KERNEL, - DMA_ATTR_NON_CONSISTENT)) - == NULL) { - printk(KERN_ERR "MPSC: Can't alloc Desc region\n"); - rc = -ENOMEM; - } - } - - return rc; -} - -static void mpsc_free_ring_mem(struct mpsc_port_info *pi) -{ - pr_debug("mpsc_free_ring_mem[%d]: Freeing ring mem\n", pi->port.line); - - if (pi->dma_region) { - dma_free_attrs(pi->port.dev, MPSC_DMA_ALLOC_SIZE, - pi->dma_region, pi->dma_region_p, - DMA_ATTR_NON_CONSISTENT); - pi->dma_region = NULL; - pi->dma_region_p = (dma_addr_t)NULL; - } -} - -static void mpsc_init_rings(struct mpsc_port_info *pi) -{ - struct mpsc_rx_desc *rxre; - struct mpsc_tx_desc *txre; - dma_addr_t dp, dp_p; - u8 *bp, *bp_p; - int i; - - pr_debug("mpsc_init_rings[%d]: Initializing rings\n", pi->port.line); - - BUG_ON(pi->dma_region == NULL); - - memset(pi->dma_region, 0, MPSC_DMA_ALLOC_SIZE); - - /* - * Descriptors & buffers are multiples of cacheline size and must be - * cacheline aligned. - */ - dp = ALIGN((u32)pi->dma_region, dma_get_cache_alignment()); - dp_p = ALIGN((u32)pi->dma_region_p, dma_get_cache_alignment()); - - /* - * Partition dma region into rx ring descriptor, rx buffers, - * tx ring descriptors, and tx buffers. - */ - pi->rxr = dp; - pi->rxr_p = dp_p; - dp += MPSC_RXR_SIZE; - dp_p += MPSC_RXR_SIZE; - - pi->rxb = (u8 *)dp; - pi->rxb_p = (u8 *)dp_p; - dp += MPSC_RXB_SIZE; - dp_p += MPSC_RXB_SIZE; - - pi->rxr_posn = 0; - - pi->txr = dp; - pi->txr_p = dp_p; - dp += MPSC_TXR_SIZE; - dp_p += MPSC_TXR_SIZE; - - pi->txb = (u8 *)dp; - pi->txb_p = (u8 *)dp_p; - - pi->txr_head = 0; - pi->txr_tail = 0; - - /* Init rx ring descriptors */ - dp = pi->rxr; - dp_p = pi->rxr_p; - bp = pi->rxb; - bp_p = pi->rxb_p; - - for (i = 0; i < MPSC_RXR_ENTRIES; i++) { - rxre = (struct mpsc_rx_desc *)dp; - - rxre->bufsize = cpu_to_be16(MPSC_RXBE_SIZE); - rxre->bytecnt = cpu_to_be16(0); - rxre->cmdstat = cpu_to_be32(SDMA_DESC_CMDSTAT_O - | SDMA_DESC_CMDSTAT_EI | SDMA_DESC_CMDSTAT_F - | SDMA_DESC_CMDSTAT_L); - rxre->link = cpu_to_be32(dp_p + MPSC_RXRE_SIZE); - rxre->buf_ptr = cpu_to_be32(bp_p); - - dp += MPSC_RXRE_SIZE; - dp_p += MPSC_RXRE_SIZE; - bp += MPSC_RXBE_SIZE; - bp_p += MPSC_RXBE_SIZE; - } - rxre->link = cpu_to_be32(pi->rxr_p); /* Wrap last back to first */ - - /* Init tx ring descriptors */ - dp = pi->txr; - dp_p = pi->txr_p; - bp = pi->txb; - bp_p = pi->txb_p; - - for (i = 0; i < MPSC_TXR_ENTRIES; i++) { - txre = (struct mpsc_tx_desc *)dp; - - txre->link = cpu_to_be32(dp_p + MPSC_TXRE_SIZE); - txre->buf_ptr = cpu_to_be32(bp_p); - - dp += MPSC_TXRE_SIZE; - dp_p += MPSC_TXRE_SIZE; - bp += MPSC_TXBE_SIZE; - bp_p += MPSC_TXBE_SIZE; - } - txre->link = cpu_to_be32(pi->txr_p); /* Wrap last back to first */ - - dma_cache_sync(pi->port.dev, (void *)pi->dma_region, - MPSC_DMA_ALLOC_SIZE, DMA_BIDIRECTIONAL); -#if defined(CONFIG_PPC32) && !defined(CONFIG_NOT_COHERENT_CACHE) - if (pi->cache_mgmt) /* GT642[46]0 Res #COMM-2 */ - flush_dcache_range((ulong)pi->dma_region, - (ulong)pi->dma_region - + MPSC_DMA_ALLOC_SIZE); -#endif - - return; -} - -static void mpsc_uninit_rings(struct mpsc_port_info *pi) -{ - pr_debug("mpsc_uninit_rings[%d]: Uninitializing rings\n",pi->port.line); - - BUG_ON(pi->dma_region == NULL); - - pi->rxr = 0; - pi->rxr_p = 0; - pi->rxb = NULL; - pi->rxb_p = NULL; - pi->rxr_posn = 0; - - pi->txr = 0; - pi->txr_p = 0; - pi->txb = NULL; - pi->txb_p = NULL; - pi->txr_head = 0; - pi->txr_tail = 0; -} - -static int mpsc_make_ready(struct mpsc_port_info *pi) -{ - int rc; - - pr_debug("mpsc_make_ready[%d]: Making cltr ready\n", pi->port.line); - - if (!pi->ready) { - mpsc_init_hw(pi); - rc = mpsc_alloc_ring_mem(pi); - if (rc) - return rc; - mpsc_init_rings(pi); - pi->ready = 1; - } - - return 0; -} - -#ifdef CONFIG_CONSOLE_POLL -static int serial_polled; -#endif - -/* - ****************************************************************************** - * - * Interrupt Handling Routines - * - ****************************************************************************** - */ - -static int mpsc_rx_intr(struct mpsc_port_info *pi, unsigned long *flags) -{ - struct mpsc_rx_desc *rxre; - struct tty_port *port = &pi->port.state->port; - u32 cmdstat, bytes_in, i; - int rc = 0; - u8 *bp; - char flag = TTY_NORMAL; - - pr_debug("mpsc_rx_intr[%d]: Handling Rx intr\n", pi->port.line); - - rxre = (struct mpsc_rx_desc *)(pi->rxr + (pi->rxr_posn*MPSC_RXRE_SIZE)); - - dma_cache_sync(pi->port.dev, (void *)rxre, MPSC_RXRE_SIZE, - DMA_FROM_DEVICE); -#if defined(CONFIG_PPC32) && !defined(CONFIG_NOT_COHERENT_CACHE) - if (pi->cache_mgmt) /* GT642[46]0 Res #COMM-2 */ - invalidate_dcache_range((ulong)rxre, - (ulong)rxre + MPSC_RXRE_SIZE); -#endif - - /* - * Loop through Rx descriptors handling ones that have been completed. - */ - while (!((cmdstat = be32_to_cpu(rxre->cmdstat)) - & SDMA_DESC_CMDSTAT_O)) { - bytes_in = be16_to_cpu(rxre->bytecnt); -#ifdef CONFIG_CONSOLE_POLL - if (unlikely(serial_polled)) { - serial_polled = 0; - return 0; - } -#endif - /* Following use of tty struct directly is deprecated */ - if (tty_buffer_request_room(port, bytes_in) < bytes_in) { - if (port->low_latency) { - spin_unlock_irqrestore(&pi->port.lock, *flags); - tty_flip_buffer_push(port); - spin_lock_irqsave(&pi->port.lock, *flags); - } - /* - * If this failed then we will throw away the bytes - * but must do so to clear interrupts. - */ - } - - bp = pi->rxb + (pi->rxr_posn * MPSC_RXBE_SIZE); - dma_cache_sync(pi->port.dev, (void *)bp, MPSC_RXBE_SIZE, - DMA_FROM_DEVICE); -#if defined(CONFIG_PPC32) && !defined(CONFIG_NOT_COHERENT_CACHE) - if (pi->cache_mgmt) /* GT642[46]0 Res #COMM-2 */ - invalidate_dcache_range((ulong)bp, - (ulong)bp + MPSC_RXBE_SIZE); -#endif - - /* - * Other than for parity error, the manual provides little - * info on what data will be in a frame flagged by any of - * these errors. For parity error, it is the last byte in - * the buffer that had the error. As for the rest, I guess - * we'll assume there is no data in the buffer. - * If there is...it gets lost. - */ - if (unlikely(cmdstat & (SDMA_DESC_CMDSTAT_BR - | SDMA_DESC_CMDSTAT_FR - | SDMA_DESC_CMDSTAT_OR))) { - - pi->port.icount.rx++; - - if (cmdstat & SDMA_DESC_CMDSTAT_BR) { /* Break */ - pi->port.icount.brk++; - - if (uart_handle_break(&pi->port)) - goto next_frame; - } else if (cmdstat & SDMA_DESC_CMDSTAT_FR) { - pi->port.icount.frame++; - } else if (cmdstat & SDMA_DESC_CMDSTAT_OR) { - pi->port.icount.overrun++; - } - - cmdstat &= pi->port.read_status_mask; - - if (cmdstat & SDMA_DESC_CMDSTAT_BR) - flag = TTY_BREAK; - else if (cmdstat & SDMA_DESC_CMDSTAT_FR) - flag = TTY_FRAME; - else if (cmdstat & SDMA_DESC_CMDSTAT_OR) - flag = TTY_OVERRUN; - else if (cmdstat & SDMA_DESC_CMDSTAT_PE) - flag = TTY_PARITY; - } - - if (uart_handle_sysrq_char(&pi->port, *bp)) { - bp++; - bytes_in--; -#ifdef CONFIG_CONSOLE_POLL - if (unlikely(serial_polled)) { - serial_polled = 0; - return 0; - } -#endif - goto next_frame; - } - - if ((unlikely(cmdstat & (SDMA_DESC_CMDSTAT_BR - | SDMA_DESC_CMDSTAT_FR - | SDMA_DESC_CMDSTAT_OR))) - && !(cmdstat & pi->port.ignore_status_mask)) { - tty_insert_flip_char(port, *bp, flag); - } else { - for (i=0; iport.icount.rx += bytes_in; - } - -next_frame: - rxre->bytecnt = cpu_to_be16(0); - wmb(); - rxre->cmdstat = cpu_to_be32(SDMA_DESC_CMDSTAT_O - | SDMA_DESC_CMDSTAT_EI | SDMA_DESC_CMDSTAT_F - | SDMA_DESC_CMDSTAT_L); - wmb(); - dma_cache_sync(pi->port.dev, (void *)rxre, MPSC_RXRE_SIZE, - DMA_BIDIRECTIONAL); -#if defined(CONFIG_PPC32) && !defined(CONFIG_NOT_COHERENT_CACHE) - if (pi->cache_mgmt) /* GT642[46]0 Res #COMM-2 */ - flush_dcache_range((ulong)rxre, - (ulong)rxre + MPSC_RXRE_SIZE); -#endif - - /* Advance to next descriptor */ - pi->rxr_posn = (pi->rxr_posn + 1) & (MPSC_RXR_ENTRIES - 1); - rxre = (struct mpsc_rx_desc *) - (pi->rxr + (pi->rxr_posn * MPSC_RXRE_SIZE)); - dma_cache_sync(pi->port.dev, (void *)rxre, MPSC_RXRE_SIZE, - DMA_FROM_DEVICE); -#if defined(CONFIG_PPC32) && !defined(CONFIG_NOT_COHERENT_CACHE) - if (pi->cache_mgmt) /* GT642[46]0 Res #COMM-2 */ - invalidate_dcache_range((ulong)rxre, - (ulong)rxre + MPSC_RXRE_SIZE); -#endif - rc = 1; - } - - /* Restart rx engine, if its stopped */ - if ((readl(pi->sdma_base + SDMA_SDCM) & SDMA_SDCM_ERD) == 0) - mpsc_start_rx(pi); - - spin_unlock_irqrestore(&pi->port.lock, *flags); - tty_flip_buffer_push(port); - spin_lock_irqsave(&pi->port.lock, *flags); - return rc; -} - -static void mpsc_setup_tx_desc(struct mpsc_port_info *pi, u32 count, u32 intr) -{ - struct mpsc_tx_desc *txre; - - txre = (struct mpsc_tx_desc *)(pi->txr - + (pi->txr_head * MPSC_TXRE_SIZE)); - - txre->bytecnt = cpu_to_be16(count); - txre->shadow = txre->bytecnt; - wmb(); /* ensure cmdstat is last field updated */ - txre->cmdstat = cpu_to_be32(SDMA_DESC_CMDSTAT_O | SDMA_DESC_CMDSTAT_F - | SDMA_DESC_CMDSTAT_L - | ((intr) ? SDMA_DESC_CMDSTAT_EI : 0)); - wmb(); - dma_cache_sync(pi->port.dev, (void *)txre, MPSC_TXRE_SIZE, - DMA_BIDIRECTIONAL); -#if defined(CONFIG_PPC32) && !defined(CONFIG_NOT_COHERENT_CACHE) - if (pi->cache_mgmt) /* GT642[46]0 Res #COMM-2 */ - flush_dcache_range((ulong)txre, - (ulong)txre + MPSC_TXRE_SIZE); -#endif -} - -static void mpsc_copy_tx_data(struct mpsc_port_info *pi) -{ - struct circ_buf *xmit = &pi->port.state->xmit; - u8 *bp; - u32 i; - - /* Make sure the desc ring isn't full */ - while (CIRC_CNT(pi->txr_head, pi->txr_tail, MPSC_TXR_ENTRIES) - < (MPSC_TXR_ENTRIES - 1)) { - if (pi->port.x_char) { - /* - * Ideally, we should use the TCS field in - * CHR_1 to put the x_char out immediately but - * errata prevents us from being able to read - * CHR_2 to know that its safe to write to - * CHR_1. Instead, just put it in-band with - * all the other Tx data. - */ - bp = pi->txb + (pi->txr_head * MPSC_TXBE_SIZE); - *bp = pi->port.x_char; - pi->port.x_char = 0; - i = 1; - } else if (!uart_circ_empty(xmit) - && !uart_tx_stopped(&pi->port)) { - i = min((u32)MPSC_TXBE_SIZE, - (u32)uart_circ_chars_pending(xmit)); - i = min(i, (u32)CIRC_CNT_TO_END(xmit->head, xmit->tail, - UART_XMIT_SIZE)); - bp = pi->txb + (pi->txr_head * MPSC_TXBE_SIZE); - memcpy(bp, &xmit->buf[xmit->tail], i); - xmit->tail = (xmit->tail + i) & (UART_XMIT_SIZE - 1); - - if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS) - uart_write_wakeup(&pi->port); - } else { /* All tx data copied into ring bufs */ - return; - } - - dma_cache_sync(pi->port.dev, (void *)bp, MPSC_TXBE_SIZE, - DMA_BIDIRECTIONAL); -#if defined(CONFIG_PPC32) && !defined(CONFIG_NOT_COHERENT_CACHE) - if (pi->cache_mgmt) /* GT642[46]0 Res #COMM-2 */ - flush_dcache_range((ulong)bp, - (ulong)bp + MPSC_TXBE_SIZE); -#endif - mpsc_setup_tx_desc(pi, i, 1); - - /* Advance to next descriptor */ - pi->txr_head = (pi->txr_head + 1) & (MPSC_TXR_ENTRIES - 1); - } -} - -static int mpsc_tx_intr(struct mpsc_port_info *pi) -{ - struct mpsc_tx_desc *txre; - int rc = 0; - unsigned long iflags; - - spin_lock_irqsave(&pi->tx_lock, iflags); - - if (!mpsc_sdma_tx_active(pi)) { - txre = (struct mpsc_tx_desc *)(pi->txr - + (pi->txr_tail * MPSC_TXRE_SIZE)); - - dma_cache_sync(pi->port.dev, (void *)txre, MPSC_TXRE_SIZE, - DMA_FROM_DEVICE); -#if defined(CONFIG_PPC32) && !defined(CONFIG_NOT_COHERENT_CACHE) - if (pi->cache_mgmt) /* GT642[46]0 Res #COMM-2 */ - invalidate_dcache_range((ulong)txre, - (ulong)txre + MPSC_TXRE_SIZE); -#endif - - while (!(be32_to_cpu(txre->cmdstat) & SDMA_DESC_CMDSTAT_O)) { - rc = 1; - pi->port.icount.tx += be16_to_cpu(txre->bytecnt); - pi->txr_tail = (pi->txr_tail+1) & (MPSC_TXR_ENTRIES-1); - - /* If no more data to tx, fall out of loop */ - if (pi->txr_head == pi->txr_tail) - break; - - txre = (struct mpsc_tx_desc *)(pi->txr - + (pi->txr_tail * MPSC_TXRE_SIZE)); - dma_cache_sync(pi->port.dev, (void *)txre, - MPSC_TXRE_SIZE, DMA_FROM_DEVICE); -#if defined(CONFIG_PPC32) && !defined(CONFIG_NOT_COHERENT_CACHE) - if (pi->cache_mgmt) /* GT642[46]0 Res #COMM-2 */ - invalidate_dcache_range((ulong)txre, - (ulong)txre + MPSC_TXRE_SIZE); -#endif - } - - mpsc_copy_tx_data(pi); - mpsc_sdma_start_tx(pi); /* start next desc if ready */ - } - - spin_unlock_irqrestore(&pi->tx_lock, iflags); - return rc; -} - -/* - * This is the driver's interrupt handler. To avoid a race, we first clear - * the interrupt, then handle any completed Rx/Tx descriptors. When done - * handling those descriptors, we restart the Rx/Tx engines if they're stopped. - */ -static irqreturn_t mpsc_sdma_intr(int irq, void *dev_id) -{ - struct mpsc_port_info *pi = dev_id; - ulong iflags; - int rc = IRQ_NONE; - - pr_debug("mpsc_sdma_intr[%d]: SDMA Interrupt Received\n",pi->port.line); - - spin_lock_irqsave(&pi->port.lock, iflags); - mpsc_sdma_intr_ack(pi); - if (mpsc_rx_intr(pi, &iflags)) - rc = IRQ_HANDLED; - if (mpsc_tx_intr(pi)) - rc = IRQ_HANDLED; - spin_unlock_irqrestore(&pi->port.lock, iflags); - - pr_debug("mpsc_sdma_intr[%d]: SDMA Interrupt Handled\n", pi->port.line); - return rc; -} - -/* - ****************************************************************************** - * - * serial_core.c Interface routines - * - ****************************************************************************** - */ -static uint mpsc_tx_empty(struct uart_port *port) -{ - struct mpsc_port_info *pi = - container_of(port, struct mpsc_port_info, port); - ulong iflags; - uint rc; - - spin_lock_irqsave(&pi->port.lock, iflags); - rc = mpsc_sdma_tx_active(pi) ? 0 : TIOCSER_TEMT; - spin_unlock_irqrestore(&pi->port.lock, iflags); - - return rc; -} - -static void mpsc_set_mctrl(struct uart_port *port, uint mctrl) -{ - /* Have no way to set modem control lines AFAICT */ -} - -static uint mpsc_get_mctrl(struct uart_port *port) -{ - struct mpsc_port_info *pi = - container_of(port, struct mpsc_port_info, port); - u32 mflags, status; - - status = (pi->mirror_regs) ? pi->MPSC_CHR_10_m - : readl(pi->mpsc_base + MPSC_CHR_10); - - mflags = 0; - if (status & 0x1) - mflags |= TIOCM_CTS; - if (status & 0x2) - mflags |= TIOCM_CAR; - - return mflags | TIOCM_DSR; /* No way to tell if DSR asserted */ -} - -static void mpsc_stop_tx(struct uart_port *port) -{ - struct mpsc_port_info *pi = - container_of(port, struct mpsc_port_info, port); - - pr_debug("mpsc_stop_tx[%d]\n", port->line); - - mpsc_freeze(pi); -} - -static void mpsc_start_tx(struct uart_port *port) -{ - struct mpsc_port_info *pi = - container_of(port, struct mpsc_port_info, port); - unsigned long iflags; - - spin_lock_irqsave(&pi->tx_lock, iflags); - - mpsc_unfreeze(pi); - mpsc_copy_tx_data(pi); - mpsc_sdma_start_tx(pi); - - spin_unlock_irqrestore(&pi->tx_lock, iflags); - - pr_debug("mpsc_start_tx[%d]\n", port->line); -} - -static void mpsc_start_rx(struct mpsc_port_info *pi) -{ - pr_debug("mpsc_start_rx[%d]: Starting...\n", pi->port.line); - - if (pi->rcv_data) { - mpsc_enter_hunt(pi); - mpsc_sdma_cmd(pi, SDMA_SDCM_ERD); - } -} - -static void mpsc_stop_rx(struct uart_port *port) -{ - struct mpsc_port_info *pi = - container_of(port, struct mpsc_port_info, port); - - pr_debug("mpsc_stop_rx[%d]: Stopping...\n", port->line); - - if (pi->mirror_regs) { - writel(pi->MPSC_CHR_2_m | MPSC_CHR_2_RA, - pi->mpsc_base + MPSC_CHR_2); - /* Erratum prevents reading CHR_2 so just delay for a while */ - udelay(100); - } else { - writel(readl(pi->mpsc_base + MPSC_CHR_2) | MPSC_CHR_2_RA, - pi->mpsc_base + MPSC_CHR_2); - - while (readl(pi->mpsc_base + MPSC_CHR_2) & MPSC_CHR_2_RA) - udelay(10); - } - - mpsc_sdma_cmd(pi, SDMA_SDCM_AR); -} - -static void mpsc_break_ctl(struct uart_port *port, int ctl) -{ - struct mpsc_port_info *pi = - container_of(port, struct mpsc_port_info, port); - ulong flags; - u32 v; - - v = ctl ? 0x00ff0000 : 0; - - spin_lock_irqsave(&pi->port.lock, flags); - if (pi->mirror_regs) - pi->MPSC_CHR_1_m = v; - writel(v, pi->mpsc_base + MPSC_CHR_1); - spin_unlock_irqrestore(&pi->port.lock, flags); -} - -static int mpsc_startup(struct uart_port *port) -{ - struct mpsc_port_info *pi = - container_of(port, struct mpsc_port_info, port); - u32 flag = 0; - int rc; - - pr_debug("mpsc_startup[%d]: Starting up MPSC, irq: %d\n", - port->line, pi->port.irq); - - if ((rc = mpsc_make_ready(pi)) == 0) { - /* Setup IRQ handler */ - mpsc_sdma_intr_ack(pi); - - /* If irq's are shared, need to set flag */ - if (mpsc_ports[0].port.irq == mpsc_ports[1].port.irq) - flag = IRQF_SHARED; - - if (request_irq(pi->port.irq, mpsc_sdma_intr, flag, - "mpsc-sdma", pi)) - printk(KERN_ERR "MPSC: Can't get SDMA IRQ %d\n", - pi->port.irq); - - mpsc_sdma_intr_unmask(pi, 0xf); - mpsc_sdma_set_rx_ring(pi, (struct mpsc_rx_desc *)(pi->rxr_p - + (pi->rxr_posn * MPSC_RXRE_SIZE))); - } - - return rc; -} - -static void mpsc_shutdown(struct uart_port *port) -{ - struct mpsc_port_info *pi = - container_of(port, struct mpsc_port_info, port); - - pr_debug("mpsc_shutdown[%d]: Shutting down MPSC\n", port->line); - - mpsc_sdma_stop(pi); - free_irq(pi->port.irq, pi); -} - -static void mpsc_set_termios(struct uart_port *port, struct ktermios *termios, - struct ktermios *old) -{ - struct mpsc_port_info *pi = - container_of(port, struct mpsc_port_info, port); - u32 baud; - ulong flags; - u32 chr_bits, stop_bits, par; - - switch (termios->c_cflag & CSIZE) { - case CS5: - chr_bits = MPSC_MPCR_CL_5; - break; - case CS6: - chr_bits = MPSC_MPCR_CL_6; - break; - case CS7: - chr_bits = MPSC_MPCR_CL_7; - break; - case CS8: - default: - chr_bits = MPSC_MPCR_CL_8; - break; - } - - if (termios->c_cflag & CSTOPB) - stop_bits = MPSC_MPCR_SBL_2; - else - stop_bits = MPSC_MPCR_SBL_1; - - par = MPSC_CHR_2_PAR_EVEN; - if (termios->c_cflag & PARENB) - if (termios->c_cflag & PARODD) - par = MPSC_CHR_2_PAR_ODD; -#ifdef CMSPAR - if (termios->c_cflag & CMSPAR) { - if (termios->c_cflag & PARODD) - par = MPSC_CHR_2_PAR_MARK; - else - par = MPSC_CHR_2_PAR_SPACE; - } -#endif - - baud = uart_get_baud_rate(port, termios, old, 0, port->uartclk); - - spin_lock_irqsave(&pi->port.lock, flags); - - uart_update_timeout(port, termios->c_cflag, baud); - - mpsc_set_char_length(pi, chr_bits); - mpsc_set_stop_bit_length(pi, stop_bits); - mpsc_set_parity(pi, par); - mpsc_set_baudrate(pi, baud); - - /* Characters/events to read */ - pi->port.read_status_mask = SDMA_DESC_CMDSTAT_OR; - - if (termios->c_iflag & INPCK) - pi->port.read_status_mask |= SDMA_DESC_CMDSTAT_PE - | SDMA_DESC_CMDSTAT_FR; - - if (termios->c_iflag & (IGNBRK | BRKINT | PARMRK)) - pi->port.read_status_mask |= SDMA_DESC_CMDSTAT_BR; - - /* Characters/events to ignore */ - pi->port.ignore_status_mask = 0; - - if (termios->c_iflag & IGNPAR) - pi->port.ignore_status_mask |= SDMA_DESC_CMDSTAT_PE - | SDMA_DESC_CMDSTAT_FR; - - if (termios->c_iflag & IGNBRK) { - pi->port.ignore_status_mask |= SDMA_DESC_CMDSTAT_BR; - - if (termios->c_iflag & IGNPAR) - pi->port.ignore_status_mask |= SDMA_DESC_CMDSTAT_OR; - } - - if ((termios->c_cflag & CREAD)) { - if (!pi->rcv_data) { - pi->rcv_data = 1; - mpsc_start_rx(pi); - } - } else if (pi->rcv_data) { - mpsc_stop_rx(port); - pi->rcv_data = 0; - } - - spin_unlock_irqrestore(&pi->port.lock, flags); -} - -static const char *mpsc_type(struct uart_port *port) -{ - pr_debug("mpsc_type[%d]: port type: %s\n", port->line,MPSC_DRIVER_NAME); - return MPSC_DRIVER_NAME; -} - -static int mpsc_request_port(struct uart_port *port) -{ - /* Should make chip/platform specific call */ - return 0; -} - -static void mpsc_release_port(struct uart_port *port) -{ - struct mpsc_port_info *pi = - container_of(port, struct mpsc_port_info, port); - - if (pi->ready) { - mpsc_uninit_rings(pi); - mpsc_free_ring_mem(pi); - pi->ready = 0; - } -} - -static void mpsc_config_port(struct uart_port *port, int flags) -{ -} - -static int mpsc_verify_port(struct uart_port *port, struct serial_struct *ser) -{ - struct mpsc_port_info *pi = - container_of(port, struct mpsc_port_info, port); - int rc = 0; - - pr_debug("mpsc_verify_port[%d]: Verifying port data\n", pi->port.line); - - if (ser->type != PORT_UNKNOWN && ser->type != PORT_MPSC) - rc = -EINVAL; - else if (pi->port.irq != ser->irq) - rc = -EINVAL; - else if (ser->io_type != SERIAL_IO_MEM) - rc = -EINVAL; - else if (pi->port.uartclk / 16 != ser->baud_base) /* Not sure */ - rc = -EINVAL; - else if ((void *)pi->port.mapbase != ser->iomem_base) - rc = -EINVAL; - else if (pi->port.iobase != ser->port) - rc = -EINVAL; - else if (ser->hub6 != 0) - rc = -EINVAL; - - return rc; -} -#ifdef CONFIG_CONSOLE_POLL -/* Serial polling routines for writing and reading from the uart while - * in an interrupt or debug context. - */ - -static char poll_buf[2048]; -static int poll_ptr; -static int poll_cnt; -static void mpsc_put_poll_char(struct uart_port *port, - unsigned char c); - -static int mpsc_get_poll_char(struct uart_port *port) -{ - struct mpsc_port_info *pi = - container_of(port, struct mpsc_port_info, port); - struct mpsc_rx_desc *rxre; - u32 cmdstat, bytes_in, i; - u8 *bp; - - if (!serial_polled) - serial_polled = 1; - - pr_debug("mpsc_rx_intr[%d]: Handling Rx intr\n", pi->port.line); - - if (poll_cnt) { - poll_cnt--; - return poll_buf[poll_ptr++]; - } - poll_ptr = 0; - poll_cnt = 0; - - while (poll_cnt == 0) { - rxre = (struct mpsc_rx_desc *)(pi->rxr + - (pi->rxr_posn*MPSC_RXRE_SIZE)); - dma_cache_sync(pi->port.dev, (void *)rxre, - MPSC_RXRE_SIZE, DMA_FROM_DEVICE); -#if defined(CONFIG_PPC32) && !defined(CONFIG_NOT_COHERENT_CACHE) - if (pi->cache_mgmt) /* GT642[46]0 Res #COMM-2 */ - invalidate_dcache_range((ulong)rxre, - (ulong)rxre + MPSC_RXRE_SIZE); -#endif - /* - * Loop through Rx descriptors handling ones that have - * been completed. - */ - while (poll_cnt == 0 && - !((cmdstat = be32_to_cpu(rxre->cmdstat)) & - SDMA_DESC_CMDSTAT_O)){ - bytes_in = be16_to_cpu(rxre->bytecnt); - bp = pi->rxb + (pi->rxr_posn * MPSC_RXBE_SIZE); - dma_cache_sync(pi->port.dev, (void *) bp, - MPSC_RXBE_SIZE, DMA_FROM_DEVICE); -#if defined(CONFIG_PPC32) && !defined(CONFIG_NOT_COHERENT_CACHE) - if (pi->cache_mgmt) /* GT642[46]0 Res #COMM-2 */ - invalidate_dcache_range((ulong)bp, - (ulong)bp + MPSC_RXBE_SIZE); -#endif - if ((unlikely(cmdstat & (SDMA_DESC_CMDSTAT_BR | - SDMA_DESC_CMDSTAT_FR | SDMA_DESC_CMDSTAT_OR))) && - !(cmdstat & pi->port.ignore_status_mask)) { - poll_buf[poll_cnt] = *bp; - poll_cnt++; - } else { - for (i = 0; i < bytes_in; i++) { - poll_buf[poll_cnt] = *bp++; - poll_cnt++; - } - pi->port.icount.rx += bytes_in; - } - rxre->bytecnt = cpu_to_be16(0); - wmb(); - rxre->cmdstat = cpu_to_be32(SDMA_DESC_CMDSTAT_O | - SDMA_DESC_CMDSTAT_EI | - SDMA_DESC_CMDSTAT_F | - SDMA_DESC_CMDSTAT_L); - wmb(); - dma_cache_sync(pi->port.dev, (void *)rxre, - MPSC_RXRE_SIZE, DMA_BIDIRECTIONAL); -#if defined(CONFIG_PPC32) && !defined(CONFIG_NOT_COHERENT_CACHE) - if (pi->cache_mgmt) /* GT642[46]0 Res #COMM-2 */ - flush_dcache_range((ulong)rxre, - (ulong)rxre + MPSC_RXRE_SIZE); -#endif - - /* Advance to next descriptor */ - pi->rxr_posn = (pi->rxr_posn + 1) & - (MPSC_RXR_ENTRIES - 1); - rxre = (struct mpsc_rx_desc *)(pi->rxr + - (pi->rxr_posn * MPSC_RXRE_SIZE)); - dma_cache_sync(pi->port.dev, (void *)rxre, - MPSC_RXRE_SIZE, DMA_FROM_DEVICE); -#if defined(CONFIG_PPC32) && !defined(CONFIG_NOT_COHERENT_CACHE) - if (pi->cache_mgmt) /* GT642[46]0 Res #COMM-2 */ - invalidate_dcache_range((ulong)rxre, - (ulong)rxre + MPSC_RXRE_SIZE); -#endif - } - - /* Restart rx engine, if its stopped */ - if ((readl(pi->sdma_base + SDMA_SDCM) & SDMA_SDCM_ERD) == 0) - mpsc_start_rx(pi); - } - if (poll_cnt) { - poll_cnt--; - return poll_buf[poll_ptr++]; - } - - return 0; -} - - -static void mpsc_put_poll_char(struct uart_port *port, - unsigned char c) -{ - struct mpsc_port_info *pi = - container_of(port, struct mpsc_port_info, port); - u32 data; - - data = readl(pi->mpsc_base + MPSC_MPCR); - writeb(c, pi->mpsc_base + MPSC_CHR_1); - mb(); - data = readl(pi->mpsc_base + MPSC_CHR_2); - data |= MPSC_CHR_2_TTCS; - writel(data, pi->mpsc_base + MPSC_CHR_2); - mb(); - - while (readl(pi->mpsc_base + MPSC_CHR_2) & MPSC_CHR_2_TTCS); -} -#endif - -static const struct uart_ops mpsc_pops = { - .tx_empty = mpsc_tx_empty, - .set_mctrl = mpsc_set_mctrl, - .get_mctrl = mpsc_get_mctrl, - .stop_tx = mpsc_stop_tx, - .start_tx = mpsc_start_tx, - .stop_rx = mpsc_stop_rx, - .break_ctl = mpsc_break_ctl, - .startup = mpsc_startup, - .shutdown = mpsc_shutdown, - .set_termios = mpsc_set_termios, - .type = mpsc_type, - .release_port = mpsc_release_port, - .request_port = mpsc_request_port, - .config_port = mpsc_config_port, - .verify_port = mpsc_verify_port, -#ifdef CONFIG_CONSOLE_POLL - .poll_get_char = mpsc_get_poll_char, - .poll_put_char = mpsc_put_poll_char, -#endif -}; - -/* - ****************************************************************************** - * - * Console Interface Routines - * - ****************************************************************************** - */ - -#ifdef CONFIG_SERIAL_MPSC_CONSOLE -static void mpsc_console_write(struct console *co, const char *s, uint count) -{ - struct mpsc_port_info *pi = &mpsc_ports[co->index]; - u8 *bp, *dp, add_cr = 0; - int i; - unsigned long iflags; - - spin_lock_irqsave(&pi->tx_lock, iflags); - - while (pi->txr_head != pi->txr_tail) { - while (mpsc_sdma_tx_active(pi)) - udelay(100); - mpsc_sdma_intr_ack(pi); - mpsc_tx_intr(pi); - } - - while (mpsc_sdma_tx_active(pi)) - udelay(100); - - while (count > 0) { - bp = dp = pi->txb + (pi->txr_head * MPSC_TXBE_SIZE); - - for (i = 0; i < MPSC_TXBE_SIZE; i++) { - if (count == 0) - break; - - if (add_cr) { - *(dp++) = '\r'; - add_cr = 0; - } else { - *(dp++) = *s; - - if (*(s++) == '\n') { /* add '\r' after '\n' */ - add_cr = 1; - count++; - } - } - - count--; - } - - dma_cache_sync(pi->port.dev, (void *)bp, MPSC_TXBE_SIZE, - DMA_BIDIRECTIONAL); -#if defined(CONFIG_PPC32) && !defined(CONFIG_NOT_COHERENT_CACHE) - if (pi->cache_mgmt) /* GT642[46]0 Res #COMM-2 */ - flush_dcache_range((ulong)bp, - (ulong)bp + MPSC_TXBE_SIZE); -#endif - mpsc_setup_tx_desc(pi, i, 0); - pi->txr_head = (pi->txr_head + 1) & (MPSC_TXR_ENTRIES - 1); - mpsc_sdma_start_tx(pi); - - while (mpsc_sdma_tx_active(pi)) - udelay(100); - - pi->txr_tail = (pi->txr_tail + 1) & (MPSC_TXR_ENTRIES - 1); - } - - spin_unlock_irqrestore(&pi->tx_lock, iflags); -} - -static int __init mpsc_console_setup(struct console *co, char *options) -{ - struct mpsc_port_info *pi; - int baud, bits, parity, flow; - - pr_debug("mpsc_console_setup[%d]: options: %s\n", co->index, options); - - if (co->index >= MPSC_NUM_CTLRS) - co->index = 0; - - pi = &mpsc_ports[co->index]; - - baud = pi->default_baud; - bits = pi->default_bits; - parity = pi->default_parity; - flow = pi->default_flow; - - if (!pi->port.ops) - return -ENODEV; - - spin_lock_init(&pi->port.lock); /* Temporary fix--copied from 8250.c */ - - if (options) - uart_parse_options(options, &baud, &parity, &bits, &flow); - - return uart_set_options(&pi->port, co, baud, parity, bits, flow); -} - -static struct console mpsc_console = { - .name = MPSC_DEV_NAME, - .write = mpsc_console_write, - .device = uart_console_device, - .setup = mpsc_console_setup, - .flags = CON_PRINTBUFFER, - .index = -1, - .data = &mpsc_reg, -}; - -static int __init mpsc_late_console_init(void) -{ - pr_debug("mpsc_late_console_init: Enter\n"); - - if (!(mpsc_console.flags & CON_ENABLED)) - register_console(&mpsc_console); - return 0; -} - -late_initcall(mpsc_late_console_init); - -#define MPSC_CONSOLE &mpsc_console -#else -#define MPSC_CONSOLE NULL -#endif -/* - ****************************************************************************** - * - * Dummy Platform Driver to extract & map shared register regions - * - ****************************************************************************** - */ -static void mpsc_resource_err(char *s) -{ - printk(KERN_WARNING "MPSC: Platform device resource error in %s\n", s); -} - -static int mpsc_shared_map_regs(struct platform_device *pd) -{ - struct resource *r; - - if ((r = platform_get_resource(pd, IORESOURCE_MEM, - MPSC_ROUTING_BASE_ORDER)) - && request_mem_region(r->start, - MPSC_ROUTING_REG_BLOCK_SIZE, - "mpsc_routing_regs")) { - mpsc_shared_regs.mpsc_routing_base = ioremap(r->start, - MPSC_ROUTING_REG_BLOCK_SIZE); - mpsc_shared_regs.mpsc_routing_base_p = r->start; - } else { - mpsc_resource_err("MPSC routing base"); - return -ENOMEM; - } - - if ((r = platform_get_resource(pd, IORESOURCE_MEM, - MPSC_SDMA_INTR_BASE_ORDER)) - && request_mem_region(r->start, - MPSC_SDMA_INTR_REG_BLOCK_SIZE, - "sdma_intr_regs")) { - mpsc_shared_regs.sdma_intr_base = ioremap(r->start, - MPSC_SDMA_INTR_REG_BLOCK_SIZE); - mpsc_shared_regs.sdma_intr_base_p = r->start; - } else { - iounmap(mpsc_shared_regs.mpsc_routing_base); - release_mem_region(mpsc_shared_regs.mpsc_routing_base_p, - MPSC_ROUTING_REG_BLOCK_SIZE); - mpsc_resource_err("SDMA intr base"); - return -ENOMEM; - } - - return 0; -} - -static void mpsc_shared_unmap_regs(void) -{ - if (mpsc_shared_regs.mpsc_routing_base) { - iounmap(mpsc_shared_regs.mpsc_routing_base); - release_mem_region(mpsc_shared_regs.mpsc_routing_base_p, - MPSC_ROUTING_REG_BLOCK_SIZE); - } - if (mpsc_shared_regs.sdma_intr_base) { - iounmap(mpsc_shared_regs.sdma_intr_base); - release_mem_region(mpsc_shared_regs.sdma_intr_base_p, - MPSC_SDMA_INTR_REG_BLOCK_SIZE); - } - - mpsc_shared_regs.mpsc_routing_base = NULL; - mpsc_shared_regs.sdma_intr_base = NULL; - - mpsc_shared_regs.mpsc_routing_base_p = 0; - mpsc_shared_regs.sdma_intr_base_p = 0; -} - -static int mpsc_shared_drv_probe(struct platform_device *dev) -{ - struct mpsc_shared_pdata *pdata; - int rc; - - if (dev->id != 0) - return -ENODEV; - - rc = mpsc_shared_map_regs(dev); - if (rc) - return rc; - - pdata = dev_get_platdata(&dev->dev); - - mpsc_shared_regs.MPSC_MRR_m = pdata->mrr_val; - mpsc_shared_regs.MPSC_RCRR_m= pdata->rcrr_val; - mpsc_shared_regs.MPSC_TCRR_m= pdata->tcrr_val; - mpsc_shared_regs.SDMA_INTR_CAUSE_m = pdata->intr_cause_val; - mpsc_shared_regs.SDMA_INTR_MASK_m = pdata->intr_mask_val; - - return 0; -} - -static int mpsc_shared_drv_remove(struct platform_device *dev) -{ - if (dev->id != 0) - return -ENODEV; - - mpsc_shared_unmap_regs(); - mpsc_shared_regs.MPSC_MRR_m = 0; - mpsc_shared_regs.MPSC_RCRR_m = 0; - mpsc_shared_regs.MPSC_TCRR_m = 0; - mpsc_shared_regs.SDMA_INTR_CAUSE_m = 0; - mpsc_shared_regs.SDMA_INTR_MASK_m = 0; - - return 0; -} - -static struct platform_driver mpsc_shared_driver = { - .probe = mpsc_shared_drv_probe, - .remove = mpsc_shared_drv_remove, - .driver = { - .name = MPSC_SHARED_NAME, - }, -}; - -/* - ****************************************************************************** - * - * Driver Interface Routines - * - ****************************************************************************** - */ -static struct uart_driver mpsc_reg = { - .owner = THIS_MODULE, - .driver_name = MPSC_DRIVER_NAME, - .dev_name = MPSC_DEV_NAME, - .major = MPSC_MAJOR, - .minor = MPSC_MINOR_START, - .nr = MPSC_NUM_CTLRS, - .cons = MPSC_CONSOLE, -}; - -static int mpsc_drv_map_regs(struct mpsc_port_info *pi, - struct platform_device *pd) -{ - struct resource *r; - - if ((r = platform_get_resource(pd, IORESOURCE_MEM, MPSC_BASE_ORDER)) - && request_mem_region(r->start, MPSC_REG_BLOCK_SIZE, - "mpsc_regs")) { - pi->mpsc_base = ioremap(r->start, MPSC_REG_BLOCK_SIZE); - pi->mpsc_base_p = r->start; - } else { - mpsc_resource_err("MPSC base"); - goto err; - } - - if ((r = platform_get_resource(pd, IORESOURCE_MEM, - MPSC_SDMA_BASE_ORDER)) - && request_mem_region(r->start, - MPSC_SDMA_REG_BLOCK_SIZE, "sdma_regs")) { - pi->sdma_base = ioremap(r->start,MPSC_SDMA_REG_BLOCK_SIZE); - pi->sdma_base_p = r->start; - } else { - mpsc_resource_err("SDMA base"); - goto err; - } - - if ((r = platform_get_resource(pd,IORESOURCE_MEM,MPSC_BRG_BASE_ORDER)) - && request_mem_region(r->start, - MPSC_BRG_REG_BLOCK_SIZE, "brg_regs")) { - pi->brg_base = ioremap(r->start, MPSC_BRG_REG_BLOCK_SIZE); - pi->brg_base_p = r->start; - } else { - mpsc_resource_err("BRG base"); - goto err; - } - return 0; - -err: - if (pi->sdma_base) { - iounmap(pi->sdma_base); - pi->sdma_base = NULL; - } - if (pi->mpsc_base) { - iounmap(pi->mpsc_base); - pi->mpsc_base = NULL; - } - return -ENOMEM; -} - -static void mpsc_drv_unmap_regs(struct mpsc_port_info *pi) -{ - if (pi->mpsc_base) { - iounmap(pi->mpsc_base); - release_mem_region(pi->mpsc_base_p, MPSC_REG_BLOCK_SIZE); - } - if (pi->sdma_base) { - iounmap(pi->sdma_base); - release_mem_region(pi->sdma_base_p, MPSC_SDMA_REG_BLOCK_SIZE); - } - if (pi->brg_base) { - iounmap(pi->brg_base); - release_mem_region(pi->brg_base_p, MPSC_BRG_REG_BLOCK_SIZE); - } - - pi->mpsc_base = NULL; - pi->sdma_base = NULL; - pi->brg_base = NULL; - - pi->mpsc_base_p = 0; - pi->sdma_base_p = 0; - pi->brg_base_p = 0; -} - -static void mpsc_drv_get_platform_data(struct mpsc_port_info *pi, - struct platform_device *pd, int num) -{ - struct mpsc_pdata *pdata; - - pdata = dev_get_platdata(&pd->dev); - - pi->port.uartclk = pdata->brg_clk_freq; - pi->port.iotype = UPIO_MEM; - pi->port.line = num; - pi->port.type = PORT_MPSC; - pi->port.fifosize = MPSC_TXBE_SIZE; - pi->port.membase = pi->mpsc_base; - pi->port.mapbase = (ulong)pi->mpsc_base; - pi->port.ops = &mpsc_pops; - - pi->mirror_regs = pdata->mirror_regs; - pi->cache_mgmt = pdata->cache_mgmt; - pi->brg_can_tune = pdata->brg_can_tune; - pi->brg_clk_src = pdata->brg_clk_src; - pi->mpsc_max_idle = pdata->max_idle; - pi->default_baud = pdata->default_baud; - pi->default_bits = pdata->default_bits; - pi->default_parity = pdata->default_parity; - pi->default_flow = pdata->default_flow; - - /* Initial values of mirrored regs */ - pi->MPSC_CHR_1_m = pdata->chr_1_val; - pi->MPSC_CHR_2_m = pdata->chr_2_val; - pi->MPSC_CHR_10_m = pdata->chr_10_val; - pi->MPSC_MPCR_m = pdata->mpcr_val; - pi->BRG_BCR_m = pdata->bcr_val; - - pi->shared_regs = &mpsc_shared_regs; - - pi->port.irq = platform_get_irq(pd, 0); -} - -static int mpsc_drv_probe(struct platform_device *dev) -{ - struct mpsc_port_info *pi; - int rc; - - dev_dbg(&dev->dev, "mpsc_drv_probe: Adding MPSC %d\n", dev->id); - - if (dev->id >= MPSC_NUM_CTLRS) - return -ENODEV; - - pi = &mpsc_ports[dev->id]; - - rc = mpsc_drv_map_regs(pi, dev); - if (rc) - return rc; - - mpsc_drv_get_platform_data(pi, dev, dev->id); - pi->port.dev = &dev->dev; - - rc = mpsc_make_ready(pi); - if (rc) - goto err_unmap; - - spin_lock_init(&pi->tx_lock); - rc = uart_add_one_port(&mpsc_reg, &pi->port); - if (rc) - goto err_relport; - - return 0; -err_relport: - mpsc_release_port(&pi->port); -err_unmap: - mpsc_drv_unmap_regs(pi); - return rc; -} - -static struct platform_driver mpsc_driver = { - .probe = mpsc_drv_probe, - .driver = { - .name = MPSC_CTLR_NAME, - .suppress_bind_attrs = true, - }, -}; - -static int __init mpsc_drv_init(void) -{ - int rc; - - printk(KERN_INFO "Serial: MPSC driver\n"); - - memset(mpsc_ports, 0, sizeof(mpsc_ports)); - memset(&mpsc_shared_regs, 0, sizeof(mpsc_shared_regs)); - - rc = uart_register_driver(&mpsc_reg); - if (rc) - return rc; - - rc = platform_driver_register(&mpsc_shared_driver); - if (rc) - goto err_unreg_uart; - - rc = platform_driver_register(&mpsc_driver); - if (rc) - goto err_unreg_plat; - - return 0; -err_unreg_plat: - platform_driver_unregister(&mpsc_shared_driver); -err_unreg_uart: - uart_unregister_driver(&mpsc_reg); - return rc; -} -device_initcall(mpsc_drv_init); - -/* -MODULE_AUTHOR("Mark A. Greer "); -MODULE_DESCRIPTION("Generic Marvell MPSC serial/UART driver"); -MODULE_LICENSE("GPL"); -*/ diff --git a/include/linux/mv643xx.h b/include/linux/mv643xx.h index 4471cf96ef69..47e5679b48e1 100644 --- a/include/linux/mv643xx.h +++ b/include/linux/mv643xx.h @@ -918,52 +918,6 @@ extern void mv64340_irq_init(unsigned int base); -/* MPSC Platform Device, Driver Data (Shared register regions) */ -#define MPSC_SHARED_NAME "mpsc_shared" - -#define MPSC_ROUTING_BASE_ORDER 0 -#define MPSC_SDMA_INTR_BASE_ORDER 1 - -#define MPSC_ROUTING_REG_BLOCK_SIZE 0x000c -#define MPSC_SDMA_INTR_REG_BLOCK_SIZE 0x0084 - -struct mpsc_shared_pdata { - u32 mrr_val; - u32 rcrr_val; - u32 tcrr_val; - u32 intr_cause_val; - u32 intr_mask_val; -}; - -/* MPSC Platform Device, Driver Data */ -#define MPSC_CTLR_NAME "mpsc" - -#define MPSC_BASE_ORDER 0 -#define MPSC_SDMA_BASE_ORDER 1 -#define MPSC_BRG_BASE_ORDER 2 - -#define MPSC_REG_BLOCK_SIZE 0x0038 -#define MPSC_SDMA_REG_BLOCK_SIZE 0x0c18 -#define MPSC_BRG_REG_BLOCK_SIZE 0x0008 - -struct mpsc_pdata { - u8 mirror_regs; - u8 cache_mgmt; - u8 max_idle; - int default_baud; - int default_bits; - int default_parity; - int default_flow; - u32 chr_1_val; - u32 chr_2_val; - u32 chr_10_val; - u32 mpcr_val; - u32 bcr_val; - u8 brg_can_tune; - u8 brg_clk_src; - u32 brg_clk_freq; -}; - /* Watchdog Platform Device, Driver Data */ #define MV64x60_WDT_NAME "mv64x60_wdt" diff --git a/include/uapi/linux/serial_core.h b/include/uapi/linux/serial_core.h index 67c4aaaa2308..5642c05e0da0 100644 --- a/include/uapi/linux/serial_core.h +++ b/include/uapi/linux/serial_core.h @@ -129,7 +129,7 @@ /* Motorola i.MX SoC */ #define PORT_IMX 62 -/* Marvell MPSC */ +/* Marvell MPSC (obsolete unused) */ #define PORT_MPSC 63 /* TXX9 type number */ -- cgit v1.2.3-71-gd317 From f0c1aab2bd1ad131d9d7528b9dcbf9253a74e5da Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 21 Jun 2019 17:37:48 +0200 Subject: netfilter: rename nf_SYNPROXY.h to nf_synproxy.h Uppercase is a reminiscence from the iptables infrastructure, rename this header before this is included in stable kernels. Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_SYNPROXY.h | 19 ------------------- include/uapi/linux/netfilter/nf_synproxy.h | 19 +++++++++++++++++++ include/uapi/linux/netfilter/xt_SYNPROXY.h | 2 +- net/netfilter/nf_synproxy_core.c | 2 +- 4 files changed, 21 insertions(+), 21 deletions(-) delete mode 100644 include/uapi/linux/netfilter/nf_SYNPROXY.h create mode 100644 include/uapi/linux/netfilter/nf_synproxy.h (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/netfilter/nf_SYNPROXY.h b/include/uapi/linux/netfilter/nf_SYNPROXY.h deleted file mode 100644 index 068d1b3a6f06..000000000000 --- a/include/uapi/linux/netfilter/nf_SYNPROXY.h +++ /dev/null @@ -1,19 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _NF_SYNPROXY_H -#define _NF_SYNPROXY_H - -#include - -#define NF_SYNPROXY_OPT_MSS 0x01 -#define NF_SYNPROXY_OPT_WSCALE 0x02 -#define NF_SYNPROXY_OPT_SACK_PERM 0x04 -#define NF_SYNPROXY_OPT_TIMESTAMP 0x08 -#define NF_SYNPROXY_OPT_ECN 0x10 - -struct nf_synproxy_info { - __u8 options; - __u8 wscale; - __u16 mss; -}; - -#endif /* _NF_SYNPROXY_H */ diff --git a/include/uapi/linux/netfilter/nf_synproxy.h b/include/uapi/linux/netfilter/nf_synproxy.h new file mode 100644 index 000000000000..068d1b3a6f06 --- /dev/null +++ b/include/uapi/linux/netfilter/nf_synproxy.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _NF_SYNPROXY_H +#define _NF_SYNPROXY_H + +#include + +#define NF_SYNPROXY_OPT_MSS 0x01 +#define NF_SYNPROXY_OPT_WSCALE 0x02 +#define NF_SYNPROXY_OPT_SACK_PERM 0x04 +#define NF_SYNPROXY_OPT_TIMESTAMP 0x08 +#define NF_SYNPROXY_OPT_ECN 0x10 + +struct nf_synproxy_info { + __u8 options; + __u8 wscale; + __u16 mss; +}; + +#endif /* _NF_SYNPROXY_H */ diff --git a/include/uapi/linux/netfilter/xt_SYNPROXY.h b/include/uapi/linux/netfilter/xt_SYNPROXY.h index 4d5611d647df..19c04ed86172 100644 --- a/include/uapi/linux/netfilter/xt_SYNPROXY.h +++ b/include/uapi/linux/netfilter/xt_SYNPROXY.h @@ -2,7 +2,7 @@ #ifndef _XT_SYNPROXY_H #define _XT_SYNPROXY_H -#include +#include #define XT_SYNPROXY_OPT_MSS NF_SYNPROXY_OPT_MSS #define XT_SYNPROXY_OPT_WSCALE NF_SYNPROXY_OPT_WSCALE diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c index 409722d23302..b101f187eda8 100644 --- a/net/netfilter/nf_synproxy_core.c +++ b/net/netfilter/nf_synproxy_core.c @@ -11,7 +11,7 @@ #include #include -#include +#include #include #include -- cgit v1.2.3-71-gd317 From 6f7b841bc939e7c811ad32427b58d54edbcfa6ed Mon Sep 17 00:00:00 2001 From: Vadim Fedorenko Date: Mon, 1 Jul 2019 19:49:34 +0300 Subject: ipvs: allow tunneling with gre encapsulation windows real servers can handle gre tunnels, this patch allows gre encapsulation with the tunneling method, thereby letting ipvs be load balancer for windows-based services Signed-off-by: Vadim Fedorenko Acked-by: Julian Anastasov Signed-off-by: Simon Horman Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/ip_vs.h | 1 + net/netfilter/ipvs/ip_vs_ctl.c | 1 + net/netfilter/ipvs/ip_vs_xmit.c | 66 +++++++++++++++++++++++++++++++++++++++-- 3 files changed, 65 insertions(+), 3 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/ip_vs.h b/include/uapi/linux/ip_vs.h index e4f18061a4fd..4102ddcb4e14 100644 --- a/include/uapi/linux/ip_vs.h +++ b/include/uapi/linux/ip_vs.h @@ -128,6 +128,7 @@ enum { IP_VS_CONN_F_TUNNEL_TYPE_IPIP = 0, /* IPIP */ IP_VS_CONN_F_TUNNEL_TYPE_GUE, /* GUE */ + IP_VS_CONN_F_TUNNEL_TYPE_GRE, /* GRE */ IP_VS_CONN_F_TUNNEL_TYPE_MAX, }; diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 84384d896e29..998353bec74f 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -525,6 +525,7 @@ static void ip_vs_rs_hash(struct netns_ipvs *ipvs, struct ip_vs_dest *dest) port = dest->tun_port; break; case IP_VS_CONN_F_TUNNEL_TYPE_IPIP: + case IP_VS_CONN_F_TUNNEL_TYPE_GRE: port = 0; break; default: diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index 71fc6d63a67f..9c464d24beec 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -29,6 +29,7 @@ #include /* for tcphdr */ #include #include +#include #include /* for csum_tcpudp_magic */ #include #include /* for icmp_send */ @@ -388,6 +389,12 @@ __ip_vs_get_out_rt(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb, IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) && skb->ip_summed == CHECKSUM_PARTIAL) mtu -= GUE_PLEN_REMCSUM + GUE_LEN_PRIV; + } else if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) { + __be16 tflags = 0; + + if (dest->tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) + tflags |= TUNNEL_CSUM; + mtu -= gre_calc_hlen(tflags); } if (mtu < 68) { IP_VS_DBG_RL("%s(): mtu less than 68\n", __func__); @@ -548,6 +555,12 @@ __ip_vs_get_out_rt_v6(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb, IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) && skb->ip_summed == CHECKSUM_PARTIAL) mtu -= GUE_PLEN_REMCSUM + GUE_LEN_PRIV; + } else if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) { + __be16 tflags = 0; + + if (dest->tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) + tflags |= TUNNEL_CSUM; + mtu -= gre_calc_hlen(tflags); } if (mtu < IPV6_MIN_MTU) { IP_VS_DBG_RL("%s(): mtu less than %d\n", __func__, @@ -1079,6 +1092,24 @@ ipvs_gue_encap(struct net *net, struct sk_buff *skb, return 0; } +static void +ipvs_gre_encap(struct net *net, struct sk_buff *skb, + struct ip_vs_conn *cp, __u8 *next_protocol) +{ + __be16 proto = *next_protocol == IPPROTO_IPIP ? + htons(ETH_P_IP) : htons(ETH_P_IPV6); + __be16 tflags = 0; + size_t hdrlen; + + if (cp->dest->tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) + tflags |= TUNNEL_CSUM; + + hdrlen = gre_calc_hlen(tflags); + gre_build_header(skb, hdrlen, tflags, proto, 0, 0); + + *next_protocol = IPPROTO_GRE; +} + /* * IP Tunneling transmitter * @@ -1151,6 +1182,15 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, gue_hdrlen = sizeof(struct guehdr) + gue_optlen; max_headroom += sizeof(struct udphdr) + gue_hdrlen; + } else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) { + size_t gre_hdrlen; + __be16 tflags = 0; + + if (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) + tflags |= TUNNEL_CSUM; + gre_hdrlen = gre_calc_hlen(tflags); + + max_headroom += gre_hdrlen; } /* We only care about the df field if sysctl_pmtu_disc(ipvs) is set */ @@ -1172,6 +1212,11 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, skb->ip_summed == CHECKSUM_PARTIAL) { gso_type |= SKB_GSO_TUNNEL_REMCSUM; } + } else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) { + if (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) + gso_type |= SKB_GSO_GRE_CSUM; + else + gso_type |= SKB_GSO_GRE; } if (iptunnel_handle_offloads(skb, gso_type)) @@ -1192,8 +1237,8 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, check = true; udp_set_csum(!check, skb, saddr, cp->daddr.ip, skb->len); - } - + } else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) + ipvs_gre_encap(net, skb, cp, &next_protocol); skb_push(skb, sizeof(struct iphdr)); skb_reset_network_header(skb); @@ -1287,6 +1332,15 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, gue_hdrlen = sizeof(struct guehdr) + gue_optlen; max_headroom += sizeof(struct udphdr) + gue_hdrlen; + } else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) { + size_t gre_hdrlen; + __be16 tflags = 0; + + if (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) + tflags |= TUNNEL_CSUM; + gre_hdrlen = gre_calc_hlen(tflags); + + max_headroom += gre_hdrlen; } skb = ip_vs_prepare_tunneled_skb(skb, cp->af, max_headroom, @@ -1306,6 +1360,11 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, skb->ip_summed == CHECKSUM_PARTIAL) { gso_type |= SKB_GSO_TUNNEL_REMCSUM; } + } else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) { + if (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) + gso_type |= SKB_GSO_GRE_CSUM; + else + gso_type |= SKB_GSO_GRE; } if (iptunnel_handle_offloads(skb, gso_type)) @@ -1326,7 +1385,8 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, check = true; udp6_set_csum(!check, skb, &saddr, &cp->daddr.in6, skb->len); - } + } else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) + ipvs_gre_encap(net, skb, cp, &next_protocol); skb_push(skb, sizeof(struct ipv6hdr)); skb_reset_network_header(skb); -- cgit v1.2.3-71-gd317 From 07a4ddec3ce9b0a533b5f90f582f1057390d5e63 Mon Sep 17 00:00:00 2001 From: Vincent Bernat Date: Tue, 2 Jul 2019 19:43:54 +0200 Subject: bonding: add an option to specify a delay between peer notifications Currently, gratuitous ARP/ND packets are sent every `miimon' milliseconds. This commit allows a user to specify a custom delay through a new option, `peer_notif_delay'. Like for `updelay' and `downdelay', this delay should be a multiple of `miimon' to avoid managing an additional work queue. The configuration logic is copied from `updelay' and `downdelay'. However, the default value cannot be set using a module parameter: Netlink or sysfs should be used to configure this feature. When setting `miimon' to 100 and `peer_notif_delay' to 500, we can observe the 500 ms delay is respected: 20:30:19.354693 ARP, Request who-has 203.0.113.10 tell 203.0.113.10, length 28 20:30:19.874892 ARP, Request who-has 203.0.113.10 tell 203.0.113.10, length 28 20:30:20.394919 ARP, Request who-has 203.0.113.10 tell 203.0.113.10, length 28 20:30:20.914963 ARP, Request who-has 203.0.113.10 tell 203.0.113.10, length 28 In bond_mii_monitor(), I have tried to keep the lock logic readable. The change is due to the fact we cannot rely on a notification to lower the value of `bond->send_peer_notif' as `NETDEV_NOTIFY_PEERS' is only triggered once every N times, while we need to decrement the counter each time. iproute2 also needs to be updated to be able to specify this new attribute through `ip link'. Signed-off-by: Vincent Bernat Signed-off-by: David S. Miller --- drivers/net/bonding/bond_main.c | 31 +++++++++++------ drivers/net/bonding/bond_netlink.c | 14 ++++++++ drivers/net/bonding/bond_options.c | 71 ++++++++++++++++++++++++-------------- drivers/net/bonding/bond_procfs.c | 2 ++ drivers/net/bonding/bond_sysfs.c | 13 +++++++ include/net/bond_options.h | 1 + include/net/bonding.h | 1 + include/uapi/linux/if_link.h | 1 + tools/include/uapi/linux/if_link.h | 1 + 9 files changed, 98 insertions(+), 37 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 84168455aded..302499ae05e6 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -796,6 +796,8 @@ static bool bond_should_notify_peers(struct bonding *bond) slave ? slave->dev->name : "NULL"); if (!slave || !bond->send_peer_notif || + bond->send_peer_notif % + max(1, bond->params.peer_notif_delay) != 0 || !netif_carrier_ok(bond->dev) || test_bit(__LINK_STATE_LINKWATCH_PENDING, &slave->dev->state)) return false; @@ -886,15 +888,18 @@ void bond_change_active_slave(struct bonding *bond, struct slave *new_active) if (netif_running(bond->dev)) { bond->send_peer_notif = - bond->params.num_peer_notif; + bond->params.num_peer_notif * + max(1, bond->params.peer_notif_delay); should_notify_peers = bond_should_notify_peers(bond); } call_netdevice_notifiers(NETDEV_BONDING_FAILOVER, bond->dev); - if (should_notify_peers) + if (should_notify_peers) { + bond->send_peer_notif--; call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, bond->dev); + } } } @@ -2279,6 +2284,7 @@ static void bond_mii_monitor(struct work_struct *work) struct bonding *bond = container_of(work, struct bonding, mii_work.work); bool should_notify_peers = false; + bool commit; unsigned long delay; struct slave *slave; struct list_head *iter; @@ -2289,12 +2295,19 @@ static void bond_mii_monitor(struct work_struct *work) goto re_arm; rcu_read_lock(); - should_notify_peers = bond_should_notify_peers(bond); - - if (bond_miimon_inspect(bond)) { + commit = !!bond_miimon_inspect(bond); + if (bond->send_peer_notif) { + rcu_read_unlock(); + if (rtnl_trylock()) { + bond->send_peer_notif--; + rtnl_unlock(); + } + } else { rcu_read_unlock(); + } + if (commit) { /* Race avoidance with bond_close cancel of workqueue */ if (!rtnl_trylock()) { delay = 1; @@ -2308,8 +2321,7 @@ static void bond_mii_monitor(struct work_struct *work) bond_miimon_commit(bond); rtnl_unlock(); /* might sleep, hold no other locks */ - } else - rcu_read_unlock(); + } re_arm: if (bond->params.miimon) @@ -3065,10 +3077,6 @@ static int bond_master_netdev_event(unsigned long event, case NETDEV_REGISTER: bond_create_proc_entry(event_bond); break; - case NETDEV_NOTIFY_PEERS: - if (event_bond->send_peer_notif) - event_bond->send_peer_notif--; - break; default: break; } @@ -4691,6 +4699,7 @@ static int bond_check_params(struct bond_params *params) params->arp_all_targets = arp_all_targets_value; params->updelay = updelay; params->downdelay = downdelay; + params->peer_notif_delay = 0; params->use_carrier = use_carrier; params->lacp_fast = lacp_fast; params->primary[0] = 0; diff --git a/drivers/net/bonding/bond_netlink.c b/drivers/net/bonding/bond_netlink.c index b24cce48ae35..a259860a7208 100644 --- a/drivers/net/bonding/bond_netlink.c +++ b/drivers/net/bonding/bond_netlink.c @@ -108,6 +108,7 @@ static const struct nla_policy bond_policy[IFLA_BOND_MAX + 1] = { [IFLA_BOND_AD_ACTOR_SYSTEM] = { .type = NLA_BINARY, .len = ETH_ALEN }, [IFLA_BOND_TLB_DYNAMIC_LB] = { .type = NLA_U8 }, + [IFLA_BOND_PEER_NOTIF_DELAY] = { .type = NLA_U32 }, }; static const struct nla_policy bond_slave_policy[IFLA_BOND_SLAVE_MAX + 1] = { @@ -215,6 +216,14 @@ static int bond_changelink(struct net_device *bond_dev, struct nlattr *tb[], if (err) return err; } + if (data[IFLA_BOND_PEER_NOTIF_DELAY]) { + int delay = nla_get_u32(data[IFLA_BOND_PEER_NOTIF_DELAY]); + + bond_opt_initval(&newval, delay); + err = __bond_opt_set(bond, BOND_OPT_PEER_NOTIF_DELAY, &newval); + if (err) + return err; + } if (data[IFLA_BOND_USE_CARRIER]) { int use_carrier = nla_get_u8(data[IFLA_BOND_USE_CARRIER]); @@ -494,6 +503,7 @@ static size_t bond_get_size(const struct net_device *bond_dev) nla_total_size(sizeof(u16)) + /* IFLA_BOND_AD_USER_PORT_KEY */ nla_total_size(ETH_ALEN) + /* IFLA_BOND_AD_ACTOR_SYSTEM */ nla_total_size(sizeof(u8)) + /* IFLA_BOND_TLB_DYNAMIC_LB */ + nla_total_size(sizeof(u32)) + /* IFLA_BOND_PEER_NOTIF_DELAY */ 0; } @@ -536,6 +546,10 @@ static int bond_fill_info(struct sk_buff *skb, bond->params.downdelay * bond->params.miimon)) goto nla_put_failure; + if (nla_put_u32(skb, IFLA_BOND_PEER_NOTIF_DELAY, + bond->params.downdelay * bond->params.miimon)) + goto nla_put_failure; + if (nla_put_u8(skb, IFLA_BOND_USE_CARRIER, bond->params.use_carrier)) goto nla_put_failure; diff --git a/drivers/net/bonding/bond_options.c b/drivers/net/bonding/bond_options.c index 0d852fe9da7c..ddb3916d3506 100644 --- a/drivers/net/bonding/bond_options.c +++ b/drivers/net/bonding/bond_options.c @@ -24,6 +24,8 @@ static int bond_option_updelay_set(struct bonding *bond, const struct bond_opt_value *newval); static int bond_option_downdelay_set(struct bonding *bond, const struct bond_opt_value *newval); +static int bond_option_peer_notif_delay_set(struct bonding *bond, + const struct bond_opt_value *newval); static int bond_option_use_carrier_set(struct bonding *bond, const struct bond_opt_value *newval); static int bond_option_arp_interval_set(struct bonding *bond, @@ -424,6 +426,13 @@ static const struct bond_option bond_opts[BOND_OPT_LAST] = { .desc = "Number of peer notifications to send on failover event", .values = bond_num_peer_notif_tbl, .set = bond_option_num_peer_notif_set + }, + [BOND_OPT_PEER_NOTIF_DELAY] = { + .id = BOND_OPT_PEER_NOTIF_DELAY, + .name = "peer_notif_delay", + .desc = "Delay between each peer notification on failover event, in milliseconds", + .values = bond_intmax_tbl, + .set = bond_option_peer_notif_delay_set } }; @@ -841,6 +850,9 @@ static int bond_option_miimon_set(struct bonding *bond, if (bond->params.downdelay) netdev_dbg(bond->dev, "Note: Updating downdelay (to %d) since it is a multiple of the miimon value\n", bond->params.downdelay * bond->params.miimon); + if (bond->params.peer_notif_delay) + netdev_dbg(bond->dev, "Note: Updating peer_notif_delay (to %d) since it is a multiple of the miimon value\n", + bond->params.peer_notif_delay * bond->params.miimon); if (newval->value && bond->params.arp_interval) { netdev_dbg(bond->dev, "MII monitoring cannot be used with ARP monitoring - disabling ARP monitoring...\n"); bond->params.arp_interval = 0; @@ -864,52 +876,59 @@ static int bond_option_miimon_set(struct bonding *bond, return 0; } -/* Set up and down delays. These must be multiples of the - * MII monitoring value, and are stored internally as the multiplier. - * Thus, we must translate to MS for the real world. +/* Set up, down and peer notification delays. These must be multiples + * of the MII monitoring value, and are stored internally as the + * multiplier. Thus, we must translate to MS for the real world. */ -static int bond_option_updelay_set(struct bonding *bond, - const struct bond_opt_value *newval) +static int _bond_option_delay_set(struct bonding *bond, + const struct bond_opt_value *newval, + const char *name, + int *target) { int value = newval->value; if (!bond->params.miimon) { - netdev_err(bond->dev, "Unable to set up delay as MII monitoring is disabled\n"); + netdev_err(bond->dev, "Unable to set %s as MII monitoring is disabled\n", + name); return -EPERM; } if ((value % bond->params.miimon) != 0) { - netdev_warn(bond->dev, "up delay (%d) is not a multiple of miimon (%d), updelay rounded to %d ms\n", + netdev_warn(bond->dev, + "%s (%d) is not a multiple of miimon (%d), value rounded to %d ms\n", + name, value, bond->params.miimon, (value / bond->params.miimon) * bond->params.miimon); } - bond->params.updelay = value / bond->params.miimon; - netdev_dbg(bond->dev, "Setting up delay to %d\n", - bond->params.updelay * bond->params.miimon); + *target = value / bond->params.miimon; + netdev_dbg(bond->dev, "Setting %s to %d\n", + name, + *target * bond->params.miimon); return 0; } +static int bond_option_updelay_set(struct bonding *bond, + const struct bond_opt_value *newval) +{ + return _bond_option_delay_set(bond, newval, "up delay", + &bond->params.updelay); +} + static int bond_option_downdelay_set(struct bonding *bond, const struct bond_opt_value *newval) { - int value = newval->value; - - if (!bond->params.miimon) { - netdev_err(bond->dev, "Unable to set down delay as MII monitoring is disabled\n"); - return -EPERM; - } - if ((value % bond->params.miimon) != 0) { - netdev_warn(bond->dev, "down delay (%d) is not a multiple of miimon (%d), delay rounded to %d ms\n", - value, bond->params.miimon, - (value / bond->params.miimon) * - bond->params.miimon); - } - bond->params.downdelay = value / bond->params.miimon; - netdev_dbg(bond->dev, "Setting down delay to %d\n", - bond->params.downdelay * bond->params.miimon); + return _bond_option_delay_set(bond, newval, "down delay", + &bond->params.downdelay); +} - return 0; +static int bond_option_peer_notif_delay_set(struct bonding *bond, + const struct bond_opt_value *newval) +{ + int ret = _bond_option_delay_set(bond, newval, + "peer notification delay", + &bond->params.peer_notif_delay); + return ret; } static int bond_option_use_carrier_set(struct bonding *bond, diff --git a/drivers/net/bonding/bond_procfs.c b/drivers/net/bonding/bond_procfs.c index 9f7d83e827c3..fd5c9cbe45b1 100644 --- a/drivers/net/bonding/bond_procfs.c +++ b/drivers/net/bonding/bond_procfs.c @@ -104,6 +104,8 @@ static void bond_info_show_master(struct seq_file *seq) bond->params.updelay * bond->params.miimon); seq_printf(seq, "Down Delay (ms): %d\n", bond->params.downdelay * bond->params.miimon); + seq_printf(seq, "Peer Notification Delay (ms): %d\n", + bond->params.peer_notif_delay * bond->params.miimon); /* ARP information */ diff --git a/drivers/net/bonding/bond_sysfs.c b/drivers/net/bonding/bond_sysfs.c index 94214eaf53c5..2d615a93685e 100644 --- a/drivers/net/bonding/bond_sysfs.c +++ b/drivers/net/bonding/bond_sysfs.c @@ -327,6 +327,18 @@ static ssize_t bonding_show_updelay(struct device *d, static DEVICE_ATTR(updelay, 0644, bonding_show_updelay, bonding_sysfs_store_option); +static ssize_t bonding_show_peer_notif_delay(struct device *d, + struct device_attribute *attr, + char *buf) +{ + struct bonding *bond = to_bond(d); + + return sprintf(buf, "%d\n", + bond->params.peer_notif_delay * bond->params.miimon); +} +static DEVICE_ATTR(peer_notif_delay, 0644, + bonding_show_peer_notif_delay, bonding_sysfs_store_option); + /* Show the LACP interval. */ static ssize_t bonding_show_lacp(struct device *d, struct device_attribute *attr, @@ -718,6 +730,7 @@ static struct attribute *per_bond_attrs[] = { &dev_attr_arp_ip_target.attr, &dev_attr_downdelay.attr, &dev_attr_updelay.attr, + &dev_attr_peer_notif_delay.attr, &dev_attr_lacp_rate.attr, &dev_attr_ad_select.attr, &dev_attr_xmit_hash_policy.attr, diff --git a/include/net/bond_options.h b/include/net/bond_options.h index 2a05cc349018..9d382f2f0bc5 100644 --- a/include/net/bond_options.h +++ b/include/net/bond_options.h @@ -63,6 +63,7 @@ enum { BOND_OPT_AD_ACTOR_SYSTEM, BOND_OPT_AD_USER_PORT_KEY, BOND_OPT_NUM_PEER_NOTIF_ALIAS, + BOND_OPT_PEER_NOTIF_DELAY, BOND_OPT_LAST }; diff --git a/include/net/bonding.h b/include/net/bonding.h index 676e7fae05a3..f7fe45689142 100644 --- a/include/net/bonding.h +++ b/include/net/bonding.h @@ -123,6 +123,7 @@ struct bond_params { int fail_over_mac; int updelay; int downdelay; + int peer_notif_delay; int lacp_fast; unsigned int min_links; int ad_select; diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 6f75bda2c2d7..4a8c02cafa9a 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -636,6 +636,7 @@ enum { IFLA_BOND_AD_USER_PORT_KEY, IFLA_BOND_AD_ACTOR_SYSTEM, IFLA_BOND_TLB_DYNAMIC_LB, + IFLA_BOND_PEER_NOTIF_DELAY, __IFLA_BOND_MAX, }; diff --git a/tools/include/uapi/linux/if_link.h b/tools/include/uapi/linux/if_link.h index 5b225ff63b48..7d113a9602f0 100644 --- a/tools/include/uapi/linux/if_link.h +++ b/tools/include/uapi/linux/if_link.h @@ -636,6 +636,7 @@ enum { IFLA_BOND_AD_USER_PORT_KEY, IFLA_BOND_AD_ACTOR_SYSTEM, IFLA_BOND_TLB_DYNAMIC_LB, + IFLA_BOND_PEER_NOTIF_DELAY, __IFLA_BOND_MAX, }; -- cgit v1.2.3-71-gd317 From ca95c7bf3d29716916baccdc77c3c2284b703069 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Thu, 4 Jul 2019 16:31:12 +0200 Subject: ALSA: usb-audio: Fix parse of UAC2 Extension Units Extension Unit (XU) is used to have a compatible layout with Processing Unit (PU) on UAC1, and the usb-audio driver code assumed it for parsing the descriptors. Meanwhile, on UAC2, XU became slightly incompatible with PU; namely, XU has a one-byte bmControls bitmap while PU has two bytes bmControls bitmap. This incompatibility results in the read of a wrong address for the last iExtension field, which ended up with an incorrect string for the mixer element name, as recently reported for Focusrite Scarlett 18i20 device. This patch corrects this misalignment by introducing a couple of new macros and calling them depending on the descriptor type. Fixes: 23caaf19b11e ("ALSA: usb-mixer: Add support for Audio Class v2.0") Reported-by: Stefan Sauer Cc: Signed-off-by: Takashi Iwai --- include/uapi/linux/usb/audio.h | 37 +++++++++++++++++++++++++++++++++++++ sound/usb/mixer.c | 16 ++++++++++------ 2 files changed, 47 insertions(+), 6 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/usb/audio.h b/include/uapi/linux/usb/audio.h index ddc5396800aa..76b7c3f6cd0d 100644 --- a/include/uapi/linux/usb/audio.h +++ b/include/uapi/linux/usb/audio.h @@ -450,6 +450,43 @@ static inline __u8 *uac_processing_unit_specific(struct uac_processing_unit_desc } } +/* + * Extension Unit (XU) has almost compatible layout with Processing Unit, but + * on UAC2, it has a different bmControls size (bControlSize); it's 1 byte for + * XU while 2 bytes for PU. The last iExtension field is a one-byte index as + * well as iProcessing field of PU. + */ +static inline __u8 uac_extension_unit_bControlSize(struct uac_processing_unit_descriptor *desc, + int protocol) +{ + switch (protocol) { + case UAC_VERSION_1: + return desc->baSourceID[desc->bNrInPins + 4]; + case UAC_VERSION_2: + return 1; /* in UAC2, this value is constant */ + case UAC_VERSION_3: + return 4; /* in UAC3, this value is constant */ + default: + return 1; + } +} + +static inline __u8 uac_extension_unit_iExtension(struct uac_processing_unit_descriptor *desc, + int protocol) +{ + __u8 control_size = uac_extension_unit_bControlSize(desc, protocol); + + switch (protocol) { + case UAC_VERSION_1: + case UAC_VERSION_2: + default: + return *(uac_processing_unit_bmControls(desc, protocol) + + control_size); + case UAC_VERSION_3: + return 0; /* UAC3 does not have this field */ + } +} + /* 4.5.2 Class-Specific AS Interface Descriptor */ struct uac1_as_header_descriptor { __u8 bLength; /* in bytes: 7 */ diff --git a/sound/usb/mixer.c b/sound/usb/mixer.c index e003b5e7b01a..ac121b10c51c 100644 --- a/sound/usb/mixer.c +++ b/sound/usb/mixer.c @@ -2318,7 +2318,7 @@ static struct procunit_info extunits[] = { */ static int build_audio_procunit(struct mixer_build *state, int unitid, void *raw_desc, struct procunit_info *list, - char *name) + bool extension_unit) { struct uac_processing_unit_descriptor *desc = raw_desc; int num_ins; @@ -2335,6 +2335,8 @@ static int build_audio_procunit(struct mixer_build *state, int unitid, static struct procunit_info default_info = { 0, NULL, default_value_info }; + const char *name = extension_unit ? + "Extension Unit" : "Processing Unit"; if (desc->bLength < 13) { usb_audio_err(state->chip, "invalid %s descriptor (id %d)\n", name, unitid); @@ -2448,7 +2450,10 @@ static int build_audio_procunit(struct mixer_build *state, int unitid, } else if (info->name) { strlcpy(kctl->id.name, info->name, sizeof(kctl->id.name)); } else { - nameid = uac_processing_unit_iProcessing(desc, state->mixer->protocol); + if (extension_unit) + nameid = uac_extension_unit_iExtension(desc, state->mixer->protocol); + else + nameid = uac_processing_unit_iProcessing(desc, state->mixer->protocol); len = 0; if (nameid) len = snd_usb_copy_string_desc(state->chip, @@ -2481,10 +2486,10 @@ static int parse_audio_processing_unit(struct mixer_build *state, int unitid, case UAC_VERSION_2: default: return build_audio_procunit(state, unitid, raw_desc, - procunits, "Processing Unit"); + procunits, false); case UAC_VERSION_3: return build_audio_procunit(state, unitid, raw_desc, - uac3_procunits, "Processing Unit"); + uac3_procunits, false); } } @@ -2495,8 +2500,7 @@ static int parse_audio_extension_unit(struct mixer_build *state, int unitid, * Note that we parse extension units with processing unit descriptors. * That's ok as the layout is the same. */ - return build_audio_procunit(state, unitid, raw_desc, - extunits, "Extension Unit"); + return build_audio_procunit(state, unitid, raw_desc, extunits, true); } /* -- cgit v1.2.3-71-gd317 From ad49d86e07a497e834cb06f2b151dccd75f8e148 Mon Sep 17 00:00:00 2001 From: Fernando Fernandez Mancera Date: Wed, 26 Jun 2019 12:59:19 +0200 Subject: netfilter: nf_tables: Add synproxy support Add synproxy support for nf_tables. This behaves like the iptables synproxy target but it is structured in a way that allows us to propose improvements in the future. Signed-off-by: Fernando Fernandez Mancera Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_synproxy.h | 1 + include/net/netfilter/nf_synproxy.h | 5 + include/uapi/linux/netfilter/nf_synproxy.h | 4 + include/uapi/linux/netfilter/nf_tables.h | 16 ++ net/netfilter/Kconfig | 11 + net/netfilter/Makefile | 1 + net/netfilter/nft_synproxy.c | 287 ++++++++++++++++++++++++++ 7 files changed, 325 insertions(+) create mode 100644 net/netfilter/nft_synproxy.c (limited to 'include/uapi/linux') diff --git a/include/net/netfilter/nf_conntrack_synproxy.h b/include/net/netfilter/nf_conntrack_synproxy.h index c5659dcf5b1a..8f00125b06f4 100644 --- a/include/net/netfilter/nf_conntrack_synproxy.h +++ b/include/net/netfilter/nf_conntrack_synproxy.h @@ -2,6 +2,7 @@ #ifndef _NF_CONNTRACK_SYNPROXY_H #define _NF_CONNTRACK_SYNPROXY_H +#include #include struct nf_conn_synproxy { diff --git a/include/net/netfilter/nf_synproxy.h b/include/net/netfilter/nf_synproxy.h index 3e8b3f03b687..87d73fb5279d 100644 --- a/include/net/netfilter/nf_synproxy.h +++ b/include/net/netfilter/nf_synproxy.h @@ -39,6 +39,11 @@ unsigned int ipv6_synproxy_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *nhs); int nf_synproxy_ipv6_init(struct synproxy_net *snet, struct net *net); void nf_synproxy_ipv6_fini(struct synproxy_net *snet, struct net *net); +#else +static inline int +nf_synproxy_ipv6_init(struct synproxy_net *snet, struct net *net) { return 0; } +static inline void +nf_synproxy_ipv6_fini(struct synproxy_net *snet, struct net *net) {}; #endif /* CONFIG_IPV6 */ #endif /* _NF_SYNPROXY_SHARED_H */ diff --git a/include/uapi/linux/netfilter/nf_synproxy.h b/include/uapi/linux/netfilter/nf_synproxy.h index 068d1b3a6f06..6f3791c8946f 100644 --- a/include/uapi/linux/netfilter/nf_synproxy.h +++ b/include/uapi/linux/netfilter/nf_synproxy.h @@ -9,6 +9,10 @@ #define NF_SYNPROXY_OPT_SACK_PERM 0x04 #define NF_SYNPROXY_OPT_TIMESTAMP 0x08 #define NF_SYNPROXY_OPT_ECN 0x10 +#define NF_SYNPROXY_OPT_MASK (NF_SYNPROXY_OPT_MSS | \ + NF_SYNPROXY_OPT_WSCALE | \ + NF_SYNPROXY_OPT_SACK_PERM | \ + NF_SYNPROXY_OPT_TIMESTAMP) struct nf_synproxy_info { __u8 options; diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index c6c8ec5c7c00..c53d581643fe 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -1551,6 +1551,22 @@ enum nft_osf_flags { NFT_OSF_F_VERSION = (1 << 0), }; +/** + * enum nft_synproxy_attributes - nf_tables synproxy expression netlink attributes + * + * @NFTA_SYNPROXY_MSS: mss value sent to the backend (NLA_U16) + * @NFTA_SYNPROXY_WSCALE: wscale value sent to the backend (NLA_U8) + * @NFTA_SYNPROXY_FLAGS: flags (NLA_U32) + */ +enum nft_synproxy_attributes { + NFTA_SYNPROXY_UNSPEC, + NFTA_SYNPROXY_MSS, + NFTA_SYNPROXY_WSCALE, + NFTA_SYNPROXY_FLAGS, + __NFTA_SYNPROXY_MAX, +}; +#define NFTA_SYNPROXY_MAX (__NFTA_SYNPROXY_MAX - 1) + /** * enum nft_device_attributes - nf_tables device netlink attributes * diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 21025c2c605b..d59742408d9b 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -651,6 +651,17 @@ config NFT_TPROXY help This makes transparent proxy support available in nftables. +config NFT_SYNPROXY + tristate "Netfilter nf_tables SYNPROXY expression support" + depends on NF_CONNTRACK && NETFILTER_ADVANCED + select NETFILTER_SYNPROXY + select SYN_COOKIES + help + The SYNPROXY expression allows you to intercept TCP connections and + establish them using syncookies before they are passed on to the + server. This allows to avoid conntrack and server resource usage + during SYN-flood attacks. + if NF_TABLES_NETDEV config NF_DUP_NETDEV diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index 72cca6b48960..deada20975ff 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -110,6 +110,7 @@ obj-$(CONFIG_NFT_SOCKET) += nft_socket.o obj-$(CONFIG_NFT_OSF) += nft_osf.o obj-$(CONFIG_NFT_TPROXY) += nft_tproxy.o obj-$(CONFIG_NFT_XFRM) += nft_xfrm.o +obj-$(CONFIG_NFT_SYNPROXY) += nft_synproxy.o obj-$(CONFIG_NFT_NAT) += nft_chain_nat.o diff --git a/net/netfilter/nft_synproxy.c b/net/netfilter/nft_synproxy.c new file mode 100644 index 000000000000..80060ade8a5b --- /dev/null +++ b/net/netfilter/nft_synproxy.c @@ -0,0 +1,287 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct nft_synproxy { + struct nf_synproxy_info info; +}; + +static const struct nla_policy nft_synproxy_policy[NFTA_SYNPROXY_MAX + 1] = { + [NFTA_SYNPROXY_MSS] = { .type = NLA_U16 }, + [NFTA_SYNPROXY_WSCALE] = { .type = NLA_U8 }, + [NFTA_SYNPROXY_FLAGS] = { .type = NLA_U32 }, +}; + +static void nft_synproxy_tcp_options(struct synproxy_options *opts, + const struct tcphdr *tcp, + struct synproxy_net *snet, + struct nf_synproxy_info *info, + struct nft_synproxy *priv) +{ + this_cpu_inc(snet->stats->syn_received); + if (tcp->ece && tcp->cwr) + opts->options |= NF_SYNPROXY_OPT_ECN; + + opts->options &= priv->info.options; + if (opts->options & NF_SYNPROXY_OPT_TIMESTAMP) + synproxy_init_timestamp_cookie(info, opts); + else + opts->options &= ~(NF_SYNPROXY_OPT_WSCALE | + NF_SYNPROXY_OPT_SACK_PERM | + NF_SYNPROXY_OPT_ECN); +} + +static void nft_synproxy_eval_v4(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt, + const struct tcphdr *tcp, + struct tcphdr *_tcph, + struct synproxy_options *opts) +{ + struct nft_synproxy *priv = nft_expr_priv(expr); + struct nf_synproxy_info info = priv->info; + struct net *net = nft_net(pkt); + struct synproxy_net *snet = synproxy_pernet(net); + struct sk_buff *skb = pkt->skb; + + if (tcp->syn) { + /* Initial SYN from client */ + nft_synproxy_tcp_options(opts, tcp, snet, &info, priv); + synproxy_send_client_synack(net, skb, tcp, opts); + consume_skb(skb); + regs->verdict.code = NF_STOLEN; + } else if (tcp->ack) { + /* ACK from client */ + if (synproxy_recv_client_ack(net, skb, tcp, opts, + ntohl(tcp->seq))) { + consume_skb(skb); + regs->verdict.code = NF_STOLEN; + } else { + regs->verdict.code = NF_DROP; + } + } +} + +#if IS_ENABLED(CONFIG_NF_TABLES_IPV6) +static void nft_synproxy_eval_v6(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt, + const struct tcphdr *tcp, + struct tcphdr *_tcph, + struct synproxy_options *opts) +{ + struct nft_synproxy *priv = nft_expr_priv(expr); + struct nf_synproxy_info info = priv->info; + struct net *net = nft_net(pkt); + struct synproxy_net *snet = synproxy_pernet(net); + struct sk_buff *skb = pkt->skb; + + if (tcp->syn) { + /* Initial SYN from client */ + nft_synproxy_tcp_options(opts, tcp, snet, &info, priv); + synproxy_send_client_synack_ipv6(net, skb, tcp, opts); + consume_skb(skb); + regs->verdict.code = NF_STOLEN; + } else if (tcp->ack) { + /* ACK from client */ + if (synproxy_recv_client_ack_ipv6(net, skb, tcp, opts, + ntohl(tcp->seq))) { + consume_skb(skb); + regs->verdict.code = NF_STOLEN; + } else { + regs->verdict.code = NF_DROP; + } + } +} +#endif /* CONFIG_NF_TABLES_IPV6*/ + +static void nft_synproxy_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + struct synproxy_options opts = {}; + struct sk_buff *skb = pkt->skb; + int thoff = pkt->xt.thoff; + const struct tcphdr *tcp; + struct tcphdr _tcph; + + if (pkt->tprot != IPPROTO_TCP) { + regs->verdict.code = NFT_BREAK; + return; + } + + if (nf_ip_checksum(skb, nft_hook(pkt), thoff, IPPROTO_TCP)) { + regs->verdict.code = NF_DROP; + return; + } + + tcp = skb_header_pointer(skb, pkt->xt.thoff, + sizeof(struct tcphdr), + &_tcph); + if (!tcp) { + regs->verdict.code = NF_DROP; + return; + } + + if (!synproxy_parse_options(skb, thoff, tcp, &opts)) { + regs->verdict.code = NF_DROP; + return; + } + + switch (skb->protocol) { + case htons(ETH_P_IP): + nft_synproxy_eval_v4(expr, regs, pkt, tcp, &_tcph, &opts); + return; +#if IS_ENABLED(CONFIG_NF_TABLES_IPV6) + case htons(ETH_P_IPV6): + nft_synproxy_eval_v6(expr, regs, pkt, tcp, &_tcph, &opts); + return; +#endif + } + regs->verdict.code = NFT_BREAK; +} + +static int nft_synproxy_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct synproxy_net *snet = synproxy_pernet(ctx->net); + struct nft_synproxy *priv = nft_expr_priv(expr); + u32 flags; + int err; + + if (tb[NFTA_SYNPROXY_MSS]) + priv->info.mss = ntohs(nla_get_be16(tb[NFTA_SYNPROXY_MSS])); + if (tb[NFTA_SYNPROXY_WSCALE]) + priv->info.wscale = nla_get_u8(tb[NFTA_SYNPROXY_WSCALE]); + if (tb[NFTA_SYNPROXY_FLAGS]) { + flags = ntohl(nla_get_be32(tb[NFTA_SYNPROXY_FLAGS])); + if (flags & ~NF_SYNPROXY_OPT_MASK) + return -EOPNOTSUPP; + priv->info.options = flags; + } + + err = nf_ct_netns_get(ctx->net, ctx->family); + if (err) + return err; + + switch (ctx->family) { + case NFPROTO_IPV4: + err = nf_synproxy_ipv4_init(snet, ctx->net); + if (err) + goto nf_ct_failure; + break; +#if IS_ENABLED(CONFIG_NF_TABLES_IPV6) + case NFPROTO_IPV6: + err = nf_synproxy_ipv6_init(snet, ctx->net); + if (err) + goto nf_ct_failure; + break; +#endif + case NFPROTO_INET: + case NFPROTO_BRIDGE: + err = nf_synproxy_ipv4_init(snet, ctx->net); + if (err) + goto nf_ct_failure; + err = nf_synproxy_ipv6_init(snet, ctx->net); + if (err) + goto nf_ct_failure; + break; + } + + return 0; + +nf_ct_failure: + nf_ct_netns_put(ctx->net, ctx->family); + return err; +} + +static void nft_synproxy_destroy(const struct nft_ctx *ctx, + const struct nft_expr *expr) +{ + struct synproxy_net *snet = synproxy_pernet(ctx->net); + + switch (ctx->family) { + case NFPROTO_IPV4: + nf_synproxy_ipv4_fini(snet, ctx->net); + break; +#if IS_ENABLED(CONFIG_NF_TABLES_IPV6) + case NFPROTO_IPV6: + nf_synproxy_ipv6_fini(snet, ctx->net); + break; +#endif + case NFPROTO_INET: + case NFPROTO_BRIDGE: + nf_synproxy_ipv4_fini(snet, ctx->net); + nf_synproxy_ipv6_fini(snet, ctx->net); + break; + } + nf_ct_netns_put(ctx->net, ctx->family); +} + +static int nft_synproxy_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ + const struct nft_synproxy *priv = nft_expr_priv(expr); + + if (nla_put_be16(skb, NFTA_SYNPROXY_MSS, htons(priv->info.mss)) || + nla_put_u8(skb, NFTA_SYNPROXY_WSCALE, priv->info.wscale) || + nla_put_be32(skb, NFTA_SYNPROXY_FLAGS, htonl(priv->info.options))) + goto nla_put_failure; + + return 0; + +nla_put_failure: + return -1; +} + +static int nft_synproxy_validate(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nft_data **data) +{ + return nft_chain_validate_hooks(ctx->chain, (1 << NF_INET_LOCAL_IN) | + (1 << NF_INET_FORWARD)); +} + +static struct nft_expr_type nft_synproxy_type; +static const struct nft_expr_ops nft_synproxy_ops = { + .eval = nft_synproxy_eval, + .size = NFT_EXPR_SIZE(sizeof(struct nft_synproxy)), + .init = nft_synproxy_init, + .destroy = nft_synproxy_destroy, + .dump = nft_synproxy_dump, + .type = &nft_synproxy_type, + .validate = nft_synproxy_validate, +}; + +static struct nft_expr_type nft_synproxy_type __read_mostly = { + .ops = &nft_synproxy_ops, + .name = "synproxy", + .owner = THIS_MODULE, + .policy = nft_synproxy_policy, + .maxattr = NFTA_SYNPROXY_MAX, +}; + +static int __init nft_synproxy_module_init(void) +{ + return nft_register_expr(&nft_synproxy_type); +} + +static void __exit nft_synproxy_module_exit(void) +{ + return nft_unregister_expr(&nft_synproxy_type); +} + +module_init(nft_synproxy_module_init); +module_exit(nft_synproxy_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Fernando Fernandez "); +MODULE_ALIAS_NFT_EXPR("synproxy"); -- cgit v1.2.3-71-gd317 From c54c7c685494fc0f1662091d4d0c4fc26e810471 Mon Sep 17 00:00:00 2001 From: wenxu Date: Fri, 5 Jul 2019 21:16:35 +0800 Subject: netfilter: nft_meta_bridge: add NFT_META_BRI_IIFPVID support This patch allows you to match on the bridge port pvid, eg. nft add rule bridge firewall zones counter meta ibrpvid 10 Signed-off-by: wenxu Reviewed-by: Nikolay Aleksandrov Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 2 ++ net/bridge/netfilter/nft_meta_bridge.c | 15 +++++++++++++++ 2 files changed, 17 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index c53d581643fe..87474920615a 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -795,6 +795,7 @@ enum nft_exthdr_attributes { * @NFT_META_SECPATH: boolean, secpath_exists (!!skb->sp) * @NFT_META_IIFKIND: packet input interface kind name (dev->rtnl_link_ops->kind) * @NFT_META_OIFKIND: packet output interface kind name (dev->rtnl_link_ops->kind) + * @NFT_META_BRI_IIFPVID: packet input bridge port pvid */ enum nft_meta_keys { NFT_META_LEN, @@ -825,6 +826,7 @@ enum nft_meta_keys { NFT_META_SECPATH, NFT_META_IIFKIND, NFT_META_OIFKIND, + NFT_META_BRI_IIFPVID, }; /** diff --git a/net/bridge/netfilter/nft_meta_bridge.c b/net/bridge/netfilter/nft_meta_bridge.c index 2ea8acb4bc4a..9487d42f657a 100644 --- a/net/bridge/netfilter/nft_meta_bridge.c +++ b/net/bridge/netfilter/nft_meta_bridge.c @@ -7,6 +7,7 @@ #include #include #include +#include static const struct net_device * nft_meta_get_bridge(const struct net_device *dev) @@ -37,6 +38,17 @@ static void nft_meta_bridge_get_eval(const struct nft_expr *expr, if (!br_dev) goto err; break; + case NFT_META_BRI_IIFPVID: { + u16 p_pvid; + + br_dev = nft_meta_get_bridge(in); + if (!br_dev || !br_vlan_enabled(br_dev)) + goto err; + + br_vlan_get_pvid_rcu(in, &p_pvid); + nft_reg_store16(dest, p_pvid); + return; + } default: goto out; } @@ -62,6 +74,9 @@ static int nft_meta_bridge_get_init(const struct nft_ctx *ctx, case NFT_META_BRI_OIFNAME: len = IFNAMSIZ; break; + case NFT_META_BRI_IIFPVID: + len = sizeof(u16); + break; default: return nft_meta_get_init(ctx, expr, tb); } -- cgit v1.2.3-71-gd317 From 2a3a93ef0ba5166e8b5766bb232f216fd412d40b Mon Sep 17 00:00:00 2001 From: wenxu Date: Fri, 5 Jul 2019 21:16:37 +0800 Subject: netfilter: nft_meta_bridge: Add NFT_META_BRI_IIFVPROTO support This patch allows you to match on bridge vlan protocol, eg. nft add rule bridge firewall zones counter meta ibrvproto 0x8100 Signed-off-by: wenxu Reviewed-by: Nikolay Aleksandrov Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 2 ++ net/bridge/netfilter/nft_meta_bridge.c | 12 ++++++++++++ 2 files changed, 14 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 87474920615a..0e3462dfb182 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -796,6 +796,7 @@ enum nft_exthdr_attributes { * @NFT_META_IIFKIND: packet input interface kind name (dev->rtnl_link_ops->kind) * @NFT_META_OIFKIND: packet output interface kind name (dev->rtnl_link_ops->kind) * @NFT_META_BRI_IIFPVID: packet input bridge port pvid + * @NFT_META_BRI_IIFVPROTO: packet input bridge vlan proto */ enum nft_meta_keys { NFT_META_LEN, @@ -827,6 +828,7 @@ enum nft_meta_keys { NFT_META_IIFKIND, NFT_META_OIFKIND, NFT_META_BRI_IIFPVID, + NFT_META_BRI_IIFVPROTO, }; /** diff --git a/net/bridge/netfilter/nft_meta_bridge.c b/net/bridge/netfilter/nft_meta_bridge.c index 9487d42f657a..bed66f536b34 100644 --- a/net/bridge/netfilter/nft_meta_bridge.c +++ b/net/bridge/netfilter/nft_meta_bridge.c @@ -49,6 +49,17 @@ static void nft_meta_bridge_get_eval(const struct nft_expr *expr, nft_reg_store16(dest, p_pvid); return; } + case NFT_META_BRI_IIFVPROTO: { + u16 p_proto; + + br_dev = nft_meta_get_bridge(in); + if (!br_dev || !br_vlan_enabled(br_dev)) + goto err; + + br_vlan_get_proto(br_dev, &p_proto); + nft_reg_store16(dest, p_proto); + return; + } default: goto out; } @@ -75,6 +86,7 @@ static int nft_meta_bridge_get_init(const struct nft_ctx *ctx, len = IFNAMSIZ; break; case NFT_META_BRI_IIFPVID: + case NFT_META_BRI_IIFVPROTO: len = sizeof(u16); break; default: -- cgit v1.2.3-71-gd317 From 6e84200c0a2994b991259d19450eee561029bf70 Mon Sep 17 00:00:00 2001 From: Pankaj Gupta Date: Fri, 5 Jul 2019 19:33:23 +0530 Subject: virtio-pmem: Add virtio pmem driver This patch adds virtio-pmem driver for KVM guest. Guest reads the persistent memory range information from Qemu over VIRTIO and registers it on nvdimm_bus. It also creates a nd_region object with the persistent memory range information so that existing 'nvdimm/pmem' driver can reserve this into system memory map. This way 'virtio-pmem' driver uses existing functionality of pmem driver to register persistent memory compatible for DAX capable filesystems. This also provides function to perform guest flush over VIRTIO from 'pmem' driver when userspace performs flush on DAX memory range. Signed-off-by: Pankaj Gupta Reviewed-by: Yuval Shaia Acked-by: Michael S. Tsirkin Acked-by: Jakub Staron Tested-by: Jakub Staron Reviewed-by: Cornelia Huck Signed-off-by: Dan Williams --- drivers/nvdimm/Makefile | 1 + drivers/nvdimm/nd_virtio.c | 125 +++++++++++++++++++++++++++++++++++++++ drivers/nvdimm/virtio_pmem.c | 122 ++++++++++++++++++++++++++++++++++++++ drivers/nvdimm/virtio_pmem.h | 55 +++++++++++++++++ drivers/virtio/Kconfig | 11 ++++ include/uapi/linux/virtio_ids.h | 1 + include/uapi/linux/virtio_pmem.h | 34 +++++++++++ 7 files changed, 349 insertions(+) create mode 100644 drivers/nvdimm/nd_virtio.c create mode 100644 drivers/nvdimm/virtio_pmem.c create mode 100644 drivers/nvdimm/virtio_pmem.h create mode 100644 include/uapi/linux/virtio_pmem.h (limited to 'include/uapi/linux') diff --git a/drivers/nvdimm/Makefile b/drivers/nvdimm/Makefile index 6f2a088afad6..cefe233e0b52 100644 --- a/drivers/nvdimm/Makefile +++ b/drivers/nvdimm/Makefile @@ -5,6 +5,7 @@ obj-$(CONFIG_ND_BTT) += nd_btt.o obj-$(CONFIG_ND_BLK) += nd_blk.o obj-$(CONFIG_X86_PMEM_LEGACY) += nd_e820.o obj-$(CONFIG_OF_PMEM) += of_pmem.o +obj-$(CONFIG_VIRTIO_PMEM) += virtio_pmem.o nd_virtio.o nd_pmem-y := pmem.o diff --git a/drivers/nvdimm/nd_virtio.c b/drivers/nvdimm/nd_virtio.c new file mode 100644 index 000000000000..8645275c08c2 --- /dev/null +++ b/drivers/nvdimm/nd_virtio.c @@ -0,0 +1,125 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * virtio_pmem.c: Virtio pmem Driver + * + * Discovers persistent memory range information + * from host and provides a virtio based flushing + * interface. + */ +#include "virtio_pmem.h" +#include "nd.h" + + /* The interrupt handler */ +void virtio_pmem_host_ack(struct virtqueue *vq) +{ + struct virtio_pmem *vpmem = vq->vdev->priv; + struct virtio_pmem_request *req_data, *req_buf; + unsigned long flags; + unsigned int len; + + spin_lock_irqsave(&vpmem->pmem_lock, flags); + while ((req_data = virtqueue_get_buf(vq, &len)) != NULL) { + req_data->done = true; + wake_up(&req_data->host_acked); + + if (!list_empty(&vpmem->req_list)) { + req_buf = list_first_entry(&vpmem->req_list, + struct virtio_pmem_request, list); + req_buf->wq_buf_avail = true; + wake_up(&req_buf->wq_buf); + list_del(&req_buf->list); + } + } + spin_unlock_irqrestore(&vpmem->pmem_lock, flags); +} +EXPORT_SYMBOL_GPL(virtio_pmem_host_ack); + + /* The request submission function */ +static int virtio_pmem_flush(struct nd_region *nd_region) +{ + struct virtio_device *vdev = nd_region->provider_data; + struct virtio_pmem *vpmem = vdev->priv; + struct virtio_pmem_request *req_data; + struct scatterlist *sgs[2], sg, ret; + unsigned long flags; + int err, err1; + + might_sleep(); + req_data = kmalloc(sizeof(*req_data), GFP_KERNEL); + if (!req_data) + return -ENOMEM; + + req_data->done = false; + init_waitqueue_head(&req_data->host_acked); + init_waitqueue_head(&req_data->wq_buf); + INIT_LIST_HEAD(&req_data->list); + req_data->req.type = cpu_to_virtio32(vdev, VIRTIO_PMEM_REQ_TYPE_FLUSH); + sg_init_one(&sg, &req_data->req, sizeof(req_data->req)); + sgs[0] = &sg; + sg_init_one(&ret, &req_data->resp.ret, sizeof(req_data->resp)); + sgs[1] = &ret; + + spin_lock_irqsave(&vpmem->pmem_lock, flags); + /* + * If virtqueue_add_sgs returns -ENOSPC then req_vq virtual + * queue does not have free descriptor. We add the request + * to req_list and wait for host_ack to wake us up when free + * slots are available. + */ + while ((err = virtqueue_add_sgs(vpmem->req_vq, sgs, 1, 1, req_data, + GFP_ATOMIC)) == -ENOSPC) { + + dev_info(&vdev->dev, "failed to send command to virtio pmem device, no free slots in the virtqueue\n"); + req_data->wq_buf_avail = false; + list_add_tail(&req_data->list, &vpmem->req_list); + spin_unlock_irqrestore(&vpmem->pmem_lock, flags); + + /* A host response results in "host_ack" getting called */ + wait_event(req_data->wq_buf, req_data->wq_buf_avail); + spin_lock_irqsave(&vpmem->pmem_lock, flags); + } + err1 = virtqueue_kick(vpmem->req_vq); + spin_unlock_irqrestore(&vpmem->pmem_lock, flags); + /* + * virtqueue_add_sgs failed with error different than -ENOSPC, we can't + * do anything about that. + */ + if (err || !err1) { + dev_info(&vdev->dev, "failed to send command to virtio pmem device\n"); + err = -EIO; + } else { + /* A host repsonse results in "host_ack" getting called */ + wait_event(req_data->host_acked, req_data->done); + err = virtio32_to_cpu(vdev, req_data->resp.ret); + } + + kfree(req_data); + return err; +}; + +/* The asynchronous flush callback function */ +int async_pmem_flush(struct nd_region *nd_region, struct bio *bio) +{ + /* + * Create child bio for asynchronous flush and chain with + * parent bio. Otherwise directly call nd_region flush. + */ + if (bio && bio->bi_iter.bi_sector != -1) { + struct bio *child = bio_alloc(GFP_ATOMIC, 0); + + if (!child) + return -ENOMEM; + bio_copy_dev(child, bio); + child->bi_opf = REQ_PREFLUSH; + child->bi_iter.bi_sector = -1; + bio_chain(child, bio); + submit_bio(child); + return 0; + } + if (virtio_pmem_flush(nd_region)) + return -EIO; + + return 0; +}; +EXPORT_SYMBOL_GPL(async_pmem_flush); +MODULE_LICENSE("GPL"); diff --git a/drivers/nvdimm/virtio_pmem.c b/drivers/nvdimm/virtio_pmem.c new file mode 100644 index 000000000000..5e3d07b47e0c --- /dev/null +++ b/drivers/nvdimm/virtio_pmem.c @@ -0,0 +1,122 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * virtio_pmem.c: Virtio pmem Driver + * + * Discovers persistent memory range information + * from host and registers the virtual pmem device + * with libnvdimm core. + */ +#include "virtio_pmem.h" +#include "nd.h" + +static struct virtio_device_id id_table[] = { + { VIRTIO_ID_PMEM, VIRTIO_DEV_ANY_ID }, + { 0 }, +}; + + /* Initialize virt queue */ +static int init_vq(struct virtio_pmem *vpmem) +{ + /* single vq */ + vpmem->req_vq = virtio_find_single_vq(vpmem->vdev, + virtio_pmem_host_ack, "flush_queue"); + if (IS_ERR(vpmem->req_vq)) + return PTR_ERR(vpmem->req_vq); + + spin_lock_init(&vpmem->pmem_lock); + INIT_LIST_HEAD(&vpmem->req_list); + + return 0; +}; + +static int virtio_pmem_probe(struct virtio_device *vdev) +{ + struct nd_region_desc ndr_desc = {}; + int nid = dev_to_node(&vdev->dev); + struct nd_region *nd_region; + struct virtio_pmem *vpmem; + struct resource res; + int err = 0; + + if (!vdev->config->get) { + dev_err(&vdev->dev, "%s failure: config access disabled\n", + __func__); + return -EINVAL; + } + + vpmem = devm_kzalloc(&vdev->dev, sizeof(*vpmem), GFP_KERNEL); + if (!vpmem) { + err = -ENOMEM; + goto out_err; + } + + vpmem->vdev = vdev; + vdev->priv = vpmem; + err = init_vq(vpmem); + if (err) { + dev_err(&vdev->dev, "failed to initialize virtio pmem vq's\n"); + goto out_err; + } + + virtio_cread(vpmem->vdev, struct virtio_pmem_config, + start, &vpmem->start); + virtio_cread(vpmem->vdev, struct virtio_pmem_config, + size, &vpmem->size); + + res.start = vpmem->start; + res.end = vpmem->start + vpmem->size - 1; + vpmem->nd_desc.provider_name = "virtio-pmem"; + vpmem->nd_desc.module = THIS_MODULE; + + vpmem->nvdimm_bus = nvdimm_bus_register(&vdev->dev, + &vpmem->nd_desc); + if (!vpmem->nvdimm_bus) { + dev_err(&vdev->dev, "failed to register device with nvdimm_bus\n"); + err = -ENXIO; + goto out_vq; + } + + dev_set_drvdata(&vdev->dev, vpmem->nvdimm_bus); + + ndr_desc.res = &res; + ndr_desc.numa_node = nid; + ndr_desc.flush = async_pmem_flush; + set_bit(ND_REGION_PAGEMAP, &ndr_desc.flags); + set_bit(ND_REGION_ASYNC, &ndr_desc.flags); + nd_region = nvdimm_pmem_region_create(vpmem->nvdimm_bus, &ndr_desc); + if (!nd_region) { + dev_err(&vdev->dev, "failed to create nvdimm region\n"); + err = -ENXIO; + goto out_nd; + } + nd_region->provider_data = dev_to_virtio(nd_region->dev.parent->parent); + return 0; +out_nd: + nvdimm_bus_unregister(vpmem->nvdimm_bus); +out_vq: + vdev->config->del_vqs(vdev); +out_err: + return err; +} + +static void virtio_pmem_remove(struct virtio_device *vdev) +{ + struct nvdimm_bus *nvdimm_bus = dev_get_drvdata(&vdev->dev); + + nvdimm_bus_unregister(nvdimm_bus); + vdev->config->del_vqs(vdev); + vdev->config->reset(vdev); +} + +static struct virtio_driver virtio_pmem_driver = { + .driver.name = KBUILD_MODNAME, + .driver.owner = THIS_MODULE, + .id_table = id_table, + .probe = virtio_pmem_probe, + .remove = virtio_pmem_remove, +}; + +module_virtio_driver(virtio_pmem_driver); +MODULE_DEVICE_TABLE(virtio, id_table); +MODULE_DESCRIPTION("Virtio pmem driver"); +MODULE_LICENSE("GPL"); diff --git a/drivers/nvdimm/virtio_pmem.h b/drivers/nvdimm/virtio_pmem.h new file mode 100644 index 000000000000..0dddefe594c4 --- /dev/null +++ b/drivers/nvdimm/virtio_pmem.h @@ -0,0 +1,55 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * virtio_pmem.h: virtio pmem Driver + * + * Discovers persistent memory range information + * from host and provides a virtio based flushing + * interface. + **/ + +#ifndef _LINUX_VIRTIO_PMEM_H +#define _LINUX_VIRTIO_PMEM_H + +#include +#include +#include +#include + +struct virtio_pmem_request { + struct virtio_pmem_req req; + struct virtio_pmem_resp resp; + + /* Wait queue to process deferred work after ack from host */ + wait_queue_head_t host_acked; + bool done; + + /* Wait queue to process deferred work after virt queue buffer avail */ + wait_queue_head_t wq_buf; + bool wq_buf_avail; + struct list_head list; +}; + +struct virtio_pmem { + struct virtio_device *vdev; + + /* Virtio pmem request queue */ + struct virtqueue *req_vq; + + /* nvdimm bus registers virtio pmem device */ + struct nvdimm_bus *nvdimm_bus; + struct nvdimm_bus_descriptor nd_desc; + + /* List to store deferred work if virtqueue is full */ + struct list_head req_list; + + /* Synchronize virtqueue data */ + spinlock_t pmem_lock; + + /* Memory region information */ + __u64 start; + __u64 size; +}; + +void virtio_pmem_host_ack(struct virtqueue *vq); +int async_pmem_flush(struct nd_region *nd_region, struct bio *bio); +#endif diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig index 023fc3bc01c6..078615cf2afc 100644 --- a/drivers/virtio/Kconfig +++ b/drivers/virtio/Kconfig @@ -43,6 +43,17 @@ config VIRTIO_PCI_LEGACY If unsure, say Y. +config VIRTIO_PMEM + tristate "Support for virtio pmem driver" + depends on VIRTIO + depends on LIBNVDIMM + help + This driver provides access to virtio-pmem devices, storage devices + that are mapped into the physical address space - similar to NVDIMMs + - with a virtio-based flushing interface. + + If unsure, say Y. + config VIRTIO_BALLOON tristate "Virtio balloon driver" depends on VIRTIO diff --git a/include/uapi/linux/virtio_ids.h b/include/uapi/linux/virtio_ids.h index 6d5c3b2d4f4d..32b2f94d1f58 100644 --- a/include/uapi/linux/virtio_ids.h +++ b/include/uapi/linux/virtio_ids.h @@ -43,5 +43,6 @@ #define VIRTIO_ID_INPUT 18 /* virtio input */ #define VIRTIO_ID_VSOCK 19 /* virtio vsock transport */ #define VIRTIO_ID_CRYPTO 20 /* virtio crypto */ +#define VIRTIO_ID_PMEM 27 /* virtio pmem */ #endif /* _LINUX_VIRTIO_IDS_H */ diff --git a/include/uapi/linux/virtio_pmem.h b/include/uapi/linux/virtio_pmem.h new file mode 100644 index 000000000000..efcd72f2d20d --- /dev/null +++ b/include/uapi/linux/virtio_pmem.h @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */ +/* + * Definitions for virtio-pmem devices. + * + * Copyright (C) 2019 Red Hat, Inc. + * + * Author(s): Pankaj Gupta + */ + +#ifndef _UAPI_LINUX_VIRTIO_PMEM_H +#define _UAPI_LINUX_VIRTIO_PMEM_H + +#include +#include +#include + +struct virtio_pmem_config { + __u64 start; + __u64 size; +}; + +#define VIRTIO_PMEM_REQ_TYPE_FLUSH 0 + +struct virtio_pmem_resp { + /* Host return status corresponding to flush request */ + __u32 ret; +}; + +struct virtio_pmem_req { + /* command type */ + __u32 type; +}; + +#endif -- cgit v1.2.3-71-gd317 From 600c70bad6594cb124c641ed05355ca134650ea4 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Mon, 1 Jul 2019 10:38:39 -0700 Subject: bpf: allow wide (u64) aligned stores for some fields of bpf_sock_addr Since commit cd17d7770578 ("bpf/tools: sync bpf.h") clang decided that it can do a single u64 store into user_ip6[2] instead of two separate u32 ones: # 17: (18) r2 = 0x100000000000000 # ; ctx->user_ip6[2] = bpf_htonl(DST_REWRITE_IP6_2); # 19: (7b) *(u64 *)(r1 +16) = r2 # invalid bpf_context access off=16 size=8 >From the compiler point of view it does look like a correct thing to do, so let's support it on the kernel side. Credit to Andrii Nakryiko for a proper implementation of bpf_ctx_wide_store_ok. Cc: Andrii Nakryiko Cc: Yonghong Song Fixes: cd17d7770578 ("bpf/tools: sync bpf.h") Reported-by: kernel test robot Acked-by: Yonghong Song Acked-by: Andrii Nakryiko Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann --- include/linux/filter.h | 6 ++++++ include/uapi/linux/bpf.h | 6 +++--- net/core/filter.c | 22 ++++++++++++++-------- 3 files changed, 23 insertions(+), 11 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/filter.h b/include/linux/filter.h index 1fe53e78c7e3..6d944369ca87 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -747,6 +747,12 @@ bpf_ctx_narrow_access_ok(u32 off, u32 size, u32 size_default) return size <= size_default && (size & (size - 1)) == 0; } +#define bpf_ctx_wide_store_ok(off, size, type, field) \ + (size == sizeof(__u64) && \ + off >= offsetof(type, field) && \ + off + sizeof(__u64) <= offsetofend(type, field) && \ + off % sizeof(__u64) == 0) + #define bpf_classic_proglen(fprog) (fprog->len * sizeof(fprog->filter[0])) static inline void bpf_prog_lock_ro(struct bpf_prog *fp) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index ead27aebf491..c318385aba51 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3247,7 +3247,7 @@ struct bpf_sock_addr { __u32 user_ip4; /* Allows 1,2,4-byte read and 4-byte write. * Stored in network byte order. */ - __u32 user_ip6[4]; /* Allows 1,2,4-byte read an 4-byte write. + __u32 user_ip6[4]; /* Allows 1,2,4-byte read and 4,8-byte write. * Stored in network byte order. */ __u32 user_port; /* Allows 4-byte read and write. @@ -3256,10 +3256,10 @@ struct bpf_sock_addr { __u32 family; /* Allows 4-byte read, but no write */ __u32 type; /* Allows 4-byte read, but no write */ __u32 protocol; /* Allows 4-byte read, but no write */ - __u32 msg_src_ip4; /* Allows 1,2,4-byte read an 4-byte write. + __u32 msg_src_ip4; /* Allows 1,2,4-byte read and 4-byte write. * Stored in network byte order. */ - __u32 msg_src_ip6[4]; /* Allows 1,2,4-byte read an 4-byte write. + __u32 msg_src_ip6[4]; /* Allows 1,2,4-byte read and 4,8-byte write. * Stored in network byte order. */ __bpf_md_ptr(struct bpf_sock *, sk); diff --git a/net/core/filter.c b/net/core/filter.c index 089aaea0ccc6..4481e950f020 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -6890,6 +6890,16 @@ static bool sock_addr_is_valid_access(int off, int size, if (!bpf_ctx_narrow_access_ok(off, size, size_default)) return false; } else { + if (bpf_ctx_wide_store_ok(off, size, + struct bpf_sock_addr, + user_ip6)) + return true; + + if (bpf_ctx_wide_store_ok(off, size, + struct bpf_sock_addr, + msg_src_ip6)) + return true; + if (size != size_default) return false; } @@ -7730,9 +7740,6 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type, /* SOCK_ADDR_STORE_NESTED_FIELD_OFF() has semantic similar to * SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF() but for store operation. * - * It doesn't support SIZE argument though since narrow stores are not - * supported for now. - * * In addition it uses Temporary Field TF (member of struct S) as the 3rd * "register" since two registers available in convert_ctx_access are not * enough: we can't override neither SRC, since it contains value to store, nor @@ -7740,7 +7747,7 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type, * instructions. But we need a temporary place to save pointer to nested * structure whose field we want to store to. */ -#define SOCK_ADDR_STORE_NESTED_FIELD_OFF(S, NS, F, NF, OFF, TF) \ +#define SOCK_ADDR_STORE_NESTED_FIELD_OFF(S, NS, F, NF, SIZE, OFF, TF) \ do { \ int tmp_reg = BPF_REG_9; \ if (si->src_reg == tmp_reg || si->dst_reg == tmp_reg) \ @@ -7751,8 +7758,7 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type, offsetof(S, TF)); \ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(S, F), tmp_reg, \ si->dst_reg, offsetof(S, F)); \ - *insn++ = BPF_STX_MEM( \ - BPF_FIELD_SIZEOF(NS, NF), tmp_reg, si->src_reg, \ + *insn++ = BPF_STX_MEM(SIZE, tmp_reg, si->src_reg, \ bpf_target_off(NS, NF, FIELD_SIZEOF(NS, NF), \ target_size) \ + OFF); \ @@ -7764,8 +7770,8 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type, TF) \ do { \ if (type == BPF_WRITE) { \ - SOCK_ADDR_STORE_NESTED_FIELD_OFF(S, NS, F, NF, OFF, \ - TF); \ + SOCK_ADDR_STORE_NESTED_FIELD_OFF(S, NS, F, NF, SIZE, \ + OFF, TF); \ } else { \ SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF( \ S, NS, F, NF, SIZE, OFF); \ -- cgit v1.2.3-71-gd317 From 2a2ea50870baa3fb4de0872c5b60828138654ca7 Mon Sep 17 00:00:00 2001 From: John Hurley Date: Sun, 7 Jul 2019 15:01:57 +0100 Subject: net: sched: add mpls manipulation actions to TC Currently, TC offers the ability to match on the MPLS fields of a packet through the use of the flow_dissector_key_mpls struct. However, as yet, TC actions do not allow the modification or manipulation of such fields. Add a new module that registers TC action ops to allow manipulation of MPLS. This includes the ability to push and pop headers as well as modify the contents of new or existing headers. A further action to decrement the TTL field of an MPLS header is also provided with a new helper added to support this. Examples of the usage of the new action with flower rules to push and pop MPLS labels are: tc filter add dev eth0 protocol ip parent ffff: flower \ action mpls push protocol mpls_uc label 123 \ action mirred egress redirect dev eth1 tc filter add dev eth0 protocol mpls_uc parent ffff: flower \ action mpls pop protocol ipv4 \ action mirred egress redirect dev eth1 Signed-off-by: John Hurley Reviewed-by: Jakub Kicinski Reviewed-by: Simon Horman Reviewed-by: Willem de Bruijn Acked-by: Cong Wang Signed-off-by: David S. Miller --- include/linux/skbuff.h | 1 + include/net/tc_act/tc_mpls.h | 30 +++ include/uapi/linux/pkt_cls.h | 3 +- include/uapi/linux/tc_act/tc_mpls.h | 33 +++ net/core/skbuff.c | 30 +++ net/sched/Kconfig | 11 + net/sched/Makefile | 1 + net/sched/act_mpls.c | 406 ++++++++++++++++++++++++++++++++++++ 8 files changed, 514 insertions(+), 1 deletion(-) create mode 100644 include/net/tc_act/tc_mpls.h create mode 100644 include/uapi/linux/tc_act/tc_mpls.h create mode 100644 net/sched/act_mpls.c (limited to 'include/uapi/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 9f7e01f2be83..9d7a2c28ea35 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3450,6 +3450,7 @@ int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci); int skb_mpls_push(struct sk_buff *skb, __be32 mpls_lse, __be16 mpls_proto); int skb_mpls_pop(struct sk_buff *skb, __be16 next_proto); int skb_mpls_update_lse(struct sk_buff *skb, __be32 mpls_lse); +int skb_mpls_dec_ttl(struct sk_buff *skb); struct sk_buff *pskb_extract(struct sk_buff *skb, int off, int to_copy, gfp_t gfp); diff --git a/include/net/tc_act/tc_mpls.h b/include/net/tc_act/tc_mpls.h new file mode 100644 index 000000000000..4bc3d9250ef0 --- /dev/null +++ b/include/net/tc_act/tc_mpls.h @@ -0,0 +1,30 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ +/* Copyright (C) 2019 Netronome Systems, Inc. */ + +#ifndef __NET_TC_MPLS_H +#define __NET_TC_MPLS_H + +#include +#include + +struct tcf_mpls_params { + int tcfm_action; + u32 tcfm_label; + u8 tcfm_tc; + u8 tcfm_ttl; + u8 tcfm_bos; + __be16 tcfm_proto; + struct rcu_head rcu; +}; + +#define ACT_MPLS_TC_NOT_SET 0xff +#define ACT_MPLS_BOS_NOT_SET 0xff +#define ACT_MPLS_LABEL_NOT_SET 0xffffffff + +struct tcf_mpls { + struct tc_action common; + struct tcf_mpls_params __rcu *mpls_p; +}; +#define to_mpls(a) ((struct tcf_mpls *)a) + +#endif /* __NET_TC_MPLS_H */ diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h index 8cc6b6777b3c..e22ef4a940bc 100644 --- a/include/uapi/linux/pkt_cls.h +++ b/include/uapi/linux/pkt_cls.h @@ -104,8 +104,9 @@ enum tca_id { TCA_ID_SIMP = TCA_ACT_SIMP, TCA_ID_IFE = TCA_ACT_IFE, TCA_ID_SAMPLE = TCA_ACT_SAMPLE, - /* other actions go here */ TCA_ID_CTINFO, + TCA_ID_MPLS, + /* other actions go here */ __TCA_ID_MAX = 255 }; diff --git a/include/uapi/linux/tc_act/tc_mpls.h b/include/uapi/linux/tc_act/tc_mpls.h new file mode 100644 index 000000000000..9360e95273c7 --- /dev/null +++ b/include/uapi/linux/tc_act/tc_mpls.h @@ -0,0 +1,33 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* Copyright (C) 2019 Netronome Systems, Inc. */ + +#ifndef __LINUX_TC_MPLS_H +#define __LINUX_TC_MPLS_H + +#include + +#define TCA_MPLS_ACT_POP 1 +#define TCA_MPLS_ACT_PUSH 2 +#define TCA_MPLS_ACT_MODIFY 3 +#define TCA_MPLS_ACT_DEC_TTL 4 + +struct tc_mpls { + tc_gen; /* generic TC action fields. */ + int m_action; /* action of type TCA_MPLS_ACT_*. */ +}; + +enum { + TCA_MPLS_UNSPEC, + TCA_MPLS_TM, /* struct tcf_t; time values associated with action. */ + TCA_MPLS_PARMS, /* struct tc_mpls; action type and general TC fields. */ + TCA_MPLS_PAD, + TCA_MPLS_PROTO, /* be16; eth_type of pushed or next (for pop) header. */ + TCA_MPLS_LABEL, /* u32; MPLS label. Lower 20 bits are used. */ + TCA_MPLS_TC, /* u8; MPLS TC field. Lower 3 bits are used. */ + TCA_MPLS_TTL, /* u8; MPLS TTL field. Must not be 0. */ + TCA_MPLS_BOS, /* u8; MPLS BOS field. Either 1 or 0. */ + __TCA_MPLS_MAX, +}; +#define TCA_MPLS_MAX (__TCA_MPLS_MAX - 1) + +#endif diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 93443a01ab39..6f1e31f674a3 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -59,6 +59,7 @@ #include #include #include +#include #include #include @@ -5564,6 +5565,35 @@ int skb_mpls_update_lse(struct sk_buff *skb, __be32 mpls_lse) } EXPORT_SYMBOL_GPL(skb_mpls_update_lse); +/** + * skb_mpls_dec_ttl() - decrement the TTL of the outermost MPLS header + * + * @skb: buffer + * + * Expects skb->data at mac header. + * + * Returns 0 on success, -errno otherwise. + */ +int skb_mpls_dec_ttl(struct sk_buff *skb) +{ + u32 lse; + u8 ttl; + + if (unlikely(!eth_p_mpls(skb->protocol))) + return -EINVAL; + + lse = be32_to_cpu(mpls_hdr(skb)->label_stack_entry); + ttl = (lse & MPLS_LS_TTL_MASK) >> MPLS_LS_TTL_SHIFT; + if (!--ttl) + return -EINVAL; + + lse &= ~MPLS_LS_TTL_MASK; + lse |= ttl << MPLS_LS_TTL_SHIFT; + + return skb_mpls_update_lse(skb, cpu_to_be32(lse)); +} +EXPORT_SYMBOL_GPL(skb_mpls_dec_ttl); + /** * alloc_skb_with_frags - allocate skb with page frags * diff --git a/net/sched/Kconfig b/net/sched/Kconfig index 360fdd3eaa77..731f5fbc2a3c 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -842,6 +842,17 @@ config NET_ACT_CSUM To compile this code as a module, choose M here: the module will be called act_csum. +config NET_ACT_MPLS + tristate "MPLS manipulation" + depends on NET_CLS_ACT + help + Say Y here to push or pop MPLS headers. + + If unsure, say N. + + To compile this code as a module, choose M here: the + module will be called act_mpls. + config NET_ACT_VLAN tristate "Vlan manipulation" depends on NET_CLS_ACT diff --git a/net/sched/Makefile b/net/sched/Makefile index d54bfcbd7981..c26603606c22 100644 --- a/net/sched/Makefile +++ b/net/sched/Makefile @@ -18,6 +18,7 @@ obj-$(CONFIG_NET_ACT_PEDIT) += act_pedit.o obj-$(CONFIG_NET_ACT_SIMP) += act_simple.o obj-$(CONFIG_NET_ACT_SKBEDIT) += act_skbedit.o obj-$(CONFIG_NET_ACT_CSUM) += act_csum.o +obj-$(CONFIG_NET_ACT_MPLS) += act_mpls.o obj-$(CONFIG_NET_ACT_VLAN) += act_vlan.o obj-$(CONFIG_NET_ACT_BPF) += act_bpf.o obj-$(CONFIG_NET_ACT_CONNMARK) += act_connmark.o diff --git a/net/sched/act_mpls.c b/net/sched/act_mpls.c new file mode 100644 index 000000000000..ca2597ce4ac9 --- /dev/null +++ b/net/sched/act_mpls.c @@ -0,0 +1,406 @@ +// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +/* Copyright (C) 2019 Netronome Systems, Inc. */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static unsigned int mpls_net_id; +static struct tc_action_ops act_mpls_ops; + +#define ACT_MPLS_TTL_DEFAULT 255 + +static __be32 tcf_mpls_get_lse(struct mpls_shim_hdr *lse, + struct tcf_mpls_params *p, bool set_bos) +{ + u32 new_lse = 0; + + if (lse) + new_lse = be32_to_cpu(lse->label_stack_entry); + + if (p->tcfm_label != ACT_MPLS_LABEL_NOT_SET) { + new_lse &= ~MPLS_LS_LABEL_MASK; + new_lse |= p->tcfm_label << MPLS_LS_LABEL_SHIFT; + } + if (p->tcfm_ttl) { + new_lse &= ~MPLS_LS_TTL_MASK; + new_lse |= p->tcfm_ttl << MPLS_LS_TTL_SHIFT; + } + if (p->tcfm_tc != ACT_MPLS_TC_NOT_SET) { + new_lse &= ~MPLS_LS_TC_MASK; + new_lse |= p->tcfm_tc << MPLS_LS_TC_SHIFT; + } + if (p->tcfm_bos != ACT_MPLS_BOS_NOT_SET) { + new_lse &= ~MPLS_LS_S_MASK; + new_lse |= p->tcfm_bos << MPLS_LS_S_SHIFT; + } else if (set_bos) { + new_lse |= 1 << MPLS_LS_S_SHIFT; + } + + return cpu_to_be32(new_lse); +} + +static int tcf_mpls_act(struct sk_buff *skb, const struct tc_action *a, + struct tcf_result *res) +{ + struct tcf_mpls *m = to_mpls(a); + struct tcf_mpls_params *p; + __be32 new_lse; + int ret; + + tcf_lastuse_update(&m->tcf_tm); + bstats_cpu_update(this_cpu_ptr(m->common.cpu_bstats), skb); + + /* Ensure 'data' points at mac_header prior calling mpls manipulating + * functions. + */ + if (skb_at_tc_ingress(skb)) + skb_push_rcsum(skb, skb->mac_len); + + ret = READ_ONCE(m->tcf_action); + + p = rcu_dereference_bh(m->mpls_p); + + switch (p->tcfm_action) { + case TCA_MPLS_ACT_POP: + if (skb_mpls_pop(skb, p->tcfm_proto)) + goto drop; + break; + case TCA_MPLS_ACT_PUSH: + new_lse = tcf_mpls_get_lse(NULL, p, !eth_p_mpls(skb->protocol)); + if (skb_mpls_push(skb, new_lse, p->tcfm_proto)) + goto drop; + break; + case TCA_MPLS_ACT_MODIFY: + new_lse = tcf_mpls_get_lse(mpls_hdr(skb), p, false); + if (skb_mpls_update_lse(skb, new_lse)) + goto drop; + break; + case TCA_MPLS_ACT_DEC_TTL: + if (skb_mpls_dec_ttl(skb)) + goto drop; + break; + } + + if (skb_at_tc_ingress(skb)) + skb_pull_rcsum(skb, skb->mac_len); + + return ret; + +drop: + qstats_drop_inc(this_cpu_ptr(m->common.cpu_qstats)); + return TC_ACT_SHOT; +} + +static int valid_label(const struct nlattr *attr, + struct netlink_ext_ack *extack) +{ + const u32 *label = nla_data(attr); + + if (*label & ~MPLS_LABEL_MASK || *label == MPLS_LABEL_IMPLNULL) { + NL_SET_ERR_MSG_MOD(extack, "MPLS label out of range"); + return -EINVAL; + } + + return 0; +} + +static const struct nla_policy mpls_policy[TCA_MPLS_MAX + 1] = { + [TCA_MPLS_UNSPEC] = { .strict_start_type = TCA_MPLS_UNSPEC + 1 }, + [TCA_MPLS_PARMS] = NLA_POLICY_EXACT_LEN(sizeof(struct tc_mpls)), + [TCA_MPLS_PROTO] = { .type = NLA_U16 }, + [TCA_MPLS_LABEL] = NLA_POLICY_VALIDATE_FN(NLA_U32, valid_label), + [TCA_MPLS_TC] = NLA_POLICY_RANGE(NLA_U8, 0, 7), + [TCA_MPLS_TTL] = NLA_POLICY_MIN(NLA_U8, 1), + [TCA_MPLS_BOS] = NLA_POLICY_RANGE(NLA_U8, 0, 1), +}; + +static int tcf_mpls_init(struct net *net, struct nlattr *nla, + struct nlattr *est, struct tc_action **a, + int ovr, int bind, bool rtnl_held, + struct tcf_proto *tp, struct netlink_ext_ack *extack) +{ + struct tc_action_net *tn = net_generic(net, mpls_net_id); + struct nlattr *tb[TCA_MPLS_MAX + 1]; + struct tcf_chain *goto_ch = NULL; + struct tcf_mpls_params *p; + struct tc_mpls *parm; + bool exists = false; + struct tcf_mpls *m; + int ret = 0, err; + u8 mpls_ttl = 0; + + if (!nla) { + NL_SET_ERR_MSG_MOD(extack, "Missing netlink attributes"); + return -EINVAL; + } + + err = nla_parse_nested(tb, TCA_MPLS_MAX, nla, mpls_policy, extack); + if (err < 0) + return err; + + if (!tb[TCA_MPLS_PARMS]) { + NL_SET_ERR_MSG_MOD(extack, "No MPLS params"); + return -EINVAL; + } + parm = nla_data(tb[TCA_MPLS_PARMS]); + + /* Verify parameters against action type. */ + switch (parm->m_action) { + case TCA_MPLS_ACT_POP: + if (!tb[TCA_MPLS_PROTO]) { + NL_SET_ERR_MSG_MOD(extack, "Protocol must be set for MPLS pop"); + return -EINVAL; + } + if (!eth_proto_is_802_3(nla_get_be16(tb[TCA_MPLS_PROTO]))) { + NL_SET_ERR_MSG_MOD(extack, "Invalid protocol type for MPLS pop"); + return -EINVAL; + } + if (tb[TCA_MPLS_LABEL] || tb[TCA_MPLS_TTL] || tb[TCA_MPLS_TC] || + tb[TCA_MPLS_BOS]) { + NL_SET_ERR_MSG_MOD(extack, "Label, TTL, TC or BOS cannot be used with MPLS pop"); + return -EINVAL; + } + break; + case TCA_MPLS_ACT_DEC_TTL: + if (tb[TCA_MPLS_PROTO] || tb[TCA_MPLS_LABEL] || + tb[TCA_MPLS_TTL] || tb[TCA_MPLS_TC] || tb[TCA_MPLS_BOS]) { + NL_SET_ERR_MSG_MOD(extack, "Label, TTL, TC, BOS or protocol cannot be used with MPLS dec_ttl"); + return -EINVAL; + } + break; + case TCA_MPLS_ACT_PUSH: + if (!tb[TCA_MPLS_LABEL]) { + NL_SET_ERR_MSG_MOD(extack, "Label is required for MPLS push"); + return -EINVAL; + } + if (tb[TCA_MPLS_PROTO] && + !eth_p_mpls(nla_get_be16(tb[TCA_MPLS_PROTO]))) { + NL_SET_ERR_MSG_MOD(extack, "Protocol must be an MPLS type for MPLS push"); + return -EPROTONOSUPPORT; + } + /* Push needs a TTL - if not specified, set a default value. */ + if (!tb[TCA_MPLS_TTL]) { +#if IS_ENABLED(CONFIG_MPLS) + mpls_ttl = net->mpls.default_ttl ? + net->mpls.default_ttl : ACT_MPLS_TTL_DEFAULT; +#else + mpls_ttl = ACT_MPLS_TTL_DEFAULT; +#endif + } + break; + case TCA_MPLS_ACT_MODIFY: + if (tb[TCA_MPLS_PROTO]) { + NL_SET_ERR_MSG_MOD(extack, "Protocol cannot be used with MPLS modify"); + return -EINVAL; + } + break; + default: + NL_SET_ERR_MSG_MOD(extack, "Unknown MPLS action"); + return -EINVAL; + } + + err = tcf_idr_check_alloc(tn, &parm->index, a, bind); + if (err < 0) + return err; + exists = err; + if (exists && bind) + return 0; + + if (!exists) { + ret = tcf_idr_create(tn, parm->index, est, a, + &act_mpls_ops, bind, true); + if (ret) { + tcf_idr_cleanup(tn, parm->index); + return ret; + } + + ret = ACT_P_CREATED; + } else if (!ovr) { + tcf_idr_release(*a, bind); + return -EEXIST; + } + + err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack); + if (err < 0) + goto release_idr; + + m = to_mpls(*a); + + p = kzalloc(sizeof(*p), GFP_KERNEL); + if (!p) { + err = -ENOMEM; + goto put_chain; + } + + p->tcfm_action = parm->m_action; + p->tcfm_label = tb[TCA_MPLS_LABEL] ? nla_get_u32(tb[TCA_MPLS_LABEL]) : + ACT_MPLS_LABEL_NOT_SET; + p->tcfm_tc = tb[TCA_MPLS_TC] ? nla_get_u8(tb[TCA_MPLS_TC]) : + ACT_MPLS_TC_NOT_SET; + p->tcfm_ttl = tb[TCA_MPLS_TTL] ? nla_get_u8(tb[TCA_MPLS_TTL]) : + mpls_ttl; + p->tcfm_bos = tb[TCA_MPLS_BOS] ? nla_get_u8(tb[TCA_MPLS_BOS]) : + ACT_MPLS_BOS_NOT_SET; + p->tcfm_proto = tb[TCA_MPLS_PROTO] ? nla_get_be16(tb[TCA_MPLS_PROTO]) : + htons(ETH_P_MPLS_UC); + + spin_lock_bh(&m->tcf_lock); + goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); + rcu_swap_protected(m->mpls_p, p, lockdep_is_held(&m->tcf_lock)); + spin_unlock_bh(&m->tcf_lock); + + if (goto_ch) + tcf_chain_put_by_act(goto_ch); + if (p) + kfree_rcu(p, rcu); + + if (ret == ACT_P_CREATED) + tcf_idr_insert(tn, *a); + return ret; +put_chain: + if (goto_ch) + tcf_chain_put_by_act(goto_ch); +release_idr: + tcf_idr_release(*a, bind); + return err; +} + +static void tcf_mpls_cleanup(struct tc_action *a) +{ + struct tcf_mpls *m = to_mpls(a); + struct tcf_mpls_params *p; + + p = rcu_dereference_protected(m->mpls_p, 1); + if (p) + kfree_rcu(p, rcu); +} + +static int tcf_mpls_dump(struct sk_buff *skb, struct tc_action *a, + int bind, int ref) +{ + unsigned char *b = skb_tail_pointer(skb); + struct tcf_mpls *m = to_mpls(a); + struct tcf_mpls_params *p; + struct tc_mpls opt = { + .index = m->tcf_index, + .refcnt = refcount_read(&m->tcf_refcnt) - ref, + .bindcnt = atomic_read(&m->tcf_bindcnt) - bind, + }; + struct tcf_t t; + + spin_lock_bh(&m->tcf_lock); + opt.action = m->tcf_action; + p = rcu_dereference_protected(m->mpls_p, lockdep_is_held(&m->tcf_lock)); + opt.m_action = p->tcfm_action; + + if (nla_put(skb, TCA_MPLS_PARMS, sizeof(opt), &opt)) + goto nla_put_failure; + + if (p->tcfm_label != ACT_MPLS_LABEL_NOT_SET && + nla_put_u32(skb, TCA_MPLS_LABEL, p->tcfm_label)) + goto nla_put_failure; + + if (p->tcfm_tc != ACT_MPLS_TC_NOT_SET && + nla_put_u8(skb, TCA_MPLS_TC, p->tcfm_tc)) + goto nla_put_failure; + + if (p->tcfm_ttl && nla_put_u8(skb, TCA_MPLS_TTL, p->tcfm_ttl)) + goto nla_put_failure; + + if (p->tcfm_bos != ACT_MPLS_BOS_NOT_SET && + nla_put_u8(skb, TCA_MPLS_BOS, p->tcfm_bos)) + goto nla_put_failure; + + if (nla_put_be16(skb, TCA_MPLS_PROTO, p->tcfm_proto)) + goto nla_put_failure; + + tcf_tm_dump(&t, &m->tcf_tm); + + if (nla_put_64bit(skb, TCA_MPLS_TM, sizeof(t), &t, TCA_MPLS_PAD)) + goto nla_put_failure; + + spin_unlock_bh(&m->tcf_lock); + + return skb->len; + +nla_put_failure: + spin_unlock_bh(&m->tcf_lock); + nlmsg_trim(skb, b); + return -EMSGSIZE; +} + +static int tcf_mpls_walker(struct net *net, struct sk_buff *skb, + struct netlink_callback *cb, int type, + const struct tc_action_ops *ops, + struct netlink_ext_ack *extack) +{ + struct tc_action_net *tn = net_generic(net, mpls_net_id); + + return tcf_generic_walker(tn, skb, cb, type, ops, extack); +} + +static int tcf_mpls_search(struct net *net, struct tc_action **a, u32 index) +{ + struct tc_action_net *tn = net_generic(net, mpls_net_id); + + return tcf_idr_search(tn, a, index); +} + +static struct tc_action_ops act_mpls_ops = { + .kind = "mpls", + .id = TCA_ID_MPLS, + .owner = THIS_MODULE, + .act = tcf_mpls_act, + .dump = tcf_mpls_dump, + .init = tcf_mpls_init, + .cleanup = tcf_mpls_cleanup, + .walk = tcf_mpls_walker, + .lookup = tcf_mpls_search, + .size = sizeof(struct tcf_mpls), +}; + +static __net_init int mpls_init_net(struct net *net) +{ + struct tc_action_net *tn = net_generic(net, mpls_net_id); + + return tc_action_net_init(tn, &act_mpls_ops); +} + +static void __net_exit mpls_exit_net(struct list_head *net_list) +{ + tc_action_net_exit(net_list, mpls_net_id); +} + +static struct pernet_operations mpls_net_ops = { + .init = mpls_init_net, + .exit_batch = mpls_exit_net, + .id = &mpls_net_id, + .size = sizeof(struct tc_action_net), +}; + +static int __init mpls_init_module(void) +{ + return tcf_register_action(&act_mpls_ops, &mpls_net_ops); +} + +static void __exit mpls_cleanup_module(void) +{ + tcf_unregister_action(&act_mpls_ops, &mpls_net_ops); +} + +module_init(mpls_init_module); +module_exit(mpls_cleanup_module); + +MODULE_AUTHOR("Netronome Systems "); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("MPLS manipulation actions"); -- cgit v1.2.3-71-gd317 From 98fd2d6563fe4a799934a2a74d632601cd089beb Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Mon, 8 Jul 2019 23:17:37 -0500 Subject: devlink: Introduce PCI PF port flavour and port attribute In an eswitch, PCI PF may have port which is normally represented using a representor netdevice. To have better visibility of eswitch port, its association with PF and a representor netdevice, introduce a PCI PF port flavour and port attriute. When devlink port flavour is PCI PF, fill up PCI PF attributes of the port. Extend port name creation using PCI PF number on best effort basis. So that vendor drivers can skip defining their own scheme. $ devlink port show pci/0000:05:00.0/0: type eth netdev eth0 flavour pcipf pfnum 0 Acked-by: Jiri Pirko Signed-off-by: Parav Pandit Signed-off-by: David S. Miller --- include/net/devlink.h | 8 ++++++++ include/uapi/linux/devlink.h | 5 +++++ net/core/devlink.c | 34 ++++++++++++++++++++++++++++++++++ 3 files changed, 47 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/net/devlink.h b/include/net/devlink.h index 4538c80fe293..97cef896e4d0 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -46,6 +46,10 @@ struct devlink_port_phys_attrs { u32 split_subport_number; }; +struct devlink_port_pci_pf_attrs { + u16 pf; /* Associated PCI PF for this port. */ +}; + struct devlink_port_attrs { u8 set:1, split:1, @@ -54,6 +58,7 @@ struct devlink_port_attrs { struct netdev_phys_item_id switch_id; union { struct devlink_port_phys_attrs phys; + struct devlink_port_pci_pf_attrs pci_pf; }; }; @@ -599,6 +604,9 @@ void devlink_port_attrs_set(struct devlink_port *devlink_port, u32 split_subport_number, const unsigned char *switch_id, unsigned char switch_id_len); +void devlink_port_attrs_pci_pf_set(struct devlink_port *devlink_port, + const unsigned char *switch_id, + unsigned char switch_id_len, u16 pf); int devlink_sb_register(struct devlink *devlink, unsigned int sb_index, u32 size, u16 ingress_pools_count, u16 egress_pools_count, u16 ingress_tc_count, diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index 5287b42c181f..f7323884c3fe 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -169,6 +169,10 @@ enum devlink_port_flavour { DEVLINK_PORT_FLAVOUR_DSA, /* Distributed switch architecture * interconnect port. */ + DEVLINK_PORT_FLAVOUR_PCI_PF, /* Represents eswitch port for + * the PCI PF. It is an internal + * port that faces the PCI PF. + */ }; enum devlink_param_cmode { @@ -337,6 +341,7 @@ enum devlink_attr { DEVLINK_ATTR_FLASH_UPDATE_STATUS_DONE, /* u64 */ DEVLINK_ATTR_FLASH_UPDATE_STATUS_TOTAL, /* u64 */ + DEVLINK_ATTR_PORT_PCI_PF_NUMBER, /* u16 */ /* add new attributes above here, update the policy in devlink.c */ __DEVLINK_ATTR_MAX, diff --git a/net/core/devlink.c b/net/core/devlink.c index a9c4e5d8a99c..d362652a5cc7 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -515,6 +515,11 @@ static int devlink_nl_port_attrs_put(struct sk_buff *msg, return 0; if (nla_put_u16(msg, DEVLINK_ATTR_PORT_FLAVOUR, attrs->flavour)) return -EMSGSIZE; + if (devlink_port->attrs.flavour == DEVLINK_PORT_FLAVOUR_PCI_PF) { + if (nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_PF_NUMBER, + attrs->pci_pf.pf)) + return -EMSGSIZE; + } if (devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_PHYSICAL && devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_CPU && devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_DSA) @@ -5801,6 +5806,32 @@ void devlink_port_attrs_set(struct devlink_port *devlink_port, } EXPORT_SYMBOL_GPL(devlink_port_attrs_set); +/** + * devlink_port_attrs_pci_pf_set - Set PCI PF port attributes + * + * @devlink_port: devlink port + * @pf: associated PF for the devlink port instance + * @switch_id: if the port is part of switch, this is buffer with ID, + * otherwise this is NULL + * @switch_id_len: length of the switch_id buffer + */ +void devlink_port_attrs_pci_pf_set(struct devlink_port *devlink_port, + const unsigned char *switch_id, + unsigned char switch_id_len, u16 pf) +{ + struct devlink_port_attrs *attrs = &devlink_port->attrs; + int ret; + + ret = __devlink_port_attrs_set(devlink_port, + DEVLINK_PORT_FLAVOUR_PCI_PF, + switch_id, switch_id_len); + if (ret) + return; + + attrs->pci_pf.pf = pf; +} +EXPORT_SYMBOL_GPL(devlink_port_attrs_pci_pf_set); + static int __devlink_port_phys_port_name_get(struct devlink_port *devlink_port, char *name, size_t len) { @@ -5826,6 +5857,9 @@ static int __devlink_port_phys_port_name_get(struct devlink_port *devlink_port, */ WARN_ON(1); return -EINVAL; + case DEVLINK_PORT_FLAVOUR_PCI_PF: + n = snprintf(name, len, "pf%u", attrs->pci_pf.pf); + break; } if (n >= len) -- cgit v1.2.3-71-gd317 From e41b6bf3cdd474dc9c587cb55906b0256835bf6d Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Mon, 8 Jul 2019 23:17:38 -0500 Subject: devlink: Introduce PCI VF port flavour and port attribute In an eswitch, PCI VF may have port which is normally represented using a representor netdevice. To have better visibility of eswitch port, its association with VF, and its representor netdevice, introduce a PCI VF port flavour. When devlink port flavour is PCI VF, fill up PCI VF attributes of the port. Extend port name creation using PCI PF and VF number scheme on best effort basis, so that vendor drivers can skip defining their own scheme. $ devlink port show pci/0000:05:00.0/0: type eth netdev eth0 flavour pcipf pfnum 0 pci/0000:05:00.0/1: type eth netdev eth1 flavour pcivf pfnum 0 vfnum 0 pci/0000:05:00.0/2: type eth netdev eth2 flavour pcivf pfnum 0 vfnum 1 Acked-by: Jiri Pirko Signed-off-by: Parav Pandit Signed-off-by: David S. Miller --- include/net/devlink.h | 10 ++++++++++ include/uapi/linux/devlink.h | 6 ++++++ net/core/devlink.c | 38 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 54 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/net/devlink.h b/include/net/devlink.h index 97cef896e4d0..bc36f942a7d5 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -50,6 +50,11 @@ struct devlink_port_pci_pf_attrs { u16 pf; /* Associated PCI PF for this port. */ }; +struct devlink_port_pci_vf_attrs { + u16 pf; /* Associated PCI PF for this port. */ + u16 vf; /* Associated PCI VF for of the PCI PF for this port. */ +}; + struct devlink_port_attrs { u8 set:1, split:1, @@ -59,6 +64,7 @@ struct devlink_port_attrs { union { struct devlink_port_phys_attrs phys; struct devlink_port_pci_pf_attrs pci_pf; + struct devlink_port_pci_vf_attrs pci_vf; }; }; @@ -607,6 +613,10 @@ void devlink_port_attrs_set(struct devlink_port *devlink_port, void devlink_port_attrs_pci_pf_set(struct devlink_port *devlink_port, const unsigned char *switch_id, unsigned char switch_id_len, u16 pf); +void devlink_port_attrs_pci_vf_set(struct devlink_port *devlink_port, + const unsigned char *switch_id, + unsigned char switch_id_len, + u16 pf, u16 vf); int devlink_sb_register(struct devlink *devlink, unsigned int sb_index, u32 size, u16 ingress_pools_count, u16 egress_pools_count, u16 ingress_tc_count, diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index f7323884c3fe..ffc993256527 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -173,6 +173,10 @@ enum devlink_port_flavour { * the PCI PF. It is an internal * port that faces the PCI PF. */ + DEVLINK_PORT_FLAVOUR_PCI_VF, /* Represents eswitch port + * for the PCI VF. It is an internal + * port that faces the PCI VF. + */ }; enum devlink_param_cmode { @@ -342,6 +346,8 @@ enum devlink_attr { DEVLINK_ATTR_FLASH_UPDATE_STATUS_TOTAL, /* u64 */ DEVLINK_ATTR_PORT_PCI_PF_NUMBER, /* u16 */ + DEVLINK_ATTR_PORT_PCI_VF_NUMBER, /* u16 */ + /* add new attributes above here, update the policy in devlink.c */ __DEVLINK_ATTR_MAX, diff --git a/net/core/devlink.c b/net/core/devlink.c index d362652a5cc7..4f40aeace902 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -519,6 +519,12 @@ static int devlink_nl_port_attrs_put(struct sk_buff *msg, if (nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_PF_NUMBER, attrs->pci_pf.pf)) return -EMSGSIZE; + } else if (devlink_port->attrs.flavour == DEVLINK_PORT_FLAVOUR_PCI_VF) { + if (nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_PF_NUMBER, + attrs->pci_vf.pf) || + nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_VF_NUMBER, + attrs->pci_vf.vf)) + return -EMSGSIZE; } if (devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_PHYSICAL && devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_CPU && @@ -5832,6 +5838,34 @@ void devlink_port_attrs_pci_pf_set(struct devlink_port *devlink_port, } EXPORT_SYMBOL_GPL(devlink_port_attrs_pci_pf_set); +/** + * devlink_port_attrs_pci_vf_set - Set PCI VF port attributes + * + * @devlink_port: devlink port + * @pf: associated PF for the devlink port instance + * @vf: associated VF of a PF for the devlink port instance + * @switch_id: if the port is part of switch, this is buffer with ID, + * otherwise this is NULL + * @switch_id_len: length of the switch_id buffer + */ +void devlink_port_attrs_pci_vf_set(struct devlink_port *devlink_port, + const unsigned char *switch_id, + unsigned char switch_id_len, + u16 pf, u16 vf) +{ + struct devlink_port_attrs *attrs = &devlink_port->attrs; + int ret; + + ret = __devlink_port_attrs_set(devlink_port, + DEVLINK_PORT_FLAVOUR_PCI_VF, + switch_id, switch_id_len); + if (ret) + return; + attrs->pci_vf.pf = pf; + attrs->pci_vf.vf = vf; +} +EXPORT_SYMBOL_GPL(devlink_port_attrs_pci_vf_set); + static int __devlink_port_phys_port_name_get(struct devlink_port *devlink_port, char *name, size_t len) { @@ -5860,6 +5894,10 @@ static int __devlink_port_phys_port_name_get(struct devlink_port *devlink_port, case DEVLINK_PORT_FLAVOUR_PCI_PF: n = snprintf(name, len, "pf%u", attrs->pci_pf.pf); break; + case DEVLINK_PORT_FLAVOUR_PCI_VF: + n = snprintf(name, len, "pf%uvf%u", + attrs->pci_vf.pf, attrs->pci_vf.vf); + break; } if (n >= len) -- cgit v1.2.3-71-gd317 From b57dc7c13ea90e09ae15f821d2583fa0231b4935 Mon Sep 17 00:00:00 2001 From: Paul Blakey Date: Tue, 9 Jul 2019 10:30:48 +0300 Subject: net/sched: Introduce action ct Allow sending a packet to conntrack module for connection tracking. The packet will be marked with conntrack connection's state, and any metadata such as conntrack mark and label. This state metadata can later be matched against with tc classifers, for example with the flower classifier as below. In addition to committing new connections the user can optionally specific a zone to track within, set a mark/label and configure nat with an address range and port range. Usage is as follows: $ tc qdisc add dev ens1f0_0 ingress $ tc qdisc add dev ens1f0_1 ingress $ tc filter add dev ens1f0_0 ingress \ prio 1 chain 0 proto ip \ flower ip_proto tcp ct_state -trk \ action ct zone 2 pipe \ action goto chain 2 $ tc filter add dev ens1f0_0 ingress \ prio 1 chain 2 proto ip \ flower ct_state +trk+new \ action ct zone 2 commit mark 0xbb nat src addr 5.5.5.7 pipe \ action mirred egress redirect dev ens1f0_1 $ tc filter add dev ens1f0_0 ingress \ prio 1 chain 2 proto ip \ flower ct_zone 2 ct_mark 0xbb ct_state +trk+est \ action ct nat pipe \ action mirred egress redirect dev ens1f0_1 $ tc filter add dev ens1f0_1 ingress \ prio 1 chain 0 proto ip \ flower ip_proto tcp ct_state -trk \ action ct zone 2 pipe \ action goto chain 1 $ tc filter add dev ens1f0_1 ingress \ prio 1 chain 1 proto ip \ flower ct_zone 2 ct_mark 0xbb ct_state +trk+est \ action ct nat pipe \ action mirred egress redirect dev ens1f0_0 Signed-off-by: Paul Blakey Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: Yossi Kuperman Acked-by: Jiri Pirko Changelog: V5->V6: Added CONFIG_NF_DEFRAG_IPV6 in handle fragments ipv6 case V4->V5: Reordered nf_conntrack_put() in tcf_ct_skb_nfct_cached() V3->V4: Added strict_start_type for act_ct policy V2->V3: Fixed david's comments: Removed extra newline after rcu in tcf_ct_params , and indent of break in act_ct.c V1->V2: Fixed parsing of ranges TCA_CT_NAT_IPV6_MAX as 'else' case overwritten ipv4 max Refactored NAT_PORT_MIN_MAX range handling as well Added ipv4/ipv6 defragmentation Removed extra skb pull push of nw offset in exectute nat Refactored tcf_ct_skb_network_trim after pull Removed TCA_ACT_CT define Signed-off-by: David S. Miller --- include/net/flow_offload.h | 5 + include/net/tc_act/tc_ct.h | 63 +++ include/uapi/linux/pkt_cls.h | 1 + include/uapi/linux/tc_act/tc_ct.h | 41 ++ net/sched/Kconfig | 11 + net/sched/Makefile | 1 + net/sched/act_ct.c | 984 ++++++++++++++++++++++++++++++++++++++ net/sched/cls_api.c | 5 + 8 files changed, 1111 insertions(+) create mode 100644 include/net/tc_act/tc_ct.h create mode 100644 include/uapi/linux/tc_act/tc_ct.h create mode 100644 net/sched/act_ct.c (limited to 'include/uapi/linux') diff --git a/include/net/flow_offload.h b/include/net/flow_offload.h index 36127c1858a4..a09e256d2b27 100644 --- a/include/net/flow_offload.h +++ b/include/net/flow_offload.h @@ -129,6 +129,7 @@ enum flow_action_id { FLOW_ACTION_QUEUE, FLOW_ACTION_SAMPLE, FLOW_ACTION_POLICE, + FLOW_ACTION_CT, }; /* This is mirroring enum pedit_header_type definition for easy mapping between @@ -178,6 +179,10 @@ struct flow_action_entry { s64 burst; u64 rate_bytes_ps; } police; + struct { /* FLOW_ACTION_CT */ + int action; + u16 zone; + } ct; }; }; diff --git a/include/net/tc_act/tc_ct.h b/include/net/tc_act/tc_ct.h new file mode 100644 index 000000000000..bdc20ab3b88d --- /dev/null +++ b/include/net/tc_act/tc_ct.h @@ -0,0 +1,63 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __NET_TC_CT_H +#define __NET_TC_CT_H + +#include +#include + +#if IS_ENABLED(CONFIG_NF_CONNTRACK) +#include +#include + +struct tcf_ct_params { + struct nf_conn *tmpl; + u16 zone; + + u32 mark; + u32 mark_mask; + + u32 labels[NF_CT_LABELS_MAX_SIZE / sizeof(u32)]; + u32 labels_mask[NF_CT_LABELS_MAX_SIZE / sizeof(u32)]; + + struct nf_nat_range2 range; + bool ipv4_range; + + u16 ct_action; + + struct rcu_head rcu; +}; + +struct tcf_ct { + struct tc_action common; + struct tcf_ct_params __rcu *params; +}; + +#define to_ct(a) ((struct tcf_ct *)a) +#define to_ct_params(a) ((struct tcf_ct_params *) \ + rtnl_dereference((to_ct(a)->params))) + +static inline uint16_t tcf_ct_zone(const struct tc_action *a) +{ + return to_ct_params(a)->zone; +} + +static inline int tcf_ct_action(const struct tc_action *a) +{ + return to_ct_params(a)->ct_action; +} + +#else +static inline uint16_t tcf_ct_zone(const struct tc_action *a) { return 0; } +static inline int tcf_ct_action(const struct tc_action *a) { return 0; } +#endif /* CONFIG_NF_CONNTRACK */ + +static inline bool is_tcf_ct(const struct tc_action *a) +{ +#if defined(CONFIG_NET_CLS_ACT) && IS_ENABLED(CONFIG_NF_CONNTRACK) + if (a->ops && a->ops->id == TCA_ID_CT) + return true; +#endif + return false; +} + +#endif /* __NET_TC_CT_H */ diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h index e22ef4a940bc..31db5589b7ca 100644 --- a/include/uapi/linux/pkt_cls.h +++ b/include/uapi/linux/pkt_cls.h @@ -106,6 +106,7 @@ enum tca_id { TCA_ID_SAMPLE = TCA_ACT_SAMPLE, TCA_ID_CTINFO, TCA_ID_MPLS, + TCA_ID_CT, /* other actions go here */ __TCA_ID_MAX = 255 }; diff --git a/include/uapi/linux/tc_act/tc_ct.h b/include/uapi/linux/tc_act/tc_ct.h new file mode 100644 index 000000000000..5fb1d7ac1027 --- /dev/null +++ b/include/uapi/linux/tc_act/tc_ct.h @@ -0,0 +1,41 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef __UAPI_TC_CT_H +#define __UAPI_TC_CT_H + +#include +#include + +enum { + TCA_CT_UNSPEC, + TCA_CT_PARMS, + TCA_CT_TM, + TCA_CT_ACTION, /* u16 */ + TCA_CT_ZONE, /* u16 */ + TCA_CT_MARK, /* u32 */ + TCA_CT_MARK_MASK, /* u32 */ + TCA_CT_LABELS, /* u128 */ + TCA_CT_LABELS_MASK, /* u128 */ + TCA_CT_NAT_IPV4_MIN, /* be32 */ + TCA_CT_NAT_IPV4_MAX, /* be32 */ + TCA_CT_NAT_IPV6_MIN, /* struct in6_addr */ + TCA_CT_NAT_IPV6_MAX, /* struct in6_addr */ + TCA_CT_NAT_PORT_MIN, /* be16 */ + TCA_CT_NAT_PORT_MAX, /* be16 */ + TCA_CT_PAD, + __TCA_CT_MAX +}; + +#define TCA_CT_MAX (__TCA_CT_MAX - 1) + +#define TCA_CT_ACT_COMMIT (1 << 0) +#define TCA_CT_ACT_FORCE (1 << 1) +#define TCA_CT_ACT_CLEAR (1 << 2) +#define TCA_CT_ACT_NAT (1 << 3) +#define TCA_CT_ACT_NAT_SRC (1 << 4) +#define TCA_CT_ACT_NAT_DST (1 << 5) + +struct tc_ct { + tc_gen; +}; + +#endif /* __UAPI_TC_CT_H */ diff --git a/net/sched/Kconfig b/net/sched/Kconfig index 731f5fbc2a3c..dd55b9ac3a66 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -940,6 +940,17 @@ config NET_ACT_TUNNEL_KEY To compile this code as a module, choose M here: the module will be called act_tunnel_key. +config NET_ACT_CT + tristate "connection tracking tc action" + depends on NET_CLS_ACT && NF_CONNTRACK + help + Say Y here to allow sending the packets to conntrack module. + + If unsure, say N. + + To compile this code as a module, choose M here: the + module will be called act_ct. + config NET_IFE_SKBMARK tristate "Support to encoding decoding skb mark on IFE action" depends on NET_ACT_IFE diff --git a/net/sched/Makefile b/net/sched/Makefile index c26603606c22..415d1e1f237e 100644 --- a/net/sched/Makefile +++ b/net/sched/Makefile @@ -29,6 +29,7 @@ obj-$(CONFIG_NET_IFE_SKBMARK) += act_meta_mark.o obj-$(CONFIG_NET_IFE_SKBPRIO) += act_meta_skbprio.o obj-$(CONFIG_NET_IFE_SKBTCINDEX) += act_meta_skbtcindex.o obj-$(CONFIG_NET_ACT_TUNNEL_KEY)+= act_tunnel_key.o +obj-$(CONFIG_NET_ACT_CT) += act_ct.o obj-$(CONFIG_NET_SCH_FIFO) += sch_fifo.o obj-$(CONFIG_NET_SCH_CBQ) += sch_cbq.o obj-$(CONFIG_NET_SCH_HTB) += sch_htb.o diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c new file mode 100644 index 000000000000..b501ce0cf116 --- /dev/null +++ b/net/sched/act_ct.c @@ -0,0 +1,984 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* - + * net/sched/act_ct.c Connection Tracking action + * + * Authors: Paul Blakey + * Yossi Kuperman + * Marcelo Ricardo Leitner + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +static struct tc_action_ops act_ct_ops; +static unsigned int ct_net_id; + +struct tc_ct_action_net { + struct tc_action_net tn; /* Must be first */ + bool labels; +}; + +/* Determine whether skb->_nfct is equal to the result of conntrack lookup. */ +static bool tcf_ct_skb_nfct_cached(struct net *net, struct sk_buff *skb, + u16 zone_id, bool force) +{ + enum ip_conntrack_info ctinfo; + struct nf_conn *ct; + + ct = nf_ct_get(skb, &ctinfo); + if (!ct) + return false; + if (!net_eq(net, read_pnet(&ct->ct_net))) + return false; + if (nf_ct_zone(ct)->id != zone_id) + return false; + + /* Force conntrack entry direction. */ + if (force && CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) { + if (nf_ct_is_confirmed(ct)) + nf_ct_kill(ct); + + nf_conntrack_put(&ct->ct_general); + nf_ct_set(skb, NULL, IP_CT_UNTRACKED); + + return false; + } + + return true; +} + +/* Trim the skb to the length specified by the IP/IPv6 header, + * removing any trailing lower-layer padding. This prepares the skb + * for higher-layer processing that assumes skb->len excludes padding + * (such as nf_ip_checksum). The caller needs to pull the skb to the + * network header, and ensure ip_hdr/ipv6_hdr points to valid data. + */ +static int tcf_ct_skb_network_trim(struct sk_buff *skb, int family) +{ + unsigned int len; + int err; + + switch (family) { + case NFPROTO_IPV4: + len = ntohs(ip_hdr(skb)->tot_len); + break; + case NFPROTO_IPV6: + len = sizeof(struct ipv6hdr) + + ntohs(ipv6_hdr(skb)->payload_len); + break; + default: + len = skb->len; + } + + err = pskb_trim_rcsum(skb, len); + + return err; +} + +static u8 tcf_ct_skb_nf_family(struct sk_buff *skb) +{ + u8 family = NFPROTO_UNSPEC; + + switch (skb->protocol) { + case htons(ETH_P_IP): + family = NFPROTO_IPV4; + break; + case htons(ETH_P_IPV6): + family = NFPROTO_IPV6; + break; + default: + break; + } + + return family; +} + +static int tcf_ct_ipv4_is_fragment(struct sk_buff *skb, bool *frag) +{ + unsigned int len; + + len = skb_network_offset(skb) + sizeof(struct iphdr); + if (unlikely(skb->len < len)) + return -EINVAL; + if (unlikely(!pskb_may_pull(skb, len))) + return -ENOMEM; + + *frag = ip_is_fragment(ip_hdr(skb)); + return 0; +} + +static int tcf_ct_ipv6_is_fragment(struct sk_buff *skb, bool *frag) +{ + unsigned int flags = 0, len, payload_ofs = 0; + unsigned short frag_off; + int nexthdr; + + len = skb_network_offset(skb) + sizeof(struct ipv6hdr); + if (unlikely(skb->len < len)) + return -EINVAL; + if (unlikely(!pskb_may_pull(skb, len))) + return -ENOMEM; + + nexthdr = ipv6_find_hdr(skb, &payload_ofs, -1, &frag_off, &flags); + if (unlikely(nexthdr < 0)) + return -EPROTO; + + *frag = flags & IP6_FH_F_FRAG; + return 0; +} + +static int tcf_ct_handle_fragments(struct net *net, struct sk_buff *skb, + u8 family, u16 zone) +{ + enum ip_conntrack_info ctinfo; + struct nf_conn *ct; + int err = 0; + bool frag; + + /* Previously seen (loopback)? Ignore. */ + ct = nf_ct_get(skb, &ctinfo); + if ((ct && !nf_ct_is_template(ct)) || ctinfo == IP_CT_UNTRACKED) + return 0; + + if (family == NFPROTO_IPV4) + err = tcf_ct_ipv4_is_fragment(skb, &frag); + else + err = tcf_ct_ipv6_is_fragment(skb, &frag); + if (err || !frag) + return err; + + skb_get(skb); + + if (family == NFPROTO_IPV4) { + enum ip_defrag_users user = IP_DEFRAG_CONNTRACK_IN + zone; + + memset(IPCB(skb), 0, sizeof(struct inet_skb_parm)); + local_bh_disable(); + err = ip_defrag(net, skb, user); + local_bh_enable(); + if (err && err != -EINPROGRESS) + goto out_free; + } else { /* NFPROTO_IPV6 */ +#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6) + enum ip6_defrag_users user = IP6_DEFRAG_CONNTRACK_IN + zone; + + memset(IP6CB(skb), 0, sizeof(struct inet6_skb_parm)); + err = nf_ct_frag6_gather(net, skb, user); + if (err && err != -EINPROGRESS) + goto out_free; +#else + err = -EOPNOTSUPP; + goto out_free; +#endif + } + + skb_clear_hash(skb); + skb->ignore_df = 1; + return err; + +out_free: + kfree_skb(skb); + return err; +} + +static void tcf_ct_params_free(struct rcu_head *head) +{ + struct tcf_ct_params *params = container_of(head, + struct tcf_ct_params, rcu); + + if (params->tmpl) + nf_conntrack_put(¶ms->tmpl->ct_general); + kfree(params); +} + +#if IS_ENABLED(CONFIG_NF_NAT) +/* Modelled after nf_nat_ipv[46]_fn(). + * range is only used for new, uninitialized NAT state. + * Returns either NF_ACCEPT or NF_DROP. + */ +static int ct_nat_execute(struct sk_buff *skb, struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + const struct nf_nat_range2 *range, + enum nf_nat_manip_type maniptype) +{ + int hooknum, err = NF_ACCEPT; + + /* See HOOK2MANIP(). */ + if (maniptype == NF_NAT_MANIP_SRC) + hooknum = NF_INET_LOCAL_IN; /* Source NAT */ + else + hooknum = NF_INET_LOCAL_OUT; /* Destination NAT */ + + switch (ctinfo) { + case IP_CT_RELATED: + case IP_CT_RELATED_REPLY: + if (skb->protocol == htons(ETH_P_IP) && + ip_hdr(skb)->protocol == IPPROTO_ICMP) { + if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo, + hooknum)) + err = NF_DROP; + goto out; + } else if (IS_ENABLED(CONFIG_IPV6) && + skb->protocol == htons(ETH_P_IPV6)) { + __be16 frag_off; + u8 nexthdr = ipv6_hdr(skb)->nexthdr; + int hdrlen = ipv6_skip_exthdr(skb, + sizeof(struct ipv6hdr), + &nexthdr, &frag_off); + + if (hdrlen >= 0 && nexthdr == IPPROTO_ICMPV6) { + if (!nf_nat_icmpv6_reply_translation(skb, ct, + ctinfo, + hooknum, + hdrlen)) + err = NF_DROP; + goto out; + } + } + /* Non-ICMP, fall thru to initialize if needed. */ + /* fall through */ + case IP_CT_NEW: + /* Seen it before? This can happen for loopback, retrans, + * or local packets. + */ + if (!nf_nat_initialized(ct, maniptype)) { + /* Initialize according to the NAT action. */ + err = (range && range->flags & NF_NAT_RANGE_MAP_IPS) + /* Action is set up to establish a new + * mapping. + */ + ? nf_nat_setup_info(ct, range, maniptype) + : nf_nat_alloc_null_binding(ct, hooknum); + if (err != NF_ACCEPT) + goto out; + } + break; + + case IP_CT_ESTABLISHED: + case IP_CT_ESTABLISHED_REPLY: + break; + + default: + err = NF_DROP; + goto out; + } + + err = nf_nat_packet(ct, ctinfo, hooknum, skb); +out: + return err; +} +#endif /* CONFIG_NF_NAT */ + +static void tcf_ct_act_set_mark(struct nf_conn *ct, u32 mark, u32 mask) +{ +#if IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) + u32 new_mark; + + if (!mask) + return; + + new_mark = mark | (ct->mark & ~(mask)); + if (ct->mark != new_mark) { + ct->mark = new_mark; + if (nf_ct_is_confirmed(ct)) + nf_conntrack_event_cache(IPCT_MARK, ct); + } +#endif +} + +static void tcf_ct_act_set_labels(struct nf_conn *ct, + u32 *labels, + u32 *labels_m) +{ +#if IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS) + size_t labels_sz = FIELD_SIZEOF(struct tcf_ct_params, labels); + + if (!memchr_inv(labels_m, 0, labels_sz)) + return; + + nf_connlabels_replace(ct, labels, labels_m, 4); +#endif +} + +static int tcf_ct_act_nat(struct sk_buff *skb, + struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + int ct_action, + struct nf_nat_range2 *range, + bool commit) +{ +#if IS_ENABLED(CONFIG_NF_NAT) + enum nf_nat_manip_type maniptype; + + if (!(ct_action & TCA_CT_ACT_NAT)) + return NF_ACCEPT; + + /* Add NAT extension if not confirmed yet. */ + if (!nf_ct_is_confirmed(ct) && !nf_ct_nat_ext_add(ct)) + return NF_DROP; /* Can't NAT. */ + + if (ctinfo != IP_CT_NEW && (ct->status & IPS_NAT_MASK) && + (ctinfo != IP_CT_RELATED || commit)) { + /* NAT an established or related connection like before. */ + if (CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY) + /* This is the REPLY direction for a connection + * for which NAT was applied in the forward + * direction. Do the reverse NAT. + */ + maniptype = ct->status & IPS_SRC_NAT + ? NF_NAT_MANIP_DST : NF_NAT_MANIP_SRC; + else + maniptype = ct->status & IPS_SRC_NAT + ? NF_NAT_MANIP_SRC : NF_NAT_MANIP_DST; + } else if (ct_action & TCA_CT_ACT_NAT_SRC) { + maniptype = NF_NAT_MANIP_SRC; + } else if (ct_action & TCA_CT_ACT_NAT_DST) { + maniptype = NF_NAT_MANIP_DST; + } else { + return NF_ACCEPT; + } + + return ct_nat_execute(skb, ct, ctinfo, range, maniptype); +#else + return NF_ACCEPT; +#endif +} + +static int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a, + struct tcf_result *res) +{ + struct net *net = dev_net(skb->dev); + bool cached, commit, clear, force; + enum ip_conntrack_info ctinfo; + struct tcf_ct *c = to_ct(a); + struct nf_conn *tmpl = NULL; + struct nf_hook_state state; + int nh_ofs, err, retval; + struct tcf_ct_params *p; + struct nf_conn *ct; + u8 family; + + p = rcu_dereference_bh(c->params); + + retval = READ_ONCE(c->tcf_action); + commit = p->ct_action & TCA_CT_ACT_COMMIT; + clear = p->ct_action & TCA_CT_ACT_CLEAR; + force = p->ct_action & TCA_CT_ACT_FORCE; + tmpl = p->tmpl; + + if (clear) { + ct = nf_ct_get(skb, &ctinfo); + if (ct) { + nf_conntrack_put(&ct->ct_general); + nf_ct_set(skb, NULL, IP_CT_UNTRACKED); + } + + goto out; + } + + family = tcf_ct_skb_nf_family(skb); + if (family == NFPROTO_UNSPEC) + goto drop; + + /* The conntrack module expects to be working at L3. + * We also try to pull the IPv4/6 header to linear area + */ + nh_ofs = skb_network_offset(skb); + skb_pull_rcsum(skb, nh_ofs); + err = tcf_ct_handle_fragments(net, skb, family, p->zone); + if (err == -EINPROGRESS) { + retval = TC_ACT_STOLEN; + goto out; + } + if (err) + goto drop; + + err = tcf_ct_skb_network_trim(skb, family); + if (err) + goto drop; + + /* If we are recirculating packets to match on ct fields and + * committing with a separate ct action, then we don't need to + * actually run the packet through conntrack twice unless it's for a + * different zone. + */ + cached = tcf_ct_skb_nfct_cached(net, skb, p->zone, force); + if (!cached) { + /* Associate skb with specified zone. */ + if (tmpl) { + ct = nf_ct_get(skb, &ctinfo); + if (skb_nfct(skb)) + nf_conntrack_put(skb_nfct(skb)); + nf_conntrack_get(&tmpl->ct_general); + nf_ct_set(skb, tmpl, IP_CT_NEW); + } + + state.hook = NF_INET_PRE_ROUTING; + state.net = net; + state.pf = family; + err = nf_conntrack_in(skb, &state); + if (err != NF_ACCEPT) + goto out_push; + } + + ct = nf_ct_get(skb, &ctinfo); + if (!ct) + goto out_push; + nf_ct_deliver_cached_events(ct); + + err = tcf_ct_act_nat(skb, ct, ctinfo, p->ct_action, &p->range, commit); + if (err != NF_ACCEPT) + goto drop; + + if (commit) { + tcf_ct_act_set_mark(ct, p->mark, p->mark_mask); + tcf_ct_act_set_labels(ct, p->labels, p->labels_mask); + + /* This will take care of sending queued events + * even if the connection is already confirmed. + */ + nf_conntrack_confirm(skb); + } + +out_push: + skb_push_rcsum(skb, nh_ofs); + +out: + bstats_cpu_update(this_cpu_ptr(a->cpu_bstats), skb); + return retval; + +drop: + qstats_drop_inc(this_cpu_ptr(a->cpu_qstats)); + return TC_ACT_SHOT; +} + +static const struct nla_policy ct_policy[TCA_CT_MAX + 1] = { + [TCA_CT_UNSPEC] = { .strict_start_type = TCA_CT_UNSPEC + 1 }, + [TCA_CT_ACTION] = { .type = NLA_U16 }, + [TCA_CT_PARMS] = { .type = NLA_EXACT_LEN, .len = sizeof(struct tc_ct) }, + [TCA_CT_ZONE] = { .type = NLA_U16 }, + [TCA_CT_MARK] = { .type = NLA_U32 }, + [TCA_CT_MARK_MASK] = { .type = NLA_U32 }, + [TCA_CT_LABELS] = { .type = NLA_BINARY, + .len = 128 / BITS_PER_BYTE }, + [TCA_CT_LABELS_MASK] = { .type = NLA_BINARY, + .len = 128 / BITS_PER_BYTE }, + [TCA_CT_NAT_IPV4_MIN] = { .type = NLA_U32 }, + [TCA_CT_NAT_IPV4_MAX] = { .type = NLA_U32 }, + [TCA_CT_NAT_IPV6_MIN] = { .type = NLA_EXACT_LEN, + .len = sizeof(struct in6_addr) }, + [TCA_CT_NAT_IPV6_MAX] = { .type = NLA_EXACT_LEN, + .len = sizeof(struct in6_addr) }, + [TCA_CT_NAT_PORT_MIN] = { .type = NLA_U16 }, + [TCA_CT_NAT_PORT_MAX] = { .type = NLA_U16 }, +}; + +static int tcf_ct_fill_params_nat(struct tcf_ct_params *p, + struct tc_ct *parm, + struct nlattr **tb, + struct netlink_ext_ack *extack) +{ + struct nf_nat_range2 *range; + + if (!(p->ct_action & TCA_CT_ACT_NAT)) + return 0; + + if (!IS_ENABLED(CONFIG_NF_NAT)) { + NL_SET_ERR_MSG_MOD(extack, "Netfilter nat isn't enabled in kernel"); + return -EOPNOTSUPP; + } + + if (!(p->ct_action & (TCA_CT_ACT_NAT_SRC | TCA_CT_ACT_NAT_DST))) + return 0; + + if ((p->ct_action & TCA_CT_ACT_NAT_SRC) && + (p->ct_action & TCA_CT_ACT_NAT_DST)) { + NL_SET_ERR_MSG_MOD(extack, "dnat and snat can't be enabled at the same time"); + return -EOPNOTSUPP; + } + + range = &p->range; + if (tb[TCA_CT_NAT_IPV4_MIN]) { + struct nlattr *max_attr = tb[TCA_CT_NAT_IPV4_MAX]; + + p->ipv4_range = true; + range->flags |= NF_NAT_RANGE_MAP_IPS; + range->min_addr.ip = + nla_get_in_addr(tb[TCA_CT_NAT_IPV4_MIN]); + + range->max_addr.ip = max_attr ? + nla_get_in_addr(max_attr) : + range->min_addr.ip; + } else if (tb[TCA_CT_NAT_IPV6_MIN]) { + struct nlattr *max_attr = tb[TCA_CT_NAT_IPV6_MAX]; + + p->ipv4_range = false; + range->flags |= NF_NAT_RANGE_MAP_IPS; + range->min_addr.in6 = + nla_get_in6_addr(tb[TCA_CT_NAT_IPV6_MIN]); + + range->max_addr.in6 = max_attr ? + nla_get_in6_addr(max_attr) : + range->min_addr.in6; + } + + if (tb[TCA_CT_NAT_PORT_MIN]) { + range->flags |= NF_NAT_RANGE_PROTO_SPECIFIED; + range->min_proto.all = nla_get_be16(tb[TCA_CT_NAT_PORT_MIN]); + + range->max_proto.all = tb[TCA_CT_NAT_PORT_MAX] ? + nla_get_be16(tb[TCA_CT_NAT_PORT_MAX]) : + range->min_proto.all; + } + + return 0; +} + +static void tcf_ct_set_key_val(struct nlattr **tb, + void *val, int val_type, + void *mask, int mask_type, + int len) +{ + if (!tb[val_type]) + return; + nla_memcpy(val, tb[val_type], len); + + if (!mask) + return; + + if (mask_type == TCA_CT_UNSPEC || !tb[mask_type]) + memset(mask, 0xff, len); + else + nla_memcpy(mask, tb[mask_type], len); +} + +static int tcf_ct_fill_params(struct net *net, + struct tcf_ct_params *p, + struct tc_ct *parm, + struct nlattr **tb, + struct netlink_ext_ack *extack) +{ + struct tc_ct_action_net *tn = net_generic(net, ct_net_id); + struct nf_conntrack_zone zone; + struct nf_conn *tmpl; + int err; + + p->zone = NF_CT_DEFAULT_ZONE_ID; + + tcf_ct_set_key_val(tb, + &p->ct_action, TCA_CT_ACTION, + NULL, TCA_CT_UNSPEC, + sizeof(p->ct_action)); + + if (p->ct_action & TCA_CT_ACT_CLEAR) + return 0; + + err = tcf_ct_fill_params_nat(p, parm, tb, extack); + if (err) + return err; + + if (tb[TCA_CT_MARK]) { + if (!IS_ENABLED(CONFIG_NF_CONNTRACK_MARK)) { + NL_SET_ERR_MSG_MOD(extack, "Conntrack mark isn't enabled."); + return -EOPNOTSUPP; + } + tcf_ct_set_key_val(tb, + &p->mark, TCA_CT_MARK, + &p->mark_mask, TCA_CT_MARK_MASK, + sizeof(p->mark)); + } + + if (tb[TCA_CT_LABELS]) { + if (!IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS)) { + NL_SET_ERR_MSG_MOD(extack, "Conntrack labels isn't enabled."); + return -EOPNOTSUPP; + } + + if (!tn->labels) { + NL_SET_ERR_MSG_MOD(extack, "Failed to set connlabel length"); + return -EOPNOTSUPP; + } + tcf_ct_set_key_val(tb, + p->labels, TCA_CT_LABELS, + p->labels_mask, TCA_CT_LABELS_MASK, + sizeof(p->labels)); + } + + if (tb[TCA_CT_ZONE]) { + if (!IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES)) { + NL_SET_ERR_MSG_MOD(extack, "Conntrack zones isn't enabled."); + return -EOPNOTSUPP; + } + + tcf_ct_set_key_val(tb, + &p->zone, TCA_CT_ZONE, + NULL, TCA_CT_UNSPEC, + sizeof(p->zone)); + } + + if (p->zone == NF_CT_DEFAULT_ZONE_ID) + return 0; + + nf_ct_zone_init(&zone, p->zone, NF_CT_DEFAULT_ZONE_DIR, 0); + tmpl = nf_ct_tmpl_alloc(net, &zone, GFP_KERNEL); + if (!tmpl) { + NL_SET_ERR_MSG_MOD(extack, "Failed to allocate conntrack template"); + return -ENOMEM; + } + __set_bit(IPS_CONFIRMED_BIT, &tmpl->status); + nf_conntrack_get(&tmpl->ct_general); + p->tmpl = tmpl; + + return 0; +} + +static int tcf_ct_init(struct net *net, struct nlattr *nla, + struct nlattr *est, struct tc_action **a, + int replace, int bind, bool rtnl_held, + struct tcf_proto *tp, + struct netlink_ext_ack *extack) +{ + struct tc_action_net *tn = net_generic(net, ct_net_id); + struct tcf_ct_params *params = NULL; + struct nlattr *tb[TCA_CT_MAX + 1]; + struct tcf_chain *goto_ch = NULL; + struct tc_ct *parm; + struct tcf_ct *c; + int err, res = 0; + + if (!nla) { + NL_SET_ERR_MSG_MOD(extack, "Ct requires attributes to be passed"); + return -EINVAL; + } + + err = nla_parse_nested(tb, TCA_CT_MAX, nla, ct_policy, extack); + if (err < 0) + return err; + + if (!tb[TCA_CT_PARMS]) { + NL_SET_ERR_MSG_MOD(extack, "Missing required ct parameters"); + return -EINVAL; + } + parm = nla_data(tb[TCA_CT_PARMS]); + + err = tcf_idr_check_alloc(tn, &parm->index, a, bind); + if (err < 0) + return err; + + if (!err) { + err = tcf_idr_create(tn, parm->index, est, a, + &act_ct_ops, bind, true); + if (err) { + tcf_idr_cleanup(tn, parm->index); + return err; + } + res = ACT_P_CREATED; + } else { + if (bind) + return 0; + + if (!replace) { + tcf_idr_release(*a, bind); + return -EEXIST; + } + } + err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack); + if (err < 0) + goto cleanup; + + c = to_ct(*a); + + params = kzalloc(sizeof(*params), GFP_KERNEL); + if (unlikely(!params)) { + err = -ENOMEM; + goto cleanup; + } + + err = tcf_ct_fill_params(net, params, parm, tb, extack); + if (err) + goto cleanup; + + spin_lock_bh(&c->tcf_lock); + goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); + rcu_swap_protected(c->params, params, lockdep_is_held(&c->tcf_lock)); + spin_unlock_bh(&c->tcf_lock); + + if (goto_ch) + tcf_chain_put_by_act(goto_ch); + if (params) + kfree_rcu(params, rcu); + if (res == ACT_P_CREATED) + tcf_idr_insert(tn, *a); + + return res; + +cleanup: + if (goto_ch) + tcf_chain_put_by_act(goto_ch); + kfree(params); + tcf_idr_release(*a, bind); + return err; +} + +static void tcf_ct_cleanup(struct tc_action *a) +{ + struct tcf_ct_params *params; + struct tcf_ct *c = to_ct(a); + + params = rcu_dereference_protected(c->params, 1); + if (params) + call_rcu(¶ms->rcu, tcf_ct_params_free); +} + +static int tcf_ct_dump_key_val(struct sk_buff *skb, + void *val, int val_type, + void *mask, int mask_type, + int len) +{ + int err; + + if (mask && !memchr_inv(mask, 0, len)) + return 0; + + err = nla_put(skb, val_type, len, val); + if (err) + return err; + + if (mask_type != TCA_CT_UNSPEC) { + err = nla_put(skb, mask_type, len, mask); + if (err) + return err; + } + + return 0; +} + +static int tcf_ct_dump_nat(struct sk_buff *skb, struct tcf_ct_params *p) +{ + struct nf_nat_range2 *range = &p->range; + + if (!(p->ct_action & TCA_CT_ACT_NAT)) + return 0; + + if (!(p->ct_action & (TCA_CT_ACT_NAT_SRC | TCA_CT_ACT_NAT_DST))) + return 0; + + if (range->flags & NF_NAT_RANGE_MAP_IPS) { + if (p->ipv4_range) { + if (nla_put_in_addr(skb, TCA_CT_NAT_IPV4_MIN, + range->min_addr.ip)) + return -1; + if (nla_put_in_addr(skb, TCA_CT_NAT_IPV4_MAX, + range->max_addr.ip)) + return -1; + } else { + if (nla_put_in6_addr(skb, TCA_CT_NAT_IPV6_MIN, + &range->min_addr.in6)) + return -1; + if (nla_put_in6_addr(skb, TCA_CT_NAT_IPV6_MAX, + &range->max_addr.in6)) + return -1; + } + } + + if (range->flags & NF_NAT_RANGE_PROTO_SPECIFIED) { + if (nla_put_be16(skb, TCA_CT_NAT_PORT_MIN, + range->min_proto.all)) + return -1; + if (nla_put_be16(skb, TCA_CT_NAT_PORT_MAX, + range->max_proto.all)) + return -1; + } + + return 0; +} + +static inline int tcf_ct_dump(struct sk_buff *skb, struct tc_action *a, + int bind, int ref) +{ + unsigned char *b = skb_tail_pointer(skb); + struct tcf_ct *c = to_ct(a); + struct tcf_ct_params *p; + + struct tc_ct opt = { + .index = c->tcf_index, + .refcnt = refcount_read(&c->tcf_refcnt) - ref, + .bindcnt = atomic_read(&c->tcf_bindcnt) - bind, + }; + struct tcf_t t; + + spin_lock_bh(&c->tcf_lock); + p = rcu_dereference_protected(c->params, + lockdep_is_held(&c->tcf_lock)); + opt.action = c->tcf_action; + + if (tcf_ct_dump_key_val(skb, + &p->ct_action, TCA_CT_ACTION, + NULL, TCA_CT_UNSPEC, + sizeof(p->ct_action))) + goto nla_put_failure; + + if (p->ct_action & TCA_CT_ACT_CLEAR) + goto skip_dump; + + if (IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) && + tcf_ct_dump_key_val(skb, + &p->mark, TCA_CT_MARK, + &p->mark_mask, TCA_CT_MARK_MASK, + sizeof(p->mark))) + goto nla_put_failure; + + if (IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS) && + tcf_ct_dump_key_val(skb, + p->labels, TCA_CT_LABELS, + p->labels_mask, TCA_CT_LABELS_MASK, + sizeof(p->labels))) + goto nla_put_failure; + + if (IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES) && + tcf_ct_dump_key_val(skb, + &p->zone, TCA_CT_ZONE, + NULL, TCA_CT_UNSPEC, + sizeof(p->zone))) + goto nla_put_failure; + + if (tcf_ct_dump_nat(skb, p)) + goto nla_put_failure; + +skip_dump: + if (nla_put(skb, TCA_CT_PARMS, sizeof(opt), &opt)) + goto nla_put_failure; + + tcf_tm_dump(&t, &c->tcf_tm); + if (nla_put_64bit(skb, TCA_CT_TM, sizeof(t), &t, TCA_CT_PAD)) + goto nla_put_failure; + spin_unlock_bh(&c->tcf_lock); + + return skb->len; +nla_put_failure: + spin_unlock_bh(&c->tcf_lock); + nlmsg_trim(skb, b); + return -1; +} + +static int tcf_ct_walker(struct net *net, struct sk_buff *skb, + struct netlink_callback *cb, int type, + const struct tc_action_ops *ops, + struct netlink_ext_ack *extack) +{ + struct tc_action_net *tn = net_generic(net, ct_net_id); + + return tcf_generic_walker(tn, skb, cb, type, ops, extack); +} + +static int tcf_ct_search(struct net *net, struct tc_action **a, u32 index) +{ + struct tc_action_net *tn = net_generic(net, ct_net_id); + + return tcf_idr_search(tn, a, index); +} + +static void tcf_stats_update(struct tc_action *a, u64 bytes, u32 packets, + u64 lastuse, bool hw) +{ + struct tcf_ct *c = to_ct(a); + + _bstats_cpu_update(this_cpu_ptr(a->cpu_bstats), bytes, packets); + + if (hw) + _bstats_cpu_update(this_cpu_ptr(a->cpu_bstats_hw), + bytes, packets); + c->tcf_tm.lastuse = max_t(u64, c->tcf_tm.lastuse, lastuse); +} + +static struct tc_action_ops act_ct_ops = { + .kind = "ct", + .id = TCA_ID_CT, + .owner = THIS_MODULE, + .act = tcf_ct_act, + .dump = tcf_ct_dump, + .init = tcf_ct_init, + .cleanup = tcf_ct_cleanup, + .walk = tcf_ct_walker, + .lookup = tcf_ct_search, + .stats_update = tcf_stats_update, + .size = sizeof(struct tcf_ct), +}; + +static __net_init int ct_init_net(struct net *net) +{ + unsigned int n_bits = FIELD_SIZEOF(struct tcf_ct_params, labels) * 8; + struct tc_ct_action_net *tn = net_generic(net, ct_net_id); + + if (nf_connlabels_get(net, n_bits - 1)) { + tn->labels = false; + pr_err("act_ct: Failed to set connlabels length"); + } else { + tn->labels = true; + } + + return tc_action_net_init(&tn->tn, &act_ct_ops); +} + +static void __net_exit ct_exit_net(struct list_head *net_list) +{ + struct net *net; + + rtnl_lock(); + list_for_each_entry(net, net_list, exit_list) { + struct tc_ct_action_net *tn = net_generic(net, ct_net_id); + + if (tn->labels) + nf_connlabels_put(net); + } + rtnl_unlock(); + + tc_action_net_exit(net_list, ct_net_id); +} + +static struct pernet_operations ct_net_ops = { + .init = ct_init_net, + .exit_batch = ct_exit_net, + .id = &ct_net_id, + .size = sizeof(struct tc_ct_action_net), +}; + +static int __init ct_init_module(void) +{ + return tcf_register_action(&act_ct_ops, &ct_net_ops); +} + +static void __exit ct_cleanup_module(void) +{ + tcf_unregister_action(&act_ct_ops, &ct_net_ops); +} + +module_init(ct_init_module); +module_exit(ct_cleanup_module); +MODULE_AUTHOR("Paul Blakey "); +MODULE_AUTHOR("Yossi Kuperman "); +MODULE_AUTHOR("Marcelo Ricardo Leitner "); +MODULE_DESCRIPTION("Connection tracking action"); +MODULE_LICENSE("GPL v2"); + diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index ad36bbcc583e..4a7331ce830d 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -35,6 +35,7 @@ #include #include #include +#include extern const struct nla_policy rtm_tca_policy[TCA_MAX + 1]; @@ -3266,6 +3267,10 @@ int tc_setup_flow_action(struct flow_action *flow_action, entry->police.burst = tcf_police_tcfp_burst(act); entry->police.rate_bytes_ps = tcf_police_rate_bytes_ps(act); + } else if (is_tcf_ct(act)) { + entry->id = FLOW_ACTION_CT; + entry->ct.action = tcf_ct_action(act); + entry->ct.zone = tcf_ct_zone(act); } else { goto err_out; } -- cgit v1.2.3-71-gd317 From e0ace68af2acfe474bc89a3d9a2e24d700bf245d Mon Sep 17 00:00:00 2001 From: Paul Blakey Date: Tue, 9 Jul 2019 10:30:50 +0300 Subject: net/sched: cls_flower: Add matching on conntrack info New matches for conntrack mark, label, zone, and state. Signed-off-by: Paul Blakey Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: Yossi Kuperman Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/uapi/linux/pkt_cls.h | 16 ++++++ net/sched/cls_flower.c | 127 +++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 138 insertions(+), 5 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h index 31db5589b7ca..b057aeeb6338 100644 --- a/include/uapi/linux/pkt_cls.h +++ b/include/uapi/linux/pkt_cls.h @@ -537,11 +537,27 @@ enum { TCA_FLOWER_KEY_PORT_DST_MIN, /* be16 */ TCA_FLOWER_KEY_PORT_DST_MAX, /* be16 */ + TCA_FLOWER_KEY_CT_STATE, /* u16 */ + TCA_FLOWER_KEY_CT_STATE_MASK, /* u16 */ + TCA_FLOWER_KEY_CT_ZONE, /* u16 */ + TCA_FLOWER_KEY_CT_ZONE_MASK, /* u16 */ + TCA_FLOWER_KEY_CT_MARK, /* u32 */ + TCA_FLOWER_KEY_CT_MARK_MASK, /* u32 */ + TCA_FLOWER_KEY_CT_LABELS, /* u128 */ + TCA_FLOWER_KEY_CT_LABELS_MASK, /* u128 */ + __TCA_FLOWER_MAX, }; #define TCA_FLOWER_MAX (__TCA_FLOWER_MAX - 1) +enum { + TCA_FLOWER_KEY_CT_FLAGS_NEW = 1 << 0, /* Beginning of a new connection. */ + TCA_FLOWER_KEY_CT_FLAGS_ESTABLISHED = 1 << 1, /* Part of an existing connection. */ + TCA_FLOWER_KEY_CT_FLAGS_RELATED = 1 << 2, /* Related to an established connection. */ + TCA_FLOWER_KEY_CT_FLAGS_TRACKED = 1 << 3, /* Conntrack has occurred. */ +}; + enum { TCA_FLOWER_KEY_ENC_OPTS_UNSPEC, TCA_FLOWER_KEY_ENC_OPTS_GENEVE, /* Nested diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 5d4935b51e6f..bec37e16347f 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -26,6 +26,8 @@ #include #include +#include + struct fl_flow_key { struct flow_dissector_key_meta meta; struct flow_dissector_key_control control; @@ -54,6 +56,7 @@ struct fl_flow_key { struct flow_dissector_key_enc_opts enc_opts; struct flow_dissector_key_ports tp_min; struct flow_dissector_key_ports tp_max; + struct flow_dissector_key_ct ct; } __aligned(BITS_PER_LONG / 8); /* Ensure that we can do comparisons as longs. */ struct fl_flow_mask_range { @@ -272,14 +275,27 @@ static struct cls_fl_filter *fl_lookup(struct fl_flow_mask *mask, return __fl_lookup(mask, mkey); } +static u16 fl_ct_info_to_flower_map[] = { + [IP_CT_ESTABLISHED] = TCA_FLOWER_KEY_CT_FLAGS_TRACKED | + TCA_FLOWER_KEY_CT_FLAGS_ESTABLISHED, + [IP_CT_RELATED] = TCA_FLOWER_KEY_CT_FLAGS_TRACKED | + TCA_FLOWER_KEY_CT_FLAGS_RELATED, + [IP_CT_ESTABLISHED_REPLY] = TCA_FLOWER_KEY_CT_FLAGS_TRACKED | + TCA_FLOWER_KEY_CT_FLAGS_ESTABLISHED, + [IP_CT_RELATED_REPLY] = TCA_FLOWER_KEY_CT_FLAGS_TRACKED | + TCA_FLOWER_KEY_CT_FLAGS_RELATED, + [IP_CT_NEW] = TCA_FLOWER_KEY_CT_FLAGS_TRACKED | + TCA_FLOWER_KEY_CT_FLAGS_NEW, +}; + static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct tcf_result *res) { struct cls_fl_head *head = rcu_dereference_bh(tp->root); - struct cls_fl_filter *f; - struct fl_flow_mask *mask; - struct fl_flow_key skb_key; struct fl_flow_key skb_mkey; + struct fl_flow_key skb_key; + struct fl_flow_mask *mask; + struct cls_fl_filter *f; list_for_each_entry_rcu(mask, &head->masks, list) { fl_clear_masked_range(&skb_key, mask); @@ -290,6 +306,9 @@ static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp, */ skb_key.basic.n_proto = skb->protocol; skb_flow_dissect_tunnel_info(skb, &mask->dissector, &skb_key); + skb_flow_dissect_ct(skb, &mask->dissector, &skb_key, + fl_ct_info_to_flower_map, + ARRAY_SIZE(fl_ct_info_to_flower_map)); skb_flow_dissect(skb, &mask->dissector, &skb_key, 0); fl_set_masked_key(&skb_mkey, &skb_key, mask); @@ -686,6 +705,16 @@ static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 1] = { [TCA_FLOWER_KEY_ENC_IP_TTL_MASK] = { .type = NLA_U8 }, [TCA_FLOWER_KEY_ENC_OPTS] = { .type = NLA_NESTED }, [TCA_FLOWER_KEY_ENC_OPTS_MASK] = { .type = NLA_NESTED }, + [TCA_FLOWER_KEY_CT_STATE] = { .type = NLA_U16 }, + [TCA_FLOWER_KEY_CT_STATE_MASK] = { .type = NLA_U16 }, + [TCA_FLOWER_KEY_CT_ZONE] = { .type = NLA_U16 }, + [TCA_FLOWER_KEY_CT_ZONE_MASK] = { .type = NLA_U16 }, + [TCA_FLOWER_KEY_CT_MARK] = { .type = NLA_U32 }, + [TCA_FLOWER_KEY_CT_MARK_MASK] = { .type = NLA_U32 }, + [TCA_FLOWER_KEY_CT_LABELS] = { .type = NLA_BINARY, + .len = 128 / BITS_PER_BYTE }, + [TCA_FLOWER_KEY_CT_LABELS_MASK] = { .type = NLA_BINARY, + .len = 128 / BITS_PER_BYTE }, }; static const struct nla_policy @@ -707,11 +736,11 @@ static void fl_set_key_val(struct nlattr **tb, { if (!tb[val_type]) return; - memcpy(val, nla_data(tb[val_type]), len); + nla_memcpy(val, tb[val_type], len); if (mask_type == TCA_FLOWER_UNSPEC || !tb[mask_type]) memset(mask, 0xff, len); else - memcpy(mask, nla_data(tb[mask_type]), len); + nla_memcpy(mask, tb[mask_type], len); } static int fl_set_key_port_range(struct nlattr **tb, struct fl_flow_key *key, @@ -997,6 +1026,51 @@ static int fl_set_enc_opt(struct nlattr **tb, struct fl_flow_key *key, return 0; } +static int fl_set_key_ct(struct nlattr **tb, + struct flow_dissector_key_ct *key, + struct flow_dissector_key_ct *mask, + struct netlink_ext_ack *extack) +{ + if (tb[TCA_FLOWER_KEY_CT_STATE]) { + if (!IS_ENABLED(CONFIG_NF_CONNTRACK)) { + NL_SET_ERR_MSG(extack, "Conntrack isn't enabled"); + return -EOPNOTSUPP; + } + fl_set_key_val(tb, &key->ct_state, TCA_FLOWER_KEY_CT_STATE, + &mask->ct_state, TCA_FLOWER_KEY_CT_STATE_MASK, + sizeof(key->ct_state)); + } + if (tb[TCA_FLOWER_KEY_CT_ZONE]) { + if (!IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES)) { + NL_SET_ERR_MSG(extack, "Conntrack zones isn't enabled"); + return -EOPNOTSUPP; + } + fl_set_key_val(tb, &key->ct_zone, TCA_FLOWER_KEY_CT_ZONE, + &mask->ct_zone, TCA_FLOWER_KEY_CT_ZONE_MASK, + sizeof(key->ct_zone)); + } + if (tb[TCA_FLOWER_KEY_CT_MARK]) { + if (!IS_ENABLED(CONFIG_NF_CONNTRACK_MARK)) { + NL_SET_ERR_MSG(extack, "Conntrack mark isn't enabled"); + return -EOPNOTSUPP; + } + fl_set_key_val(tb, &key->ct_mark, TCA_FLOWER_KEY_CT_MARK, + &mask->ct_mark, TCA_FLOWER_KEY_CT_MARK_MASK, + sizeof(key->ct_mark)); + } + if (tb[TCA_FLOWER_KEY_CT_LABELS]) { + if (!IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS)) { + NL_SET_ERR_MSG(extack, "Conntrack labels aren't enabled"); + return -EOPNOTSUPP; + } + fl_set_key_val(tb, key->ct_labels, TCA_FLOWER_KEY_CT_LABELS, + mask->ct_labels, TCA_FLOWER_KEY_CT_LABELS_MASK, + sizeof(key->ct_labels)); + } + + return 0; +} + static int fl_set_key(struct net *net, struct nlattr **tb, struct fl_flow_key *key, struct fl_flow_key *mask, struct netlink_ext_ack *extack) @@ -1206,6 +1280,10 @@ static int fl_set_key(struct net *net, struct nlattr **tb, return ret; } + ret = fl_set_key_ct(tb, &key->ct, &mask->ct, extack); + if (ret) + return ret; + if (tb[TCA_FLOWER_KEY_FLAGS]) ret = fl_set_key_flags(tb, &key->control.flags, &mask->control.flags); @@ -1306,6 +1384,8 @@ static void fl_init_dissector(struct flow_dissector *dissector, FLOW_DISSECTOR_KEY_ENC_IP, enc_ip); FL_KEY_SET_IF_MASKED(mask, keys, cnt, FLOW_DISSECTOR_KEY_ENC_OPTS, enc_opts); + FL_KEY_SET_IF_MASKED(mask, keys, cnt, + FLOW_DISSECTOR_KEY_CT, ct); skb_flow_dissector_init(dissector, keys, cnt); } @@ -2065,6 +2145,40 @@ nla_put_failure: return -EMSGSIZE; } +static int fl_dump_key_ct(struct sk_buff *skb, + struct flow_dissector_key_ct *key, + struct flow_dissector_key_ct *mask) +{ + if (IS_ENABLED(CONFIG_NF_CONNTRACK) && + fl_dump_key_val(skb, &key->ct_state, TCA_FLOWER_KEY_CT_STATE, + &mask->ct_state, TCA_FLOWER_KEY_CT_STATE_MASK, + sizeof(key->ct_state))) + goto nla_put_failure; + + if (IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES) && + fl_dump_key_val(skb, &key->ct_zone, TCA_FLOWER_KEY_CT_ZONE, + &mask->ct_zone, TCA_FLOWER_KEY_CT_ZONE_MASK, + sizeof(key->ct_zone))) + goto nla_put_failure; + + if (IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) && + fl_dump_key_val(skb, &key->ct_mark, TCA_FLOWER_KEY_CT_MARK, + &mask->ct_mark, TCA_FLOWER_KEY_CT_MARK_MASK, + sizeof(key->ct_mark))) + goto nla_put_failure; + + if (IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS) && + fl_dump_key_val(skb, &key->ct_labels, TCA_FLOWER_KEY_CT_LABELS, + &mask->ct_labels, TCA_FLOWER_KEY_CT_LABELS_MASK, + sizeof(key->ct_labels))) + goto nla_put_failure; + + return 0; + +nla_put_failure: + return -EMSGSIZE; +} + static int fl_dump_key_options(struct sk_buff *skb, int enc_opt_type, struct flow_dissector_key_enc_opts *enc_opts) { @@ -2298,6 +2412,9 @@ static int fl_dump_key(struct sk_buff *skb, struct net *net, fl_dump_key_enc_opt(skb, &key->enc_opts, &mask->enc_opts)) goto nla_put_failure; + if (fl_dump_key_ct(skb, &key->ct, &mask->ct)) + goto nla_put_failure; + if (fl_dump_key_flags(skb, key->control.flags, mask->control.flags)) goto nla_put_failure; -- cgit v1.2.3-71-gd317 From 0fa03c624d8fc9932d0f27c39a9deca6a37e0e17 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 19 Apr 2019 13:34:07 -0600 Subject: io_uring: add support for sendmsg() This is done through IORING_OP_SENDMSG. There's a new sqe->msg_flags for the flags argument, and the msghdr struct is passed in the sqe->addr field. We use MSG_DONTWAIT to force an inline fast path if sendmsg() doesn't block, and punt to async execution if it would have. Acked-by: David S. Miller Signed-off-by: Jens Axboe --- fs/io_uring.c | 40 ++++++++++++++++++++++++++++++++++++++++ include/linux/socket.h | 4 ++++ include/uapi/linux/io_uring.h | 2 ++ net/socket.c | 7 +++++++ 4 files changed, 53 insertions(+) (limited to 'include/uapi/linux') diff --git a/fs/io_uring.c b/fs/io_uring.c index 9f0ef4956f87..5d4cd8c4132d 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1390,6 +1390,43 @@ static int io_sync_file_range(struct io_kiocb *req, return 0; } +static int io_sendmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe, + bool force_nonblock) +{ +#if defined(CONFIG_NET) + struct socket *sock; + int ret; + + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) + return -EINVAL; + + sock = sock_from_file(req->file, &ret); + if (sock) { + struct user_msghdr __user *msg; + unsigned flags; + + flags = READ_ONCE(sqe->msg_flags); + if (flags & MSG_DONTWAIT) + req->flags |= REQ_F_NOWAIT; + else if (force_nonblock) + flags |= MSG_DONTWAIT; + + msg = (struct user_msghdr __user *) (unsigned long) + READ_ONCE(sqe->addr); + + ret = __sys_sendmsg_sock(sock, msg, flags); + if (force_nonblock && ret == -EAGAIN) + return ret; + } + + io_cqring_add_event(req->ctx, sqe->user_data, ret); + io_put_req(req); + return 0; +#else + return -EOPNOTSUPP; +#endif +} + static void io_poll_remove_one(struct io_kiocb *req) { struct io_poll_iocb *poll = &req->poll; @@ -1675,6 +1712,9 @@ static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, case IORING_OP_SYNC_FILE_RANGE: ret = io_sync_file_range(req, s->sqe, force_nonblock); break; + case IORING_OP_SENDMSG: + ret = io_sendmsg(req, s->sqe, force_nonblock); + break; default: ret = -EINVAL; break; diff --git a/include/linux/socket.h b/include/linux/socket.h index b57cd8bf96e2..9d770ef3ced5 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -12,6 +12,7 @@ struct pid; struct cred; +struct socket; #define __sockaddr_check_size(size) \ BUILD_BUG_ON(((size) > sizeof(struct __kernel_sockaddr_storage))) @@ -374,6 +375,9 @@ extern int __sys_recvmmsg(int fd, struct mmsghdr __user *mmsg, extern int __sys_sendmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen, unsigned int flags, bool forbid_cmsg_compat); +extern long __sys_sendmsg_sock(struct socket *sock, + struct user_msghdr __user *msg, + unsigned int flags); /* helpers which do the actual work for syscalls */ extern int __sys_recvfrom(int fd, void __user *ubuf, size_t size, diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 10b7c45f6d57..d74742d6269f 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -27,6 +27,7 @@ struct io_uring_sqe { __u32 fsync_flags; __u16 poll_events; __u32 sync_range_flags; + __u32 msg_flags; }; __u64 user_data; /* data to be passed back at completion time */ union { @@ -58,6 +59,7 @@ struct io_uring_sqe { #define IORING_OP_POLL_ADD 6 #define IORING_OP_POLL_REMOVE 7 #define IORING_OP_SYNC_FILE_RANGE 8 +#define IORING_OP_SENDMSG 9 /* * sqe->fsync_flags diff --git a/net/socket.c b/net/socket.c index bffec466b4f1..b9536940255e 100644 --- a/net/socket.c +++ b/net/socket.c @@ -2313,6 +2313,13 @@ out_freeiov: /* * BSD sendmsg interface */ +long __sys_sendmsg_sock(struct socket *sock, struct user_msghdr __user *msg, + unsigned int flags) +{ + struct msghdr msg_sys; + + return ___sys_sendmsg(sock, msg, &msg_sys, flags, NULL, 0); +} long __sys_sendmsg(int fd, struct user_msghdr __user *msg, unsigned int flags, bool forbid_cmsg_compat) -- cgit v1.2.3-71-gd317 From aa1fa28fc73ea6b740ee7b62bf3b07141883dbb8 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 19 Apr 2019 13:38:09 -0600 Subject: io_uring: add support for recvmsg() This is done through IORING_OP_RECVMSG. This opcode uses the same sqe->msg_flags that IORING_OP_SENDMSG added, and we pass in the msghdr struct in the sqe->addr field as well. We use MSG_DONTWAIT to force an inline fast path if recvmsg() doesn't block, and punt to async execution if it would have. Acked-by: David S. Miller Signed-off-by: Jens Axboe --- fs/io_uring.c | 31 +++++++++++++++++++++++++++---- include/linux/socket.h | 3 +++ include/uapi/linux/io_uring.h | 1 + net/socket.c | 8 ++++++++ 4 files changed, 39 insertions(+), 4 deletions(-) (limited to 'include/uapi/linux') diff --git a/fs/io_uring.c b/fs/io_uring.c index 5d4cd8c4132d..8d86e31b0762 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1390,10 +1390,12 @@ static int io_sync_file_range(struct io_kiocb *req, return 0; } -static int io_sendmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe, - bool force_nonblock) -{ #if defined(CONFIG_NET) +static int io_send_recvmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe, + bool force_nonblock, + long (*fn)(struct socket *, struct user_msghdr __user *, + unsigned int)) +{ struct socket *sock; int ret; @@ -1414,7 +1416,7 @@ static int io_sendmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe, msg = (struct user_msghdr __user *) (unsigned long) READ_ONCE(sqe->addr); - ret = __sys_sendmsg_sock(sock, msg, flags); + ret = fn(sock, msg, flags); if (force_nonblock && ret == -EAGAIN) return ret; } @@ -1422,6 +1424,24 @@ static int io_sendmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe, io_cqring_add_event(req->ctx, sqe->user_data, ret); io_put_req(req); return 0; +} +#endif + +static int io_sendmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe, + bool force_nonblock) +{ +#if defined(CONFIG_NET) + return io_send_recvmsg(req, sqe, force_nonblock, __sys_sendmsg_sock); +#else + return -EOPNOTSUPP; +#endif +} + +static int io_recvmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe, + bool force_nonblock) +{ +#if defined(CONFIG_NET) + return io_send_recvmsg(req, sqe, force_nonblock, __sys_recvmsg_sock); #else return -EOPNOTSUPP; #endif @@ -1715,6 +1735,9 @@ static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, case IORING_OP_SENDMSG: ret = io_sendmsg(req, s->sqe, force_nonblock); break; + case IORING_OP_RECVMSG: + ret = io_recvmsg(req, s->sqe, force_nonblock); + break; default: ret = -EINVAL; break; diff --git a/include/linux/socket.h b/include/linux/socket.h index 9d770ef3ced5..97523818cb14 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -378,6 +378,9 @@ extern int __sys_sendmmsg(int fd, struct mmsghdr __user *mmsg, extern long __sys_sendmsg_sock(struct socket *sock, struct user_msghdr __user *msg, unsigned int flags); +extern long __sys_recvmsg_sock(struct socket *sock, + struct user_msghdr __user *msg, + unsigned int flags); /* helpers which do the actual work for syscalls */ extern int __sys_recvfrom(int fd, void __user *ubuf, size_t size, diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index d74742d6269f..1e1652f25cc1 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -60,6 +60,7 @@ struct io_uring_sqe { #define IORING_OP_POLL_REMOVE 7 #define IORING_OP_SYNC_FILE_RANGE 8 #define IORING_OP_SENDMSG 9 +#define IORING_OP_RECVMSG 10 /* * sqe->fsync_flags diff --git a/net/socket.c b/net/socket.c index b9536940255e..98354cc18840 100644 --- a/net/socket.c +++ b/net/socket.c @@ -2494,6 +2494,14 @@ out_freeiov: * BSD recvmsg interface */ +long __sys_recvmsg_sock(struct socket *sock, struct user_msghdr __user *msg, + unsigned int flags) +{ + struct msghdr msg_sys; + + return ___sys_recvmsg(sock, msg, &msg_sys, flags, 0); +} + long __sys_recvmsg(int fd, struct user_msghdr __user *msg, unsigned int flags, bool forbid_cmsg_compat) { -- cgit v1.2.3-71-gd317 From c9626a2cbdb20e26587b3fad99960520a023432b Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 9 Jul 2019 23:00:43 +0200 Subject: netfilter: nf_tables: add hardware offload support This patch adds hardware offload support for nftables through the existing netdev_ops->ndo_setup_tc() interface, the TC_SETUP_CLSFLOWER classifier and the flow rule API. This hardware offload support is available for the NFPROTO_NETDEV family and the ingress hook. Each nftables expression has a new ->offload interface, that is used to populate the flow rule object that is attached to the transaction object. There is a new per-table NFT_TABLE_F_HW flag, that is set on to offload an entire table, including all of its chains. This patch supports for basic metadata (layer 3 and 4 protocol numbers), 5-tuple payload matching and the accept/drop actions; this also includes basechain hardware offload only. Signed-off-by: Pablo Neira Ayuso Signed-off-by: David S. Miller --- include/net/netfilter/nf_tables.h | 14 ++ include/net/netfilter/nf_tables_offload.h | 76 +++++++++ include/uapi/linux/netfilter/nf_tables.h | 2 + net/netfilter/Makefile | 2 +- net/netfilter/nf_tables_api.c | 39 ++++- net/netfilter/nf_tables_offload.c | 267 ++++++++++++++++++++++++++++++ net/netfilter/nft_cmp.c | 53 ++++++ net/netfilter/nft_immediate.c | 31 ++++ net/netfilter/nft_meta.c | 27 +++ net/netfilter/nft_payload.c | 187 +++++++++++++++++++++ 10 files changed, 691 insertions(+), 7 deletions(-) create mode 100644 include/net/netfilter/nf_tables_offload.h create mode 100644 net/netfilter/nf_tables_offload.c (limited to 'include/uapi/linux') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 9e8493aad49d..35dfdd9f69b3 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -161,6 +161,7 @@ struct nft_ctx { const struct nlattr * const *nla; u32 portid; u32 seq; + u16 flags; u8 family; u8 level; bool report; @@ -735,6 +736,9 @@ enum nft_trans_phase { NFT_TRANS_RELEASE }; +struct nft_flow_rule; +struct nft_offload_ctx; + /** * struct nft_expr_ops - nf_tables expression operations * @@ -777,6 +781,10 @@ struct nft_expr_ops { const struct nft_data **data); bool (*gc)(struct net *net, const struct nft_expr *expr); + int (*offload)(struct nft_offload_ctx *ctx, + struct nft_flow_rule *flow, + const struct nft_expr *expr); + u32 offload_flags; const struct nft_expr_type *type; void *data; }; @@ -859,6 +867,7 @@ static inline struct nft_userdata *nft_userdata(const struct nft_rule *rule) enum nft_chain_flags { NFT_BASE_CHAIN = 0x1, + NFT_CHAIN_HW_OFFLOAD = 0x2, }; /** @@ -942,6 +951,7 @@ struct nft_stats { * @stats: per-cpu chain stats * @chain: the chain * @dev_name: device name that this base chain is attached to (if any) + * @cb_list: list of flow block callbacks (for hardware offload) */ struct nft_base_chain { struct nf_hook_ops ops; @@ -951,6 +961,7 @@ struct nft_base_chain { struct nft_stats __percpu *stats; struct nft_chain chain; char dev_name[IFNAMSIZ]; + struct list_head cb_list; }; static inline struct nft_base_chain *nft_base_chain(const struct nft_chain *chain) @@ -1322,11 +1333,14 @@ struct nft_trans { struct nft_trans_rule { struct nft_rule *rule; + struct nft_flow_rule *flow; u32 rule_id; }; #define nft_trans_rule(trans) \ (((struct nft_trans_rule *)trans->data)->rule) +#define nft_trans_flow_rule(trans) \ + (((struct nft_trans_rule *)trans->data)->flow) #define nft_trans_rule_id(trans) \ (((struct nft_trans_rule *)trans->data)->rule_id) diff --git a/include/net/netfilter/nf_tables_offload.h b/include/net/netfilter/nf_tables_offload.h new file mode 100644 index 000000000000..3196663a10e3 --- /dev/null +++ b/include/net/netfilter/nf_tables_offload.h @@ -0,0 +1,76 @@ +#ifndef _NET_NF_TABLES_OFFLOAD_H +#define _NET_NF_TABLES_OFFLOAD_H + +#include +#include + +struct nft_offload_reg { + u32 key; + u32 len; + u32 base_offset; + u32 offset; + struct nft_data mask; +}; + +enum nft_offload_dep_type { + NFT_OFFLOAD_DEP_UNSPEC = 0, + NFT_OFFLOAD_DEP_NETWORK, + NFT_OFFLOAD_DEP_TRANSPORT, +}; + +struct nft_offload_ctx { + struct { + enum nft_offload_dep_type type; + __be16 l3num; + u8 protonum; + } dep; + unsigned int num_actions; + struct nft_offload_reg regs[NFT_REG32_15 + 1]; +}; + +void nft_offload_set_dependency(struct nft_offload_ctx *ctx, + enum nft_offload_dep_type type); +void nft_offload_update_dependency(struct nft_offload_ctx *ctx, + const void *data, u32 len); + +struct nft_flow_key { + struct flow_dissector_key_basic basic; + union { + struct flow_dissector_key_ipv4_addrs ipv4; + struct flow_dissector_key_ipv6_addrs ipv6; + }; + struct flow_dissector_key_ports tp; + struct flow_dissector_key_ip ip; + struct flow_dissector_key_vlan vlan; + struct flow_dissector_key_eth_addrs eth_addrs; +} __aligned(BITS_PER_LONG / 8); /* Ensure that we can do comparisons as longs. */ + +struct nft_flow_match { + struct flow_dissector dissector; + struct nft_flow_key key; + struct nft_flow_key mask; +}; + +struct nft_flow_rule { + __be16 proto; + struct nft_flow_match match; + struct flow_rule *rule; +}; + +#define NFT_OFFLOAD_F_ACTION (1 << 0) + +struct nft_rule; +struct nft_flow_rule *nft_flow_rule_create(const struct nft_rule *rule); +void nft_flow_rule_destroy(struct nft_flow_rule *flow); +int nft_flow_rule_offload_commit(struct net *net); + +#define NFT_OFFLOAD_MATCH(__key, __base, __field, __len, __reg) \ + (__reg)->base_offset = \ + offsetof(struct nft_flow_key, __base); \ + (__reg)->offset = \ + offsetof(struct nft_flow_key, __base.__field); \ + (__reg)->len = __len; \ + (__reg)->key = __key; \ + memset(&(__reg)->mask, 0xff, (__reg)->len); + +#endif diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 0e3462dfb182..82abaa183fc3 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -192,6 +192,7 @@ enum nft_table_attributes { * @NFTA_CHAIN_USE: number of references to this chain (NLA_U32) * @NFTA_CHAIN_TYPE: type name of the string (NLA_NUL_STRING) * @NFTA_CHAIN_COUNTERS: counter specification of the chain (NLA_NESTED: nft_counter_attributes) + * @NFTA_CHAIN_FLAGS: chain flags */ enum nft_chain_attributes { NFTA_CHAIN_UNSPEC, @@ -204,6 +205,7 @@ enum nft_chain_attributes { NFTA_CHAIN_TYPE, NFTA_CHAIN_COUNTERS, NFTA_CHAIN_PAD, + NFTA_CHAIN_FLAGS, __NFTA_CHAIN_MAX }; #define NFTA_CHAIN_MAX (__NFTA_CHAIN_MAX - 1) diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index deada20975ff..9270a7fae484 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -78,7 +78,7 @@ nf_tables-objs := nf_tables_core.o nf_tables_api.o nft_chain_filter.o \ nf_tables_trace.o nft_immediate.o nft_cmp.o nft_range.o \ nft_bitwise.o nft_byteorder.o nft_payload.o nft_lookup.o \ nft_dynset.o nft_meta.o nft_rt.o nft_exthdr.o \ - nft_chain_route.o + nft_chain_route.o nf_tables_offload.o nf_tables_set-objs := nf_tables_set_core.o \ nft_set_hash.o nft_set_bitmap.o nft_set_rbtree.o diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index d22d00ca78c1..ed17a7c29b86 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -97,6 +98,7 @@ static void nft_ctx_init(struct nft_ctx *ctx, ctx->nla = nla; ctx->portid = NETLINK_CB(skb).portid; ctx->report = nlmsg_report(nlh); + ctx->flags = nlh->nlmsg_flags; ctx->seq = nlh->nlmsg_seq; } @@ -1169,6 +1171,7 @@ static const struct nla_policy nft_chain_policy[NFTA_CHAIN_MAX + 1] = { [NFTA_CHAIN_POLICY] = { .type = NLA_U32 }, [NFTA_CHAIN_TYPE] = { .type = NLA_STRING }, [NFTA_CHAIN_COUNTERS] = { .type = NLA_NESTED }, + [NFTA_CHAIN_FLAGS] = { .type = NLA_U32 }, }; static const struct nla_policy nft_hook_policy[NFTA_HOOK_MAX + 1] = { @@ -1603,7 +1606,7 @@ static struct nft_rule **nf_tables_chain_alloc_rules(const struct nft_chain *cha } static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, - u8 policy) + u8 policy, u32 flags) { const struct nlattr * const *nla = ctx->nla; struct nft_table *table = ctx->table; @@ -1657,8 +1660,9 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, ops->hook = hook.type->hooks[ops->hooknum]; ops->dev = hook.dev; - chain->flags |= NFT_BASE_CHAIN; + chain->flags |= NFT_BASE_CHAIN | flags; basechain->policy = NF_ACCEPT; + INIT_LIST_HEAD(&basechain->cb_list); } else { chain = kzalloc(sizeof(*chain), GFP_KERNEL); if (chain == NULL) @@ -1718,7 +1722,8 @@ err1: return err; } -static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy) +static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy, + u32 flags) { const struct nlattr * const *nla = ctx->nla; struct nft_table *table = ctx->table; @@ -1730,6 +1735,9 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy) struct nft_trans *trans; int err; + if (chain->flags ^ flags) + return -EOPNOTSUPP; + if (nla[NFTA_CHAIN_HOOK]) { if (!nft_is_base_chain(chain)) return -EBUSY; @@ -1835,6 +1843,7 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk, u8 policy = NF_ACCEPT; struct nft_ctx ctx; u64 handle = 0; + u32 flags = 0; lockdep_assert_held(&net->nft.commit_mutex); @@ -1889,6 +1898,9 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk, } } + if (nla[NFTA_CHAIN_FLAGS]) + flags = ntohl(nla_get_be32(nla[NFTA_CHAIN_FLAGS])); + nft_ctx_init(&ctx, net, skb, nlh, family, table, chain, nla); if (chain != NULL) { @@ -1899,10 +1911,10 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk, if (nlh->nlmsg_flags & NLM_F_REPLACE) return -EOPNOTSUPP; - return nf_tables_updchain(&ctx, genmask, policy); + return nf_tables_updchain(&ctx, genmask, policy, flags); } - return nf_tables_addchain(&ctx, family, genmask, policy); + return nf_tables_addchain(&ctx, family, genmask, policy, flags); } static int nf_tables_delchain(struct net *net, struct sock *nlsk, @@ -2658,6 +2670,7 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk, u8 genmask = nft_genmask_next(net); struct nft_expr_info *info = NULL; int family = nfmsg->nfgen_family; + struct nft_flow_rule *flow; struct nft_table *table; struct nft_chain *chain; struct nft_rule *rule, *old_rule = NULL; @@ -2804,7 +2817,8 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk, list_add_tail_rcu(&rule->list, &old_rule->list); } else { - if (nft_trans_rule_add(&ctx, NFT_MSG_NEWRULE, rule) == NULL) { + trans = nft_trans_rule_add(&ctx, NFT_MSG_NEWRULE, rule); + if (!trans) { err = -ENOMEM; goto err2; } @@ -2827,6 +2841,14 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk, if (net->nft.validate_state == NFT_VALIDATE_DO) return nft_table_validate(net, table); + if (chain->flags & NFT_CHAIN_HW_OFFLOAD) { + flow = nft_flow_rule_create(rule); + if (IS_ERR(flow)) + return PTR_ERR(flow); + + nft_trans_flow_rule(trans) = flow; + } + return 0; err2: nf_tables_rule_release(&ctx, rule); @@ -6624,6 +6646,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) struct nft_trans_elem *te; struct nft_chain *chain; struct nft_table *table; + int err; if (list_empty(&net->nft.commit_list)) { mutex_unlock(&net->nft.commit_mutex); @@ -6634,6 +6657,10 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) if (nf_tables_validate(net) < 0) return -EAGAIN; + err = nft_flow_rule_offload_commit(net); + if (err < 0) + return err; + /* 1. Allocate space for next generation rules_gen_X[] */ list_for_each_entry_safe(trans, next, &net->nft.commit_list, list) { int ret; diff --git a/net/netfilter/nf_tables_offload.c b/net/netfilter/nf_tables_offload.c new file mode 100644 index 000000000000..2c3302845f67 --- /dev/null +++ b/net/netfilter/nf_tables_offload.c @@ -0,0 +1,267 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include +#include +#include +#include +#include +#include +#include + +static struct nft_flow_rule *nft_flow_rule_alloc(int num_actions) +{ + struct nft_flow_rule *flow; + + flow = kzalloc(sizeof(struct nft_flow_rule), GFP_KERNEL); + if (!flow) + return NULL; + + flow->rule = flow_rule_alloc(num_actions); + if (!flow->rule) { + kfree(flow); + return NULL; + } + + flow->rule->match.dissector = &flow->match.dissector; + flow->rule->match.mask = &flow->match.mask; + flow->rule->match.key = &flow->match.key; + + return flow; +} + +struct nft_flow_rule *nft_flow_rule_create(const struct nft_rule *rule) +{ + struct nft_offload_ctx ctx = { + .dep = { + .type = NFT_OFFLOAD_DEP_UNSPEC, + }, + }; + struct nft_flow_rule *flow; + int num_actions = 0, err; + struct nft_expr *expr; + + expr = nft_expr_first(rule); + while (expr->ops && expr != nft_expr_last(rule)) { + if (expr->ops->offload_flags & NFT_OFFLOAD_F_ACTION) + num_actions++; + + expr = nft_expr_next(expr); + } + + flow = nft_flow_rule_alloc(num_actions); + if (!flow) + return ERR_PTR(-ENOMEM); + + expr = nft_expr_first(rule); + while (expr->ops && expr != nft_expr_last(rule)) { + if (!expr->ops->offload) { + err = -EOPNOTSUPP; + goto err_out; + } + err = expr->ops->offload(&ctx, flow, expr); + if (err < 0) + goto err_out; + + expr = nft_expr_next(expr); + } + flow->proto = ctx.dep.l3num; + + return flow; +err_out: + nft_flow_rule_destroy(flow); + + return ERR_PTR(err); +} + +void nft_flow_rule_destroy(struct nft_flow_rule *flow) +{ + kfree(flow->rule); + kfree(flow); +} + +void nft_offload_set_dependency(struct nft_offload_ctx *ctx, + enum nft_offload_dep_type type) +{ + ctx->dep.type = type; +} + +void nft_offload_update_dependency(struct nft_offload_ctx *ctx, + const void *data, u32 len) +{ + switch (ctx->dep.type) { + case NFT_OFFLOAD_DEP_NETWORK: + WARN_ON(len != sizeof(__u16)); + memcpy(&ctx->dep.l3num, data, sizeof(__u16)); + break; + case NFT_OFFLOAD_DEP_TRANSPORT: + WARN_ON(len != sizeof(__u8)); + memcpy(&ctx->dep.protonum, data, sizeof(__u8)); + break; + default: + break; + } + ctx->dep.type = NFT_OFFLOAD_DEP_UNSPEC; +} + +static void nft_flow_offload_common_init(struct flow_cls_common_offload *common, + __be16 proto, + struct netlink_ext_ack *extack) +{ + common->protocol = proto; + common->extack = extack; +} + +static int nft_setup_cb_call(struct nft_base_chain *basechain, + enum tc_setup_type type, void *type_data) +{ + struct flow_block_cb *block_cb; + int err; + + list_for_each_entry(block_cb, &basechain->cb_list, list) { + err = block_cb->cb(type, type_data, block_cb->cb_priv); + if (err < 0) + return err; + } + return 0; +} + +static int nft_flow_offload_rule(struct nft_trans *trans, + enum flow_cls_command command) +{ + struct nft_flow_rule *flow = nft_trans_flow_rule(trans); + struct nft_rule *rule = nft_trans_rule(trans); + struct flow_cls_offload cls_flow = {}; + struct nft_base_chain *basechain; + struct netlink_ext_ack extack; + __be16 proto = ETH_P_ALL; + + if (!nft_is_base_chain(trans->ctx.chain)) + return -EOPNOTSUPP; + + basechain = nft_base_chain(trans->ctx.chain); + + if (flow) + proto = flow->proto; + + nft_flow_offload_common_init(&cls_flow.common, proto, &extack); + cls_flow.command = command; + cls_flow.cookie = (unsigned long) rule; + if (flow) + cls_flow.rule = flow->rule; + + return nft_setup_cb_call(basechain, TC_SETUP_CLSFLOWER, &cls_flow); +} + +static int nft_flow_offload_bind(struct flow_block_offload *bo, + struct nft_base_chain *basechain) +{ + list_splice(&bo->cb_list, &basechain->cb_list); + return 0; +} + +static int nft_flow_offload_unbind(struct flow_block_offload *bo, + struct nft_base_chain *basechain) +{ + struct flow_block_cb *block_cb, *next; + + list_for_each_entry_safe(block_cb, next, &bo->cb_list, list) { + list_del(&block_cb->list); + flow_block_cb_free(block_cb); + } + + return 0; +} + +#define FLOW_SETUP_BLOCK TC_SETUP_BLOCK + +static int nft_flow_offload_chain(struct nft_trans *trans, + enum flow_block_command cmd) +{ + struct nft_chain *chain = trans->ctx.chain; + struct netlink_ext_ack extack = {}; + struct flow_block_offload bo = {}; + struct nft_base_chain *basechain; + struct net_device *dev; + int err; + + if (!nft_is_base_chain(chain)) + return -EOPNOTSUPP; + + basechain = nft_base_chain(chain); + dev = basechain->ops.dev; + if (!dev || !dev->netdev_ops->ndo_setup_tc) + return -EOPNOTSUPP; + + /* Only default policy to accept is supported for now. */ + if (cmd == FLOW_BLOCK_BIND && + nft_trans_chain_policy(trans) != -1 && + nft_trans_chain_policy(trans) != NF_ACCEPT) + return -EOPNOTSUPP; + + bo.command = cmd; + bo.binder_type = FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS; + bo.extack = &extack; + INIT_LIST_HEAD(&bo.cb_list); + + err = dev->netdev_ops->ndo_setup_tc(dev, FLOW_SETUP_BLOCK, &bo); + if (err < 0) + return err; + + switch (cmd) { + case FLOW_BLOCK_BIND: + err = nft_flow_offload_bind(&bo, basechain); + break; + case FLOW_BLOCK_UNBIND: + err = nft_flow_offload_unbind(&bo, basechain); + break; + } + + return err; +} + +int nft_flow_rule_offload_commit(struct net *net) +{ + struct nft_trans *trans; + int err = 0; + + list_for_each_entry(trans, &net->nft.commit_list, list) { + if (trans->ctx.family != NFPROTO_NETDEV) + continue; + + switch (trans->msg_type) { + case NFT_MSG_NEWCHAIN: + if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD)) + continue; + + err = nft_flow_offload_chain(trans, FLOW_BLOCK_BIND); + break; + case NFT_MSG_DELCHAIN: + if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD)) + continue; + + err = nft_flow_offload_chain(trans, FLOW_BLOCK_UNBIND); + break; + case NFT_MSG_NEWRULE: + if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD)) + continue; + + if (trans->ctx.flags & NLM_F_REPLACE || + !(trans->ctx.flags & NLM_F_APPEND)) + return -EOPNOTSUPP; + + err = nft_flow_offload_rule(trans, FLOW_CLS_REPLACE); + nft_flow_rule_destroy(nft_trans_flow_rule(trans)); + break; + case NFT_MSG_DELRULE: + if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD)) + continue; + + err = nft_flow_offload_rule(trans, FLOW_CLS_DESTROY); + break; + } + + if (err) + return err; + } + + return err; +} diff --git a/net/netfilter/nft_cmp.c b/net/netfilter/nft_cmp.c index 411c0cf741e3..bd173b1824c6 100644 --- a/net/netfilter/nft_cmp.c +++ b/net/netfilter/nft_cmp.c @@ -12,6 +12,7 @@ #include #include #include +#include #include struct nft_cmp_expr { @@ -107,12 +108,44 @@ nla_put_failure: return -1; } +static int __nft_cmp_offload(struct nft_offload_ctx *ctx, + struct nft_flow_rule *flow, + const struct nft_cmp_expr *priv) +{ + struct nft_offload_reg *reg = &ctx->regs[priv->sreg]; + u8 *mask = (u8 *)&flow->match.mask; + u8 *key = (u8 *)&flow->match.key; + + if (priv->op != NFT_CMP_EQ) + return -EOPNOTSUPP; + + memcpy(key + reg->offset, &priv->data, priv->len); + memcpy(mask + reg->offset, ®->mask, priv->len); + + flow->match.dissector.used_keys |= BIT(reg->key); + flow->match.dissector.offset[reg->key] = reg->base_offset; + + nft_offload_update_dependency(ctx, &priv->data, priv->len); + + return 0; +} + +static int nft_cmp_offload(struct nft_offload_ctx *ctx, + struct nft_flow_rule *flow, + const struct nft_expr *expr) +{ + const struct nft_cmp_expr *priv = nft_expr_priv(expr); + + return __nft_cmp_offload(ctx, flow, priv); +} + static const struct nft_expr_ops nft_cmp_ops = { .type = &nft_cmp_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_cmp_expr)), .eval = nft_cmp_eval, .init = nft_cmp_init, .dump = nft_cmp_dump, + .offload = nft_cmp_offload, }; static int nft_cmp_fast_init(const struct nft_ctx *ctx, @@ -143,6 +176,25 @@ static int nft_cmp_fast_init(const struct nft_ctx *ctx, return 0; } +static int nft_cmp_fast_offload(struct nft_offload_ctx *ctx, + struct nft_flow_rule *flow, + const struct nft_expr *expr) +{ + const struct nft_cmp_fast_expr *priv = nft_expr_priv(expr); + struct nft_cmp_expr cmp = { + .data = { + .data = { + [0] = priv->data, + }, + }, + .sreg = priv->sreg, + .len = priv->len / BITS_PER_BYTE, + .op = NFT_CMP_EQ, + }; + + return __nft_cmp_offload(ctx, flow, &cmp); +} + static int nft_cmp_fast_dump(struct sk_buff *skb, const struct nft_expr *expr) { const struct nft_cmp_fast_expr *priv = nft_expr_priv(expr); @@ -169,6 +221,7 @@ const struct nft_expr_ops nft_cmp_fast_ops = { .eval = NULL, /* inlined */ .init = nft_cmp_fast_init, .dump = nft_cmp_fast_dump, + .offload = nft_cmp_fast_offload, }; static const struct nft_expr_ops * diff --git a/net/netfilter/nft_immediate.c b/net/netfilter/nft_immediate.c index cb8547f97220..ca2ae4b95a8d 100644 --- a/net/netfilter/nft_immediate.c +++ b/net/netfilter/nft_immediate.c @@ -13,6 +13,7 @@ #include #include #include +#include void nft_immediate_eval(const struct nft_expr *expr, struct nft_regs *regs, @@ -124,6 +125,34 @@ static int nft_immediate_validate(const struct nft_ctx *ctx, return 0; } +static int nft_immediate_offload(struct nft_offload_ctx *ctx, + struct nft_flow_rule *flow, + const struct nft_expr *expr) +{ + const struct nft_immediate_expr *priv = nft_expr_priv(expr); + struct flow_action_entry *entry; + const struct nft_data *data; + + if (priv->dreg != NFT_REG_VERDICT) + return -EOPNOTSUPP; + + entry = &flow->rule->action.entries[ctx->num_actions++]; + + data = &priv->data; + switch (data->verdict.code) { + case NF_ACCEPT: + entry->id = FLOW_ACTION_ACCEPT; + break; + case NF_DROP: + entry->id = FLOW_ACTION_DROP; + break; + default: + return -EOPNOTSUPP; + } + + return 0; +} + static const struct nft_expr_ops nft_imm_ops = { .type = &nft_imm_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_immediate_expr)), @@ -133,6 +162,8 @@ static const struct nft_expr_ops nft_imm_ops = { .deactivate = nft_immediate_deactivate, .dump = nft_immediate_dump, .validate = nft_immediate_validate, + .offload = nft_immediate_offload, + .offload_flags = NFT_OFFLOAD_F_ACTION, }; struct nft_expr_type nft_imm_type __read_mostly = { diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c index 417f8d32e9a3..76866f77e343 100644 --- a/net/netfilter/nft_meta.c +++ b/net/netfilter/nft_meta.c @@ -22,6 +22,7 @@ #include #include #include +#include #include /* NF_BR_PRE_ROUTING */ @@ -490,6 +491,31 @@ void nft_meta_set_destroy(const struct nft_ctx *ctx, } EXPORT_SYMBOL_GPL(nft_meta_set_destroy); +static int nft_meta_get_offload(struct nft_offload_ctx *ctx, + struct nft_flow_rule *flow, + const struct nft_expr *expr) +{ + const struct nft_meta *priv = nft_expr_priv(expr); + struct nft_offload_reg *reg = &ctx->regs[priv->dreg]; + + switch (priv->key) { + case NFT_META_PROTOCOL: + NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_BASIC, basic, n_proto, + sizeof(__u16), reg); + nft_offload_set_dependency(ctx, NFT_OFFLOAD_DEP_NETWORK); + break; + case NFT_META_L4PROTO: + NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_BASIC, basic, ip_proto, + sizeof(__u8), reg); + nft_offload_set_dependency(ctx, NFT_OFFLOAD_DEP_TRANSPORT); + break; + default: + return -EOPNOTSUPP; + } + + return 0; +} + static const struct nft_expr_ops nft_meta_get_ops = { .type = &nft_meta_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_meta)), @@ -497,6 +523,7 @@ static const struct nft_expr_ops nft_meta_get_ops = { .init = nft_meta_get_init, .dump = nft_meta_get_dump, .validate = nft_meta_get_validate, + .offload = nft_meta_get_offload, }; static const struct nft_expr_ops nft_meta_set_ops = { diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c index 1260f78a034d..22a80eb60222 100644 --- a/net/netfilter/nft_payload.c +++ b/net/netfilter/nft_payload.c @@ -15,10 +15,13 @@ #include #include #include +#include /* For layer 4 checksum field offset. */ #include #include #include +#include +#include /* add vlan header into the user buffer for if tag was removed by offloads */ static bool @@ -150,12 +153,195 @@ nla_put_failure: return -1; } +static int nft_payload_offload_ll(struct nft_offload_ctx *ctx, + struct nft_flow_rule *flow, + const struct nft_payload *priv) +{ + struct nft_offload_reg *reg = &ctx->regs[priv->dreg]; + + switch (priv->offset) { + case offsetof(struct ethhdr, h_source): + NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_ETH_ADDRS, eth_addrs, + src, ETH_ALEN, reg); + break; + case offsetof(struct ethhdr, h_dest): + NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_ETH_ADDRS, eth_addrs, + dst, ETH_ALEN, reg); + break; + } + + return 0; +} + +static int nft_payload_offload_ip(struct nft_offload_ctx *ctx, + struct nft_flow_rule *flow, + const struct nft_payload *priv) +{ + struct nft_offload_reg *reg = &ctx->regs[priv->dreg]; + + switch (priv->offset) { + case offsetof(struct iphdr, saddr): + NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_IPV4_ADDRS, ipv4, src, + sizeof(struct in_addr), reg); + break; + case offsetof(struct iphdr, daddr): + NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_IPV4_ADDRS, ipv4, dst, + sizeof(struct in_addr), reg); + break; + case offsetof(struct iphdr, protocol): + NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_BASIC, basic, ip_proto, + sizeof(__u8), reg); + nft_offload_set_dependency(ctx, NFT_OFFLOAD_DEP_TRANSPORT); + break; + default: + return -EOPNOTSUPP; + } + + return 0; +} + +static int nft_payload_offload_ip6(struct nft_offload_ctx *ctx, + struct nft_flow_rule *flow, + const struct nft_payload *priv) +{ + struct nft_offload_reg *reg = &ctx->regs[priv->dreg]; + + switch (priv->offset) { + case offsetof(struct ipv6hdr, saddr): + NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_IPV6_ADDRS, ipv6, src, + sizeof(struct in6_addr), reg); + break; + case offsetof(struct ipv6hdr, daddr): + NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_IPV6_ADDRS, ipv6, dst, + sizeof(struct in6_addr), reg); + break; + case offsetof(struct ipv6hdr, nexthdr): + NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_BASIC, basic, ip_proto, + sizeof(__u8), reg); + nft_offload_set_dependency(ctx, NFT_OFFLOAD_DEP_TRANSPORT); + break; + default: + return -EOPNOTSUPP; + } + + return 0; +} + +static int nft_payload_offload_nh(struct nft_offload_ctx *ctx, + struct nft_flow_rule *flow, + const struct nft_payload *priv) +{ + int err; + + switch (ctx->dep.l3num) { + case htons(ETH_P_IP): + err = nft_payload_offload_ip(ctx, flow, priv); + break; + case htons(ETH_P_IPV6): + err = nft_payload_offload_ip6(ctx, flow, priv); + break; + default: + return -EOPNOTSUPP; + } + + return err; +} + +static int nft_payload_offload_tcp(struct nft_offload_ctx *ctx, + struct nft_flow_rule *flow, + const struct nft_payload *priv) +{ + struct nft_offload_reg *reg = &ctx->regs[priv->dreg]; + + switch (priv->offset) { + case offsetof(struct tcphdr, source): + NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_PORTS, tp, src, + sizeof(__be16), reg); + break; + case offsetof(struct tcphdr, dest): + NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_PORTS, tp, dst, + sizeof(__be16), reg); + break; + default: + return -EOPNOTSUPP; + } + + return 0; +} + +static int nft_payload_offload_udp(struct nft_offload_ctx *ctx, + struct nft_flow_rule *flow, + const struct nft_payload *priv) +{ + struct nft_offload_reg *reg = &ctx->regs[priv->dreg]; + + switch (priv->offset) { + case offsetof(struct udphdr, source): + NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_PORTS, tp, src, + sizeof(__be16), reg); + break; + case offsetof(struct udphdr, dest): + NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_PORTS, tp, dst, + sizeof(__be16), reg); + break; + default: + return -EOPNOTSUPP; + } + + return 0; +} + +static int nft_payload_offload_th(struct nft_offload_ctx *ctx, + struct nft_flow_rule *flow, + const struct nft_payload *priv) +{ + int err; + + switch (ctx->dep.protonum) { + case IPPROTO_TCP: + err = nft_payload_offload_tcp(ctx, flow, priv); + break; + case IPPROTO_UDP: + err = nft_payload_offload_udp(ctx, flow, priv); + break; + default: + return -EOPNOTSUPP; + } + + return err; +} + +static int nft_payload_offload(struct nft_offload_ctx *ctx, + struct nft_flow_rule *flow, + const struct nft_expr *expr) +{ + const struct nft_payload *priv = nft_expr_priv(expr); + int err; + + switch (priv->base) { + case NFT_PAYLOAD_LL_HEADER: + err = nft_payload_offload_ll(ctx, flow, priv); + break; + case NFT_PAYLOAD_NETWORK_HEADER: + err = nft_payload_offload_nh(ctx, flow, priv); + break; + case NFT_PAYLOAD_TRANSPORT_HEADER: + err = nft_payload_offload_th(ctx, flow, priv); + break; + default: + err = -EOPNOTSUPP; + break; + } + return err; +} + static const struct nft_expr_ops nft_payload_ops = { .type = &nft_payload_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_payload)), .eval = nft_payload_eval, .init = nft_payload_init, .dump = nft_payload_dump, + .offload = nft_payload_offload, }; const struct nft_expr_ops nft_payload_fast_ops = { @@ -164,6 +350,7 @@ const struct nft_expr_ops nft_payload_fast_ops = { .eval = nft_payload_eval, .init = nft_payload_init, .dump = nft_payload_dump, + .offload = nft_payload_offload, }; static inline void nft_csum_replace(__sum16 *sum, __wsum fsum, __wsum tsum) -- cgit v1.2.3-71-gd317 From fbc697796e358d1ed8ed25758b19bdb3a1f8e9f9 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 9 Jul 2019 14:45:17 -0700 Subject: pkt_sched: Include const.h Commit 9903c8dc7342 changed TC_ETF defines to use _BITUL instead of BIT but did not add the dependecy on linux/const.h. As a consequence, importing the uapi headers into iproute2 causes builds to fail. Add the dependency. Fixes: 9903c8dc7342 ("etf: Don't use BIT() in UAPI headers.") Cc: Vedang Patel Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/uapi/linux/pkt_sched.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index 390efb54b2e0..1f623252abe8 100644 --- a/include/uapi/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h @@ -2,6 +2,7 @@ #ifndef __LINUX_PKT_SCHED_H #define __LINUX_PKT_SCHED_H +#include #include /* Logical priority bands not depending on specific packet scheduler. -- cgit v1.2.3-71-gd317 From 028db3e290f15ac509084c0fc3b9d021f668f877 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 10 Jul 2019 18:43:43 -0700 Subject: Revert "Merge tag 'keys-acl-20190703' of git://git.kernel.org/pub/scm/linux/kernel/git/dhowells/linux-fs" This reverts merge 0f75ef6a9cff49ff612f7ce0578bced9d0b38325 (and thus effectively commits 7a1ade847596 ("keys: Provide KEYCTL_GRANT_PERMISSION") 2e12256b9a76 ("keys: Replace uid/gid/perm permissions checking with an ACL") that the merge brought in). It turns out that it breaks booting with an encrypted volume, and Eric biggers reports that it also breaks the fscrypt tests [1] and loading of in-kernel X.509 certificates [2]. The root cause of all the breakage is likely the same, but David Howells is off email so rather than try to work it out it's getting reverted in order to not impact the rest of the merge window. [1] https://lore.kernel.org/lkml/20190710011559.GA7973@sol.localdomain/ [2] https://lore.kernel.org/lkml/20190710013225.GB7973@sol.localdomain/ Link: https://lore.kernel.org/lkml/CAHk-=wjxoeMJfeBahnWH=9zShKp2bsVy527vo3_y8HfOdhwAAw@mail.gmail.com/ Reported-by: Eric Biggers Cc: David Howells Cc: James Morris Signed-off-by: Linus Torvalds --- Documentation/security/keys/core.rst | 128 ++------ Documentation/security/keys/request-key.rst | 9 +- certs/blacklist.c | 7 +- certs/system_keyring.c | 12 +- drivers/md/dm-crypt.c | 2 +- drivers/nvdimm/security.c | 2 +- fs/afs/security.c | 2 +- fs/cifs/cifs_spnego.c | 25 +- fs/cifs/cifsacl.c | 28 +- fs/cifs/connect.c | 4 +- fs/crypto/keyinfo.c | 2 +- fs/ecryptfs/ecryptfs_kernel.h | 2 +- fs/ecryptfs/keystore.c | 2 +- fs/fscache/object-list.c | 2 +- fs/nfs/nfs4idmap.c | 30 +- fs/ubifs/auth.c | 2 +- include/linux/key.h | 121 ++++--- include/uapi/linux/keyctl.h | 65 ---- lib/digsig.c | 2 +- net/ceph/ceph_common.c | 2 +- net/dns_resolver/dns_key.c | 12 +- net/dns_resolver/dns_query.c | 15 +- net/rxrpc/key.c | 19 +- net/wireless/reg.c | 6 +- security/integrity/digsig.c | 31 +- security/integrity/digsig_asymmetric.c | 2 +- security/integrity/evm/evm_crypto.c | 2 +- security/integrity/ima/ima_mok.c | 13 +- security/integrity/integrity.h | 6 +- .../integrity/platform_certs/platform_keyring.c | 14 +- security/keys/compat.c | 2 - security/keys/encrypted-keys/encrypted.c | 2 +- security/keys/encrypted-keys/masterkey_trusted.c | 2 +- security/keys/gc.c | 2 +- security/keys/internal.h | 16 +- security/keys/key.c | 29 +- security/keys/keyctl.c | 104 ++---- security/keys/keyring.c | 27 +- security/keys/permission.c | 361 ++------------------- security/keys/persistent.c | 27 +- security/keys/proc.c | 22 +- security/keys/process_keys.c | 86 ++--- security/keys/request_key.c | 34 +- security/keys/request_key_auth.c | 15 +- security/selinux/hooks.c | 16 +- security/smack/smack_lsm.c | 3 +- 46 files changed, 325 insertions(+), 992 deletions(-) (limited to 'include/uapi/linux') diff --git a/Documentation/security/keys/core.rst b/Documentation/security/keys/core.rst index bc561ca95c86..d6d8b0b756b6 100644 --- a/Documentation/security/keys/core.rst +++ b/Documentation/security/keys/core.rst @@ -57,9 +57,9 @@ Each key has a number of attributes: type provides an operation to perform a match between the description on a key and a criterion string. - * Each key has an owner user ID, a group ID and an ACL. These are used to - control what a process may do to a key from userspace, and whether a - kernel service will be able to find the key. + * Each key has an owner user ID, a group ID and a permissions mask. These + are used to control what a process may do to a key from userspace, and + whether a kernel service will be able to find the key. * Each key can be set to expire at a specific time by the key type's instantiation function. Keys can also be immortal. @@ -198,110 +198,43 @@ The key service provides a number of features besides keys: Key Access Permissions ====================== -Keys have an owner user ID, a group ID and an ACL. The ACL is made up of a -sequence of ACEs that each contain three elements: +Keys have an owner user ID, a group access ID, and a permissions mask. The mask +has up to eight bits each for possessor, user, group and other access. Only +six of each set of eight bits are defined. These permissions granted are: - * The type of subject. - * The subject. + * View - These two together indicate the subject to whom the permits are granted. - The type can be one of: + This permits a key or keyring's attributes to be viewed - including key + type and description. - * ``KEY_ACE_SUBJ_STANDARD`` + * Read - The subject is a standard 'macro' type. The subject can be one of: - - * ``KEY_ACE_EVERYONE`` - - The permits are granted to everyone. It replaces the old 'other' - type on the assumption that you wouldn't grant a permission to other - that you you wouldn't grant to everyone else. - - * ``KEY_ACE_OWNER`` - - The permits are granted to the owner of the key (key->uid). - - * ``KEY_ACE_GROUP`` - - The permits are granted to the key's group (key->gid). - - * ``KEY_ACE_POSSESSOR`` - - The permits are granted to anyone who possesses the key. - - * The set of permits granted to the subject. These include: - - * ``KEY_ACE_VIEW`` - - This permits a key or keyring's attributes to be viewed - including the - key type and description. - - * ``KEY_ACE_READ`` - - This permits a key's payload to be viewed or a keyring's list of linked - keys. - - * ``KEY_ACE_WRITE`` - - This permits a key's payload to be instantiated or updated, or it allows - a link to be added to or removed from a keyring. - - * ``KEY_ACE_SEARCH`` - - This permits keyrings to be searched and keys to be found. Searches can - only recurse into nested keyrings that have search permission set. - - * ``KEY_ACE_LINK`` - - This permits a key or keyring to be linked to. To create a link from a - keyring to a key, a process must have Write permission on the keyring - and Link permission on the key. - - * ``KEY_ACE_SET_SECURITY`` - - This permits a key's UID, GID and permissions mask to be changed. + This permits a key's payload to be viewed or a keyring's list of linked + keys. - * ``KEY_ACE_INVAL`` + * Write - This permits a key to be invalidated with KEYCTL_INVALIDATE. + This permits a key's payload to be instantiated or updated, or it allows a + link to be added to or removed from a keyring. - * ``KEY_ACE_REVOKE`` + * Search - This permits a key to be revoked with KEYCTL_REVOKE. + This permits keyrings to be searched and keys to be found. Searches can + only recurse into nested keyrings that have search permission set. - * ``KEY_ACE_JOIN`` + * Link - This permits a keyring to be joined as a session by - KEYCTL_JOIN_SESSION_KEYRING or KEYCTL_SESSION_TO_PARENT. + This permits a key or keyring to be linked to. To create a link from a + keyring to a key, a process must have Write permission on the keyring and + Link permission on the key. - * ``KEY_ACE_CLEAR`` + * Set Attribute - This permits a keyring to be cleared. + This permits a key's UID, GID and permissions mask to be changed. For changing the ownership, group ID or permissions mask, being the owner of the key or having the sysadmin capability is sufficient. -The legacy KEYCTL_SETPERM and KEYCTL_DESCRIBE functions can only see/generate -View, Read, Write, Search, Link and SetAttr permits, and do this for each of -possessor, user, group and other permission sets as a 32-bit flag mask. These -will be approximated/inferred: - - SETPERM Permit Implied ACE Permit - =============== ======================= - Search Inval, Join - Write Revoke, Clear - Setattr Set Security, Revoke - - ACE Permit Described as - =============== ======================= - Inval Search - Join Search - Revoke Write (unless Setattr) - Clear write - Set Security Setattr - -'Other' will be approximated as/inferred from the 'Everyone' subject. - SELinux Support =============== @@ -1151,8 +1084,7 @@ payload contents" for more information. struct key *request_key(const struct key_type *type, const char *description, - const char *callout_info, - struct key_acl *acl); + const char *callout_info); This is used to request a key or keyring with a description that matches the description specified according to the key type's match_preparse() @@ -1167,8 +1099,6 @@ payload contents" for more information. If successful, the key will have been attached to the default keyring for implicitly obtained request-key keys, as set by KEYCTL_SET_REQKEY_KEYRING. - If a key is created, it will be given the specified ACL. - See also Documentation/security/keys/request-key.rst. @@ -1177,8 +1107,7 @@ payload contents" for more information. struct key *request_key_tag(const struct key_type *type, const char *description, struct key_tag *domain_tag, - const char *callout_info, - struct key_acl *acl); + const char *callout_info); This is identical to request_key(), except that a domain tag may be specifies that causes search algorithm to only match keys matching that @@ -1193,8 +1122,7 @@ payload contents" for more information. struct key_tag *domain_tag, const void *callout_info, size_t callout_len, - void *aux, - struct key_acl *acl); + void *aux); This is identical to request_key_tag(), except that the auxiliary data is passed to the key_type->request_key() op if it exists, and the @@ -1267,7 +1195,7 @@ payload contents" for more information. struct key *keyring_alloc(const char *description, uid_t uid, gid_t gid, const struct cred *cred, - struct key_acl *acl, + key_perm_t perm, struct key_restriction *restrict_link, unsigned long flags, struct key *dest); diff --git a/Documentation/security/keys/request-key.rst b/Documentation/security/keys/request-key.rst index f356fd06c8d5..35f2296b704a 100644 --- a/Documentation/security/keys/request-key.rst +++ b/Documentation/security/keys/request-key.rst @@ -11,16 +11,14 @@ The process starts by either the kernel requesting a service by calling struct key *request_key(const struct key_type *type, const char *description, - const char *callout_info, - struct key_acl *acl); + const char *callout_info); or:: struct key *request_key_tag(const struct key_type *type, const char *description, const struct key_tag *domain_tag, - const char *callout_info, - struct key_acl *acl); + const char *callout_info); or:: @@ -29,8 +27,7 @@ or:: const struct key_tag *domain_tag, const char *callout_info, size_t callout_len, - void *aux, - struct key_acl *acl); + void *aux); or:: diff --git a/certs/blacklist.c b/certs/blacklist.c index 93d70b885f8e..ec00bf337eb6 100644 --- a/certs/blacklist.c +++ b/certs/blacklist.c @@ -89,7 +89,8 @@ int mark_hash_blacklisted(const char *hash) hash, NULL, 0, - &internal_key_acl, + ((KEY_POS_ALL & ~KEY_POS_SETATTR) | + KEY_USR_VIEW), KEY_ALLOC_NOT_IN_QUOTA | KEY_ALLOC_BUILT_IN); if (IS_ERR(key)) { @@ -148,7 +149,9 @@ static int __init blacklist_init(void) keyring_alloc(".blacklist", KUIDT_INIT(0), KGIDT_INIT(0), current_cred(), - &internal_keyring_acl, + (KEY_POS_ALL & ~KEY_POS_SETATTR) | + KEY_USR_VIEW | KEY_USR_READ | + KEY_USR_SEARCH, KEY_ALLOC_NOT_IN_QUOTA | KEY_FLAG_KEEP, NULL, NULL); diff --git a/certs/system_keyring.c b/certs/system_keyring.c index 57be78b5fdfc..1eba08a1af82 100644 --- a/certs/system_keyring.c +++ b/certs/system_keyring.c @@ -99,7 +99,9 @@ static __init int system_trusted_keyring_init(void) builtin_trusted_keys = keyring_alloc(".builtin_trusted_keys", KUIDT_INIT(0), KGIDT_INIT(0), current_cred(), - &internal_key_acl, KEY_ALLOC_NOT_IN_QUOTA, + ((KEY_POS_ALL & ~KEY_POS_SETATTR) | + KEY_USR_VIEW | KEY_USR_READ | KEY_USR_SEARCH), + KEY_ALLOC_NOT_IN_QUOTA, NULL, NULL); if (IS_ERR(builtin_trusted_keys)) panic("Can't allocate builtin trusted keyring\n"); @@ -108,7 +110,10 @@ static __init int system_trusted_keyring_init(void) secondary_trusted_keys = keyring_alloc(".secondary_trusted_keys", KUIDT_INIT(0), KGIDT_INIT(0), current_cred(), - &internal_writable_keyring_acl, KEY_ALLOC_NOT_IN_QUOTA, + ((KEY_POS_ALL & ~KEY_POS_SETATTR) | + KEY_USR_VIEW | KEY_USR_READ | KEY_USR_SEARCH | + KEY_USR_WRITE), + KEY_ALLOC_NOT_IN_QUOTA, get_builtin_and_secondary_restriction(), NULL); if (IS_ERR(secondary_trusted_keys)) @@ -158,7 +163,8 @@ static __init int load_system_certificate_list(void) NULL, p, plen, - &internal_key_acl, + ((KEY_POS_ALL & ~KEY_POS_SETATTR) | + KEY_USR_VIEW | KEY_USR_READ), KEY_ALLOC_NOT_IN_QUOTA | KEY_ALLOC_BUILT_IN | KEY_ALLOC_BYPASS_RESTRICTION); diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 0fd3ca9bfe54..1b16d34bb785 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -2035,7 +2035,7 @@ static int crypt_set_keyring_key(struct crypt_config *cc, const char *key_string return -ENOMEM; key = request_key(key_string[0] == 'l' ? &key_type_logon : &key_type_user, - key_desc + 1, NULL, NULL); + key_desc + 1, NULL); if (IS_ERR(key)) { kzfree(new_key_string); return PTR_ERR(key); diff --git a/drivers/nvdimm/security.c b/drivers/nvdimm/security.c index 99a5708b37e3..a570f2263a42 100644 --- a/drivers/nvdimm/security.c +++ b/drivers/nvdimm/security.c @@ -55,7 +55,7 @@ static struct key *nvdimm_request_key(struct nvdimm *nvdimm) struct device *dev = &nvdimm->dev; sprintf(desc, "%s%s", NVDIMM_PREFIX, nvdimm->dimm_id); - key = request_key(&key_type_encrypted, desc, "", NULL); + key = request_key(&key_type_encrypted, desc, ""); if (IS_ERR(key)) { if (PTR_ERR(key) == -ENOKEY) dev_dbg(dev, "request_key() found no key\n"); diff --git a/fs/afs/security.c b/fs/afs/security.c index 8866703b2e6c..71e71c07568f 100644 --- a/fs/afs/security.c +++ b/fs/afs/security.c @@ -28,7 +28,7 @@ struct key *afs_request_key(struct afs_cell *cell) _debug("key %s", cell->anonymous_key->description); key = request_key(&key_type_rxrpc, cell->anonymous_key->description, - NULL, NULL); + NULL); if (IS_ERR(key)) { if (PTR_ERR(key) != -ENOKEY) { _leave(" = %ld", PTR_ERR(key)); diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c index d1b439ad0f1a..7f01c6e60791 100644 --- a/fs/cifs/cifs_spnego.c +++ b/fs/cifs/cifs_spnego.c @@ -32,25 +32,6 @@ #include "cifsproto.h" static const struct cred *spnego_cred; -static struct key_acl cifs_spnego_key_acl = { - .usage = REFCOUNT_INIT(1), - .nr_ace = 2, - .possessor_viewable = true, - .aces = { - KEY_POSSESSOR_ACE(KEY_ACE_VIEW | KEY_ACE_SEARCH | KEY_ACE_READ), - KEY_OWNER_ACE(KEY_ACE_VIEW), - } -}; - -static struct key_acl cifs_spnego_keyring_acl = { - .usage = REFCOUNT_INIT(1), - .nr_ace = 2, - .aces = { - KEY_POSSESSOR_ACE(KEY_ACE_SEARCH | KEY_ACE_WRITE), - KEY_OWNER_ACE(KEY_ACE_VIEW | KEY_ACE_READ | KEY_ACE_CLEAR), - } -}; - /* create a new cifs key */ static int cifs_spnego_key_instantiate(struct key *key, struct key_preparsed_payload *prep) @@ -189,8 +170,7 @@ cifs_get_spnego_key(struct cifs_ses *sesInfo) cifs_dbg(FYI, "key description = %s\n", description); saved_cred = override_creds(spnego_cred); - spnego_key = request_key(&cifs_spnego_key_type, description, "", - &cifs_spnego_key_acl); + spnego_key = request_key(&cifs_spnego_key_type, description, ""); revert_creds(saved_cred); #ifdef CONFIG_CIFS_DEBUG2 @@ -227,7 +207,8 @@ init_cifs_spnego(void) keyring = keyring_alloc(".cifs_spnego", GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, cred, - &cifs_spnego_keyring_acl, + (KEY_POS_ALL & ~KEY_POS_SETATTR) | + KEY_USR_VIEW | KEY_USR_READ, KEY_ALLOC_NOT_IN_QUOTA, NULL, NULL); if (IS_ERR(keyring)) { ret = PTR_ERR(keyring); diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c index 78eed72f3af0..1d377b7f2860 100644 --- a/fs/cifs/cifsacl.c +++ b/fs/cifs/cifsacl.c @@ -33,25 +33,6 @@ #include "cifsproto.h" #include "cifs_debug.h" -static struct key_acl cifs_idmap_key_acl = { - .usage = REFCOUNT_INIT(1), - .nr_ace = 2, - .possessor_viewable = true, - .aces = { - KEY_POSSESSOR_ACE(KEY_ACE_VIEW | KEY_ACE_SEARCH | KEY_ACE_READ), - KEY_OWNER_ACE(KEY_ACE_VIEW), - } -}; - -static struct key_acl cifs_idmap_keyring_acl = { - .usage = REFCOUNT_INIT(1), - .nr_ace = 2, - .aces = { - KEY_POSSESSOR_ACE(KEY_ACE_SEARCH | KEY_ACE_WRITE), - KEY_OWNER_ACE(KEY_ACE_VIEW | KEY_ACE_READ), - } -}; - /* security id for everyone/world system group */ static const struct cifs_sid sid_everyone = { 1, 1, {0, 0, 0, 0, 0, 1}, {0} }; @@ -317,8 +298,7 @@ id_to_sid(unsigned int cid, uint sidtype, struct cifs_sid *ssid) rc = 0; saved_cred = override_creds(root_cred); - sidkey = request_key(&cifs_idmap_key_type, desc, "", - &cifs_idmap_key_acl); + sidkey = request_key(&cifs_idmap_key_type, desc, ""); if (IS_ERR(sidkey)) { rc = -EINVAL; cifs_dbg(FYI, "%s: Can't map %cid %u to a SID\n", @@ -423,8 +403,7 @@ try_upcall_to_get_id: return -ENOMEM; saved_cred = override_creds(root_cred); - sidkey = request_key(&cifs_idmap_key_type, sidstr, "", - &cifs_idmap_key_acl); + sidkey = request_key(&cifs_idmap_key_type, sidstr, ""); if (IS_ERR(sidkey)) { rc = -EINVAL; cifs_dbg(FYI, "%s: Can't map SID %s to a %cid\n", @@ -502,7 +481,8 @@ init_cifs_idmap(void) keyring = keyring_alloc(".cifs_idmap", GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, cred, - &cifs_idmap_keyring_acl, + (KEY_POS_ALL & ~KEY_POS_SETATTR) | + KEY_USR_VIEW | KEY_USR_READ, KEY_ALLOC_NOT_IN_QUOTA, NULL, NULL); if (IS_ERR(keyring)) { ret = PTR_ERR(keyring); diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index ae6bae2ecb5d..714a359c7c8d 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -2992,7 +2992,7 @@ cifs_set_cifscreds(struct smb_vol *vol, struct cifs_ses *ses) } cifs_dbg(FYI, "%s: desc=%s\n", __func__, desc); - key = request_key(&key_type_logon, desc, "", NULL); + key = request_key(&key_type_logon, desc, ""); if (IS_ERR(key)) { if (!ses->domainName) { cifs_dbg(FYI, "domainName is NULL\n"); @@ -3003,7 +3003,7 @@ cifs_set_cifscreds(struct smb_vol *vol, struct cifs_ses *ses) /* didn't work, try to find a domain key */ sprintf(desc, "cifs:d:%s", ses->domainName); cifs_dbg(FYI, "%s: desc=%s\n", __func__, desc); - key = request_key(&key_type_logon, desc, "", NULL); + key = request_key(&key_type_logon, desc, ""); if (IS_ERR(key)) { rc = PTR_ERR(key); goto out_err; diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c index 4f85af8ab239..dcd91a3fbe49 100644 --- a/fs/crypto/keyinfo.c +++ b/fs/crypto/keyinfo.c @@ -92,7 +92,7 @@ find_and_lock_process_key(const char *prefix, if (!description) return ERR_PTR(-ENOMEM); - key = request_key(&key_type_logon, description, NULL, NULL); + key = request_key(&key_type_logon, description, NULL); kfree(description); if (IS_ERR(key)) return key; diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h index 67844fe41a61..1c1a56be7ea2 100644 --- a/fs/ecryptfs/ecryptfs_kernel.h +++ b/fs/ecryptfs/ecryptfs_kernel.h @@ -91,7 +91,7 @@ ecryptfs_get_encrypted_key_payload_data(struct key *key) static inline struct key *ecryptfs_get_encrypted_key(char *sig) { - return request_key(&key_type_encrypted, sig, NULL, NULL); + return request_key(&key_type_encrypted, sig, NULL); } #else diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c index ba382f135918..9536e592e25a 100644 --- a/fs/ecryptfs/keystore.c +++ b/fs/ecryptfs/keystore.c @@ -1610,7 +1610,7 @@ int ecryptfs_keyring_auth_tok_for_sig(struct key **auth_tok_key, { int rc = 0; - (*auth_tok_key) = request_key(&key_type_user, sig, NULL, NULL); + (*auth_tok_key) = request_key(&key_type_user, sig, NULL); if (!(*auth_tok_key) || IS_ERR(*auth_tok_key)) { (*auth_tok_key) = ecryptfs_get_encrypted_key(sig); if (!(*auth_tok_key) || IS_ERR(*auth_tok_key)) { diff --git a/fs/fscache/object-list.c b/fs/fscache/object-list.c index 67b7bda5647a..72ebfe578f40 100644 --- a/fs/fscache/object-list.c +++ b/fs/fscache/object-list.c @@ -317,7 +317,7 @@ static void fscache_objlist_config(struct fscache_objlist_data *data) const char *buf; int len; - key = request_key(&key_type_user, "fscache:objlist", NULL, NULL); + key = request_key(&key_type_user, "fscache:objlist", NULL); if (IS_ERR(key)) goto no_config; diff --git a/fs/nfs/nfs4idmap.c b/fs/nfs/nfs4idmap.c index 69679f4f2e6c..1e7296395d71 100644 --- a/fs/nfs/nfs4idmap.c +++ b/fs/nfs/nfs4idmap.c @@ -72,25 +72,6 @@ struct idmap { const struct cred *cred; }; -static struct key_acl nfs_idmap_key_acl = { - .usage = REFCOUNT_INIT(1), - .nr_ace = 2, - .possessor_viewable = true, - .aces = { - KEY_POSSESSOR_ACE(KEY_ACE_VIEW | KEY_ACE_SEARCH | KEY_ACE_READ), - KEY_OWNER_ACE(KEY_ACE_VIEW), - } -}; - -static struct key_acl nfs_idmap_keyring_acl = { - .usage = REFCOUNT_INIT(1), - .nr_ace = 2, - .aces = { - KEY_POSSESSOR_ACE(KEY_ACE_SEARCH | KEY_ACE_WRITE), - KEY_OWNER_ACE(KEY_ACE_VIEW | KEY_ACE_READ), - } -}; - static struct user_namespace *idmap_userns(const struct idmap *idmap) { if (idmap && idmap->cred) @@ -227,7 +208,8 @@ int nfs_idmap_init(void) keyring = keyring_alloc(".id_resolver", GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, cred, - &nfs_idmap_keyring_acl, + (KEY_POS_ALL & ~KEY_POS_SETATTR) | + KEY_USR_VIEW | KEY_USR_READ, KEY_ALLOC_NOT_IN_QUOTA, NULL, NULL); if (IS_ERR(keyring)) { ret = PTR_ERR(keyring); @@ -305,13 +287,11 @@ static struct key *nfs_idmap_request_key(const char *name, size_t namelen, return ERR_PTR(ret); if (!idmap->cred || idmap->cred->user_ns == &init_user_ns) - rkey = request_key(&key_type_id_resolver, desc, "", - &nfs_idmap_key_acl); + rkey = request_key(&key_type_id_resolver, desc, ""); if (IS_ERR(rkey)) { mutex_lock(&idmap->idmap_mutex); rkey = request_key_with_auxdata(&key_type_id_resolver_legacy, - desc, NULL, "", 0, idmap, - &nfs_idmap_key_acl); + desc, NULL, "", 0, idmap); mutex_unlock(&idmap->idmap_mutex); } if (!IS_ERR(rkey)) @@ -340,6 +320,8 @@ static ssize_t nfs_idmap_get_key(const char *name, size_t namelen, } rcu_read_lock(); + rkey->perm |= KEY_USR_VIEW; + ret = key_validate(rkey); if (ret < 0) goto out_up; diff --git a/fs/ubifs/auth.c b/fs/ubifs/auth.c index 38718026ad0b..60f43b93d06e 100644 --- a/fs/ubifs/auth.c +++ b/fs/ubifs/auth.c @@ -227,7 +227,7 @@ int ubifs_init_authentication(struct ubifs_info *c) snprintf(hmac_name, CRYPTO_MAX_ALG_NAME, "hmac(%s)", c->auth_hash_name); - keyring_key = request_key(&key_type_logon, c->auth_key_name, NULL, NULL); + keyring_key = request_key(&key_type_logon, c->auth_key_name, NULL); if (IS_ERR(keyring_key)) { ubifs_err(c, "Failed to request key: %ld", diff --git a/include/linux/key.h b/include/linux/key.h index 6fef6684501f..91f391cd272e 100644 --- a/include/linux/key.h +++ b/include/linux/key.h @@ -27,15 +27,50 @@ /* key handle serial number */ typedef int32_t key_serial_t; +/* key handle permissions mask */ +typedef uint32_t key_perm_t; + struct key; struct net; #ifdef CONFIG_KEYS -#include - #undef KEY_DEBUGGING +#define KEY_POS_VIEW 0x01000000 /* possessor can view a key's attributes */ +#define KEY_POS_READ 0x02000000 /* possessor can read key payload / view keyring */ +#define KEY_POS_WRITE 0x04000000 /* possessor can update key payload / add link to keyring */ +#define KEY_POS_SEARCH 0x08000000 /* possessor can find a key in search / search a keyring */ +#define KEY_POS_LINK 0x10000000 /* possessor can create a link to a key/keyring */ +#define KEY_POS_SETATTR 0x20000000 /* possessor can set key attributes */ +#define KEY_POS_ALL 0x3f000000 + +#define KEY_USR_VIEW 0x00010000 /* user permissions... */ +#define KEY_USR_READ 0x00020000 +#define KEY_USR_WRITE 0x00040000 +#define KEY_USR_SEARCH 0x00080000 +#define KEY_USR_LINK 0x00100000 +#define KEY_USR_SETATTR 0x00200000 +#define KEY_USR_ALL 0x003f0000 + +#define KEY_GRP_VIEW 0x00000100 /* group permissions... */ +#define KEY_GRP_READ 0x00000200 +#define KEY_GRP_WRITE 0x00000400 +#define KEY_GRP_SEARCH 0x00000800 +#define KEY_GRP_LINK 0x00001000 +#define KEY_GRP_SETATTR 0x00002000 +#define KEY_GRP_ALL 0x00003f00 + +#define KEY_OTH_VIEW 0x00000001 /* third party permissions... */ +#define KEY_OTH_READ 0x00000002 +#define KEY_OTH_WRITE 0x00000004 +#define KEY_OTH_SEARCH 0x00000008 +#define KEY_OTH_LINK 0x00000010 +#define KEY_OTH_SETATTR 0x00000020 +#define KEY_OTH_ALL 0x0000003f + +#define KEY_PERM_UNDEF 0xffffffff + struct seq_file; struct user_struct; struct signal_struct; @@ -78,36 +113,6 @@ union key_payload { void *data[4]; }; -struct key_ace { - unsigned int type; - unsigned int perm; - union { - kuid_t uid; - kgid_t gid; - unsigned int subject_id; - }; -}; - -struct key_acl { - refcount_t usage; - unsigned short nr_ace; - bool possessor_viewable; - struct rcu_head rcu; - struct key_ace aces[]; -}; - -#define KEY_POSSESSOR_ACE(perms) { \ - .type = KEY_ACE_SUBJ_STANDARD, \ - .perm = perms, \ - .subject_id = KEY_ACE_POSSESSOR \ - } - -#define KEY_OWNER_ACE(perms) { \ - .type = KEY_ACE_SUBJ_STANDARD, \ - .perm = perms, \ - .subject_id = KEY_ACE_OWNER \ - } - /*****************************************************************************/ /* * key reference with possession attribute handling @@ -174,7 +179,6 @@ struct key { struct rw_semaphore sem; /* change vs change sem */ struct key_user *user; /* owner of this key */ void *security; /* security data for this key */ - struct key_acl __rcu *acl; union { time64_t expiry; /* time at which key expires (or 0) */ time64_t revoked_at; /* time at which key was revoked */ @@ -182,6 +186,7 @@ struct key { time64_t last_used_at; /* last time used for LRU keyring discard */ kuid_t uid; kgid_t gid; + key_perm_t perm; /* access permissions */ unsigned short quotalen; /* length added to quota */ unsigned short datalen; /* payload data length * - may not match RCU dereferenced payload @@ -205,7 +210,6 @@ struct key { #define KEY_FLAG_ROOT_CAN_INVAL 7 /* set if key can be invalidated by root without permission */ #define KEY_FLAG_KEEP 8 /* set if key should not be removed */ #define KEY_FLAG_UID_KEYRING 9 /* set if key is a user or user session keyring */ -#define KEY_FLAG_HAS_ACL 10 /* Set if KEYCTL_SETACL called on key */ /* the key type and key description string * - the desc is used to match a key against search criteria @@ -254,7 +258,7 @@ extern struct key *key_alloc(struct key_type *type, const char *desc, kuid_t uid, kgid_t gid, const struct cred *cred, - struct key_acl *acl, + key_perm_t perm, unsigned long flags, struct key_restriction *restrict_link); @@ -291,8 +295,7 @@ static inline void key_ref_put(key_ref_t key_ref) extern struct key *request_key_tag(struct key_type *type, const char *description, struct key_tag *domain_tag, - const char *callout_info, - struct key_acl *acl); + const char *callout_info); extern struct key *request_key_rcu(struct key_type *type, const char *description, @@ -303,24 +306,21 @@ extern struct key *request_key_with_auxdata(struct key_type *type, struct key_tag *domain_tag, const void *callout_info, size_t callout_len, - void *aux, - struct key_acl *acl); + void *aux); /** * request_key - Request a key and wait for construction * @type: Type of key. * @description: The searchable description of the key. * @callout_info: The data to pass to the instantiation upcall (or NULL). - * @acl: The ACL to attach to a new key (or NULL). * * As for request_key_tag(), but with the default global domain tag. */ static inline struct key *request_key(struct key_type *type, const char *description, - const char *callout_info, - struct key_acl *acl) + const char *callout_info) { - return request_key_tag(type, description, NULL, callout_info, acl); + return request_key_tag(type, description, NULL, callout_info); } #ifdef CONFIG_NET @@ -330,7 +330,6 @@ static inline struct key *request_key(struct key_type *type, * @description: The searchable description of the key. * @net: The network namespace that is the key's domain of operation. * @callout_info: The data to pass to the instantiation upcall (or NULL). - * @acl: The ACL to attach to a new key (or NULL). * * As for request_key() except that it does not add the returned key to a * keyring if found, new keys are always allocated in the user's quota, the @@ -340,8 +339,8 @@ static inline struct key *request_key(struct key_type *type, * Furthermore, it then works as wait_for_key_construction() to wait for the * completion of keys undergoing construction with a non-interruptible wait. */ -#define request_key_net(type, description, net, callout_info, acl) \ - request_key_tag(type, description, net->key_domain, callout_info, acl); +#define request_key_net(type, description, net, callout_info) \ + request_key_tag(type, description, net->key_domain, callout_info); #endif /* CONFIG_NET */ extern int wait_for_key_construction(struct key *key, bool intr); @@ -353,7 +352,7 @@ extern key_ref_t key_create_or_update(key_ref_t keyring, const char *description, const void *payload, size_t plen, - struct key_acl *acl, + key_perm_t perm, unsigned long flags); extern int key_update(key_ref_t key, @@ -373,7 +372,7 @@ extern int key_unlink(struct key *keyring, extern struct key *keyring_alloc(const char *description, kuid_t uid, kgid_t gid, const struct cred *cred, - struct key_acl *acl, + key_perm_t perm, unsigned long flags, struct key_restriction *restrict_link, struct key *dest); @@ -406,29 +405,19 @@ static inline key_serial_t key_serial(const struct key *key) extern void key_set_timeout(struct key *, unsigned); extern key_ref_t lookup_user_key(key_serial_t id, unsigned long flags, - u32 desired_perm); + key_perm_t perm); extern void key_free_user_ns(struct user_namespace *); /* * The permissions required on a key that we're looking up. */ -#define KEY_NEED_VIEW 0x001 /* Require permission to view attributes */ -#define KEY_NEED_READ 0x002 /* Require permission to read content */ -#define KEY_NEED_WRITE 0x004 /* Require permission to update / modify */ -#define KEY_NEED_SEARCH 0x008 /* Require permission to search (keyring) or find (key) */ -#define KEY_NEED_LINK 0x010 /* Require permission to link */ -#define KEY_NEED_SETSEC 0x020 /* Require permission to set owner, group, ACL */ -#define KEY_NEED_INVAL 0x040 /* Require permission to invalidate key */ -#define KEY_NEED_REVOKE 0x080 /* Require permission to revoke key */ -#define KEY_NEED_JOIN 0x100 /* Require permission to join keyring as session */ -#define KEY_NEED_CLEAR 0x200 /* Require permission to clear a keyring */ -#define KEY_NEED_ALL 0x3ff - -#define OLD_KEY_NEED_SETATTR 0x20 /* Used to be Require permission to change attributes */ - -extern struct key_acl internal_key_acl; -extern struct key_acl internal_keyring_acl; -extern struct key_acl internal_writable_keyring_acl; +#define KEY_NEED_VIEW 0x01 /* Require permission to view attributes */ +#define KEY_NEED_READ 0x02 /* Require permission to read content */ +#define KEY_NEED_WRITE 0x04 /* Require permission to update / modify */ +#define KEY_NEED_SEARCH 0x08 /* Require permission to search (keyring) or find (key) */ +#define KEY_NEED_LINK 0x10 /* Require permission to link */ +#define KEY_NEED_SETATTR 0x20 /* Require permission to change attributes */ +#define KEY_NEED_ALL 0x3f /* All the above permissions */ static inline short key_read_state(const struct key *key) { diff --git a/include/uapi/linux/keyctl.h b/include/uapi/linux/keyctl.h index 1f7a4e737214..ed3d5893830d 100644 --- a/include/uapi/linux/keyctl.h +++ b/include/uapi/linux/keyctl.h @@ -15,69 +15,6 @@ #include -/* - * Keyring permission grant definitions - */ -enum key_ace_subject_type { - KEY_ACE_SUBJ_STANDARD = 0, /* subject is one of key_ace_standard_subject */ - nr__key_ace_subject_type -}; - -enum key_ace_standard_subject { - KEY_ACE_EVERYONE = 0, /* Everyone, including owner and group */ - KEY_ACE_GROUP = 1, /* The key's group */ - KEY_ACE_OWNER = 2, /* The owner of the key */ - KEY_ACE_POSSESSOR = 3, /* Any process that possesses of the key */ - nr__key_ace_standard_subject -}; - -#define KEY_ACE_VIEW 0x00000001 /* Can describe the key */ -#define KEY_ACE_READ 0x00000002 /* Can read the key content */ -#define KEY_ACE_WRITE 0x00000004 /* Can update/modify the key content */ -#define KEY_ACE_SEARCH 0x00000008 /* Can find the key by search */ -#define KEY_ACE_LINK 0x00000010 /* Can make a link to the key */ -#define KEY_ACE_SET_SECURITY 0x00000020 /* Can set owner, group, ACL */ -#define KEY_ACE_INVAL 0x00000040 /* Can invalidate the key */ -#define KEY_ACE_REVOKE 0x00000080 /* Can revoke the key */ -#define KEY_ACE_JOIN 0x00000100 /* Can join keyring */ -#define KEY_ACE_CLEAR 0x00000200 /* Can clear keyring */ -#define KEY_ACE__PERMS 0xffffffff - -/* - * Old-style permissions mask, deprecated in favour of ACL. - */ -#define KEY_POS_VIEW 0x01000000 /* possessor can view a key's attributes */ -#define KEY_POS_READ 0x02000000 /* possessor can read key payload / view keyring */ -#define KEY_POS_WRITE 0x04000000 /* possessor can update key payload / add link to keyring */ -#define KEY_POS_SEARCH 0x08000000 /* possessor can find a key in search / search a keyring */ -#define KEY_POS_LINK 0x10000000 /* possessor can create a link to a key/keyring */ -#define KEY_POS_SETATTR 0x20000000 /* possessor can set key attributes */ -#define KEY_POS_ALL 0x3f000000 - -#define KEY_USR_VIEW 0x00010000 /* user permissions... */ -#define KEY_USR_READ 0x00020000 -#define KEY_USR_WRITE 0x00040000 -#define KEY_USR_SEARCH 0x00080000 -#define KEY_USR_LINK 0x00100000 -#define KEY_USR_SETATTR 0x00200000 -#define KEY_USR_ALL 0x003f0000 - -#define KEY_GRP_VIEW 0x00000100 /* group permissions... */ -#define KEY_GRP_READ 0x00000200 -#define KEY_GRP_WRITE 0x00000400 -#define KEY_GRP_SEARCH 0x00000800 -#define KEY_GRP_LINK 0x00001000 -#define KEY_GRP_SETATTR 0x00002000 -#define KEY_GRP_ALL 0x00003f00 - -#define KEY_OTH_VIEW 0x00000001 /* third party permissions... */ -#define KEY_OTH_READ 0x00000002 -#define KEY_OTH_WRITE 0x00000004 -#define KEY_OTH_SEARCH 0x00000008 -#define KEY_OTH_LINK 0x00000010 -#define KEY_OTH_SETATTR 0x00000020 -#define KEY_OTH_ALL 0x0000003f - /* special process keyring shortcut IDs */ #define KEY_SPEC_THREAD_KEYRING -1 /* - key ID for thread-specific keyring */ #define KEY_SPEC_PROCESS_KEYRING -2 /* - key ID for process-specific keyring */ @@ -132,7 +69,6 @@ enum key_ace_standard_subject { #define KEYCTL_RESTRICT_KEYRING 29 /* Restrict keys allowed to link to a keyring */ #define KEYCTL_MOVE 30 /* Move keys between keyrings */ #define KEYCTL_CAPABILITIES 31 /* Find capabilities of keyrings subsystem */ -#define KEYCTL_GRANT_PERMISSION 32 /* Grant a permit to a key */ /* keyctl structures */ struct keyctl_dh_params { @@ -194,6 +130,5 @@ struct keyctl_pkey_params { #define KEYCTL_CAPS0_MOVE 0x80 /* KEYCTL_MOVE supported */ #define KEYCTL_CAPS1_NS_KEYRING_NAME 0x01 /* Keyring names are per-user_namespace */ #define KEYCTL_CAPS1_NS_KEY_TAG 0x02 /* Key indexing can include a namespace tag */ -#define KEYCTL_CAPS1_ACL_ALTERABLE 0x04 /* Keys have internal ACL that can be altered */ #endif /* _LINUX_KEYCTL_H */ diff --git a/lib/digsig.c b/lib/digsig.c index ab0800f98eaf..e0627c3e53b2 100644 --- a/lib/digsig.c +++ b/lib/digsig.c @@ -224,7 +224,7 @@ int digsig_verify(struct key *keyring, const char *sig, int siglen, else key = key_ref_to_ptr(kref); } else { - key = request_key(&key_type_user, name, NULL, NULL); + key = request_key(&key_type_user, name, NULL); } if (IS_ERR(key)) { pr_err("key not found, id: %s\n", name); diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c index 38de80d01aae..1c811c74bfc0 100644 --- a/net/ceph/ceph_common.c +++ b/net/ceph/ceph_common.c @@ -306,7 +306,7 @@ static int get_secret(struct ceph_crypto_key *dst, const char *name) { int err = 0; struct ceph_crypto_key *ckey; - ukey = request_key(&key_type_ceph, name, NULL, NULL); + ukey = request_key(&key_type_ceph, name, NULL); if (IS_ERR(ukey)) { /* request_key errors don't map nicely to mount(2) errors; don't even try, but still printk */ diff --git a/net/dns_resolver/dns_key.c b/net/dns_resolver/dns_key.c index 6b201531b165..3e1a90669006 100644 --- a/net/dns_resolver/dns_key.c +++ b/net/dns_resolver/dns_key.c @@ -46,15 +46,6 @@ const struct cred *dns_resolver_cache; #define DNS_ERRORNO_OPTION "dnserror" -static struct key_acl dns_keyring_acl = { - .usage = REFCOUNT_INIT(1), - .nr_ace = 2, - .aces = { - KEY_POSSESSOR_ACE(KEY_ACE_SEARCH | KEY_ACE_WRITE), - KEY_OWNER_ACE(KEY_ACE_VIEW | KEY_ACE_READ | KEY_ACE_CLEAR), - } -}; - /* * Preparse instantiation data for a dns_resolver key. * @@ -352,7 +343,8 @@ static int __init init_dns_resolver(void) keyring = keyring_alloc(".dns_resolver", GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, cred, - &dns_keyring_acl, + (KEY_POS_ALL & ~KEY_POS_SETATTR) | + KEY_USR_VIEW | KEY_USR_READ, KEY_ALLOC_NOT_IN_QUOTA, NULL, NULL); if (IS_ERR(keyring)) { ret = PTR_ERR(keyring); diff --git a/net/dns_resolver/dns_query.c b/net/dns_resolver/dns_query.c index 236baf2bfa4c..cab4e0df924f 100644 --- a/net/dns_resolver/dns_query.c +++ b/net/dns_resolver/dns_query.c @@ -47,16 +47,6 @@ #include "internal.h" -static struct key_acl dns_key_acl = { - .usage = REFCOUNT_INIT(1), - .nr_ace = 2, - .possessor_viewable = true, - .aces = { - KEY_POSSESSOR_ACE(KEY_ACE_VIEW | KEY_ACE_SEARCH | KEY_ACE_READ), - KEY_OWNER_ACE(KEY_ACE_VIEW | KEY_ACE_INVAL), - } -}; - /** * dns_query - Query the DNS * @net: The network namespace to operate in. @@ -135,8 +125,7 @@ int dns_query(struct net *net, * add_key() to preinstall malicious redirections */ saved_cred = override_creds(dns_resolver_cache); - rkey = request_key_net(&key_type_dns_resolver, desc, net, options, - &dns_key_acl); + rkey = request_key_net(&key_type_dns_resolver, desc, net, options); revert_creds(saved_cred); kfree(desc); if (IS_ERR(rkey)) { @@ -146,6 +135,8 @@ int dns_query(struct net *net, down_read(&rkey->sem); set_bit(KEY_FLAG_ROOT_CAN_INVAL, &rkey->flags); + rkey->perm |= KEY_USR_VIEW; + ret = key_validate(rkey); if (ret < 0) goto put; diff --git a/net/rxrpc/key.c b/net/rxrpc/key.c index 2032f6a8225e..6c3f35fac42d 100644 --- a/net/rxrpc/key.c +++ b/net/rxrpc/key.c @@ -23,14 +23,6 @@ #include #include "ar-internal.h" -static struct key_acl rxrpc_null_key_acl = { - .usage = REFCOUNT_INIT(1), - .nr_ace = 1, - .aces = { - KEY_POSSESSOR_ACE(KEY_ACE_SEARCH | KEY_ACE_READ), - } -}; - static int rxrpc_vet_description_s(const char *); static int rxrpc_preparse(struct key_preparsed_payload *); static int rxrpc_preparse_s(struct key_preparsed_payload *); @@ -918,8 +910,7 @@ int rxrpc_request_key(struct rxrpc_sock *rx, char __user *optval, int optlen) if (IS_ERR(description)) return PTR_ERR(description); - key = request_key_net(&key_type_rxrpc, description, sock_net(&rx->sk), - NULL, NULL); + key = request_key_net(&key_type_rxrpc, description, sock_net(&rx->sk), NULL); if (IS_ERR(key)) { kfree(description); _leave(" = %ld", PTR_ERR(key)); @@ -950,8 +941,7 @@ int rxrpc_server_keyring(struct rxrpc_sock *rx, char __user *optval, if (IS_ERR(description)) return PTR_ERR(description); - key = request_key_net(&key_type_keyring, description, sock_net(&rx->sk), - NULL, NULL); + key = request_key_net(&key_type_keyring, description, sock_net(&rx->sk), NULL); if (IS_ERR(key)) { kfree(description); _leave(" = %ld", PTR_ERR(key)); @@ -984,8 +974,7 @@ int rxrpc_get_server_data_key(struct rxrpc_connection *conn, _enter(""); key = key_alloc(&key_type_rxrpc, "x", - GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, cred, - &internal_key_acl, + GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, cred, 0, KEY_ALLOC_NOT_IN_QUOTA, NULL); if (IS_ERR(key)) { _leave(" = -ENOMEM [alloc %ld]", PTR_ERR(key)); @@ -1033,7 +1022,7 @@ struct key *rxrpc_get_null_key(const char *keyname) key = key_alloc(&key_type_rxrpc, keyname, GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, cred, - &rxrpc_null_key_acl, KEY_ALLOC_NOT_IN_QUOTA, NULL); + KEY_POS_SEARCH, KEY_ALLOC_NOT_IN_QUOTA, NULL); if (IS_ERR(key)) return key; diff --git a/net/wireless/reg.c b/net/wireless/reg.c index 298fe91557f7..4831ad745f91 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -741,7 +741,8 @@ static void __init load_keys_from_buffer(const u8 *p, unsigned int buflen) key = key_create_or_update(make_key_ref(builtin_regdb_keys, 1), "asymmetric", NULL, p, plen, - &internal_key_acl, + ((KEY_POS_ALL & ~KEY_POS_SETATTR) | + KEY_USR_VIEW | KEY_USR_READ), KEY_ALLOC_NOT_IN_QUOTA | KEY_ALLOC_BUILT_IN | KEY_ALLOC_BYPASS_RESTRICTION); @@ -767,7 +768,8 @@ static int __init load_builtin_regdb_keys(void) builtin_regdb_keys = keyring_alloc(".builtin_regdb_keys", KUIDT_INIT(0), KGIDT_INIT(0), current_cred(), - &internal_keyring_acl, + ((KEY_POS_ALL & ~KEY_POS_SETATTR) | + KEY_USR_VIEW | KEY_USR_READ | KEY_USR_SEARCH), KEY_ALLOC_NOT_IN_QUOTA, NULL, NULL); if (IS_ERR(builtin_regdb_keys)) return PTR_ERR(builtin_regdb_keys); diff --git a/security/integrity/digsig.c b/security/integrity/digsig.c index f9f3c8ffe786..868ade3e8970 100644 --- a/security/integrity/digsig.c +++ b/security/integrity/digsig.c @@ -47,8 +47,7 @@ int integrity_digsig_verify(const unsigned int id, const char *sig, int siglen, if (!keyring[id]) { keyring[id] = - request_key(&key_type_keyring, keyring_name[id], - NULL, NULL); + request_key(&key_type_keyring, keyring_name[id], NULL); if (IS_ERR(keyring[id])) { int err = PTR_ERR(keyring[id]); pr_err("no %s keyring: %d\n", keyring_name[id], err); @@ -71,14 +70,14 @@ int integrity_digsig_verify(const unsigned int id, const char *sig, int siglen, } static int __init __integrity_init_keyring(const unsigned int id, - struct key_acl *acl, + key_perm_t perm, struct key_restriction *restriction) { const struct cred *cred = current_cred(); int err = 0; keyring[id] = keyring_alloc(keyring_name[id], KUIDT_INIT(0), - KGIDT_INIT(0), cred, acl, + KGIDT_INIT(0), cred, perm, KEY_ALLOC_NOT_IN_QUOTA, restriction, NULL); if (IS_ERR(keyring[id])) { err = PTR_ERR(keyring[id]); @@ -96,7 +95,10 @@ static int __init __integrity_init_keyring(const unsigned int id, int __init integrity_init_keyring(const unsigned int id) { struct key_restriction *restriction; - struct key_acl *acl = &internal_keyring_acl; + key_perm_t perm; + + perm = (KEY_POS_ALL & ~KEY_POS_SETATTR) | KEY_USR_VIEW + | KEY_USR_READ | KEY_USR_SEARCH; if (id == INTEGRITY_KEYRING_PLATFORM) { restriction = NULL; @@ -111,14 +113,14 @@ int __init integrity_init_keyring(const unsigned int id) return -ENOMEM; restriction->check = restrict_link_to_ima; - acl = &internal_writable_keyring_acl; + perm |= KEY_USR_WRITE; out: - return __integrity_init_keyring(id, acl, restriction); + return __integrity_init_keyring(id, perm, restriction); } -static int __init integrity_add_key(const unsigned int id, const void *data, - off_t size, struct key_acl *acl) +int __init integrity_add_key(const unsigned int id, const void *data, + off_t size, key_perm_t perm) { key_ref_t key; int rc = 0; @@ -127,7 +129,7 @@ static int __init integrity_add_key(const unsigned int id, const void *data, return -EINVAL; key = key_create_or_update(make_key_ref(keyring[id], 1), "asymmetric", - NULL, data, size, acl ?: &internal_key_acl, + NULL, data, size, perm, KEY_ALLOC_NOT_IN_QUOTA); if (IS_ERR(key)) { rc = PTR_ERR(key); @@ -147,6 +149,7 @@ int __init integrity_load_x509(const unsigned int id, const char *path) void *data; loff_t size; int rc; + key_perm_t perm; rc = kernel_read_file_from_path(path, &data, &size, 0, READING_X509_CERTIFICATE); @@ -155,19 +158,21 @@ int __init integrity_load_x509(const unsigned int id, const char *path) return rc; } + perm = (KEY_POS_ALL & ~KEY_POS_SETATTR) | KEY_USR_VIEW | KEY_USR_READ; + pr_info("Loading X.509 certificate: %s\n", path); - rc = integrity_add_key(id, data, size, NULL); + rc = integrity_add_key(id, (const void *)data, size, perm); vfree(data); return rc; } int __init integrity_load_cert(const unsigned int id, const char *source, - const void *data, size_t len, struct key_acl *acl) + const void *data, size_t len, key_perm_t perm) { if (!data) return -EINVAL; pr_info("Loading X.509 certificate: %s\n", source); - return integrity_add_key(id, data, len, acl); + return integrity_add_key(id, data, len, perm); } diff --git a/security/integrity/digsig_asymmetric.c b/security/integrity/digsig_asymmetric.c index a29df775fdd8..55aec161d0e1 100644 --- a/security/integrity/digsig_asymmetric.c +++ b/security/integrity/digsig_asymmetric.c @@ -53,7 +53,7 @@ static struct key *request_asymmetric_key(struct key *keyring, uint32_t keyid) else key = key_ref_to_ptr(kref); } else { - key = request_key(&key_type_asymmetric, name, NULL, NULL); + key = request_key(&key_type_asymmetric, name, NULL); } if (IS_ERR(key)) { diff --git a/security/integrity/evm/evm_crypto.c b/security/integrity/evm/evm_crypto.c index 466eebd3b4aa..d485f6fc908e 100644 --- a/security/integrity/evm/evm_crypto.c +++ b/security/integrity/evm/evm_crypto.c @@ -356,7 +356,7 @@ int evm_init_key(void) struct encrypted_key_payload *ekp; int rc; - evm_key = request_key(&key_type_encrypted, EVMKEY, NULL, NULL); + evm_key = request_key(&key_type_encrypted, EVMKEY, NULL); if (IS_ERR(evm_key)) return -ENOENT; diff --git a/security/integrity/ima/ima_mok.c b/security/integrity/ima/ima_mok.c index b52ae1476ec3..36cadadbfba4 100644 --- a/security/integrity/ima/ima_mok.c +++ b/security/integrity/ima/ima_mok.c @@ -16,15 +16,6 @@ #include -static struct key_acl integrity_blacklist_keyring_acl = { - .usage = REFCOUNT_INIT(1), - .nr_ace = 2, - .aces = { - KEY_POSSESSOR_ACE(KEY_ACE_SEARCH | KEY_ACE_WRITE), - KEY_OWNER_ACE(KEY_ACE_VIEW | KEY_ACE_READ | KEY_ACE_WRITE | KEY_ACE_SEARCH), - } -}; - struct key *ima_blacklist_keyring; /* @@ -44,7 +35,9 @@ __init int ima_mok_init(void) ima_blacklist_keyring = keyring_alloc(".ima_blacklist", KUIDT_INIT(0), KGIDT_INIT(0), current_cred(), - &integrity_blacklist_keyring_acl, + (KEY_POS_ALL & ~KEY_POS_SETATTR) | + KEY_USR_VIEW | KEY_USR_READ | + KEY_USR_WRITE | KEY_USR_SEARCH, KEY_ALLOC_NOT_IN_QUOTA, restriction, NULL); diff --git a/security/integrity/integrity.h b/security/integrity/integrity.h index 875c6a7a5af1..ed12d8e13d04 100644 --- a/security/integrity/integrity.h +++ b/security/integrity/integrity.h @@ -12,8 +12,6 @@ #include #include -struct key_acl; - /* iint action cache flags */ #define IMA_MEASURE 0x00000001 #define IMA_MEASURED 0x00000002 @@ -157,7 +155,7 @@ int integrity_digsig_verify(const unsigned int id, const char *sig, int siglen, int __init integrity_init_keyring(const unsigned int id); int __init integrity_load_x509(const unsigned int id, const char *path); int __init integrity_load_cert(const unsigned int id, const char *source, - const void *data, size_t len, struct key_acl *acl); + const void *data, size_t len, key_perm_t perm); #else static inline int integrity_digsig_verify(const unsigned int id, @@ -175,7 +173,7 @@ static inline int integrity_init_keyring(const unsigned int id) static inline int __init integrity_load_cert(const unsigned int id, const char *source, const void *data, size_t len, - struct key_acl *acl) + key_perm_t perm) { return 0; } diff --git a/security/integrity/platform_certs/platform_keyring.c b/security/integrity/platform_certs/platform_keyring.c index 7646e35f2d91..bcafd7387729 100644 --- a/security/integrity/platform_certs/platform_keyring.c +++ b/security/integrity/platform_certs/platform_keyring.c @@ -14,15 +14,6 @@ #include #include "../integrity.h" -static struct key_acl platform_key_acl = { - .usage = REFCOUNT_INIT(1), - .nr_ace = 2, - .aces = { - KEY_POSSESSOR_ACE(KEY_ACE_SEARCH | KEY_ACE_READ), - KEY_OWNER_ACE(KEY_ACE_VIEW), - } -}; - /** * add_to_platform_keyring - Add to platform keyring without validation. * @source: Source of key @@ -35,10 +26,13 @@ static struct key_acl platform_key_acl = { void __init add_to_platform_keyring(const char *source, const void *data, size_t len) { + key_perm_t perm; int rc; + perm = (KEY_POS_ALL & ~KEY_POS_SETATTR) | KEY_USR_VIEW; + rc = integrity_load_cert(INTEGRITY_KEYRING_PLATFORM, source, data, len, - &platform_key_acl); + perm); if (rc) pr_info("Error adding keys to platform keyring %s\n", source); } diff --git a/security/keys/compat.c b/security/keys/compat.c index b0e59546e7bd..9bcc404131aa 100644 --- a/security/keys/compat.c +++ b/security/keys/compat.c @@ -157,8 +157,6 @@ COMPAT_SYSCALL_DEFINE5(keyctl, u32, option, case KEYCTL_MOVE: return keyctl_keyring_move(arg2, arg3, arg4, arg5); - case KEYCTL_GRANT_PERMISSION: - return keyctl_grant_permission(arg2, arg3, arg4, arg5); case KEYCTL_CAPABILITIES: return keyctl_capabilities(compat_ptr(arg2), arg3); diff --git a/security/keys/encrypted-keys/encrypted.c b/security/keys/encrypted-keys/encrypted.c index 9df560e477c2..60720f58cbe0 100644 --- a/security/keys/encrypted-keys/encrypted.c +++ b/security/keys/encrypted-keys/encrypted.c @@ -304,7 +304,7 @@ static struct key *request_user_key(const char *master_desc, const u8 **master_k const struct user_key_payload *upayload; struct key *ukey; - ukey = request_key(&key_type_user, master_desc, NULL, NULL); + ukey = request_key(&key_type_user, master_desc, NULL); if (IS_ERR(ukey)) goto error; diff --git a/security/keys/encrypted-keys/masterkey_trusted.c b/security/keys/encrypted-keys/masterkey_trusted.c index d649f2f29475..c68528aa49c6 100644 --- a/security/keys/encrypted-keys/masterkey_trusted.c +++ b/security/keys/encrypted-keys/masterkey_trusted.c @@ -30,7 +30,7 @@ struct key *request_trusted_key(const char *trusted_desc, struct trusted_key_payload *tpayload; struct key *tkey; - tkey = request_key(&key_type_trusted, trusted_desc, NULL, NULL); + tkey = request_key(&key_type_trusted, trusted_desc, NULL); if (IS_ERR(tkey)) goto error; diff --git a/security/keys/gc.c b/security/keys/gc.c index 48c3e124c272..671dd730ecfc 100644 --- a/security/keys/gc.c +++ b/security/keys/gc.c @@ -151,7 +151,6 @@ static noinline void key_gc_unused_keys(struct list_head *keys) key_user_put(key->user); key_put_tag(key->domain_tag); - key_put_acl(rcu_access_pointer(key->acl)); kfree(key->description); memzero_explicit(key, sizeof(*key)); @@ -221,6 +220,7 @@ continue_scanning: if (key->type == key_gc_dead_keytype) { gc_state |= KEY_GC_FOUND_DEAD_KEY; set_bit(KEY_FLAG_DEAD, &key->flags); + key->perm = 0; goto skip_dead_key; } else if (key->type == &key_type_keyring && key->restrict_link) { diff --git a/security/keys/internal.h b/security/keys/internal.h index e0c5bb8b1685..c039373488bd 100644 --- a/security/keys/internal.h +++ b/security/keys/internal.h @@ -84,11 +84,8 @@ extern struct rb_root key_serial_tree; extern spinlock_t key_serial_lock; extern struct mutex key_construction_mutex; extern wait_queue_head_t request_key_conswq; -extern struct key_acl default_key_acl; -extern struct key_acl joinable_keyring_acl; extern void key_set_index_key(struct keyring_index_key *index_key); - extern struct key_type *key_type_lookup(const char *type); extern void key_type_put(struct key_type *ktype); @@ -159,7 +156,6 @@ extern struct key *request_key_and_link(struct key_type *type, const void *callout_info, size_t callout_len, void *aux, - struct key_acl *acl, struct key *dest_keyring, unsigned long flags); @@ -183,10 +179,7 @@ extern void key_gc_keytype(struct key_type *ktype); extern int key_task_permission(const key_ref_t key_ref, const struct cred *cred, - u32 desired_perm); -extern unsigned int key_acl_to_perm(const struct key_acl *acl); -extern long key_set_acl(struct key *key, struct key_acl *acl); -extern void key_put_acl(struct key_acl *acl); + key_perm_t perm); /* * Check to see whether permission is granted to use a key in the desired way. @@ -233,7 +226,7 @@ extern long keyctl_keyring_search(key_serial_t, const char __user *, const char __user *, key_serial_t); extern long keyctl_read_key(key_serial_t, char __user *, size_t); extern long keyctl_chown_key(key_serial_t, uid_t, gid_t); -extern long keyctl_setperm_key(key_serial_t, unsigned int); +extern long keyctl_setperm_key(key_serial_t, key_perm_t); extern long keyctl_instantiate_key(key_serial_t, const void __user *, size_t, key_serial_t); extern long keyctl_negate_key(key_serial_t, unsigned, key_serial_t); @@ -338,11 +331,6 @@ static inline long keyctl_pkey_e_d_s(int op, extern long keyctl_capabilities(unsigned char __user *_buffer, size_t buflen); -extern long keyctl_grant_permission(key_serial_t keyid, - enum key_ace_subject_type type, - unsigned int subject, - unsigned int perm); - /* * Debugging key validation */ diff --git a/security/keys/key.c b/security/keys/key.c index 519211a996e7..764f4c57913e 100644 --- a/security/keys/key.c +++ b/security/keys/key.c @@ -195,7 +195,7 @@ serial_exists: * @uid: The owner of the new key. * @gid: The group ID for the new key's group permissions. * @cred: The credentials specifying UID namespace. - * @acl: The ACL to attach to the new key. + * @perm: The permissions mask of the new key. * @flags: Flags specifying quota properties. * @restrict_link: Optional link restriction for new keyrings. * @@ -223,7 +223,7 @@ serial_exists: */ struct key *key_alloc(struct key_type *type, const char *desc, kuid_t uid, kgid_t gid, const struct cred *cred, - struct key_acl *acl, unsigned long flags, + key_perm_t perm, unsigned long flags, struct key_restriction *restrict_link) { struct key_user *user = NULL; @@ -246,9 +246,6 @@ struct key *key_alloc(struct key_type *type, const char *desc, desclen = strlen(desc); quotalen = desclen + 1 + type->def_datalen; - if (!acl) - acl = &default_key_acl; - /* get hold of the key tracking for this user */ user = key_user_lookup(uid); if (!user) @@ -295,8 +292,7 @@ struct key *key_alloc(struct key_type *type, const char *desc, key->datalen = type->def_datalen; key->uid = uid; key->gid = gid; - refcount_inc(&acl->usage); - rcu_assign_pointer(key->acl, acl); + key->perm = perm; key->restrict_link = restrict_link; key->last_used_at = ktime_get_real_seconds(); @@ -791,7 +787,7 @@ error: * @description: The searchable description for the key. * @payload: The data to use to instantiate or update the key. * @plen: The length of @payload. - * @acl: The ACL to attach if a key is created. + * @perm: The permissions mask for a new key. * @flags: The quota flags for a new key. * * Search the destination keyring for a key of the same description and if one @@ -814,7 +810,7 @@ key_ref_t key_create_or_update(key_ref_t keyring_ref, const char *description, const void *payload, size_t plen, - struct key_acl *acl, + key_perm_t perm, unsigned long flags) { struct keyring_index_key index_key = { @@ -911,9 +907,22 @@ key_ref_t key_create_or_update(key_ref_t keyring_ref, goto found_matching_key; } + /* if the client doesn't provide, decide on the permissions we want */ + if (perm == KEY_PERM_UNDEF) { + perm = KEY_POS_VIEW | KEY_POS_SEARCH | KEY_POS_LINK | KEY_POS_SETATTR; + perm |= KEY_USR_VIEW; + + if (index_key.type->read) + perm |= KEY_POS_READ; + + if (index_key.type == &key_type_keyring || + index_key.type->update) + perm |= KEY_POS_WRITE; + } + /* allocate a new key */ key = key_alloc(index_key.type, index_key.description, - cred->fsuid, cred->fsgid, cred, acl, flags, NULL); + cred->fsuid, cred->fsgid, cred, perm, flags, NULL); if (IS_ERR(key)) { key_ref = ERR_CAST(key); goto error_link_end; diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c index c2dd66d556d4..9b898c969558 100644 --- a/security/keys/keyctl.c +++ b/security/keys/keyctl.c @@ -37,8 +37,7 @@ static const unsigned char keyrings_capabilities[2] = { KEYCTL_CAPS0_MOVE ), [1] = (KEYCTL_CAPS1_NS_KEYRING_NAME | - KEYCTL_CAPS1_NS_KEY_TAG | - KEYCTL_CAPS1_ACL_ALTERABLE), + KEYCTL_CAPS1_NS_KEY_TAG), }; static int key_get_type_from_user(char *type, @@ -131,7 +130,8 @@ SYSCALL_DEFINE5(add_key, const char __user *, _type, /* create or update the requested key and add it to the target * keyring */ key_ref = key_create_or_update(keyring_ref, type, description, - payload, plen, NULL, KEY_ALLOC_IN_QUOTA); + payload, plen, KEY_PERM_UNDEF, + KEY_ALLOC_IN_QUOTA); if (!IS_ERR(key_ref)) { ret = key_ref_to_ptr(key_ref)->serial; key_ref_put(key_ref); @@ -221,8 +221,7 @@ SYSCALL_DEFINE4(request_key, const char __user *, _type, /* do the search */ key = request_key_and_link(ktype, description, NULL, callout_info, - callout_len, NULL, NULL, - key_ref_to_ptr(dest_ref), + callout_len, NULL, key_ref_to_ptr(dest_ref), KEY_ALLOC_IN_QUOTA); if (IS_ERR(key)) { ret = PTR_ERR(key); @@ -384,10 +383,16 @@ long keyctl_revoke_key(key_serial_t id) struct key *key; long ret; - key_ref = lookup_user_key(id, 0, KEY_NEED_REVOKE); + key_ref = lookup_user_key(id, 0, KEY_NEED_WRITE); if (IS_ERR(key_ref)) { ret = PTR_ERR(key_ref); - goto error; + if (ret != -EACCES) + goto error; + key_ref = lookup_user_key(id, 0, KEY_NEED_SETATTR); + if (IS_ERR(key_ref)) { + ret = PTR_ERR(key_ref); + goto error; + } } key = key_ref_to_ptr(key_ref); @@ -421,7 +426,7 @@ long keyctl_invalidate_key(key_serial_t id) kenter("%d", id); - key_ref = lookup_user_key(id, 0, KEY_NEED_INVAL); + key_ref = lookup_user_key(id, 0, KEY_NEED_SEARCH); if (IS_ERR(key_ref)) { ret = PTR_ERR(key_ref); @@ -466,7 +471,7 @@ long keyctl_keyring_clear(key_serial_t ringid) struct key *keyring; long ret; - keyring_ref = lookup_user_key(ringid, KEY_LOOKUP_CREATE, KEY_NEED_CLEAR); + keyring_ref = lookup_user_key(ringid, KEY_LOOKUP_CREATE, KEY_NEED_WRITE); if (IS_ERR(keyring_ref)) { ret = PTR_ERR(keyring_ref); @@ -641,7 +646,6 @@ long keyctl_describe_key(key_serial_t keyid, size_t buflen) { struct key *key, *instkey; - unsigned int perm; key_ref_t key_ref; char *infobuf; long ret; @@ -671,10 +675,6 @@ okay: key = key_ref_to_ptr(key_ref); desclen = strlen(key->description); - rcu_read_lock(); - perm = key_acl_to_perm(rcu_dereference(key->acl)); - rcu_read_unlock(); - /* calculate how much information we're going to return */ ret = -ENOMEM; infobuf = kasprintf(GFP_KERNEL, @@ -682,7 +682,7 @@ okay: key->type->name, from_kuid_munged(current_user_ns(), key->uid), from_kgid_munged(current_user_ns(), key->gid), - perm); + key->perm); if (!infobuf) goto error2; infolen = strlen(infobuf); @@ -899,7 +899,7 @@ long keyctl_chown_key(key_serial_t id, uid_t user, gid_t group) goto error; key_ref = lookup_user_key(id, KEY_LOOKUP_CREATE | KEY_LOOKUP_PARTIAL, - KEY_NEED_SETSEC); + KEY_NEED_SETATTR); if (IS_ERR(key_ref)) { ret = PTR_ERR(key_ref); goto error; @@ -994,25 +994,18 @@ quota_overrun: * the key need not be fully instantiated yet. If the caller does not have * sysadmin capability, it may only change the permission on keys that it owns. */ -long keyctl_setperm_key(key_serial_t id, unsigned int perm) +long keyctl_setperm_key(key_serial_t id, key_perm_t perm) { - struct key_acl *acl; struct key *key; key_ref_t key_ref; long ret; - int nr, i, j; + ret = -EINVAL; if (perm & ~(KEY_POS_ALL | KEY_USR_ALL | KEY_GRP_ALL | KEY_OTH_ALL)) - return -EINVAL; - - nr = 0; - if (perm & KEY_POS_ALL) nr++; - if (perm & KEY_USR_ALL) nr++; - if (perm & KEY_GRP_ALL) nr++; - if (perm & KEY_OTH_ALL) nr++; + goto error; key_ref = lookup_user_key(id, KEY_LOOKUP_CREATE | KEY_LOOKUP_PARTIAL, - KEY_NEED_SETSEC); + KEY_NEED_SETATTR); if (IS_ERR(key_ref)) { ret = PTR_ERR(key_ref); goto error; @@ -1020,45 +1013,17 @@ long keyctl_setperm_key(key_serial_t id, unsigned int perm) key = key_ref_to_ptr(key_ref); - ret = -EOPNOTSUPP; - if (test_bit(KEY_FLAG_HAS_ACL, &key->flags)) - goto error_key; + /* make the changes with the locks held to prevent chown/chmod races */ + ret = -EACCES; + down_write(&key->sem); - ret = -ENOMEM; - acl = kzalloc(struct_size(acl, aces, nr), GFP_KERNEL); - if (!acl) - goto error_key; - - refcount_set(&acl->usage, 1); - acl->nr_ace = nr; - j = 0; - for (i = 0; i < 4; i++) { - struct key_ace *ace = &acl->aces[j]; - unsigned int subset = (perm >> (i * 8)) & KEY_OTH_ALL; - - if (!subset) - continue; - ace->type = KEY_ACE_SUBJ_STANDARD; - ace->subject_id = KEY_ACE_EVERYONE + i; - ace->perm = subset; - if (subset & (KEY_OTH_WRITE | KEY_OTH_SETATTR)) - ace->perm |= KEY_ACE_REVOKE; - if (subset & KEY_OTH_SEARCH) - ace->perm |= KEY_ACE_INVAL; - if (key->type == &key_type_keyring) { - if (subset & KEY_OTH_SEARCH) - ace->perm |= KEY_ACE_JOIN; - if (subset & KEY_OTH_WRITE) - ace->perm |= KEY_ACE_CLEAR; - } - j++; + /* if we're not the sysadmin, we can only change a key that we own */ + if (capable(CAP_SYS_ADMIN) || uid_eq(key->uid, current_fsuid())) { + key->perm = perm; + ret = 0; } - /* make the changes with the locks held to prevent chown/chmod races */ - down_write(&key->sem); - ret = key_set_acl(key, acl); up_write(&key->sem); -error_key: key_put(key); error: return ret; @@ -1423,7 +1388,7 @@ long keyctl_set_timeout(key_serial_t id, unsigned timeout) long ret; key_ref = lookup_user_key(id, KEY_LOOKUP_CREATE | KEY_LOOKUP_PARTIAL, - KEY_NEED_SETSEC); + KEY_NEED_SETATTR); if (IS_ERR(key_ref)) { /* setting the timeout on a key under construction is permitted * if we have the authorisation token handy */ @@ -1574,7 +1539,7 @@ long keyctl_get_security(key_serial_t keyid, * Attempt to install the calling process's session keyring on the process's * parent process. * - * The keyring must exist and must grant the caller JOIN permission, and the + * The keyring must exist and must grant the caller LINK permission, and the * parent process must be single-threaded and must have the same effective * ownership as this process and mustn't be SUID/SGID. * @@ -1591,7 +1556,7 @@ long keyctl_session_to_parent(void) struct cred *cred; int ret; - keyring_r = lookup_user_key(KEY_SPEC_SESSION_KEYRING, 0, KEY_NEED_JOIN); + keyring_r = lookup_user_key(KEY_SPEC_SESSION_KEYRING, 0, KEY_NEED_LINK); if (IS_ERR(keyring_r)) return PTR_ERR(keyring_r); @@ -1693,7 +1658,7 @@ long keyctl_restrict_keyring(key_serial_t id, const char __user *_type, char *restriction = NULL; long ret; - key_ref = lookup_user_key(id, 0, KEY_NEED_SETSEC); + key_ref = lookup_user_key(id, 0, KEY_NEED_SETATTR); if (IS_ERR(key_ref)) return PTR_ERR(key_ref); @@ -1799,7 +1764,7 @@ SYSCALL_DEFINE5(keyctl, int, option, unsigned long, arg2, unsigned long, arg3, case KEYCTL_SETPERM: return keyctl_setperm_key((key_serial_t) arg2, - (unsigned int)arg3); + (key_perm_t) arg3); case KEYCTL_INSTANTIATE: return keyctl_instantiate_key((key_serial_t) arg2, @@ -1888,11 +1853,6 @@ SYSCALL_DEFINE5(keyctl, int, option, unsigned long, arg2, unsigned long, arg3, (key_serial_t)arg3, (key_serial_t)arg4, (unsigned int)arg5); - case KEYCTL_GRANT_PERMISSION: - return keyctl_grant_permission((key_serial_t)arg2, - (enum key_ace_subject_type)arg3, - (unsigned int)arg4, - (unsigned int)arg5); case KEYCTL_CAPABILITIES: return keyctl_capabilities((unsigned char __user *)arg2, (size_t)arg3); diff --git a/security/keys/keyring.c b/security/keys/keyring.c index 3b5458f23a95..febf36c6ddc5 100644 --- a/security/keys/keyring.c +++ b/security/keys/keyring.c @@ -515,19 +515,11 @@ static long keyring_read(const struct key *keyring, return ret; } -/** - * keyring_alloc - Allocate a keyring and link into the destination - * @description: The key description to allow the key to be searched out. - * @uid: The owner of the new key. - * @gid: The group ID for the new key's group permissions. - * @cred: The credentials specifying UID namespace. - * @acl: The ACL to attach to the new key. - * @flags: Flags specifying quota properties. - * @restrict_link: Optional link restriction for new keyrings. - * @dest: Destination keyring. +/* + * Allocate a keyring and link into the destination keyring. */ struct key *keyring_alloc(const char *description, kuid_t uid, kgid_t gid, - const struct cred *cred, struct key_acl *acl, + const struct cred *cred, key_perm_t perm, unsigned long flags, struct key_restriction *restrict_link, struct key *dest) @@ -536,7 +528,7 @@ struct key *keyring_alloc(const char *description, kuid_t uid, kgid_t gid, int ret; keyring = key_alloc(&key_type_keyring, description, - uid, gid, cred, acl, flags, restrict_link); + uid, gid, cred, perm, flags, restrict_link); if (!IS_ERR(keyring)) { ret = key_instantiate_and_link(keyring, NULL, 0, dest, NULL); if (ret < 0) { @@ -1140,11 +1132,10 @@ found: /* * Find a keyring with the specified name. * - * Only keyrings that have nonzero refcount, are not revoked, and are owned by - * a user in the current user namespace are considered. If @uid_keyring is - * %true, the keyring additionally must have been allocated as a user or user - * session keyring; otherwise, it must grant JOIN permission directly to the - * caller (ie. not through possession). + * Only keyrings that have nonzero refcount, are not revoked, and are owned by a + * user in the current user namespace are considered. If @uid_keyring is %true, + * the keyring additionally must have been allocated as a user or user session + * keyring; otherwise, it must grant Search permission directly to the caller. * * Returns a pointer to the keyring with the keyring's refcount having being * incremented on success. -ENOKEY is returned if a key could not be found. @@ -1178,7 +1169,7 @@ struct key *find_keyring_by_name(const char *name, bool uid_keyring) continue; } else { if (key_permission(make_key_ref(keyring, 0), - KEY_NEED_JOIN) < 0) + KEY_NEED_SEARCH) < 0) continue; } diff --git a/security/keys/permission.c b/security/keys/permission.c index fd8a5dc6910a..085f907b64ac 100644 --- a/security/keys/permission.c +++ b/security/keys/permission.c @@ -7,67 +7,13 @@ #include #include -#include -#include #include "internal.h" -struct key_acl default_key_acl = { - .usage = REFCOUNT_INIT(1), - .nr_ace = 2, - .possessor_viewable = true, - .aces = { - KEY_POSSESSOR_ACE(KEY_ACE__PERMS & ~KEY_ACE_JOIN), - KEY_OWNER_ACE(KEY_ACE_VIEW), - } -}; -EXPORT_SYMBOL(default_key_acl); - -struct key_acl joinable_keyring_acl = { - .usage = REFCOUNT_INIT(1), - .nr_ace = 2, - .possessor_viewable = true, - .aces = { - KEY_POSSESSOR_ACE(KEY_ACE__PERMS & ~KEY_ACE_JOIN), - KEY_OWNER_ACE(KEY_ACE_VIEW | KEY_ACE_READ | KEY_ACE_LINK | KEY_ACE_JOIN), - } -}; -EXPORT_SYMBOL(joinable_keyring_acl); - -struct key_acl internal_key_acl = { - .usage = REFCOUNT_INIT(1), - .nr_ace = 2, - .aces = { - KEY_POSSESSOR_ACE(KEY_ACE_SEARCH), - KEY_OWNER_ACE(KEY_ACE_VIEW | KEY_ACE_READ | KEY_ACE_SEARCH), - } -}; -EXPORT_SYMBOL(internal_key_acl); - -struct key_acl internal_keyring_acl = { - .usage = REFCOUNT_INIT(1), - .nr_ace = 2, - .aces = { - KEY_POSSESSOR_ACE(KEY_ACE_SEARCH), - KEY_OWNER_ACE(KEY_ACE_VIEW | KEY_ACE_READ | KEY_ACE_SEARCH), - } -}; -EXPORT_SYMBOL(internal_keyring_acl); - -struct key_acl internal_writable_keyring_acl = { - .usage = REFCOUNT_INIT(1), - .nr_ace = 2, - .aces = { - KEY_POSSESSOR_ACE(KEY_ACE_SEARCH | KEY_ACE_WRITE), - KEY_OWNER_ACE(KEY_ACE_VIEW | KEY_ACE_READ | KEY_ACE_WRITE | KEY_ACE_SEARCH), - } -}; -EXPORT_SYMBOL(internal_writable_keyring_acl); - /** * key_task_permission - Check a key can be used * @key_ref: The key to check. * @cred: The credentials to use. - * @desired_perm: The permission to check for. + * @perm: The permissions to check for. * * Check to see whether permission is granted to use a key in the desired way, * but permit the security modules to override. @@ -78,73 +24,53 @@ EXPORT_SYMBOL(internal_writable_keyring_acl); * permissions bits or the LSM check. */ int key_task_permission(const key_ref_t key_ref, const struct cred *cred, - unsigned int desired_perm) + unsigned perm) { - const struct key_acl *acl; - const struct key *key; - unsigned int allow = 0; - int i; - - BUILD_BUG_ON(KEY_NEED_VIEW != KEY_ACE_VIEW || - KEY_NEED_READ != KEY_ACE_READ || - KEY_NEED_WRITE != KEY_ACE_WRITE || - KEY_NEED_SEARCH != KEY_ACE_SEARCH || - KEY_NEED_LINK != KEY_ACE_LINK || - KEY_NEED_SETSEC != KEY_ACE_SET_SECURITY || - KEY_NEED_INVAL != KEY_ACE_INVAL || - KEY_NEED_REVOKE != KEY_ACE_REVOKE || - KEY_NEED_JOIN != KEY_ACE_JOIN || - KEY_NEED_CLEAR != KEY_ACE_CLEAR); + struct key *key; + key_perm_t kperm; + int ret; key = key_ref_to_ptr(key_ref); - rcu_read_lock(); - - acl = rcu_dereference(key->acl); - if (!acl || acl->nr_ace == 0) - goto no_access_rcu; + /* use the second 8-bits of permissions for keys the caller owns */ + if (uid_eq(key->uid, cred->fsuid)) { + kperm = key->perm >> 16; + goto use_these_perms; + } - for (i = 0; i < acl->nr_ace; i++) { - const struct key_ace *ace = &acl->aces[i]; + /* use the third 8-bits of permissions for keys the caller has a group + * membership in common with */ + if (gid_valid(key->gid) && key->perm & KEY_GRP_ALL) { + if (gid_eq(key->gid, cred->fsgid)) { + kperm = key->perm >> 8; + goto use_these_perms; + } - switch (ace->type) { - case KEY_ACE_SUBJ_STANDARD: - switch (ace->subject_id) { - case KEY_ACE_POSSESSOR: - if (is_key_possessed(key_ref)) - allow |= ace->perm; - break; - case KEY_ACE_OWNER: - if (uid_eq(key->uid, cred->fsuid)) - allow |= ace->perm; - break; - case KEY_ACE_GROUP: - if (gid_valid(key->gid)) { - if (gid_eq(key->gid, cred->fsgid)) - allow |= ace->perm; - else if (groups_search(cred->group_info, key->gid)) - allow |= ace->perm; - } - break; - case KEY_ACE_EVERYONE: - allow |= ace->perm; - break; - } - break; + ret = groups_search(cred->group_info, key->gid); + if (ret) { + kperm = key->perm >> 8; + goto use_these_perms; } } - rcu_read_unlock(); + /* otherwise use the least-significant 8-bits */ + kperm = key->perm; + +use_these_perms: - if (!(allow & desired_perm)) - goto no_access; + /* use the top 8-bits of permissions for keys the caller possesses + * - possessor permissions are additive with other permissions + */ + if (is_key_possessed(key_ref)) + kperm |= key->perm >> 24; - return security_key_permission(key_ref, cred, desired_perm); + kperm = kperm & perm & KEY_NEED_ALL; -no_access_rcu: - rcu_read_unlock(); -no_access: - return -EACCES; + if (kperm != perm) + return -EACCES; + + /* let LSM be the final arbiter */ + return security_key_permission(key_ref, cred, perm); } EXPORT_SYMBOL(key_task_permission); @@ -178,218 +104,3 @@ int key_validate(const struct key *key) return 0; } EXPORT_SYMBOL(key_validate); - -/* - * Roughly render an ACL to an old-style permissions mask. We cannot - * accurately render what the ACL, particularly if it has ACEs that represent - * subjects outside of { poss, user, group, other }. - */ -unsigned int key_acl_to_perm(const struct key_acl *acl) -{ - unsigned int perm = 0, tperm; - int i; - - BUILD_BUG_ON(KEY_OTH_VIEW != KEY_ACE_VIEW || - KEY_OTH_READ != KEY_ACE_READ || - KEY_OTH_WRITE != KEY_ACE_WRITE || - KEY_OTH_SEARCH != KEY_ACE_SEARCH || - KEY_OTH_LINK != KEY_ACE_LINK || - KEY_OTH_SETATTR != KEY_ACE_SET_SECURITY); - - if (!acl || acl->nr_ace == 0) - return 0; - - for (i = 0; i < acl->nr_ace; i++) { - const struct key_ace *ace = &acl->aces[i]; - - switch (ace->type) { - case KEY_ACE_SUBJ_STANDARD: - tperm = ace->perm & KEY_OTH_ALL; - - /* Invalidation and joining were allowed by SEARCH */ - if (ace->perm & (KEY_ACE_INVAL | KEY_ACE_JOIN)) - tperm |= KEY_OTH_SEARCH; - - /* Revocation was allowed by either SETATTR or WRITE */ - if ((ace->perm & KEY_ACE_REVOKE) && !(tperm & KEY_OTH_SETATTR)) - tperm |= KEY_OTH_WRITE; - - /* Clearing was allowed by WRITE */ - if (ace->perm & KEY_ACE_CLEAR) - tperm |= KEY_OTH_WRITE; - - switch (ace->subject_id) { - case KEY_ACE_POSSESSOR: - perm |= tperm << 24; - break; - case KEY_ACE_OWNER: - perm |= tperm << 16; - break; - case KEY_ACE_GROUP: - perm |= tperm << 8; - break; - case KEY_ACE_EVERYONE: - perm |= tperm << 0; - break; - } - } - } - - return perm; -} - -/* - * Destroy a key's ACL. - */ -void key_put_acl(struct key_acl *acl) -{ - if (acl && refcount_dec_and_test(&acl->usage)) - kfree_rcu(acl, rcu); -} - -/* - * Try to set the ACL. This either attaches or discards the proposed ACL. - */ -long key_set_acl(struct key *key, struct key_acl *acl) -{ - int i; - - /* If we're not the sysadmin, we can only change a key that we own. */ - if (!capable(CAP_SYS_ADMIN) && !uid_eq(key->uid, current_fsuid())) { - key_put_acl(acl); - return -EACCES; - } - - for (i = 0; i < acl->nr_ace; i++) { - const struct key_ace *ace = &acl->aces[i]; - if (ace->type == KEY_ACE_SUBJ_STANDARD && - ace->subject_id == KEY_ACE_POSSESSOR) { - if (ace->perm & KEY_ACE_VIEW) - acl->possessor_viewable = true; - break; - } - } - - rcu_swap_protected(key->acl, acl, lockdep_is_held(&key->sem)); - key_put_acl(acl); - return 0; -} - -/* - * Allocate a new ACL with an extra ACE slot. - */ -static struct key_acl *key_alloc_acl(const struct key_acl *old_acl, int nr, int skip) -{ - struct key_acl *acl; - int nr_ace, i, j = 0; - - nr_ace = old_acl->nr_ace + nr; - if (nr_ace > 16) - return ERR_PTR(-EINVAL); - - acl = kzalloc(struct_size(acl, aces, nr_ace), GFP_KERNEL); - if (!acl) - return ERR_PTR(-ENOMEM); - - refcount_set(&acl->usage, 1); - acl->nr_ace = nr_ace; - for (i = 0; i < old_acl->nr_ace; i++) { - if (i == skip) - continue; - acl->aces[j] = old_acl->aces[i]; - j++; - } - return acl; -} - -/* - * Generate the revised ACL. - */ -static long key_change_acl(struct key *key, struct key_ace *new_ace) -{ - struct key_acl *acl, *old; - int i; - - old = rcu_dereference_protected(key->acl, lockdep_is_held(&key->sem)); - - for (i = 0; i < old->nr_ace; i++) - if (old->aces[i].type == new_ace->type && - old->aces[i].subject_id == new_ace->subject_id) - goto found_match; - - if (new_ace->perm == 0) - return 0; /* No permissions to remove. Add deny record? */ - - acl = key_alloc_acl(old, 1, -1); - if (IS_ERR(acl)) - return PTR_ERR(acl); - acl->aces[i] = *new_ace; - goto change; - -found_match: - if (new_ace->perm == 0) - goto delete_ace; - if (new_ace->perm == old->aces[i].perm) - return 0; - acl = key_alloc_acl(old, 0, -1); - if (IS_ERR(acl)) - return PTR_ERR(acl); - acl->aces[i].perm = new_ace->perm; - goto change; - -delete_ace: - acl = key_alloc_acl(old, -1, i); - if (IS_ERR(acl)) - return PTR_ERR(acl); - goto change; - -change: - return key_set_acl(key, acl); -} - -/* - * Add, alter or remove (if perm == 0) an ACE in a key's ACL. - */ -long keyctl_grant_permission(key_serial_t keyid, - enum key_ace_subject_type type, - unsigned int subject, - unsigned int perm) -{ - struct key_ace new_ace; - struct key *key; - key_ref_t key_ref; - long ret; - - new_ace.type = type; - new_ace.perm = perm; - - switch (type) { - case KEY_ACE_SUBJ_STANDARD: - if (subject >= nr__key_ace_standard_subject) - return -ENOENT; - new_ace.subject_id = subject; - break; - - default: - return -ENOENT; - } - - key_ref = lookup_user_key(keyid, KEY_LOOKUP_PARTIAL, KEY_NEED_SETSEC); - if (IS_ERR(key_ref)) { - ret = PTR_ERR(key_ref); - goto error; - } - - key = key_ref_to_ptr(key_ref); - - down_write(&key->sem); - - /* If we're not the sysadmin, we can only change a key that we own */ - ret = -EACCES; - if (capable(CAP_SYS_ADMIN) || uid_eq(key->uid, current_fsuid())) - ret = key_change_acl(key, &new_ace); - up_write(&key->sem); - key_put(key); -error: - return ret; -} diff --git a/security/keys/persistent.c b/security/keys/persistent.c index 8171c90d4c9a..97af230aa4b2 100644 --- a/security/keys/persistent.c +++ b/security/keys/persistent.c @@ -12,27 +12,6 @@ unsigned persistent_keyring_expiry = 3 * 24 * 3600; /* Expire after 3 days of non-use */ -static struct key_acl persistent_register_keyring_acl = { - .usage = REFCOUNT_INIT(1), - .nr_ace = 2, - .aces = { - KEY_POSSESSOR_ACE(KEY_ACE_SEARCH | KEY_ACE_WRITE), - KEY_OWNER_ACE(KEY_ACE_VIEW | KEY_ACE_READ), - } -}; - -static struct key_acl persistent_keyring_acl = { - .usage = REFCOUNT_INIT(1), - .nr_ace = 2, - .possessor_viewable = true, - .aces = { - KEY_POSSESSOR_ACE(KEY_ACE_VIEW | KEY_ACE_READ | KEY_ACE_WRITE | - KEY_ACE_SEARCH | KEY_ACE_LINK | - KEY_ACE_CLEAR | KEY_ACE_INVAL), - KEY_OWNER_ACE(KEY_ACE_VIEW | KEY_ACE_READ), - } -}; - /* * Create the persistent keyring register for the current user namespace. * @@ -43,7 +22,8 @@ static int key_create_persistent_register(struct user_namespace *ns) struct key *reg = keyring_alloc(".persistent_register", KUIDT_INIT(0), KGIDT_INIT(0), current_cred(), - &persistent_register_keyring_acl, + ((KEY_POS_ALL & ~KEY_POS_SETATTR) | + KEY_USR_VIEW | KEY_USR_READ), KEY_ALLOC_NOT_IN_QUOTA, NULL, NULL); if (IS_ERR(reg)) return PTR_ERR(reg); @@ -76,7 +56,8 @@ static key_ref_t key_create_persistent(struct user_namespace *ns, kuid_t uid, persistent = keyring_alloc(index_key->description, uid, INVALID_GID, current_cred(), - &persistent_keyring_acl, + ((KEY_POS_ALL & ~KEY_POS_SETATTR) | + KEY_USR_VIEW | KEY_USR_READ), KEY_ALLOC_NOT_IN_QUOTA, NULL, ns->persistent_keyring_register); if (IS_ERR(persistent)) diff --git a/security/keys/proc.c b/security/keys/proc.c index b394ad1e874b..415f3f1c2da0 100644 --- a/security/keys/proc.c +++ b/security/keys/proc.c @@ -110,13 +110,11 @@ static struct key *find_ge_key(struct seq_file *p, key_serial_t id) } static void *proc_keys_start(struct seq_file *p, loff_t *_pos) - __acquires(rcu) __acquires(key_serial_lock) { key_serial_t pos = *_pos; struct key *key; - rcu_read_lock(); spin_lock(&key_serial_lock); if (*_pos > INT_MAX) @@ -146,15 +144,12 @@ static void *proc_keys_next(struct seq_file *p, void *v, loff_t *_pos) static void proc_keys_stop(struct seq_file *p, void *v) __releases(key_serial_lock) - __releases(rcu) { spin_unlock(&key_serial_lock); - rcu_read_unlock(); } static int proc_keys_show(struct seq_file *m, void *v) { - const struct key_acl *acl; struct rb_node *_p = v; struct key *key = rb_entry(_p, struct key, serial_node); unsigned long flags; @@ -162,7 +157,6 @@ static int proc_keys_show(struct seq_file *m, void *v) time64_t now, expiry; char xbuf[16]; short state; - bool check_pos; u64 timo; int rc; @@ -176,15 +170,15 @@ static int proc_keys_show(struct seq_file *m, void *v) KEYRING_SEARCH_RECURSE), }; - acl = rcu_dereference(key->acl); - check_pos = acl->possessor_viewable; + key_ref = make_key_ref(key, 0); /* determine if the key is possessed by this process (a test we can * skip if the key does not indicate the possessor can view it */ - key_ref = make_key_ref(key, 0); - if (check_pos) { + if (key->perm & KEY_POS_VIEW) { + rcu_read_lock(); skey_ref = search_cred_keyrings_rcu(&ctx); + rcu_read_unlock(); if (!IS_ERR(skey_ref)) { key_ref_put(skey_ref); key_ref = make_key_ref(key, 1); @@ -194,10 +188,12 @@ static int proc_keys_show(struct seq_file *m, void *v) /* check whether the current task is allowed to view the key */ rc = key_task_permission(key_ref, ctx.cred, KEY_NEED_VIEW); if (rc < 0) - goto out; + return 0; now = ktime_get_real_seconds(); + rcu_read_lock(); + /* come up with a suitable timeout value */ expiry = READ_ONCE(key->expiry); if (expiry == 0) { @@ -236,7 +232,7 @@ static int proc_keys_show(struct seq_file *m, void *v) showflag(flags, 'i', KEY_FLAG_INVALIDATED), refcount_read(&key->usage), xbuf, - key_acl_to_perm(acl), + key->perm, from_kuid_munged(seq_user_ns(m), key->uid), from_kgid_munged(seq_user_ns(m), key->gid), key->type->name); @@ -247,7 +243,7 @@ static int proc_keys_show(struct seq_file *m, void *v) key->type->describe(key, m); seq_putc(m, '\n'); -out: + rcu_read_unlock(); return 0; } diff --git a/security/keys/process_keys.c b/security/keys/process_keys.c index aa3bfcadbc66..09541de31f2f 100644 --- a/security/keys/process_keys.c +++ b/security/keys/process_keys.c @@ -32,47 +32,6 @@ struct key_user root_key_user = { .uid = GLOBAL_ROOT_UID, }; -static struct key_acl user_reg_keyring_acl = { - .usage = REFCOUNT_INIT(1), - .possessor_viewable = true, - .nr_ace = 2, - .aces = { - KEY_POSSESSOR_ACE(KEY_ACE_WRITE | KEY_ACE_SEARCH), - KEY_OWNER_ACE(KEY_ACE_VIEW | KEY_ACE_READ), - } -}; - -static struct key_acl user_keyring_acl = { - .usage = REFCOUNT_INIT(1), - .possessor_viewable = true, - .nr_ace = 2, - .aces = { - KEY_POSSESSOR_ACE(KEY_ACE_VIEW | KEY_ACE_READ | KEY_ACE_WRITE | - KEY_ACE_SEARCH | KEY_ACE_LINK), - KEY_OWNER_ACE(KEY_ACE__PERMS & ~(KEY_ACE_JOIN | KEY_ACE_SET_SECURITY)), - } -}; - -static struct key_acl session_keyring_acl = { - .usage = REFCOUNT_INIT(1), - .possessor_viewable = true, - .nr_ace = 2, - .aces = { - KEY_POSSESSOR_ACE(KEY_ACE__PERMS & ~KEY_ACE_JOIN), - KEY_OWNER_ACE(KEY_ACE_VIEW | KEY_ACE_READ), - } -}; - -static struct key_acl thread_and_process_keyring_acl = { - .usage = REFCOUNT_INIT(1), - .possessor_viewable = true, - .nr_ace = 2, - .aces = { - KEY_POSSESSOR_ACE(KEY_ACE__PERMS & ~(KEY_ACE_JOIN | KEY_ACE_SET_SECURITY)), - KEY_OWNER_ACE(KEY_ACE_VIEW), - } -}; - /* * Get or create a user register keyring. */ @@ -92,8 +51,11 @@ static struct key *get_user_register(struct user_namespace *user_ns) if (!reg_keyring) { reg_keyring = keyring_alloc(".user_reg", user_ns->owner, INVALID_GID, - &init_cred, &user_reg_keyring_acl, - 0, NULL, NULL); + &init_cred, + KEY_POS_WRITE | KEY_POS_SEARCH | + KEY_USR_VIEW | KEY_USR_READ, + 0, + NULL, NULL); if (!IS_ERR(reg_keyring)) smp_store_release(&user_ns->user_keyring_register, reg_keyring); @@ -115,11 +77,14 @@ int look_up_user_keyrings(struct key **_user_keyring, const struct cred *cred = current_cred(); struct user_namespace *user_ns = current_user_ns(); struct key *reg_keyring, *uid_keyring, *session_keyring; + key_perm_t user_keyring_perm; key_ref_t uid_keyring_r, session_keyring_r; uid_t uid = from_kuid(user_ns, cred->user->uid); char buf[20]; int ret; + user_keyring_perm = (KEY_POS_ALL & ~KEY_POS_SETATTR) | KEY_USR_ALL; + kenter("%u", uid); reg_keyring = get_user_register(user_ns); @@ -139,7 +104,7 @@ int look_up_user_keyrings(struct key **_user_keyring, kdebug("_uid %p", uid_keyring_r); if (uid_keyring_r == ERR_PTR(-EAGAIN)) { uid_keyring = keyring_alloc(buf, cred->user->uid, INVALID_GID, - cred, &user_keyring_acl, + cred, user_keyring_perm, KEY_ALLOC_UID_KEYRING | KEY_ALLOC_IN_QUOTA, NULL, reg_keyring); @@ -161,7 +126,7 @@ int look_up_user_keyrings(struct key **_user_keyring, kdebug("_uid_ses %p", session_keyring_r); if (session_keyring_r == ERR_PTR(-EAGAIN)) { session_keyring = keyring_alloc(buf, cred->user->uid, INVALID_GID, - cred, &user_keyring_acl, + cred, user_keyring_perm, KEY_ALLOC_UID_KEYRING | KEY_ALLOC_IN_QUOTA, NULL, NULL); @@ -261,7 +226,7 @@ int install_thread_keyring_to_cred(struct cred *new) return 0; keyring = keyring_alloc("_tid", new->uid, new->gid, new, - &thread_and_process_keyring_acl, + KEY_POS_ALL | KEY_USR_VIEW, KEY_ALLOC_QUOTA_OVERRUN, NULL, NULL); if (IS_ERR(keyring)) @@ -308,7 +273,7 @@ int install_process_keyring_to_cred(struct cred *new) return 0; keyring = keyring_alloc("_pid", new->uid, new->gid, new, - &thread_and_process_keyring_acl, + KEY_POS_ALL | KEY_USR_VIEW, KEY_ALLOC_QUOTA_OVERRUN, NULL, NULL); if (IS_ERR(keyring)) @@ -363,7 +328,8 @@ int install_session_keyring_to_cred(struct cred *cred, struct key *keyring) flags = KEY_ALLOC_IN_QUOTA; keyring = keyring_alloc("_ses", cred->uid, cred->gid, cred, - &session_keyring_acl, flags, NULL, NULL); + KEY_POS_ALL | KEY_USR_VIEW | KEY_USR_READ, + flags, NULL, NULL); if (IS_ERR(keyring)) return PTR_ERR(keyring); } else { @@ -643,7 +609,7 @@ bool lookup_user_key_possessed(const struct key *key, * returned key reference. */ key_ref_t lookup_user_key(key_serial_t id, unsigned long lflags, - unsigned int desired_perm) + key_perm_t perm) { struct keyring_search_context ctx = { .match_data.cmp = lookup_user_key_possessed, @@ -818,12 +784,12 @@ try_again: case -ERESTARTSYS: goto invalid_key; default: - if (desired_perm) + if (perm) goto invalid_key; case 0: break; } - } else if (desired_perm) { + } else if (perm) { ret = key_validate(key); if (ret < 0) goto invalid_key; @@ -835,11 +801,9 @@ try_again: goto invalid_key; /* check the permissions */ - if (desired_perm) { - ret = key_task_permission(key_ref, ctx.cred, desired_perm); - if (ret < 0) - goto invalid_key; - } + ret = key_task_permission(key_ref, ctx.cred, perm); + if (ret < 0) + goto invalid_key; key->last_used_at = ktime_get_real_seconds(); @@ -904,13 +868,13 @@ long join_session_keyring(const char *name) if (PTR_ERR(keyring) == -ENOKEY) { /* not found - try and create a new one */ keyring = keyring_alloc( - name, old->uid, old->gid, old, &joinable_keyring_acl, + name, old->uid, old->gid, old, + KEY_POS_ALL | KEY_USR_VIEW | KEY_USR_READ | KEY_USR_LINK, KEY_ALLOC_IN_QUOTA, NULL, NULL); if (IS_ERR(keyring)) { ret = PTR_ERR(keyring); goto error2; } - goto no_perm_test; } else if (IS_ERR(keyring)) { ret = PTR_ERR(keyring); goto error2; @@ -919,12 +883,6 @@ long join_session_keyring(const char *name) goto error3; } - ret = key_task_permission(make_key_ref(keyring, false), old, - KEY_NEED_JOIN); - if (ret < 0) - goto error3; - -no_perm_test: /* we've got a keyring - now to install it */ ret = install_session_keyring_to_cred(new, keyring); if (ret < 0) diff --git a/security/keys/request_key.c b/security/keys/request_key.c index 46c5187ce03f..7325f382dbf4 100644 --- a/security/keys/request_key.c +++ b/security/keys/request_key.c @@ -135,7 +135,8 @@ static int call_sbin_request_key(struct key *authkey, void *aux) cred = get_current_cred(); keyring = keyring_alloc(desc, cred->fsuid, cred->fsgid, cred, - NULL, KEY_ALLOC_QUOTA_OVERRUN, NULL, NULL); + KEY_POS_ALL | KEY_USR_VIEW | KEY_USR_READ, + KEY_ALLOC_QUOTA_OVERRUN, NULL, NULL); put_cred(cred); if (IS_ERR(keyring)) { ret = PTR_ERR(keyring); @@ -366,11 +367,11 @@ static int construct_alloc_key(struct keyring_search_context *ctx, struct key *dest_keyring, unsigned long flags, struct key_user *user, - struct key_acl *acl, struct key **_key) { struct assoc_array_edit *edit = NULL; struct key *key; + key_perm_t perm; key_ref_t key_ref; int ret; @@ -380,9 +381,17 @@ static int construct_alloc_key(struct keyring_search_context *ctx, *_key = NULL; mutex_lock(&user->cons_lock); + perm = KEY_POS_VIEW | KEY_POS_SEARCH | KEY_POS_LINK | KEY_POS_SETATTR; + perm |= KEY_USR_VIEW; + if (ctx->index_key.type->read) + perm |= KEY_POS_READ; + if (ctx->index_key.type == &key_type_keyring || + ctx->index_key.type->update) + perm |= KEY_POS_WRITE; + key = key_alloc(ctx->index_key.type, ctx->index_key.description, ctx->cred->fsuid, ctx->cred->fsgid, ctx->cred, - acl, flags, NULL); + perm, flags, NULL); if (IS_ERR(key)) goto alloc_failed; @@ -465,7 +474,6 @@ static struct key *construct_key_and_link(struct keyring_search_context *ctx, const char *callout_info, size_t callout_len, void *aux, - struct key_acl *acl, struct key *dest_keyring, unsigned long flags) { @@ -488,7 +496,7 @@ static struct key *construct_key_and_link(struct keyring_search_context *ctx, goto error_put_dest_keyring; } - ret = construct_alloc_key(ctx, dest_keyring, flags, user, acl, &key); + ret = construct_alloc_key(ctx, dest_keyring, flags, user, &key); key_user_put(user); if (ret == 0) { @@ -526,7 +534,6 @@ error: * @callout_info: The data to pass to the instantiation upcall (or NULL). * @callout_len: The length of callout_info. * @aux: Auxiliary data for the upcall. - * @acl: The ACL to attach if a new key is created. * @dest_keyring: Where to cache the key. * @flags: Flags to key_alloc(). * @@ -554,7 +561,6 @@ struct key *request_key_and_link(struct key_type *type, const void *callout_info, size_t callout_len, void *aux, - struct key_acl *acl, struct key *dest_keyring, unsigned long flags) { @@ -629,7 +635,7 @@ struct key *request_key_and_link(struct key_type *type, goto error_free; key = construct_key_and_link(&ctx, callout_info, callout_len, - aux, acl, dest_keyring, flags); + aux, dest_keyring, flags); } error_free: @@ -672,7 +678,6 @@ EXPORT_SYMBOL(wait_for_key_construction); * @description: The searchable description of the key. * @domain_tag: The domain in which the key operates. * @callout_info: The data to pass to the instantiation upcall (or NULL). - * @acl: The ACL to attach if a new key is created. * * As for request_key_and_link() except that it does not add the returned key * to a keyring if found, new keys are always allocated in the user's quota, @@ -685,8 +690,7 @@ EXPORT_SYMBOL(wait_for_key_construction); struct key *request_key_tag(struct key_type *type, const char *description, struct key_tag *domain_tag, - const char *callout_info, - struct key_acl *acl) + const char *callout_info) { struct key *key; size_t callout_len = 0; @@ -696,7 +700,7 @@ struct key *request_key_tag(struct key_type *type, callout_len = strlen(callout_info); key = request_key_and_link(type, description, domain_tag, callout_info, callout_len, - NULL, acl, NULL, KEY_ALLOC_IN_QUOTA); + NULL, NULL, KEY_ALLOC_IN_QUOTA); if (!IS_ERR(key)) { ret = wait_for_key_construction(key, false); if (ret < 0) { @@ -716,7 +720,6 @@ EXPORT_SYMBOL(request_key_tag); * @callout_info: The data to pass to the instantiation upcall (or NULL). * @callout_len: The length of callout_info. * @aux: Auxiliary data for the upcall. - * @acl: The ACL to attach if a new key is created. * * As for request_key_and_link() except that it does not add the returned key * to a keyring if found and new keys are always allocated in the user's quota. @@ -729,15 +732,14 @@ struct key *request_key_with_auxdata(struct key_type *type, struct key_tag *domain_tag, const void *callout_info, size_t callout_len, - void *aux, - struct key_acl *acl) + void *aux) { struct key *key; int ret; key = request_key_and_link(type, description, domain_tag, callout_info, callout_len, - aux, acl, NULL, KEY_ALLOC_IN_QUOTA); + aux, NULL, KEY_ALLOC_IN_QUOTA); if (!IS_ERR(key)) { ret = wait_for_key_construction(key, false); if (ret < 0) { diff --git a/security/keys/request_key_auth.c b/security/keys/request_key_auth.c index 27e437d94b81..e73ec040e250 100644 --- a/security/keys/request_key_auth.c +++ b/security/keys/request_key_auth.c @@ -24,17 +24,6 @@ static void request_key_auth_revoke(struct key *); static void request_key_auth_destroy(struct key *); static long request_key_auth_read(const struct key *, char __user *, size_t); -static struct key_acl request_key_auth_acl = { - .usage = REFCOUNT_INIT(1), - .nr_ace = 2, - .possessor_viewable = true, - .aces = { - KEY_POSSESSOR_ACE(KEY_ACE_VIEW | KEY_ACE_READ | KEY_ACE_SEARCH | - KEY_ACE_LINK), - KEY_OWNER_ACE(KEY_ACE_VIEW), - } -}; - /* * The request-key authorisation key type definition. */ @@ -221,8 +210,8 @@ struct key *request_key_auth_new(struct key *target, const char *op, authkey = key_alloc(&key_type_request_key_auth, desc, cred->fsuid, cred->fsgid, cred, - &request_key_auth_acl, - KEY_ALLOC_NOT_IN_QUOTA, NULL); + KEY_POS_VIEW | KEY_POS_READ | KEY_POS_SEARCH | KEY_POS_LINK | + KEY_USR_VIEW, KEY_ALLOC_NOT_IN_QUOTA, NULL); if (IS_ERR(authkey)) { ret = PTR_ERR(authkey); goto error_free_rka; diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 4bef86ed463b..74dd46de01b6 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -6502,7 +6502,6 @@ static int selinux_key_permission(key_ref_t key_ref, { struct key *key; struct key_security_struct *ksec; - unsigned oldstyle_perm; u32 sid; /* if no specific permissions are requested, we skip the @@ -6511,26 +6510,13 @@ static int selinux_key_permission(key_ref_t key_ref, if (perm == 0) return 0; - oldstyle_perm = perm & (KEY_NEED_VIEW | KEY_NEED_READ | KEY_NEED_WRITE | - KEY_NEED_SEARCH | KEY_NEED_LINK); - if (perm & KEY_NEED_SETSEC) - oldstyle_perm |= OLD_KEY_NEED_SETATTR; - if (perm & KEY_NEED_INVAL) - oldstyle_perm |= KEY_NEED_SEARCH; - if (perm & KEY_NEED_REVOKE && !(perm & OLD_KEY_NEED_SETATTR)) - oldstyle_perm |= KEY_NEED_WRITE; - if (perm & KEY_NEED_JOIN) - oldstyle_perm |= KEY_NEED_SEARCH; - if (perm & KEY_NEED_CLEAR) - oldstyle_perm |= KEY_NEED_WRITE; - sid = cred_sid(cred); key = key_ref_to_ptr(key_ref); ksec = key->security; return avc_has_perm(&selinux_state, - sid, ksec->sid, SECCLASS_KEY, oldstyle_perm, NULL); + sid, ksec->sid, SECCLASS_KEY, perm, NULL); } static int selinux_key_getsecurity(struct key *key, char **_buffer) diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c index 50c536cad85b..4c5e5a438f8b 100644 --- a/security/smack/smack_lsm.c +++ b/security/smack/smack_lsm.c @@ -4284,8 +4284,7 @@ static int smack_key_permission(key_ref_t key_ref, #endif if (perm & (KEY_NEED_READ | KEY_NEED_SEARCH | KEY_NEED_VIEW)) request |= MAY_READ; - if (perm & (KEY_NEED_WRITE | KEY_NEED_LINK | KEY_NEED_SETSEC | - KEY_NEED_INVAL | KEY_NEED_REVOKE | KEY_NEED_CLEAR)) + if (perm & (KEY_NEED_WRITE | KEY_NEED_LINK | KEY_NEED_SETATTR)) request |= MAY_WRITE; rc = smk_access(tkp, keyp->security, request, &ad); rc = smk_bu_note("key access", tkp, keyp->security, request, rc); -- cgit v1.2.3-71-gd317 From 22be8233b34f4f468934c5fefcbe6151766fb8f2 Mon Sep 17 00:00:00 2001 From: Hans Verkuil Date: Thu, 11 Jul 2019 04:53:25 -0400 Subject: media: videodev2.h: change V4L2_PIX_FMT_BGRA444 define: fourcc was already in use The V4L2_PIX_FMT_BGRA444 define clashed with the pre-existing V4L2_PIX_FMT_SGRBG12 which strangely enough used the same fourcc, even though that fourcc made no sense for a Bayer format. In any case, you can't have duplicates, so change the fourcc of V4L2_PIX_FMT_BGRA444. Signed-off-by: Hans Verkuil Cc: # for v5.2 and up Fixes: 6c84f9b1d2900 ("media: v4l: Add definitions for missing 16-bit RGB4444 formats") Reviewed-by: Laurent Pinchart Reviewed-by: Kieran Bingham Signed-off-by: Mauro Carvalho Chehab --- include/uapi/linux/videodev2.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h index 9d9705ceda76..2427bc4d8eba 100644 --- a/include/uapi/linux/videodev2.h +++ b/include/uapi/linux/videodev2.h @@ -518,7 +518,13 @@ struct v4l2_pix_format { #define V4L2_PIX_FMT_RGBX444 v4l2_fourcc('R', 'X', '1', '2') /* 16 rrrrgggg bbbbxxxx */ #define V4L2_PIX_FMT_ABGR444 v4l2_fourcc('A', 'B', '1', '2') /* 16 aaaabbbb ggggrrrr */ #define V4L2_PIX_FMT_XBGR444 v4l2_fourcc('X', 'B', '1', '2') /* 16 xxxxbbbb ggggrrrr */ -#define V4L2_PIX_FMT_BGRA444 v4l2_fourcc('B', 'A', '1', '2') /* 16 bbbbgggg rrrraaaa */ + +/* + * Originally this had 'BA12' as fourcc, but this clashed with the older + * V4L2_PIX_FMT_SGRBG12 which inexplicably used that same fourcc. + * So use 'GA12' instead for V4L2_PIX_FMT_BGRA444. + */ +#define V4L2_PIX_FMT_BGRA444 v4l2_fourcc('G', 'A', '1', '2') /* 16 bbbbgggg rrrraaaa */ #define V4L2_PIX_FMT_BGRX444 v4l2_fourcc('B', 'X', '1', '2') /* 16 bbbbgggg rrrrxxxx */ #define V4L2_PIX_FMT_RGB555 v4l2_fourcc('R', 'G', 'B', 'O') /* 16 RGB-5-5-5 */ #define V4L2_PIX_FMT_ARGB555 v4l2_fourcc('A', 'R', '1', '5') /* 16 ARGB-1-5-5-5 */ -- cgit v1.2.3-71-gd317 From 66bb8a065f5aedd4551d8d3fbce582972f65c2e1 Mon Sep 17 00:00:00 2001 From: Eric Hankland Date: Wed, 10 Jul 2019 18:25:15 -0700 Subject: KVM: x86: PMU Event Filter Some events can provide a guest with information about other guests or the host (e.g. L3 cache stats); providing the capability to restrict access to a "safe" set of events would limit the potential for the PMU to be used in any side channel attacks. This change introduces a new VM ioctl that sets an event filter. If the guest attempts to program a counter for any blacklisted or non-whitelisted event, the kernel counter won't be created, so any RDPMC/RDMSR will show 0 instances of that event. Signed-off-by: Eric Hankland [Lots of changes. All remaining bugs are probably mine. - Paolo] Signed-off-by: Paolo Bonzini --- Documentation/virtual/kvm/api.txt | 26 ++++++++++++++++ arch/x86/include/asm/kvm_host.h | 2 ++ arch/x86/include/uapi/asm/kvm.h | 10 +++++++ arch/x86/kvm/pmu.c | 63 +++++++++++++++++++++++++++++++++++++++ arch/x86/kvm/pmu.h | 1 + arch/x86/kvm/x86.c | 5 ++++ include/uapi/linux/kvm.h | 3 ++ 7 files changed, 110 insertions(+) (limited to 'include/uapi/linux') diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 91fd86fcc49f..38b0d4451a24 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -4065,6 +4065,32 @@ KVM_ARM_VCPU_FINALIZE call. See KVM_ARM_VCPU_INIT for details of vcpu features that require finalization using this ioctl. +4.120 KVM_SET_PMU_EVENT_FILTER + +Capability: KVM_CAP_PMU_EVENT_FILTER +Architectures: x86 +Type: vm ioctl +Parameters: struct kvm_pmu_event_filter (in) +Returns: 0 on success, -1 on error + +struct kvm_pmu_event_filter { + __u32 action; + __u32 nevents; + __u64 events[0]; +}; + +This ioctl restricts the set of PMU events that the guest can program. +The argument holds a list of events which will be allowed or denied. +The eventsel+umask of each event the guest attempts to program is compared +against the events field to determine whether the guest should have access. +This only affects general purpose counters; fixed purpose counters can +be disabled by changing the perfmon CPUID leaf. + +Valid values for 'action': +#define KVM_PMU_EVENT_ALLOW 0 +#define KVM_PMU_EVENT_DENY 1 + + 5. The kvm_run structure ------------------------ diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index f46a12a5cf2e..34d017bd1d1b 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -933,6 +933,8 @@ struct kvm_arch { bool guest_can_read_msr_platform_info; bool exception_payload_enabled; + + struct kvm_pmu_event_filter *pmu_event_filter; }; struct kvm_vm_stat { diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index f9b021e16ebc..46588f5d6283 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -422,4 +422,14 @@ struct kvm_nested_state { __u8 data[0]; }; +/* for KVM_CAP_PMU_EVENT_FILTER */ +struct kvm_pmu_event_filter { + __u32 action; + __u32 nevents; + __u64 events[0]; +}; + +#define KVM_PMU_EVENT_ALLOW 0 +#define KVM_PMU_EVENT_DENY 1 + #endif /* _ASM_X86_KVM_H */ diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index dd745b58ffd8..9d92c4d3cd44 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -22,6 +22,9 @@ #include "lapic.h" #include "pmu.h" +/* This keeps the total size of the filter under 4k. */ +#define KVM_PMU_EVENT_FILTER_MAX_EVENTS 63 + /* NOTE: * - Each perf counter is defined as "struct kvm_pmc"; * - There are two types of perf counters: general purpose (gp) and fixed. @@ -144,6 +147,10 @@ void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel) { unsigned config, type = PERF_TYPE_RAW; u8 event_select, unit_mask; + struct kvm *kvm = pmc->vcpu->kvm; + struct kvm_pmu_event_filter *filter; + int i; + bool allow_event = true; if (eventsel & ARCH_PERFMON_EVENTSEL_PIN_CONTROL) printk_once("kvm pmu: pin control bit is ignored\n"); @@ -155,6 +162,22 @@ void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel) if (!(eventsel & ARCH_PERFMON_EVENTSEL_ENABLE) || !pmc_is_enabled(pmc)) return; + filter = srcu_dereference(kvm->arch.pmu_event_filter, &kvm->srcu); + if (filter) { + for (i = 0; i < filter->nevents; i++) + if (filter->events[i] == + (eventsel & AMD64_RAW_EVENT_MASK_NB)) + break; + if (filter->action == KVM_PMU_EVENT_ALLOW && + i == filter->nevents) + allow_event = false; + if (filter->action == KVM_PMU_EVENT_DENY && + i < filter->nevents) + allow_event = false; + } + if (!allow_event) + return; + event_select = eventsel & ARCH_PERFMON_EVENTSEL_EVENT; unit_mask = (eventsel & ARCH_PERFMON_EVENTSEL_UMASK) >> 8; @@ -351,3 +374,43 @@ void kvm_pmu_destroy(struct kvm_vcpu *vcpu) { kvm_pmu_reset(vcpu); } + +int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp) +{ + struct kvm_pmu_event_filter tmp, *filter; + size_t size; + int r; + + if (copy_from_user(&tmp, argp, sizeof(tmp))) + return -EFAULT; + + if (tmp.action != KVM_PMU_EVENT_ALLOW && + tmp.action != KVM_PMU_EVENT_DENY) + return -EINVAL; + + if (tmp.nevents > KVM_PMU_EVENT_FILTER_MAX_EVENTS) + return -E2BIG; + + size = struct_size(filter, events, tmp.nevents); + filter = kmalloc(size, GFP_KERNEL_ACCOUNT); + if (!filter) + return -ENOMEM; + + r = -EFAULT; + if (copy_from_user(filter, argp, size)) + goto cleanup; + + /* Ensure nevents can't be changed between the user copies. */ + *filter = tmp; + + mutex_lock(&kvm->lock); + rcu_swap_protected(kvm->arch.pmu_event_filter, filter, + mutex_is_locked(&kvm->lock)); + mutex_unlock(&kvm->lock); + + synchronize_srcu_expedited(&kvm->srcu); + r = 0; +cleanup: + kfree(filter); + return r; +} diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index 22dff661145a..58265f761c3b 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -118,6 +118,7 @@ void kvm_pmu_refresh(struct kvm_vcpu *vcpu); void kvm_pmu_reset(struct kvm_vcpu *vcpu); void kvm_pmu_init(struct kvm_vcpu *vcpu); void kvm_pmu_destroy(struct kvm_vcpu *vcpu); +int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp); bool is_vmware_backdoor_pmc(u32 pmc_idx); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 2e302e977dac..81faceba8cec 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3132,6 +3132,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_SET_BOOT_CPU_ID: case KVM_CAP_SPLIT_IRQCHIP: case KVM_CAP_IMMEDIATE_EXIT: + case KVM_CAP_PMU_EVENT_FILTER: case KVM_CAP_GET_MSR_FEATURES: case KVM_CAP_MSR_PLATFORM_INFO: case KVM_CAP_EXCEPTION_PAYLOAD: @@ -4978,6 +4979,9 @@ set_identity_unlock: r = kvm_vm_ioctl_hv_eventfd(kvm, &hvevfd); break; } + case KVM_SET_PMU_EVENT_FILTER: + r = kvm_vm_ioctl_set_pmu_event_filter(kvm, argp); + break; default: r = -ENOTTY; } @@ -9428,6 +9432,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm) kvm_ioapic_destroy(kvm); kvm_free_vcpus(kvm); kvfree(rcu_dereference_check(kvm->arch.apic_map, 1)); + kfree(srcu_dereference_check(kvm->arch.pmu_event_filter, &kvm->srcu, 1)); kvm_mmu_uninit_vm(kvm); kvm_page_track_cleanup(kvm); kvm_hv_destroy_vm(kvm); diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index c2152f3dd02d..a7c19540ce21 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -995,6 +995,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_ARM_SVE 170 #define KVM_CAP_ARM_PTRAUTH_ADDRESS 171 #define KVM_CAP_ARM_PTRAUTH_GENERIC 172 +#define KVM_CAP_PMU_EVENT_FILTER 173 #ifdef KVM_CAP_IRQ_ROUTING @@ -1329,6 +1330,8 @@ struct kvm_s390_ucas_mapping { #define KVM_PPC_GET_RMMU_INFO _IOW(KVMIO, 0xb0, struct kvm_ppc_rmmu_info) /* Available with KVM_CAP_PPC_GET_CPU_CHAR */ #define KVM_PPC_GET_CPU_CHAR _IOR(KVMIO, 0xb1, struct kvm_ppc_cpu_char) +/* Available with KVM_CAP_PMU_EVENT_FILTER */ +#define KVM_SET_PMU_EVENT_FILTER _IOW(KVMIO, 0xb2, struct kvm_pmu_event_filter) /* ioctl for vm fd */ #define KVM_CREATE_DEVICE _IOWR(KVMIO, 0xe0, struct kvm_create_device) -- cgit v1.2.3-71-gd317 From c32cc30c0544f13982ee0185d55f4910319b1a79 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Thu, 11 Jul 2019 20:52:18 -0700 Subject: nilfs2: do not use unexported cpu_to_le32()/le32_to_cpu() in uapi header cpu_to_le32/le32_to_cpu is defined in include/linux/byteorder/generic.h, which is not exported to user-space. UAPI headers must use the ones prefixed with double-underscore. Detected by compile-testing exported headers: include/linux/nilfs2_ondisk.h: In function `nilfs_checkpoint_set_snapshot': include/linux/nilfs2_ondisk.h:536:17: error: implicit declaration of function `cpu_to_le32' [-Werror=implicit-function-declaration] cp->cp_flags = cpu_to_le32(le32_to_cpu(cp->cp_flags) | \ ^ include/linux/nilfs2_ondisk.h:552:1: note: in expansion of macro `NILFS_CHECKPOINT_FNS' NILFS_CHECKPOINT_FNS(SNAPSHOT, snapshot) ^~~~~~~~~~~~~~~~~~~~ include/linux/nilfs2_ondisk.h:536:29: error: implicit declaration of function `le32_to_cpu' [-Werror=implicit-function-declaration] cp->cp_flags = cpu_to_le32(le32_to_cpu(cp->cp_flags) | \ ^ include/linux/nilfs2_ondisk.h:552:1: note: in expansion of macro `NILFS_CHECKPOINT_FNS' NILFS_CHECKPOINT_FNS(SNAPSHOT, snapshot) ^~~~~~~~~~~~~~~~~~~~ include/linux/nilfs2_ondisk.h: In function `nilfs_segment_usage_set_clean': include/linux/nilfs2_ondisk.h:622:19: error: implicit declaration of function `cpu_to_le64' [-Werror=implicit-function-declaration] su->su_lastmod = cpu_to_le64(0); ^~~~~~~~~~~ Link: http://lkml.kernel.org/r/20190605053006.14332-1-yamada.masahiro@socionext.com Fixes: e63e88bc53ba ("nilfs2: move ioctl interface and disk layout to uapi separately") Signed-off-by: Masahiro Yamada Acked-by: Ryusuke Konishi Cc: Arnd Bergmann Cc: Greg KH Cc: Joe Perches Cc: [4.9+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/uapi/linux/nilfs2_ondisk.h | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/nilfs2_ondisk.h b/include/uapi/linux/nilfs2_ondisk.h index a7e66ab11d1d..c23f91ae5fe8 100644 --- a/include/uapi/linux/nilfs2_ondisk.h +++ b/include/uapi/linux/nilfs2_ondisk.h @@ -29,7 +29,7 @@ #include #include - +#include #define NILFS_INODE_BMAP_SIZE 7 @@ -533,19 +533,19 @@ enum { static inline void \ nilfs_checkpoint_set_##name(struct nilfs_checkpoint *cp) \ { \ - cp->cp_flags = cpu_to_le32(le32_to_cpu(cp->cp_flags) | \ - (1UL << NILFS_CHECKPOINT_##flag)); \ + cp->cp_flags = __cpu_to_le32(__le32_to_cpu(cp->cp_flags) | \ + (1UL << NILFS_CHECKPOINT_##flag)); \ } \ static inline void \ nilfs_checkpoint_clear_##name(struct nilfs_checkpoint *cp) \ { \ - cp->cp_flags = cpu_to_le32(le32_to_cpu(cp->cp_flags) & \ + cp->cp_flags = __cpu_to_le32(__le32_to_cpu(cp->cp_flags) & \ ~(1UL << NILFS_CHECKPOINT_##flag)); \ } \ static inline int \ nilfs_checkpoint_##name(const struct nilfs_checkpoint *cp) \ { \ - return !!(le32_to_cpu(cp->cp_flags) & \ + return !!(__le32_to_cpu(cp->cp_flags) & \ (1UL << NILFS_CHECKPOINT_##flag)); \ } @@ -595,20 +595,20 @@ enum { static inline void \ nilfs_segment_usage_set_##name(struct nilfs_segment_usage *su) \ { \ - su->su_flags = cpu_to_le32(le32_to_cpu(su->su_flags) | \ + su->su_flags = __cpu_to_le32(__le32_to_cpu(su->su_flags) | \ (1UL << NILFS_SEGMENT_USAGE_##flag));\ } \ static inline void \ nilfs_segment_usage_clear_##name(struct nilfs_segment_usage *su) \ { \ su->su_flags = \ - cpu_to_le32(le32_to_cpu(su->su_flags) & \ + __cpu_to_le32(__le32_to_cpu(su->su_flags) & \ ~(1UL << NILFS_SEGMENT_USAGE_##flag)); \ } \ static inline int \ nilfs_segment_usage_##name(const struct nilfs_segment_usage *su) \ { \ - return !!(le32_to_cpu(su->su_flags) & \ + return !!(__le32_to_cpu(su->su_flags) & \ (1UL << NILFS_SEGMENT_USAGE_##flag)); \ } @@ -619,15 +619,15 @@ NILFS_SEGMENT_USAGE_FNS(ERROR, error) static inline void nilfs_segment_usage_set_clean(struct nilfs_segment_usage *su) { - su->su_lastmod = cpu_to_le64(0); - su->su_nblocks = cpu_to_le32(0); - su->su_flags = cpu_to_le32(0); + su->su_lastmod = __cpu_to_le64(0); + su->su_nblocks = __cpu_to_le32(0); + su->su_flags = __cpu_to_le32(0); } static inline int nilfs_segment_usage_clean(const struct nilfs_segment_usage *su) { - return !le32_to_cpu(su->su_flags); + return !__le32_to_cpu(su->su_flags); } /** -- cgit v1.2.3-71-gd317 From da82c92f1150f66afabf78d2c85ef9ac18dc6d38 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Thu, 27 Jun 2019 13:08:35 -0300 Subject: docs: cgroup-v1: add it to the admin-guide book Those files belong to the admin guide, so add them. Signed-off-by: Mauro Carvalho Chehab --- .../admin-guide/cgroup-v1/blkio-controller.rst | 302 ++++++ Documentation/admin-guide/cgroup-v1/cgroups.rst | 695 ++++++++++++++ Documentation/admin-guide/cgroup-v1/cpuacct.rst | 50 + Documentation/admin-guide/cgroup-v1/cpusets.rst | 866 +++++++++++++++++ Documentation/admin-guide/cgroup-v1/devices.rst | 132 +++ .../admin-guide/cgroup-v1/freezer-subsystem.rst | 127 +++ Documentation/admin-guide/cgroup-v1/hugetlb.rst | 50 + Documentation/admin-guide/cgroup-v1/index.rst | 28 + Documentation/admin-guide/cgroup-v1/memcg_test.rst | 355 +++++++ Documentation/admin-guide/cgroup-v1/memory.rst | 1003 ++++++++++++++++++++ Documentation/admin-guide/cgroup-v1/net_cls.rst | 44 + Documentation/admin-guide/cgroup-v1/net_prio.rst | 57 ++ Documentation/admin-guide/cgroup-v1/pids.rst | 92 ++ Documentation/admin-guide/cgroup-v1/rdma.rst | 117 +++ Documentation/admin-guide/cgroup-v2.rst | 2 +- Documentation/admin-guide/index.rst | 1 + Documentation/admin-guide/kernel-parameters.txt | 4 +- .../admin-guide/mm/numa_memory_policy.rst | 2 +- Documentation/block/bfq-iosched.rst | 2 +- Documentation/cgroup-v1/blkio-controller.rst | 302 ------ Documentation/cgroup-v1/cgroups.rst | 695 -------------- Documentation/cgroup-v1/cpuacct.rst | 50 - Documentation/cgroup-v1/cpusets.rst | 866 ----------------- Documentation/cgroup-v1/devices.rst | 132 --- Documentation/cgroup-v1/freezer-subsystem.rst | 127 --- Documentation/cgroup-v1/hugetlb.rst | 50 - Documentation/cgroup-v1/index.rst | 30 - Documentation/cgroup-v1/memcg_test.rst | 355 ------- Documentation/cgroup-v1/memory.rst | 1003 -------------------- Documentation/cgroup-v1/net_cls.rst | 44 - Documentation/cgroup-v1/net_prio.rst | 57 -- Documentation/cgroup-v1/pids.rst | 92 -- Documentation/cgroup-v1/rdma.rst | 117 --- Documentation/filesystems/tmpfs.txt | 2 +- Documentation/kernel-per-CPU-kthreads.txt | 2 +- Documentation/scheduler/sched-deadline.rst | 2 +- Documentation/scheduler/sched-design-CFS.rst | 2 +- Documentation/scheduler/sched-rt-group.rst | 2 +- Documentation/vm/numa.rst | 4 +- Documentation/vm/page_migration.rst | 2 +- Documentation/vm/unevictable-lru.rst | 2 +- Documentation/x86/x86_64/fake-numa-for-cpusets.rst | 4 +- MAINTAINERS | 4 +- block/Kconfig | 2 +- include/linux/cgroup-defs.h | 2 +- include/uapi/linux/bpf.h | 2 +- init/Kconfig | 4 +- kernel/cgroup/cpuset.c | 2 +- security/device_cgroup.c | 2 +- tools/include/uapi/linux/bpf.h | 2 +- 50 files changed, 3945 insertions(+), 3946 deletions(-) create mode 100644 Documentation/admin-guide/cgroup-v1/blkio-controller.rst create mode 100644 Documentation/admin-guide/cgroup-v1/cgroups.rst create mode 100644 Documentation/admin-guide/cgroup-v1/cpuacct.rst create mode 100644 Documentation/admin-guide/cgroup-v1/cpusets.rst create mode 100644 Documentation/admin-guide/cgroup-v1/devices.rst create mode 100644 Documentation/admin-guide/cgroup-v1/freezer-subsystem.rst create mode 100644 Documentation/admin-guide/cgroup-v1/hugetlb.rst create mode 100644 Documentation/admin-guide/cgroup-v1/index.rst create mode 100644 Documentation/admin-guide/cgroup-v1/memcg_test.rst create mode 100644 Documentation/admin-guide/cgroup-v1/memory.rst create mode 100644 Documentation/admin-guide/cgroup-v1/net_cls.rst create mode 100644 Documentation/admin-guide/cgroup-v1/net_prio.rst create mode 100644 Documentation/admin-guide/cgroup-v1/pids.rst create mode 100644 Documentation/admin-guide/cgroup-v1/rdma.rst delete mode 100644 Documentation/cgroup-v1/blkio-controller.rst delete mode 100644 Documentation/cgroup-v1/cgroups.rst delete mode 100644 Documentation/cgroup-v1/cpuacct.rst delete mode 100644 Documentation/cgroup-v1/cpusets.rst delete mode 100644 Documentation/cgroup-v1/devices.rst delete mode 100644 Documentation/cgroup-v1/freezer-subsystem.rst delete mode 100644 Documentation/cgroup-v1/hugetlb.rst delete mode 100644 Documentation/cgroup-v1/index.rst delete mode 100644 Documentation/cgroup-v1/memcg_test.rst delete mode 100644 Documentation/cgroup-v1/memory.rst delete mode 100644 Documentation/cgroup-v1/net_cls.rst delete mode 100644 Documentation/cgroup-v1/net_prio.rst delete mode 100644 Documentation/cgroup-v1/pids.rst delete mode 100644 Documentation/cgroup-v1/rdma.rst (limited to 'include/uapi/linux') diff --git a/Documentation/admin-guide/cgroup-v1/blkio-controller.rst b/Documentation/admin-guide/cgroup-v1/blkio-controller.rst new file mode 100644 index 000000000000..1d7d962933be --- /dev/null +++ b/Documentation/admin-guide/cgroup-v1/blkio-controller.rst @@ -0,0 +1,302 @@ +=================== +Block IO Controller +=================== + +Overview +======== +cgroup subsys "blkio" implements the block io controller. There seems to be +a need of various kinds of IO control policies (like proportional BW, max BW) +both at leaf nodes as well as at intermediate nodes in a storage hierarchy. +Plan is to use the same cgroup based management interface for blkio controller +and based on user options switch IO policies in the background. + +One IO control policy is throttling policy which can be used to +specify upper IO rate limits on devices. This policy is implemented in +generic block layer and can be used on leaf nodes as well as higher +level logical devices like device mapper. + +HOWTO +===== +Throttling/Upper Limit policy +----------------------------- +- Enable Block IO controller:: + + CONFIG_BLK_CGROUP=y + +- Enable throttling in block layer:: + + CONFIG_BLK_DEV_THROTTLING=y + +- Mount blkio controller (see cgroups.txt, Why are cgroups needed?):: + + mount -t cgroup -o blkio none /sys/fs/cgroup/blkio + +- Specify a bandwidth rate on particular device for root group. The format + for policy is ": ":: + + echo "8:16 1048576" > /sys/fs/cgroup/blkio/blkio.throttle.read_bps_device + + Above will put a limit of 1MB/second on reads happening for root group + on device having major/minor number 8:16. + +- Run dd to read a file and see if rate is throttled to 1MB/s or not:: + + # dd iflag=direct if=/mnt/common/zerofile of=/dev/null bs=4K count=1024 + 1024+0 records in + 1024+0 records out + 4194304 bytes (4.2 MB) copied, 4.0001 s, 1.0 MB/s + + Limits for writes can be put using blkio.throttle.write_bps_device file. + +Hierarchical Cgroups +==================== + +Throttling implements hierarchy support; however, +throttling's hierarchy support is enabled iff "sane_behavior" is +enabled from cgroup side, which currently is a development option and +not publicly available. + +If somebody created a hierarchy like as follows:: + + root + / \ + test1 test2 + | + test3 + +Throttling with "sane_behavior" will handle the +hierarchy correctly. For throttling, all limits apply +to the whole subtree while all statistics are local to the IOs +directly generated by tasks in that cgroup. + +Throttling without "sane_behavior" enabled from cgroup side will +practically treat all groups at same level as if it looks like the +following:: + + pivot + / / \ \ + root test1 test2 test3 + +Various user visible config options +=================================== +CONFIG_BLK_CGROUP + - Block IO controller. + +CONFIG_BFQ_CGROUP_DEBUG + - Debug help. Right now some additional stats file show up in cgroup + if this option is enabled. + +CONFIG_BLK_DEV_THROTTLING + - Enable block device throttling support in block layer. + +Details of cgroup files +======================= +Proportional weight policy files +-------------------------------- +- blkio.weight + - Specifies per cgroup weight. This is default weight of the group + on all the devices until and unless overridden by per device rule. + (See blkio.weight_device). + Currently allowed range of weights is from 10 to 1000. + +- blkio.weight_device + - One can specify per cgroup per device rules using this interface. + These rules override the default value of group weight as specified + by blkio.weight. + + Following is the format:: + + # echo dev_maj:dev_minor weight > blkio.weight_device + + Configure weight=300 on /dev/sdb (8:16) in this cgroup:: + + # echo 8:16 300 > blkio.weight_device + # cat blkio.weight_device + dev weight + 8:16 300 + + Configure weight=500 on /dev/sda (8:0) in this cgroup:: + + # echo 8:0 500 > blkio.weight_device + # cat blkio.weight_device + dev weight + 8:0 500 + 8:16 300 + + Remove specific weight for /dev/sda in this cgroup:: + + # echo 8:0 0 > blkio.weight_device + # cat blkio.weight_device + dev weight + 8:16 300 + +- blkio.leaf_weight[_device] + - Equivalents of blkio.weight[_device] for the purpose of + deciding how much weight tasks in the given cgroup has while + competing with the cgroup's child cgroups. For details, + please refer to Documentation/block/cfq-iosched.txt. + +- blkio.time + - disk time allocated to cgroup per device in milliseconds. First + two fields specify the major and minor number of the device and + third field specifies the disk time allocated to group in + milliseconds. + +- blkio.sectors + - number of sectors transferred to/from disk by the group. First + two fields specify the major and minor number of the device and + third field specifies the number of sectors transferred by the + group to/from the device. + +- blkio.io_service_bytes + - Number of bytes transferred to/from the disk by the group. These + are further divided by the type of operation - read or write, sync + or async. First two fields specify the major and minor number of the + device, third field specifies the operation type and the fourth field + specifies the number of bytes. + +- blkio.io_serviced + - Number of IOs (bio) issued to the disk by the group. These + are further divided by the type of operation - read or write, sync + or async. First two fields specify the major and minor number of the + device, third field specifies the operation type and the fourth field + specifies the number of IOs. + +- blkio.io_service_time + - Total amount of time between request dispatch and request completion + for the IOs done by this cgroup. This is in nanoseconds to make it + meaningful for flash devices too. For devices with queue depth of 1, + this time represents the actual service time. When queue_depth > 1, + that is no longer true as requests may be served out of order. This + may cause the service time for a given IO to include the service time + of multiple IOs when served out of order which may result in total + io_service_time > actual time elapsed. This time is further divided by + the type of operation - read or write, sync or async. First two fields + specify the major and minor number of the device, third field + specifies the operation type and the fourth field specifies the + io_service_time in ns. + +- blkio.io_wait_time + - Total amount of time the IOs for this cgroup spent waiting in the + scheduler queues for service. This can be greater than the total time + elapsed since it is cumulative io_wait_time for all IOs. It is not a + measure of total time the cgroup spent waiting but rather a measure of + the wait_time for its individual IOs. For devices with queue_depth > 1 + this metric does not include the time spent waiting for service once + the IO is dispatched to the device but till it actually gets serviced + (there might be a time lag here due to re-ordering of requests by the + device). This is in nanoseconds to make it meaningful for flash + devices too. This time is further divided by the type of operation - + read or write, sync or async. First two fields specify the major and + minor number of the device, third field specifies the operation type + and the fourth field specifies the io_wait_time in ns. + +- blkio.io_merged + - Total number of bios/requests merged into requests belonging to this + cgroup. This is further divided by the type of operation - read or + write, sync or async. + +- blkio.io_queued + - Total number of requests queued up at any given instant for this + cgroup. This is further divided by the type of operation - read or + write, sync or async. + +- blkio.avg_queue_size + - Debugging aid only enabled if CONFIG_BFQ_CGROUP_DEBUG=y. + The average queue size for this cgroup over the entire time of this + cgroup's existence. Queue size samples are taken each time one of the + queues of this cgroup gets a timeslice. + +- blkio.group_wait_time + - Debugging aid only enabled if CONFIG_BFQ_CGROUP_DEBUG=y. + This is the amount of time the cgroup had to wait since it became busy + (i.e., went from 0 to 1 request queued) to get a timeslice for one of + its queues. This is different from the io_wait_time which is the + cumulative total of the amount of time spent by each IO in that cgroup + waiting in the scheduler queue. This is in nanoseconds. If this is + read when the cgroup is in a waiting (for timeslice) state, the stat + will only report the group_wait_time accumulated till the last time it + got a timeslice and will not include the current delta. + +- blkio.empty_time + - Debugging aid only enabled if CONFIG_BFQ_CGROUP_DEBUG=y. + This is the amount of time a cgroup spends without any pending + requests when not being served, i.e., it does not include any time + spent idling for one of the queues of the cgroup. This is in + nanoseconds. If this is read when the cgroup is in an empty state, + the stat will only report the empty_time accumulated till the last + time it had a pending request and will not include the current delta. + +- blkio.idle_time + - Debugging aid only enabled if CONFIG_BFQ_CGROUP_DEBUG=y. + This is the amount of time spent by the IO scheduler idling for a + given cgroup in anticipation of a better request than the existing ones + from other queues/cgroups. This is in nanoseconds. If this is read + when the cgroup is in an idling state, the stat will only report the + idle_time accumulated till the last idle period and will not include + the current delta. + +- blkio.dequeue + - Debugging aid only enabled if CONFIG_BFQ_CGROUP_DEBUG=y. This + gives the statistics about how many a times a group was dequeued + from service tree of the device. First two fields specify the major + and minor number of the device and third field specifies the number + of times a group was dequeued from a particular device. + +- blkio.*_recursive + - Recursive version of various stats. These files show the + same information as their non-recursive counterparts but + include stats from all the descendant cgroups. + +Throttling/Upper limit policy files +----------------------------------- +- blkio.throttle.read_bps_device + - Specifies upper limit on READ rate from the device. IO rate is + specified in bytes per second. Rules are per device. Following is + the format:: + + echo ": " > /cgrp/blkio.throttle.read_bps_device + +- blkio.throttle.write_bps_device + - Specifies upper limit on WRITE rate to the device. IO rate is + specified in bytes per second. Rules are per device. Following is + the format:: + + echo ": " > /cgrp/blkio.throttle.write_bps_device + +- blkio.throttle.read_iops_device + - Specifies upper limit on READ rate from the device. IO rate is + specified in IO per second. Rules are per device. Following is + the format:: + + echo ": " > /cgrp/blkio.throttle.read_iops_device + +- blkio.throttle.write_iops_device + - Specifies upper limit on WRITE rate to the device. IO rate is + specified in io per second. Rules are per device. Following is + the format:: + + echo ": " > /cgrp/blkio.throttle.write_iops_device + +Note: If both BW and IOPS rules are specified for a device, then IO is + subjected to both the constraints. + +- blkio.throttle.io_serviced + - Number of IOs (bio) issued to the disk by the group. These + are further divided by the type of operation - read or write, sync + or async. First two fields specify the major and minor number of the + device, third field specifies the operation type and the fourth field + specifies the number of IOs. + +- blkio.throttle.io_service_bytes + - Number of bytes transferred to/from the disk by the group. These + are further divided by the type of operation - read or write, sync + or async. First two fields specify the major and minor number of the + device, third field specifies the operation type and the fourth field + specifies the number of bytes. + +Common files among various policies +----------------------------------- +- blkio.reset_stats + - Writing an int to this file will result in resetting all the stats + for that cgroup. diff --git a/Documentation/admin-guide/cgroup-v1/cgroups.rst b/Documentation/admin-guide/cgroup-v1/cgroups.rst new file mode 100644 index 000000000000..b0688011ed06 --- /dev/null +++ b/Documentation/admin-guide/cgroup-v1/cgroups.rst @@ -0,0 +1,695 @@ +============== +Control Groups +============== + +Written by Paul Menage based on +Documentation/admin-guide/cgroup-v1/cpusets.rst + +Original copyright statements from cpusets.txt: + +Portions Copyright (C) 2004 BULL SA. + +Portions Copyright (c) 2004-2006 Silicon Graphics, Inc. + +Modified by Paul Jackson + +Modified by Christoph Lameter + +.. CONTENTS: + + 1. Control Groups + 1.1 What are cgroups ? + 1.2 Why are cgroups needed ? + 1.3 How are cgroups implemented ? + 1.4 What does notify_on_release do ? + 1.5 What does clone_children do ? + 1.6 How do I use cgroups ? + 2. Usage Examples and Syntax + 2.1 Basic Usage + 2.2 Attaching processes + 2.3 Mounting hierarchies by name + 3. Kernel API + 3.1 Overview + 3.2 Synchronization + 3.3 Subsystem API + 4. Extended attributes usage + 5. Questions + +1. Control Groups +================= + +1.1 What are cgroups ? +---------------------- + +Control Groups provide a mechanism for aggregating/partitioning sets of +tasks, and all their future children, into hierarchical groups with +specialized behaviour. + +Definitions: + +A *cgroup* associates a set of tasks with a set of parameters for one +or more subsystems. + +A *subsystem* is a module that makes use of the task grouping +facilities provided by cgroups to treat groups of tasks in +particular ways. A subsystem is typically a "resource controller" that +schedules a resource or applies per-cgroup limits, but it may be +anything that wants to act on a group of processes, e.g. a +virtualization subsystem. + +A *hierarchy* is a set of cgroups arranged in a tree, such that +every task in the system is in exactly one of the cgroups in the +hierarchy, and a set of subsystems; each subsystem has system-specific +state attached to each cgroup in the hierarchy. Each hierarchy has +an instance of the cgroup virtual filesystem associated with it. + +At any one time there may be multiple active hierarchies of task +cgroups. Each hierarchy is a partition of all tasks in the system. + +User-level code may create and destroy cgroups by name in an +instance of the cgroup virtual file system, specify and query to +which cgroup a task is assigned, and list the task PIDs assigned to +a cgroup. Those creations and assignments only affect the hierarchy +associated with that instance of the cgroup file system. + +On their own, the only use for cgroups is for simple job +tracking. The intention is that other subsystems hook into the generic +cgroup support to provide new attributes for cgroups, such as +accounting/limiting the resources which processes in a cgroup can +access. For example, cpusets (see Documentation/admin-guide/cgroup-v1/cpusets.rst) allow +you to associate a set of CPUs and a set of memory nodes with the +tasks in each cgroup. + +1.2 Why are cgroups needed ? +---------------------------- + +There are multiple efforts to provide process aggregations in the +Linux kernel, mainly for resource-tracking purposes. Such efforts +include cpusets, CKRM/ResGroups, UserBeanCounters, and virtual server +namespaces. These all require the basic notion of a +grouping/partitioning of processes, with newly forked processes ending +up in the same group (cgroup) as their parent process. + +The kernel cgroup patch provides the minimum essential kernel +mechanisms required to efficiently implement such groups. It has +minimal impact on the system fast paths, and provides hooks for +specific subsystems such as cpusets to provide additional behaviour as +desired. + +Multiple hierarchy support is provided to allow for situations where +the division of tasks into cgroups is distinctly different for +different subsystems - having parallel hierarchies allows each +hierarchy to be a natural division of tasks, without having to handle +complex combinations of tasks that would be present if several +unrelated subsystems needed to be forced into the same tree of +cgroups. + +At one extreme, each resource controller or subsystem could be in a +separate hierarchy; at the other extreme, all subsystems +would be attached to the same hierarchy. + +As an example of a scenario (originally proposed by vatsa@in.ibm.com) +that can benefit from multiple hierarchies, consider a large +university server with various users - students, professors, system +tasks etc. The resource planning for this server could be along the +following lines:: + + CPU : "Top cpuset" + / \ + CPUSet1 CPUSet2 + | | + (Professors) (Students) + + In addition (system tasks) are attached to topcpuset (so + that they can run anywhere) with a limit of 20% + + Memory : Professors (50%), Students (30%), system (20%) + + Disk : Professors (50%), Students (30%), system (20%) + + Network : WWW browsing (20%), Network File System (60%), others (20%) + / \ + Professors (15%) students (5%) + +Browsers like Firefox/Lynx go into the WWW network class, while (k)nfsd goes +into the NFS network class. + +At the same time Firefox/Lynx will share an appropriate CPU/Memory class +depending on who launched it (prof/student). + +With the ability to classify tasks differently for different resources +(by putting those resource subsystems in different hierarchies), +the admin can easily set up a script which receives exec notifications +and depending on who is launching the browser he can:: + + # echo browser_pid > /sys/fs/cgroup///tasks + +With only a single hierarchy, he now would potentially have to create +a separate cgroup for every browser launched and associate it with +appropriate network and other resource class. This may lead to +proliferation of such cgroups. + +Also let's say that the administrator would like to give enhanced network +access temporarily to a student's browser (since it is night and the user +wants to do online gaming :)) OR give one of the student's simulation +apps enhanced CPU power. + +With ability to write PIDs directly to resource classes, it's just a +matter of:: + + # echo pid > /sys/fs/cgroup/network//tasks + (after some time) + # echo pid > /sys/fs/cgroup/network//tasks + +Without this ability, the administrator would have to split the cgroup into +multiple separate ones and then associate the new cgroups with the +new resource classes. + + + +1.3 How are cgroups implemented ? +--------------------------------- + +Control Groups extends the kernel as follows: + + - Each task in the system has a reference-counted pointer to a + css_set. + + - A css_set contains a set of reference-counted pointers to + cgroup_subsys_state objects, one for each cgroup subsystem + registered in the system. There is no direct link from a task to + the cgroup of which it's a member in each hierarchy, but this + can be determined by following pointers through the + cgroup_subsys_state objects. This is because accessing the + subsystem state is something that's expected to happen frequently + and in performance-critical code, whereas operations that require a + task's actual cgroup assignments (in particular, moving between + cgroups) are less common. A linked list runs through the cg_list + field of each task_struct using the css_set, anchored at + css_set->tasks. + + - A cgroup hierarchy filesystem can be mounted for browsing and + manipulation from user space. + + - You can list all the tasks (by PID) attached to any cgroup. + +The implementation of cgroups requires a few, simple hooks +into the rest of the kernel, none in performance-critical paths: + + - in init/main.c, to initialize the root cgroups and initial + css_set at system boot. + + - in fork and exit, to attach and detach a task from its css_set. + +In addition, a new file system of type "cgroup" may be mounted, to +enable browsing and modifying the cgroups presently known to the +kernel. When mounting a cgroup hierarchy, you may specify a +comma-separated list of subsystems to mount as the filesystem mount +options. By default, mounting the cgroup filesystem attempts to +mount a hierarchy containing all registered subsystems. + +If an active hierarchy with exactly the same set of subsystems already +exists, it will be reused for the new mount. If no existing hierarchy +matches, and any of the requested subsystems are in use in an existing +hierarchy, the mount will fail with -EBUSY. Otherwise, a new hierarchy +is activated, associated with the requested subsystems. + +It's not currently possible to bind a new subsystem to an active +cgroup hierarchy, or to unbind a subsystem from an active cgroup +hierarchy. This may be possible in future, but is fraught with nasty +error-recovery issues. + +When a cgroup filesystem is unmounted, if there are any +child cgroups created below the top-level cgroup, that hierarchy +will remain active even though unmounted; if there are no +child cgroups then the hierarchy will be deactivated. + +No new system calls are added for cgroups - all support for +querying and modifying cgroups is via this cgroup file system. + +Each task under /proc has an added file named 'cgroup' displaying, +for each active hierarchy, the subsystem names and the cgroup name +as the path relative to the root of the cgroup file system. + +Each cgroup is represented by a directory in the cgroup file system +containing the following files describing that cgroup: + + - tasks: list of tasks (by PID) attached to that cgroup. This list + is not guaranteed to be sorted. Writing a thread ID into this file + moves the thread into this cgroup. + - cgroup.procs: list of thread group IDs in the cgroup. This list is + not guaranteed to be sorted or free of duplicate TGIDs, and userspace + should sort/uniquify the list if this property is required. + Writing a thread group ID into this file moves all threads in that + group into this cgroup. + - notify_on_release flag: run the release agent on exit? + - release_agent: the path to use for release notifications (this file + exists in the top cgroup only) + +Other subsystems such as cpusets may add additional files in each +cgroup dir. + +New cgroups are created using the mkdir system call or shell +command. The properties of a cgroup, such as its flags, are +modified by writing to the appropriate file in that cgroups +directory, as listed above. + +The named hierarchical structure of nested cgroups allows partitioning +a large system into nested, dynamically changeable, "soft-partitions". + +The attachment of each task, automatically inherited at fork by any +children of that task, to a cgroup allows organizing the work load +on a system into related sets of tasks. A task may be re-attached to +any other cgroup, if allowed by the permissions on the necessary +cgroup file system directories. + +When a task is moved from one cgroup to another, it gets a new +css_set pointer - if there's an already existing css_set with the +desired collection of cgroups then that group is reused, otherwise a new +css_set is allocated. The appropriate existing css_set is located by +looking into a hash table. + +To allow access from a cgroup to the css_sets (and hence tasks) +that comprise it, a set of cg_cgroup_link objects form a lattice; +each cg_cgroup_link is linked into a list of cg_cgroup_links for +a single cgroup on its cgrp_link_list field, and a list of +cg_cgroup_links for a single css_set on its cg_link_list. + +Thus the set of tasks in a cgroup can be listed by iterating over +each css_set that references the cgroup, and sub-iterating over +each css_set's task set. + +The use of a Linux virtual file system (vfs) to represent the +cgroup hierarchy provides for a familiar permission and name space +for cgroups, with a minimum of additional kernel code. + +1.4 What does notify_on_release do ? +------------------------------------ + +If the notify_on_release flag is enabled (1) in a cgroup, then +whenever the last task in the cgroup leaves (exits or attaches to +some other cgroup) and the last child cgroup of that cgroup +is removed, then the kernel runs the command specified by the contents +of the "release_agent" file in that hierarchy's root directory, +supplying the pathname (relative to the mount point of the cgroup +file system) of the abandoned cgroup. This enables automatic +removal of abandoned cgroups. The default value of +notify_on_release in the root cgroup at system boot is disabled +(0). The default value of other cgroups at creation is the current +value of their parents' notify_on_release settings. The default value of +a cgroup hierarchy's release_agent path is empty. + +1.5 What does clone_children do ? +--------------------------------- + +This flag only affects the cpuset controller. If the clone_children +flag is enabled (1) in a cgroup, a new cpuset cgroup will copy its +configuration from the parent during initialization. + +1.6 How do I use cgroups ? +-------------------------- + +To start a new job that is to be contained within a cgroup, using +the "cpuset" cgroup subsystem, the steps are something like:: + + 1) mount -t tmpfs cgroup_root /sys/fs/cgroup + 2) mkdir /sys/fs/cgroup/cpuset + 3) mount -t cgroup -ocpuset cpuset /sys/fs/cgroup/cpuset + 4) Create the new cgroup by doing mkdir's and write's (or echo's) in + the /sys/fs/cgroup/cpuset virtual file system. + 5) Start a task that will be the "founding father" of the new job. + 6) Attach that task to the new cgroup by writing its PID to the + /sys/fs/cgroup/cpuset tasks file for that cgroup. + 7) fork, exec or clone the job tasks from this founding father task. + +For example, the following sequence of commands will setup a cgroup +named "Charlie", containing just CPUs 2 and 3, and Memory Node 1, +and then start a subshell 'sh' in that cgroup:: + + mount -t tmpfs cgroup_root /sys/fs/cgroup + mkdir /sys/fs/cgroup/cpuset + mount -t cgroup cpuset -ocpuset /sys/fs/cgroup/cpuset + cd /sys/fs/cgroup/cpuset + mkdir Charlie + cd Charlie + /bin/echo 2-3 > cpuset.cpus + /bin/echo 1 > cpuset.mems + /bin/echo $$ > tasks + sh + # The subshell 'sh' is now running in cgroup Charlie + # The next line should display '/Charlie' + cat /proc/self/cgroup + +2. Usage Examples and Syntax +============================ + +2.1 Basic Usage +--------------- + +Creating, modifying, using cgroups can be done through the cgroup +virtual filesystem. + +To mount a cgroup hierarchy with all available subsystems, type:: + + # mount -t cgroup xxx /sys/fs/cgroup + +The "xxx" is not interpreted by the cgroup code, but will appear in +/proc/mounts so may be any useful identifying string that you like. + +Note: Some subsystems do not work without some user input first. For instance, +if cpusets are enabled the user will have to populate the cpus and mems files +for each new cgroup created before that group can be used. + +As explained in section `1.2 Why are cgroups needed?` you should create +different hierarchies of cgroups for each single resource or group of +resources you want to control. Therefore, you should mount a tmpfs on +/sys/fs/cgroup and create directories for each cgroup resource or resource +group:: + + # mount -t tmpfs cgroup_root /sys/fs/cgroup + # mkdir /sys/fs/cgroup/rg1 + +To mount a cgroup hierarchy with just the cpuset and memory +subsystems, type:: + + # mount -t cgroup -o cpuset,memory hier1 /sys/fs/cgroup/rg1 + +While remounting cgroups is currently supported, it is not recommend +to use it. Remounting allows changing bound subsystems and +release_agent. Rebinding is hardly useful as it only works when the +hierarchy is empty and release_agent itself should be replaced with +conventional fsnotify. The support for remounting will be removed in +the future. + +To Specify a hierarchy's release_agent:: + + # mount -t cgroup -o cpuset,release_agent="/sbin/cpuset_release_agent" \ + xxx /sys/fs/cgroup/rg1 + +Note that specifying 'release_agent' more than once will return failure. + +Note that changing the set of subsystems is currently only supported +when the hierarchy consists of a single (root) cgroup. Supporting +the ability to arbitrarily bind/unbind subsystems from an existing +cgroup hierarchy is intended to be implemented in the future. + +Then under /sys/fs/cgroup/rg1 you can find a tree that corresponds to the +tree of the cgroups in the system. For instance, /sys/fs/cgroup/rg1 +is the cgroup that holds the whole system. + +If you want to change the value of release_agent:: + + # echo "/sbin/new_release_agent" > /sys/fs/cgroup/rg1/release_agent + +It can also be changed via remount. + +If you want to create a new cgroup under /sys/fs/cgroup/rg1:: + + # cd /sys/fs/cgroup/rg1 + # mkdir my_cgroup + +Now you want to do something with this cgroup: + + # cd my_cgroup + +In this directory you can find several files:: + + # ls + cgroup.procs notify_on_release tasks + (plus whatever files added by the attached subsystems) + +Now attach your shell to this cgroup:: + + # /bin/echo $$ > tasks + +You can also create cgroups inside your cgroup by using mkdir in this +directory:: + + # mkdir my_sub_cs + +To remove a cgroup, just use rmdir:: + + # rmdir my_sub_cs + +This will fail if the cgroup is in use (has cgroups inside, or +has processes attached, or is held alive by other subsystem-specific +reference). + +2.2 Attaching processes +----------------------- + +:: + + # /bin/echo PID > tasks + +Note that it is PID, not PIDs. You can only attach ONE task at a time. +If you have several tasks to attach, you have to do it one after another:: + + # /bin/echo PID1 > tasks + # /bin/echo PID2 > tasks + ... + # /bin/echo PIDn > tasks + +You can attach the current shell task by echoing 0:: + + # echo 0 > tasks + +You can use the cgroup.procs file instead of the tasks file to move all +threads in a threadgroup at once. Echoing the PID of any task in a +threadgroup to cgroup.procs causes all tasks in that threadgroup to be +attached to the cgroup. Writing 0 to cgroup.procs moves all tasks +in the writing task's threadgroup. + +Note: Since every task is always a member of exactly one cgroup in each +mounted hierarchy, to remove a task from its current cgroup you must +move it into a new cgroup (possibly the root cgroup) by writing to the +new cgroup's tasks file. + +Note: Due to some restrictions enforced by some cgroup subsystems, moving +a process to another cgroup can fail. + +2.3 Mounting hierarchies by name +-------------------------------- + +Passing the name= option when mounting a cgroups hierarchy +associates the given name with the hierarchy. This can be used when +mounting a pre-existing hierarchy, in order to refer to it by name +rather than by its set of active subsystems. Each hierarchy is either +nameless, or has a unique name. + +The name should match [\w.-]+ + +When passing a name= option for a new hierarchy, you need to +specify subsystems manually; the legacy behaviour of mounting all +subsystems when none are explicitly specified is not supported when +you give a subsystem a name. + +The name of the subsystem appears as part of the hierarchy description +in /proc/mounts and /proc//cgroups. + + +3. Kernel API +============= + +3.1 Overview +------------ + +Each kernel subsystem that wants to hook into the generic cgroup +system needs to create a cgroup_subsys object. This contains +various methods, which are callbacks from the cgroup system, along +with a subsystem ID which will be assigned by the cgroup system. + +Other fields in the cgroup_subsys object include: + +- subsys_id: a unique array index for the subsystem, indicating which + entry in cgroup->subsys[] this subsystem should be managing. + +- name: should be initialized to a unique subsystem name. Should be + no longer than MAX_CGROUP_TYPE_NAMELEN. + +- early_init: indicate if the subsystem needs early initialization + at system boot. + +Each cgroup object created by the system has an array of pointers, +indexed by subsystem ID; this pointer is entirely managed by the +subsystem; the generic cgroup code will never touch this pointer. + +3.2 Synchronization +------------------- + +There is a global mutex, cgroup_mutex, used by the cgroup +system. This should be taken by anything that wants to modify a +cgroup. It may also be taken to prevent cgroups from being +modified, but more specific locks may be more appropriate in that +situation. + +See kernel/cgroup.c for more details. + +Subsystems can take/release the cgroup_mutex via the functions +cgroup_lock()/cgroup_unlock(). + +Accessing a task's cgroup pointer may be done in the following ways: +- while holding cgroup_mutex +- while holding the task's alloc_lock (via task_lock()) +- inside an rcu_read_lock() section via rcu_dereference() + +3.3 Subsystem API +----------------- + +Each subsystem should: + +- add an entry in linux/cgroup_subsys.h +- define a cgroup_subsys object called _cgrp_subsys + +Each subsystem may export the following methods. The only mandatory +methods are css_alloc/free. Any others that are null are presumed to +be successful no-ops. + +``struct cgroup_subsys_state *css_alloc(struct cgroup *cgrp)`` +(cgroup_mutex held by caller) + +Called to allocate a subsystem state object for a cgroup. The +subsystem should allocate its subsystem state object for the passed +cgroup, returning a pointer to the new object on success or a +ERR_PTR() value. On success, the subsystem pointer should point to +a structure of type cgroup_subsys_state (typically embedded in a +larger subsystem-specific object), which will be initialized by the +cgroup system. Note that this will be called at initialization to +create the root subsystem state for this subsystem; this case can be +identified by the passed cgroup object having a NULL parent (since +it's the root of the hierarchy) and may be an appropriate place for +initialization code. + +``int css_online(struct cgroup *cgrp)`` +(cgroup_mutex held by caller) + +Called after @cgrp successfully completed all allocations and made +visible to cgroup_for_each_child/descendant_*() iterators. The +subsystem may choose to fail creation by returning -errno. This +callback can be used to implement reliable state sharing and +propagation along the hierarchy. See the comment on +cgroup_for_each_descendant_pre() for details. + +``void css_offline(struct cgroup *cgrp);`` +(cgroup_mutex held by caller) + +This is the counterpart of css_online() and called iff css_online() +has succeeded on @cgrp. This signifies the beginning of the end of +@cgrp. @cgrp is being removed and the subsystem should start dropping +all references it's holding on @cgrp. When all references are dropped, +cgroup removal will proceed to the next step - css_free(). After this +callback, @cgrp should be considered dead to the subsystem. + +``void css_free(struct cgroup *cgrp)`` +(cgroup_mutex held by caller) + +The cgroup system is about to free @cgrp; the subsystem should free +its subsystem state object. By the time this method is called, @cgrp +is completely unused; @cgrp->parent is still valid. (Note - can also +be called for a newly-created cgroup if an error occurs after this +subsystem's create() method has been called for the new cgroup). + +``int can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)`` +(cgroup_mutex held by caller) + +Called prior to moving one or more tasks into a cgroup; if the +subsystem returns an error, this will abort the attach operation. +@tset contains the tasks to be attached and is guaranteed to have at +least one task in it. + +If there are multiple tasks in the taskset, then: + - it's guaranteed that all are from the same thread group + - @tset contains all tasks from the thread group whether or not + they're switching cgroups + - the first task is the leader + +Each @tset entry also contains the task's old cgroup and tasks which +aren't switching cgroup can be skipped easily using the +cgroup_taskset_for_each() iterator. Note that this isn't called on a +fork. If this method returns 0 (success) then this should remain valid +while the caller holds cgroup_mutex and it is ensured that either +attach() or cancel_attach() will be called in future. + +``void css_reset(struct cgroup_subsys_state *css)`` +(cgroup_mutex held by caller) + +An optional operation which should restore @css's configuration to the +initial state. This is currently only used on the unified hierarchy +when a subsystem is disabled on a cgroup through +"cgroup.subtree_control" but should remain enabled because other +subsystems depend on it. cgroup core makes such a css invisible by +removing the associated interface files and invokes this callback so +that the hidden subsystem can return to the initial neutral state. +This prevents unexpected resource control from a hidden css and +ensures that the configuration is in the initial state when it is made +visible again later. + +``void cancel_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)`` +(cgroup_mutex held by caller) + +Called when a task attach operation has failed after can_attach() has succeeded. +A subsystem whose can_attach() has some side-effects should provide this +function, so that the subsystem can implement a rollback. If not, not necessary. +This will be called only about subsystems whose can_attach() operation have +succeeded. The parameters are identical to can_attach(). + +``void attach(struct cgroup *cgrp, struct cgroup_taskset *tset)`` +(cgroup_mutex held by caller) + +Called after the task has been attached to the cgroup, to allow any +post-attachment activity that requires memory allocations or blocking. +The parameters are identical to can_attach(). + +``void fork(struct task_struct *task)`` + +Called when a task is forked into a cgroup. + +``void exit(struct task_struct *task)`` + +Called during task exit. + +``void free(struct task_struct *task)`` + +Called when the task_struct is freed. + +``void bind(struct cgroup *root)`` +(cgroup_mutex held by caller) + +Called when a cgroup subsystem is rebound to a different hierarchy +and root cgroup. Currently this will only involve movement between +the default hierarchy (which never has sub-cgroups) and a hierarchy +that is being created/destroyed (and hence has no sub-cgroups). + +4. Extended attribute usage +=========================== + +cgroup filesystem supports certain types of extended attributes in its +directories and files. The current supported types are: + + - Trusted (XATTR_TRUSTED) + - Security (XATTR_SECURITY) + +Both require CAP_SYS_ADMIN capability to set. + +Like in tmpfs, the extended attributes in cgroup filesystem are stored +using kernel memory and it's advised to keep the usage at minimum. This +is the reason why user defined extended attributes are not supported, since +any user can do it and there's no limit in the value size. + +The current known users for this feature are SELinux to limit cgroup usage +in containers and systemd for assorted meta data like main PID in a cgroup +(systemd creates a cgroup per service). + +5. Questions +============ + +:: + + Q: what's up with this '/bin/echo' ? + A: bash's builtin 'echo' command does not check calls to write() against + errors. If you use it in the cgroup file system, you won't be + able to tell whether a command succeeded or failed. + + Q: When I attach processes, only the first of the line gets really attached ! + A: We can only return one error code per call to write(). So you should also + put only ONE PID. diff --git a/Documentation/admin-guide/cgroup-v1/cpuacct.rst b/Documentation/admin-guide/cgroup-v1/cpuacct.rst new file mode 100644 index 000000000000..d30ed81d2ad7 --- /dev/null +++ b/Documentation/admin-guide/cgroup-v1/cpuacct.rst @@ -0,0 +1,50 @@ +========================= +CPU Accounting Controller +========================= + +The CPU accounting controller is used to group tasks using cgroups and +account the CPU usage of these groups of tasks. + +The CPU accounting controller supports multi-hierarchy groups. An accounting +group accumulates the CPU usage of all of its child groups and the tasks +directly present in its group. + +Accounting groups can be created by first mounting the cgroup filesystem:: + + # mount -t cgroup -ocpuacct none /sys/fs/cgroup + +With the above step, the initial or the parent accounting group becomes +visible at /sys/fs/cgroup. At bootup, this group includes all the tasks in +the system. /sys/fs/cgroup/tasks lists the tasks in this cgroup. +/sys/fs/cgroup/cpuacct.usage gives the CPU time (in nanoseconds) obtained +by this group which is essentially the CPU time obtained by all the tasks +in the system. + +New accounting groups can be created under the parent group /sys/fs/cgroup:: + + # cd /sys/fs/cgroup + # mkdir g1 + # echo $$ > g1/tasks + +The above steps create a new group g1 and move the current shell +process (bash) into it. CPU time consumed by this bash and its children +can be obtained from g1/cpuacct.usage and the same is accumulated in +/sys/fs/cgroup/cpuacct.usage also. + +cpuacct.stat file lists a few statistics which further divide the +CPU time obtained by the cgroup into user and system times. Currently +the following statistics are supported: + +user: Time spent by tasks of the cgroup in user mode. +system: Time spent by tasks of the cgroup in kernel mode. + +user and system are in USER_HZ unit. + +cpuacct controller uses percpu_counter interface to collect user and +system times. This has two side effects: + +- It is theoretically possible to see wrong values for user and system times. + This is because percpu_counter_read() on 32bit systems isn't safe + against concurrent writes. +- It is possible to see slightly outdated values for user and system times + due to the batch processing nature of percpu_counter. diff --git a/Documentation/admin-guide/cgroup-v1/cpusets.rst b/Documentation/admin-guide/cgroup-v1/cpusets.rst new file mode 100644 index 000000000000..86a6ae995d54 --- /dev/null +++ b/Documentation/admin-guide/cgroup-v1/cpusets.rst @@ -0,0 +1,866 @@ +======= +CPUSETS +======= + +Copyright (C) 2004 BULL SA. + +Written by Simon.Derr@bull.net + +- Portions Copyright (c) 2004-2006 Silicon Graphics, Inc. +- Modified by Paul Jackson +- Modified by Christoph Lameter +- Modified by Paul Menage +- Modified by Hidetoshi Seto + +.. CONTENTS: + + 1. Cpusets + 1.1 What are cpusets ? + 1.2 Why are cpusets needed ? + 1.3 How are cpusets implemented ? + 1.4 What are exclusive cpusets ? + 1.5 What is memory_pressure ? + 1.6 What is memory spread ? + 1.7 What is sched_load_balance ? + 1.8 What is sched_relax_domain_level ? + 1.9 How do I use cpusets ? + 2. Usage Examples and Syntax + 2.1 Basic Usage + 2.2 Adding/removing cpus + 2.3 Setting flags + 2.4 Attaching processes + 3. Questions + 4. Contact + +1. Cpusets +========== + +1.1 What are cpusets ? +---------------------- + +Cpusets provide a mechanism for assigning a set of CPUs and Memory +Nodes to a set of tasks. In this document "Memory Node" refers to +an on-line node that contains memory. + +Cpusets constrain the CPU and Memory placement of tasks to only +the resources within a task's current cpuset. They form a nested +hierarchy visible in a virtual file system. These are the essential +hooks, beyond what is already present, required to manage dynamic +job placement on large systems. + +Cpusets use the generic cgroup subsystem described in +Documentation/admin-guide/cgroup-v1/cgroups.rst. + +Requests by a task, using the sched_setaffinity(2) system call to +include CPUs in its CPU affinity mask, and using the mbind(2) and +set_mempolicy(2) system calls to include Memory Nodes in its memory +policy, are both filtered through that task's cpuset, filtering out any +CPUs or Memory Nodes not in that cpuset. The scheduler will not +schedule a task on a CPU that is not allowed in its cpus_allowed +vector, and the kernel page allocator will not allocate a page on a +node that is not allowed in the requesting task's mems_allowed vector. + +User level code may create and destroy cpusets by name in the cgroup +virtual file system, manage the attributes and permissions of these +cpusets and which CPUs and Memory Nodes are assigned to each cpuset, +specify and query to which cpuset a task is assigned, and list the +task pids assigned to a cpuset. + + +1.2 Why are cpusets needed ? +---------------------------- + +The management of large computer systems, with many processors (CPUs), +complex memory cache hierarchies and multiple Memory Nodes having +non-uniform access times (NUMA) presents additional challenges for +the efficient scheduling and memory placement of processes. + +Frequently more modest sized systems can be operated with adequate +efficiency just by letting the operating system automatically share +the available CPU and Memory resources amongst the requesting tasks. + +But larger systems, which benefit more from careful processor and +memory placement to reduce memory access times and contention, +and which typically represent a larger investment for the customer, +can benefit from explicitly placing jobs on properly sized subsets of +the system. + +This can be especially valuable on: + + * Web Servers running multiple instances of the same web application, + * Servers running different applications (for instance, a web server + and a database), or + * NUMA systems running large HPC applications with demanding + performance characteristics. + +These subsets, or "soft partitions" must be able to be dynamically +adjusted, as the job mix changes, without impacting other concurrently +executing jobs. The location of the running jobs pages may also be moved +when the memory locations are changed. + +The kernel cpuset patch provides the minimum essential kernel +mechanisms required to efficiently implement such subsets. It +leverages existing CPU and Memory Placement facilities in the Linux +kernel to avoid any additional impact on the critical scheduler or +memory allocator code. + + +1.3 How are cpusets implemented ? +--------------------------------- + +Cpusets provide a Linux kernel mechanism to constrain which CPUs and +Memory Nodes are used by a process or set of processes. + +The Linux kernel already has a pair of mechanisms to specify on which +CPUs a task may be scheduled (sched_setaffinity) and on which Memory +Nodes it may obtain memory (mbind, set_mempolicy). + +Cpusets extends these two mechanisms as follows: + + - Cpusets are sets of allowed CPUs and Memory Nodes, known to the + kernel. + - Each task in the system is attached to a cpuset, via a pointer + in the task structure to a reference counted cgroup structure. + - Calls to sched_setaffinity are filtered to just those CPUs + allowed in that task's cpuset. + - Calls to mbind and set_mempolicy are filtered to just + those Memory Nodes allowed in that task's cpuset. + - The root cpuset contains all the systems CPUs and Memory + Nodes. + - For any cpuset, one can define child cpusets containing a subset + of the parents CPU and Memory Node resources. + - The hierarchy of cpusets can be mounted at /dev/cpuset, for + browsing and manipulation from user space. + - A cpuset may be marked exclusive, which ensures that no other + cpuset (except direct ancestors and descendants) may contain + any overlapping CPUs or Memory Nodes. + - You can list all the tasks (by pid) attached to any cpuset. + +The implementation of cpusets requires a few, simple hooks +into the rest of the kernel, none in performance critical paths: + + - in init/main.c, to initialize the root cpuset at system boot. + - in fork and exit, to attach and detach a task from its cpuset. + - in sched_setaffinity, to mask the requested CPUs by what's + allowed in that task's cpuset. + - in sched.c migrate_live_tasks(), to keep migrating tasks within + the CPUs allowed by their cpuset, if possible. + - in the mbind and set_mempolicy system calls, to mask the requested + Memory Nodes by what's allowed in that task's cpuset. + - in page_alloc.c, to restrict memory to allowed nodes. + - in vmscan.c, to restrict page recovery to the current cpuset. + +You should mount the "cgroup" filesystem type in order to enable +browsing and modifying the cpusets presently known to the kernel. No +new system calls are added for cpusets - all support for querying and +modifying cpusets is via this cpuset file system. + +The /proc//status file for each task has four added lines, +displaying the task's cpus_allowed (on which CPUs it may be scheduled) +and mems_allowed (on which Memory Nodes it may obtain memory), +in the two formats seen in the following example:: + + Cpus_allowed: ffffffff,ffffffff,ffffffff,ffffffff + Cpus_allowed_list: 0-127 + Mems_allowed: ffffffff,ffffffff + Mems_allowed_list: 0-63 + +Each cpuset is represented by a directory in the cgroup file system +containing (on top of the standard cgroup files) the following +files describing that cpuset: + + - cpuset.cpus: list of CPUs in that cpuset + - cpuset.mems: list of Memory Nodes in that cpuset + - cpuset.memory_migrate flag: if set, move pages to cpusets nodes + - cpuset.cpu_exclusive flag: is cpu placement exclusive? + - cpuset.mem_exclusive flag: is memory placement exclusive? + - cpuset.mem_hardwall flag: is memory allocation hardwalled + - cpuset.memory_pressure: measure of how much paging pressure in cpuset + - cpuset.memory_spread_page flag: if set, spread page cache evenly on allowed nodes + - cpuset.memory_spread_slab flag: if set, spread slab cache evenly on allowed nodes + - cpuset.sched_load_balance flag: if set, load balance within CPUs on that cpuset + - cpuset.sched_relax_domain_level: the searching range when migrating tasks + +In addition, only the root cpuset has the following file: + + - cpuset.memory_pressure_enabled flag: compute memory_pressure? + +New cpusets are created using the mkdir system call or shell +command. The properties of a cpuset, such as its flags, allowed +CPUs and Memory Nodes, and attached tasks, are modified by writing +to the appropriate file in that cpusets directory, as listed above. + +The named hierarchical structure of nested cpusets allows partitioning +a large system into nested, dynamically changeable, "soft-partitions". + +The attachment of each task, automatically inherited at fork by any +children of that task, to a cpuset allows organizing the work load +on a system into related sets of tasks such that each set is constrained +to using the CPUs and Memory Nodes of a particular cpuset. A task +may be re-attached to any other cpuset, if allowed by the permissions +on the necessary cpuset file system directories. + +Such management of a system "in the large" integrates smoothly with +the detailed placement done on individual tasks and memory regions +using the sched_setaffinity, mbind and set_mempolicy system calls. + +The following rules apply to each cpuset: + + - Its CPUs and Memory Nodes must be a subset of its parents. + - It can't be marked exclusive unless its parent is. + - If its cpu or memory is exclusive, they may not overlap any sibling. + +These rules, and the natural hierarchy of cpusets, enable efficient +enforcement of the exclusive guarantee, without having to scan all +cpusets every time any of them change to ensure nothing overlaps a +exclusive cpuset. Also, the use of a Linux virtual file system (vfs) +to represent the cpuset hierarchy provides for a familiar permission +and name space for cpusets, with a minimum of additional kernel code. + +The cpus and mems files in the root (top_cpuset) cpuset are +read-only. The cpus file automatically tracks the value of +cpu_online_mask using a CPU hotplug notifier, and the mems file +automatically tracks the value of node_states[N_MEMORY]--i.e., +nodes with memory--using the cpuset_track_online_nodes() hook. + + +1.4 What are exclusive cpusets ? +-------------------------------- + +If a cpuset is cpu or mem exclusive, no other cpuset, other than +a direct ancestor or descendant, may share any of the same CPUs or +Memory Nodes. + +A cpuset that is cpuset.mem_exclusive *or* cpuset.mem_hardwall is "hardwalled", +i.e. it restricts kernel allocations for page, buffer and other data +commonly shared by the kernel across multiple users. All cpusets, +whether hardwalled or not, restrict allocations of memory for user +space. This enables configuring a system so that several independent +jobs can share common kernel data, such as file system pages, while +isolating each job's user allocation in its own cpuset. To do this, +construct a large mem_exclusive cpuset to hold all the jobs, and +construct child, non-mem_exclusive cpusets for each individual job. +Only a small amount of typical kernel memory, such as requests from +interrupt handlers, is allowed to be taken outside even a +mem_exclusive cpuset. + + +1.5 What is memory_pressure ? +----------------------------- +The memory_pressure of a cpuset provides a simple per-cpuset metric +of the rate that the tasks in a cpuset are attempting to free up in +use memory on the nodes of the cpuset to satisfy additional memory +requests. + +This enables batch managers monitoring jobs running in dedicated +cpusets to efficiently detect what level of memory pressure that job +is causing. + +This is useful both on tightly managed systems running a wide mix of +submitted jobs, which may choose to terminate or re-prioritize jobs that +are trying to use more memory than allowed on the nodes assigned to them, +and with tightly coupled, long running, massively parallel scientific +computing jobs that will dramatically fail to meet required performance +goals if they start to use more memory than allowed to them. + +This mechanism provides a very economical way for the batch manager +to monitor a cpuset for signs of memory pressure. It's up to the +batch manager or other user code to decide what to do about it and +take action. + +==> + Unless this feature is enabled by writing "1" to the special file + /dev/cpuset/memory_pressure_enabled, the hook in the rebalance + code of __alloc_pages() for this metric reduces to simply noticing + that the cpuset_memory_pressure_enabled flag is zero. So only + systems that enable this feature will compute the metric. + +Why a per-cpuset, running average: + + Because this meter is per-cpuset, rather than per-task or mm, + the system load imposed by a batch scheduler monitoring this + metric is sharply reduced on large systems, because a scan of + the tasklist can be avoided on each set of queries. + + Because this meter is a running average, instead of an accumulating + counter, a batch scheduler can detect memory pressure with a + single read, instead of having to read and accumulate results + for a period of time. + + Because this meter is per-cpuset rather than per-task or mm, + the batch scheduler can obtain the key information, memory + pressure in a cpuset, with a single read, rather than having to + query and accumulate results over all the (dynamically changing) + set of tasks in the cpuset. + +A per-cpuset simple digital filter (requires a spinlock and 3 words +of data per-cpuset) is kept, and updated by any task attached to that +cpuset, if it enters the synchronous (direct) page reclaim code. + +A per-cpuset file provides an integer number representing the recent +(half-life of 10 seconds) rate of direct page reclaims caused by +the tasks in the cpuset, in units of reclaims attempted per second, +times 1000. + + +1.6 What is memory spread ? +--------------------------- +There are two boolean flag files per cpuset that control where the +kernel allocates pages for the file system buffers and related in +kernel data structures. They are called 'cpuset.memory_spread_page' and +'cpuset.memory_spread_slab'. + +If the per-cpuset boolean flag file 'cpuset.memory_spread_page' is set, then +the kernel will spread the file system buffers (page cache) evenly +over all the nodes that the faulting task is allowed to use, instead +of preferring to put those pages on the node where the task is running. + +If the per-cpuset boolean flag file 'cpuset.memory_spread_slab' is set, +then the kernel will spread some file system related slab caches, +such as for inodes and dentries evenly over all the nodes that the +faulting task is allowed to use, instead of preferring to put those +pages on the node where the task is running. + +The setting of these flags does not affect anonymous data segment or +stack segment pages of a task. + +By default, both kinds of memory spreading are off, and memory +pages are allocated on the node local to where the task is running, +except perhaps as modified by the task's NUMA mempolicy or cpuset +configuration, so long as sufficient free memory pages are available. + +When new cpusets are created, they inherit the memory spread settings +of their parent. + +Setting memory spreading causes allocations for the affected page +or slab caches to ignore the task's NUMA mempolicy and be spread +instead. Tasks using mbind() or set_mempolicy() calls to set NUMA +mempolicies will not notice any change in these calls as a result of +their containing task's memory spread settings. If memory spreading +is turned off, then the currently specified NUMA mempolicy once again +applies to memory page allocations. + +Both 'cpuset.memory_spread_page' and 'cpuset.memory_spread_slab' are boolean flag +files. By default they contain "0", meaning that the feature is off +for that cpuset. If a "1" is written to that file, then that turns +the named feature on. + +The implementation is simple. + +Setting the flag 'cpuset.memory_spread_page' turns on a per-process flag +PFA_SPREAD_PAGE for each task that is in that cpuset or subsequently +joins that cpuset. The page allocation calls for the page cache +is modified to perform an inline check for this PFA_SPREAD_PAGE task +flag, and if set, a call to a new routine cpuset_mem_spread_node() +returns the node to prefer for the allocation. + +Similarly, setting 'cpuset.memory_spread_slab' turns on the flag +PFA_SPREAD_SLAB, and appropriately marked slab caches will allocate +pages from the node returned by cpuset_mem_spread_node(). + +The cpuset_mem_spread_node() routine is also simple. It uses the +value of a per-task rotor cpuset_mem_spread_rotor to select the next +node in the current task's mems_allowed to prefer for the allocation. + +This memory placement policy is also known (in other contexts) as +round-robin or interleave. + +This policy can provide substantial improvements for jobs that need +to place thread local data on the corresponding node, but that need +to access large file system data sets that need to be spread across +the several nodes in the jobs cpuset in order to fit. Without this +policy, especially for jobs that might have one thread reading in the +data set, the memory allocation across the nodes in the jobs cpuset +can become very uneven. + +1.7 What is sched_load_balance ? +-------------------------------- + +The kernel scheduler (kernel/sched/core.c) automatically load balances +tasks. If one CPU is underutilized, kernel code running on that +CPU will look for tasks on other more overloaded CPUs and move those +tasks to itself, within the constraints of such placement mechanisms +as cpusets and sched_setaffinity. + +The algorithmic cost of load balancing and its impact on key shared +kernel data structures such as the task list increases more than +linearly with the number of CPUs being balanced. So the scheduler +has support to partition the systems CPUs into a number of sched +domains such that it only load balances within each sched domain. +Each sched domain covers some subset of the CPUs in the system; +no two sched domains overlap; some CPUs might not be in any sched +domain and hence won't be load balanced. + +Put simply, it costs less to balance between two smaller sched domains +than one big one, but doing so means that overloads in one of the +two domains won't be load balanced to the other one. + +By default, there is one sched domain covering all CPUs, including those +marked isolated using the kernel boot time "isolcpus=" argument. However, +the isolated CPUs will not participate in load balancing, and will not +have tasks running on them unless explicitly assigned. + +This default load balancing across all CPUs is not well suited for +the following two situations: + + 1) On large systems, load balancing across many CPUs is expensive. + If the system is managed using cpusets to place independent jobs + on separate sets of CPUs, full load balancing is unnecessary. + 2) Systems supporting realtime on some CPUs need to minimize + system overhead on those CPUs, including avoiding task load + balancing if that is not needed. + +When the per-cpuset flag "cpuset.sched_load_balance" is enabled (the default +setting), it requests that all the CPUs in that cpusets allowed 'cpuset.cpus' +be contained in a single sched domain, ensuring that load balancing +can move a task (not otherwised pinned, as by sched_setaffinity) +from any CPU in that cpuset to any other. + +When the per-cpuset flag "cpuset.sched_load_balance" is disabled, then the +scheduler will avoid load balancing across the CPUs in that cpuset, +--except-- in so far as is necessary because some overlapping cpuset +has "sched_load_balance" enabled. + +So, for example, if the top cpuset has the flag "cpuset.sched_load_balance" +enabled, then the scheduler will have one sched domain covering all +CPUs, and the setting of the "cpuset.sched_load_balance" flag in any other +cpusets won't matter, as we're already fully load balancing. + +Therefore in the above two situations, the top cpuset flag +"cpuset.sched_load_balance" should be disabled, and only some of the smaller, +child cpusets have this flag enabled. + +When doing this, you don't usually want to leave any unpinned tasks in +the top cpuset that might use non-trivial amounts of CPU, as such tasks +may be artificially constrained to some subset of CPUs, depending on +the particulars of this flag setting in descendant cpusets. Even if +such a task could use spare CPU cycles in some other CPUs, the kernel +scheduler might not consider the possibility of load balancing that +task to that underused CPU. + +Of course, tasks pinned to a particular CPU can be left in a cpuset +that disables "cpuset.sched_load_balance" as those tasks aren't going anywhere +else anyway. + +There is an impedance mismatch here, between cpusets and sched domains. +Cpusets are hierarchical and nest. Sched domains are flat; they don't +overlap and each CPU is in at most one sched domain. + +It is necessary for sched domains to be flat because load balancing +across partially overlapping sets of CPUs would risk unstable dynamics +that would be beyond our understanding. So if each of two partially +overlapping cpusets enables the flag 'cpuset.sched_load_balance', then we +form a single sched domain that is a superset of both. We won't move +a task to a CPU outside its cpuset, but the scheduler load balancing +code might waste some compute cycles considering that possibility. + +This mismatch is why there is not a simple one-to-one relation +between which cpusets have the flag "cpuset.sched_load_balance" enabled, +and the sched domain configuration. If a cpuset enables the flag, it +will get balancing across all its CPUs, but if it disables the flag, +it will only be assured of no load balancing if no other overlapping +cpuset enables the flag. + +If two cpusets have partially overlapping 'cpuset.cpus' allowed, and only +one of them has this flag enabled, then the other may find its +tasks only partially load balanced, just on the overlapping CPUs. +This is just the general case of the top_cpuset example given a few +paragraphs above. In the general case, as in the top cpuset case, +don't leave tasks that might use non-trivial amounts of CPU in +such partially load balanced cpusets, as they may be artificially +constrained to some subset of the CPUs allowed to them, for lack of +load balancing to the other CPUs. + +CPUs in "cpuset.isolcpus" were excluded from load balancing by the +isolcpus= kernel boot option, and will never be load balanced regardless +of the value of "cpuset.sched_load_balance" in any cpuset. + +1.7.1 sched_load_balance implementation details. +------------------------------------------------ + +The per-cpuset flag 'cpuset.sched_load_balance' defaults to enabled (contrary +to most cpuset flags.) When enabled for a cpuset, the kernel will +ensure that it can load balance across all the CPUs in that cpuset +(makes sure that all the CPUs in the cpus_allowed of that cpuset are +in the same sched domain.) + +If two overlapping cpusets both have 'cpuset.sched_load_balance' enabled, +then they will be (must be) both in the same sched domain. + +If, as is the default, the top cpuset has 'cpuset.sched_load_balance' enabled, +then by the above that means there is a single sched domain covering +the whole system, regardless of any other cpuset settings. + +The kernel commits to user space that it will avoid load balancing +where it can. It will pick as fine a granularity partition of sched +domains as it can while still providing load balancing for any set +of CPUs allowed to a cpuset having 'cpuset.sched_load_balance' enabled. + +The internal kernel cpuset to scheduler interface passes from the +cpuset code to the scheduler code a partition of the load balanced +CPUs in the system. This partition is a set of subsets (represented +as an array of struct cpumask) of CPUs, pairwise disjoint, that cover +all the CPUs that must be load balanced. + +The cpuset code builds a new such partition and passes it to the +scheduler sched domain setup code, to have the sched domains rebuilt +as necessary, whenever: + + - the 'cpuset.sched_load_balance' flag of a cpuset with non-empty CPUs changes, + - or CPUs come or go from a cpuset with this flag enabled, + - or 'cpuset.sched_relax_domain_level' value of a cpuset with non-empty CPUs + and with this flag enabled changes, + - or a cpuset with non-empty CPUs and with this flag enabled is removed, + - or a cpu is offlined/onlined. + +This partition exactly defines what sched domains the scheduler should +setup - one sched domain for each element (struct cpumask) in the +partition. + +The scheduler remembers the currently active sched domain partitions. +When the scheduler routine partition_sched_domains() is invoked from +the cpuset code to update these sched domains, it compares the new +partition requested with the current, and updates its sched domains, +removing the old and adding the new, for each change. + + +1.8 What is sched_relax_domain_level ? +-------------------------------------- + +In sched domain, the scheduler migrates tasks in 2 ways; periodic load +balance on tick, and at time of some schedule events. + +When a task is woken up, scheduler try to move the task on idle CPU. +For example, if a task A running on CPU X activates another task B +on the same CPU X, and if CPU Y is X's sibling and performing idle, +then scheduler migrate task B to CPU Y so that task B can start on +CPU Y without waiting task A on CPU X. + +And if a CPU run out of tasks in its runqueue, the CPU try to pull +extra tasks from other busy CPUs to help them before it is going to +be idle. + +Of course it takes some searching cost to find movable tasks and/or +idle CPUs, the scheduler might not search all CPUs in the domain +every time. In fact, in some architectures, the searching ranges on +events are limited in the same socket or node where the CPU locates, +while the load balance on tick searches all. + +For example, assume CPU Z is relatively far from CPU X. Even if CPU Z +is idle while CPU X and the siblings are busy, scheduler can't migrate +woken task B from X to Z since it is out of its searching range. +As the result, task B on CPU X need to wait task A or wait load balance +on the next tick. For some applications in special situation, waiting +1 tick may be too long. + +The 'cpuset.sched_relax_domain_level' file allows you to request changing +this searching range as you like. This file takes int value which +indicates size of searching range in levels ideally as follows, +otherwise initial value -1 that indicates the cpuset has no request. + +====== =========================================================== + -1 no request. use system default or follow request of others. + 0 no search. + 1 search siblings (hyperthreads in a core). + 2 search cores in a package. + 3 search cpus in a node [= system wide on non-NUMA system] + 4 search nodes in a chunk of node [on NUMA system] + 5 search system wide [on NUMA system] +====== =========================================================== + +The system default is architecture dependent. The system default +can be changed using the relax_domain_level= boot parameter. + +This file is per-cpuset and affect the sched domain where the cpuset +belongs to. Therefore if the flag 'cpuset.sched_load_balance' of a cpuset +is disabled, then 'cpuset.sched_relax_domain_level' have no effect since +there is no sched domain belonging the cpuset. + +If multiple cpusets are overlapping and hence they form a single sched +domain, the largest value among those is used. Be careful, if one +requests 0 and others are -1 then 0 is used. + +Note that modifying this file will have both good and bad effects, +and whether it is acceptable or not depends on your situation. +Don't modify this file if you are not sure. + +If your situation is: + + - The migration costs between each cpu can be assumed considerably + small(for you) due to your special application's behavior or + special hardware support for CPU cache etc. + - The searching cost doesn't have impact(for you) or you can make + the searching cost enough small by managing cpuset to compact etc. + - The latency is required even it sacrifices cache hit rate etc. + then increasing 'sched_relax_domain_level' would benefit you. + + +1.9 How do I use cpusets ? +-------------------------- + +In order to minimize the impact of cpusets on critical kernel +code, such as the scheduler, and due to the fact that the kernel +does not support one task updating the memory placement of another +task directly, the impact on a task of changing its cpuset CPU +or Memory Node placement, or of changing to which cpuset a task +is attached, is subtle. + +If a cpuset has its Memory Nodes modified, then for each task attached +to that cpuset, the next time that the kernel attempts to allocate +a page of memory for that task, the kernel will notice the change +in the task's cpuset, and update its per-task memory placement to +remain within the new cpusets memory placement. If the task was using +mempolicy MPOL_BIND, and the nodes to which it was bound overlap with +its new cpuset, then the task will continue to use whatever subset +of MPOL_BIND nodes are still allowed in the new cpuset. If the task +was using MPOL_BIND and now none of its MPOL_BIND nodes are allowed +in the new cpuset, then the task will be essentially treated as if it +was MPOL_BIND bound to the new cpuset (even though its NUMA placement, +as queried by get_mempolicy(), doesn't change). If a task is moved +from one cpuset to another, then the kernel will adjust the task's +memory placement, as above, the next time that the kernel attempts +to allocate a page of memory for that task. + +If a cpuset has its 'cpuset.cpus' modified, then each task in that cpuset +will have its allowed CPU placement changed immediately. Similarly, +if a task's pid is written to another cpuset's 'tasks' file, then its +allowed CPU placement is changed immediately. If such a task had been +bound to some subset of its cpuset using the sched_setaffinity() call, +the task will be allowed to run on any CPU allowed in its new cpuset, +negating the effect of the prior sched_setaffinity() call. + +In summary, the memory placement of a task whose cpuset is changed is +updated by the kernel, on the next allocation of a page for that task, +and the processor placement is updated immediately. + +Normally, once a page is allocated (given a physical page +of main memory) then that page stays on whatever node it +was allocated, so long as it remains allocated, even if the +cpusets memory placement policy 'cpuset.mems' subsequently changes. +If the cpuset flag file 'cpuset.memory_migrate' is set true, then when +tasks are attached to that cpuset, any pages that task had +allocated to it on nodes in its previous cpuset are migrated +to the task's new cpuset. The relative placement of the page within +the cpuset is preserved during these migration operations if possible. +For example if the page was on the second valid node of the prior cpuset +then the page will be placed on the second valid node of the new cpuset. + +Also if 'cpuset.memory_migrate' is set true, then if that cpuset's +'cpuset.mems' file is modified, pages allocated to tasks in that +cpuset, that were on nodes in the previous setting of 'cpuset.mems', +will be moved to nodes in the new setting of 'mems.' +Pages that were not in the task's prior cpuset, or in the cpuset's +prior 'cpuset.mems' setting, will not be moved. + +There is an exception to the above. If hotplug functionality is used +to remove all the CPUs that are currently assigned to a cpuset, +then all the tasks in that cpuset will be moved to the nearest ancestor +with non-empty cpus. But the moving of some (or all) tasks might fail if +cpuset is bound with another cgroup subsystem which has some restrictions +on task attaching. In this failing case, those tasks will stay +in the original cpuset, and the kernel will automatically update +their cpus_allowed to allow all online CPUs. When memory hotplug +functionality for removing Memory Nodes is available, a similar exception +is expected to apply there as well. In general, the kernel prefers to +violate cpuset placement, over starving a task that has had all +its allowed CPUs or Memory Nodes taken offline. + +There is a second exception to the above. GFP_ATOMIC requests are +kernel internal allocations that must be satisfied, immediately. +The kernel may drop some request, in rare cases even panic, if a +GFP_ATOMIC alloc fails. If the request cannot be satisfied within +the current task's cpuset, then we relax the cpuset, and look for +memory anywhere we can find it. It's better to violate the cpuset +than stress the kernel. + +To start a new job that is to be contained within a cpuset, the steps are: + + 1) mkdir /sys/fs/cgroup/cpuset + 2) mount -t cgroup -ocpuset cpuset /sys/fs/cgroup/cpuset + 3) Create the new cpuset by doing mkdir's and write's (or echo's) in + the /sys/fs/cgroup/cpuset virtual file system. + 4) Start a task that will be the "founding father" of the new job. + 5) Attach that task to the new cpuset by writing its pid to the + /sys/fs/cgroup/cpuset tasks file for that cpuset. + 6) fork, exec or clone the job tasks from this founding father task. + +For example, the following sequence of commands will setup a cpuset +named "Charlie", containing just CPUs 2 and 3, and Memory Node 1, +and then start a subshell 'sh' in that cpuset:: + + mount -t cgroup -ocpuset cpuset /sys/fs/cgroup/cpuset + cd /sys/fs/cgroup/cpuset + mkdir Charlie + cd Charlie + /bin/echo 2-3 > cpuset.cpus + /bin/echo 1 > cpuset.mems + /bin/echo $$ > tasks + sh + # The subshell 'sh' is now running in cpuset Charlie + # The next line should display '/Charlie' + cat /proc/self/cpuset + +There are ways to query or modify cpusets: + + - via the cpuset file system directly, using the various cd, mkdir, echo, + cat, rmdir commands from the shell, or their equivalent from C. + - via the C library libcpuset. + - via the C library libcgroup. + (http://sourceforge.net/projects/libcg/) + - via the python application cset. + (http://code.google.com/p/cpuset/) + +The sched_setaffinity calls can also be done at the shell prompt using +SGI's runon or Robert Love's taskset. The mbind and set_mempolicy +calls can be done at the shell prompt using the numactl command +(part of Andi Kleen's numa package). + +2. Usage Examples and Syntax +============================ + +2.1 Basic Usage +--------------- + +Creating, modifying, using the cpusets can be done through the cpuset +virtual filesystem. + +To mount it, type: +# mount -t cgroup -o cpuset cpuset /sys/fs/cgroup/cpuset + +Then under /sys/fs/cgroup/cpuset you can find a tree that corresponds to the +tree of the cpusets in the system. For instance, /sys/fs/cgroup/cpuset +is the cpuset that holds the whole system. + +If you want to create a new cpuset under /sys/fs/cgroup/cpuset:: + + # cd /sys/fs/cgroup/cpuset + # mkdir my_cpuset + +Now you want to do something with this cpuset:: + + # cd my_cpuset + +In this directory you can find several files:: + + # ls + cgroup.clone_children cpuset.memory_pressure + cgroup.event_control cpuset.memory_spread_page + cgroup.procs cpuset.memory_spread_slab + cpuset.cpu_exclusive cpuset.mems + cpuset.cpus cpuset.sched_load_balance + cpuset.mem_exclusive cpuset.sched_relax_domain_level + cpuset.mem_hardwall notify_on_release + cpuset.memory_migrate tasks + +Reading them will give you information about the state of this cpuset: +the CPUs and Memory Nodes it can use, the processes that are using +it, its properties. By writing to these files you can manipulate +the cpuset. + +Set some flags:: + + # /bin/echo 1 > cpuset.cpu_exclusive + +Add some cpus:: + + # /bin/echo 0-7 > cpuset.cpus + +Add some mems:: + + # /bin/echo 0-7 > cpuset.mems + +Now attach your shell to this cpuset:: + + # /bin/echo $$ > tasks + +You can also create cpusets inside your cpuset by using mkdir in this +directory:: + + # mkdir my_sub_cs + +To remove a cpuset, just use rmdir:: + + # rmdir my_sub_cs + +This will fail if the cpuset is in use (has cpusets inside, or has +processes attached). + +Note that for legacy reasons, the "cpuset" filesystem exists as a +wrapper around the cgroup filesystem. + +The command:: + + mount -t cpuset X /sys/fs/cgroup/cpuset + +is equivalent to:: + + mount -t cgroup -ocpuset,noprefix X /sys/fs/cgroup/cpuset + echo "/sbin/cpuset_release_agent" > /sys/fs/cgroup/cpuset/release_agent + +2.2 Adding/removing cpus +------------------------ + +This is the syntax to use when writing in the cpus or mems files +in cpuset directories:: + + # /bin/echo 1-4 > cpuset.cpus -> set cpus list to cpus 1,2,3,4 + # /bin/echo 1,2,3,4 > cpuset.cpus -> set cpus list to cpus 1,2,3,4 + +To add a CPU to a cpuset, write the new list of CPUs including the +CPU to be added. To add 6 to the above cpuset:: + + # /bin/echo 1-4,6 > cpuset.cpus -> set cpus list to cpus 1,2,3,4,6 + +Similarly to remove a CPU from a cpuset, write the new list of CPUs +without the CPU to be removed. + +To remove all the CPUs:: + + # /bin/echo "" > cpuset.cpus -> clear cpus list + +2.3 Setting flags +----------------- + +The syntax is very simple:: + + # /bin/echo 1 > cpuset.cpu_exclusive -> set flag 'cpuset.cpu_exclusive' + # /bin/echo 0 > cpuset.cpu_exclusive -> unset flag 'cpuset.cpu_exclusive' + +2.4 Attaching processes +----------------------- + +:: + + # /bin/echo PID > tasks + +Note that it is PID, not PIDs. You can only attach ONE task at a time. +If you have several tasks to attach, you have to do it one after another:: + + # /bin/echo PID1 > tasks + # /bin/echo PID2 > tasks + ... + # /bin/echo PIDn > tasks + + +3. Questions +============ + +Q: + what's up with this '/bin/echo' ? + +A: + bash's builtin 'echo' command does not check calls to write() against + errors. If you use it in the cpuset file system, you won't be + able to tell whether a command succeeded or failed. + +Q: + When I attach processes, only the first of the line gets really attached ! + +A: + We can only return one error code per call to write(). So you should also + put only ONE pid. + +4. Contact +========== + +Web: http://www.bullopensource.org/cpuset diff --git a/Documentation/admin-guide/cgroup-v1/devices.rst b/Documentation/admin-guide/cgroup-v1/devices.rst new file mode 100644 index 000000000000..e1886783961e --- /dev/null +++ b/Documentation/admin-guide/cgroup-v1/devices.rst @@ -0,0 +1,132 @@ +=========================== +Device Whitelist Controller +=========================== + +1. Description +============== + +Implement a cgroup to track and enforce open and mknod restrictions +on device files. A device cgroup associates a device access +whitelist with each cgroup. A whitelist entry has 4 fields. +'type' is a (all), c (char), or b (block). 'all' means it applies +to all types and all major and minor numbers. Major and minor are +either an integer or * for all. Access is a composition of r +(read), w (write), and m (mknod). + +The root device cgroup starts with rwm to 'all'. A child device +cgroup gets a copy of the parent. Administrators can then remove +devices from the whitelist or add new entries. A child cgroup can +never receive a device access which is denied by its parent. + +2. User Interface +================= + +An entry is added using devices.allow, and removed using +devices.deny. For instance:: + + echo 'c 1:3 mr' > /sys/fs/cgroup/1/devices.allow + +allows cgroup 1 to read and mknod the device usually known as +/dev/null. Doing:: + + echo a > /sys/fs/cgroup/1/devices.deny + +will remove the default 'a *:* rwm' entry. Doing:: + + echo a > /sys/fs/cgroup/1/devices.allow + +will add the 'a *:* rwm' entry to the whitelist. + +3. Security +=========== + +Any task can move itself between cgroups. This clearly won't +suffice, but we can decide the best way to adequately restrict +movement as people get some experience with this. We may just want +to require CAP_SYS_ADMIN, which at least is a separate bit from +CAP_MKNOD. We may want to just refuse moving to a cgroup which +isn't a descendant of the current one. Or we may want to use +CAP_MAC_ADMIN, since we really are trying to lock down root. + +CAP_SYS_ADMIN is needed to modify the whitelist or move another +task to a new cgroup. (Again we'll probably want to change that). + +A cgroup may not be granted more permissions than the cgroup's +parent has. + +4. Hierarchy +============ + +device cgroups maintain hierarchy by making sure a cgroup never has more +access permissions than its parent. Every time an entry is written to +a cgroup's devices.deny file, all its children will have that entry removed +from their whitelist and all the locally set whitelist entries will be +re-evaluated. In case one of the locally set whitelist entries would provide +more access than the cgroup's parent, it'll be removed from the whitelist. + +Example:: + + A + / \ + B + + group behavior exceptions + A allow "b 8:* rwm", "c 116:1 rw" + B deny "c 1:3 rwm", "c 116:2 rwm", "b 3:* rwm" + +If a device is denied in group A:: + + # echo "c 116:* r" > A/devices.deny + +it'll propagate down and after revalidating B's entries, the whitelist entry +"c 116:2 rwm" will be removed:: + + group whitelist entries denied devices + A all "b 8:* rwm", "c 116:* rw" + B "c 1:3 rwm", "b 3:* rwm" all the rest + +In case parent's exceptions change and local exceptions are not allowed +anymore, they'll be deleted. + +Notice that new whitelist entries will not be propagated:: + + A + / \ + B + + group whitelist entries denied devices + A "c 1:3 rwm", "c 1:5 r" all the rest + B "c 1:3 rwm", "c 1:5 r" all the rest + +when adding ``c *:3 rwm``:: + + # echo "c *:3 rwm" >A/devices.allow + +the result:: + + group whitelist entries denied devices + A "c *:3 rwm", "c 1:5 r" all the rest + B "c 1:3 rwm", "c 1:5 r" all the rest + +but now it'll be possible to add new entries to B:: + + # echo "c 2:3 rwm" >B/devices.allow + # echo "c 50:3 r" >B/devices.allow + +or even:: + + # echo "c *:3 rwm" >B/devices.allow + +Allowing or denying all by writing 'a' to devices.allow or devices.deny will +not be possible once the device cgroups has children. + +4.1 Hierarchy (internal implementation) +--------------------------------------- + +device cgroups is implemented internally using a behavior (ALLOW, DENY) and a +list of exceptions. The internal state is controlled using the same user +interface to preserve compatibility with the previous whitelist-only +implementation. Removal or addition of exceptions that will reduce the access +to devices will be propagated down the hierarchy. +For every propagated exception, the effective rules will be re-evaluated based +on current parent's access rules. diff --git a/Documentation/admin-guide/cgroup-v1/freezer-subsystem.rst b/Documentation/admin-guide/cgroup-v1/freezer-subsystem.rst new file mode 100644 index 000000000000..582d3427de3f --- /dev/null +++ b/Documentation/admin-guide/cgroup-v1/freezer-subsystem.rst @@ -0,0 +1,127 @@ +============== +Cgroup Freezer +============== + +The cgroup freezer is useful to batch job management system which start +and stop sets of tasks in order to schedule the resources of a machine +according to the desires of a system administrator. This sort of program +is often used on HPC clusters to schedule access to the cluster as a +whole. The cgroup freezer uses cgroups to describe the set of tasks to +be started/stopped by the batch job management system. It also provides +a means to start and stop the tasks composing the job. + +The cgroup freezer will also be useful for checkpointing running groups +of tasks. The freezer allows the checkpoint code to obtain a consistent +image of the tasks by attempting to force the tasks in a cgroup into a +quiescent state. Once the tasks are quiescent another task can +walk /proc or invoke a kernel interface to gather information about the +quiesced tasks. Checkpointed tasks can be restarted later should a +recoverable error occur. This also allows the checkpointed tasks to be +migrated between nodes in a cluster by copying the gathered information +to another node and restarting the tasks there. + +Sequences of SIGSTOP and SIGCONT are not always sufficient for stopping +and resuming tasks in userspace. Both of these signals are observable +from within the tasks we wish to freeze. While SIGSTOP cannot be caught, +blocked, or ignored it can be seen by waiting or ptracing parent tasks. +SIGCONT is especially unsuitable since it can be caught by the task. Any +programs designed to watch for SIGSTOP and SIGCONT could be broken by +attempting to use SIGSTOP and SIGCONT to stop and resume tasks. We can +demonstrate this problem using nested bash shells:: + + $ echo $$ + 16644 + $ bash + $ echo $$ + 16690 + + From a second, unrelated bash shell: + $ kill -SIGSTOP 16690 + $ kill -SIGCONT 16690 + + + +This happens because bash can observe both signals and choose how it +responds to them. + +Another example of a program which catches and responds to these +signals is gdb. In fact any program designed to use ptrace is likely to +have a problem with this method of stopping and resuming tasks. + +In contrast, the cgroup freezer uses the kernel freezer code to +prevent the freeze/unfreeze cycle from becoming visible to the tasks +being frozen. This allows the bash example above and gdb to run as +expected. + +The cgroup freezer is hierarchical. Freezing a cgroup freezes all +tasks belonging to the cgroup and all its descendant cgroups. Each +cgroup has its own state (self-state) and the state inherited from the +parent (parent-state). Iff both states are THAWED, the cgroup is +THAWED. + +The following cgroupfs files are created by cgroup freezer. + +* freezer.state: Read-write. + + When read, returns the effective state of the cgroup - "THAWED", + "FREEZING" or "FROZEN". This is the combined self and parent-states. + If any is freezing, the cgroup is freezing (FREEZING or FROZEN). + + FREEZING cgroup transitions into FROZEN state when all tasks + belonging to the cgroup and its descendants become frozen. Note that + a cgroup reverts to FREEZING from FROZEN after a new task is added + to the cgroup or one of its descendant cgroups until the new task is + frozen. + + When written, sets the self-state of the cgroup. Two values are + allowed - "FROZEN" and "THAWED". If FROZEN is written, the cgroup, + if not already freezing, enters FREEZING state along with all its + descendant cgroups. + + If THAWED is written, the self-state of the cgroup is changed to + THAWED. Note that the effective state may not change to THAWED if + the parent-state is still freezing. If a cgroup's effective state + becomes THAWED, all its descendants which are freezing because of + the cgroup also leave the freezing state. + +* freezer.self_freezing: Read only. + + Shows the self-state. 0 if the self-state is THAWED; otherwise, 1. + This value is 1 iff the last write to freezer.state was "FROZEN". + +* freezer.parent_freezing: Read only. + + Shows the parent-state. 0 if none of the cgroup's ancestors is + frozen; otherwise, 1. + +The root cgroup is non-freezable and the above interface files don't +exist. + +* Examples of usage:: + + # mkdir /sys/fs/cgroup/freezer + # mount -t cgroup -ofreezer freezer /sys/fs/cgroup/freezer + # mkdir /sys/fs/cgroup/freezer/0 + # echo $some_pid > /sys/fs/cgroup/freezer/0/tasks + +to get status of the freezer subsystem:: + + # cat /sys/fs/cgroup/freezer/0/freezer.state + THAWED + +to freeze all tasks in the container:: + + # echo FROZEN > /sys/fs/cgroup/freezer/0/freezer.state + # cat /sys/fs/cgroup/freezer/0/freezer.state + FREEZING + # cat /sys/fs/cgroup/freezer/0/freezer.state + FROZEN + +to unfreeze all tasks in the container:: + + # echo THAWED > /sys/fs/cgroup/freezer/0/freezer.state + # cat /sys/fs/cgroup/freezer/0/freezer.state + THAWED + +This is the basic mechanism which should do the right thing for user space task +in a simple scenario. diff --git a/Documentation/admin-guide/cgroup-v1/hugetlb.rst b/Documentation/admin-guide/cgroup-v1/hugetlb.rst new file mode 100644 index 000000000000..a3902aa253a9 --- /dev/null +++ b/Documentation/admin-guide/cgroup-v1/hugetlb.rst @@ -0,0 +1,50 @@ +================== +HugeTLB Controller +================== + +The HugeTLB controller allows to limit the HugeTLB usage per control group and +enforces the controller limit during page fault. Since HugeTLB doesn't +support page reclaim, enforcing the limit at page fault time implies that, +the application will get SIGBUS signal if it tries to access HugeTLB pages +beyond its limit. This requires the application to know beforehand how much +HugeTLB pages it would require for its use. + +HugeTLB controller can be created by first mounting the cgroup filesystem. + +# mount -t cgroup -o hugetlb none /sys/fs/cgroup + +With the above step, the initial or the parent HugeTLB group becomes +visible at /sys/fs/cgroup. At bootup, this group includes all the tasks in +the system. /sys/fs/cgroup/tasks lists the tasks in this cgroup. + +New groups can be created under the parent group /sys/fs/cgroup:: + + # cd /sys/fs/cgroup + # mkdir g1 + # echo $$ > g1/tasks + +The above steps create a new group g1 and move the current shell +process (bash) into it. + +Brief summary of control files:: + + hugetlb..limit_in_bytes # set/show limit of "hugepagesize" hugetlb usage + hugetlb..max_usage_in_bytes # show max "hugepagesize" hugetlb usage recorded + hugetlb..usage_in_bytes # show current usage for "hugepagesize" hugetlb + hugetlb..failcnt # show the number of allocation failure due to HugeTLB limit + +For a system supporting three hugepage sizes (64k, 32M and 1G), the control +files include:: + + hugetlb.1GB.limit_in_bytes + hugetlb.1GB.max_usage_in_bytes + hugetlb.1GB.usage_in_bytes + hugetlb.1GB.failcnt + hugetlb.64KB.limit_in_bytes + hugetlb.64KB.max_usage_in_bytes + hugetlb.64KB.usage_in_bytes + hugetlb.64KB.failcnt + hugetlb.32MB.limit_in_bytes + hugetlb.32MB.max_usage_in_bytes + hugetlb.32MB.usage_in_bytes + hugetlb.32MB.failcnt diff --git a/Documentation/admin-guide/cgroup-v1/index.rst b/Documentation/admin-guide/cgroup-v1/index.rst new file mode 100644 index 000000000000..10bf48bae0b0 --- /dev/null +++ b/Documentation/admin-guide/cgroup-v1/index.rst @@ -0,0 +1,28 @@ +======================== +Control Groups version 1 +======================== + +.. toctree:: + :maxdepth: 1 + + cgroups + + blkio-controller + cpuacct + cpusets + devices + freezer-subsystem + hugetlb + memcg_test + memory + net_cls + net_prio + pids + rdma + +.. only:: subproject and html + + Indices + ======= + + * :ref:`genindex` diff --git a/Documentation/admin-guide/cgroup-v1/memcg_test.rst b/Documentation/admin-guide/cgroup-v1/memcg_test.rst new file mode 100644 index 000000000000..3f7115e07b5d --- /dev/null +++ b/Documentation/admin-guide/cgroup-v1/memcg_test.rst @@ -0,0 +1,355 @@ +===================================================== +Memory Resource Controller(Memcg) Implementation Memo +===================================================== + +Last Updated: 2010/2 + +Base Kernel Version: based on 2.6.33-rc7-mm(candidate for 34). + +Because VM is getting complex (one of reasons is memcg...), memcg's behavior +is complex. This is a document for memcg's internal behavior. +Please note that implementation details can be changed. + +(*) Topics on API should be in Documentation/admin-guide/cgroup-v1/memory.rst) + +0. How to record usage ? +======================== + + 2 objects are used. + + page_cgroup ....an object per page. + + Allocated at boot or memory hotplug. Freed at memory hot removal. + + swap_cgroup ... an entry per swp_entry. + + Allocated at swapon(). Freed at swapoff(). + + The page_cgroup has USED bit and double count against a page_cgroup never + occurs. swap_cgroup is used only when a charged page is swapped-out. + +1. Charge +========= + + a page/swp_entry may be charged (usage += PAGE_SIZE) at + + mem_cgroup_try_charge() + +2. Uncharge +=========== + + a page/swp_entry may be uncharged (usage -= PAGE_SIZE) by + + mem_cgroup_uncharge() + Called when a page's refcount goes down to 0. + + mem_cgroup_uncharge_swap() + Called when swp_entry's refcnt goes down to 0. A charge against swap + disappears. + +3. charge-commit-cancel +======================= + + Memcg pages are charged in two steps: + + - mem_cgroup_try_charge() + - mem_cgroup_commit_charge() or mem_cgroup_cancel_charge() + + At try_charge(), there are no flags to say "this page is charged". + at this point, usage += PAGE_SIZE. + + At commit(), the page is associated with the memcg. + + At cancel(), simply usage -= PAGE_SIZE. + +Under below explanation, we assume CONFIG_MEM_RES_CTRL_SWAP=y. + +4. Anonymous +============ + + Anonymous page is newly allocated at + - page fault into MAP_ANONYMOUS mapping. + - Copy-On-Write. + + 4.1 Swap-in. + At swap-in, the page is taken from swap-cache. There are 2 cases. + + (a) If the SwapCache is newly allocated and read, it has no charges. + (b) If the SwapCache has been mapped by processes, it has been + charged already. + + 4.2 Swap-out. + At swap-out, typical state transition is below. + + (a) add to swap cache. (marked as SwapCache) + swp_entry's refcnt += 1. + (b) fully unmapped. + swp_entry's refcnt += # of ptes. + (c) write back to swap. + (d) delete from swap cache. (remove from SwapCache) + swp_entry's refcnt -= 1. + + + Finally, at task exit, + (e) zap_pte() is called and swp_entry's refcnt -=1 -> 0. + +5. Page Cache +============= + + Page Cache is charged at + - add_to_page_cache_locked(). + + The logic is very clear. (About migration, see below) + + Note: + __remove_from_page_cache() is called by remove_from_page_cache() + and __remove_mapping(). + +6. Shmem(tmpfs) Page Cache +=========================== + + The best way to understand shmem's page state transition is to read + mm/shmem.c. + + But brief explanation of the behavior of memcg around shmem will be + helpful to understand the logic. + + Shmem's page (just leaf page, not direct/indirect block) can be on + + - radix-tree of shmem's inode. + - SwapCache. + - Both on radix-tree and SwapCache. This happens at swap-in + and swap-out, + + It's charged when... + + - A new page is added to shmem's radix-tree. + - A swp page is read. (move a charge from swap_cgroup to page_cgroup) + +7. Page Migration +================= + + mem_cgroup_migrate() + +8. LRU +====== + Each memcg has its own private LRU. Now, its handling is under global + VM's control (means that it's handled under global pgdat->lru_lock). + Almost all routines around memcg's LRU is called by global LRU's + list management functions under pgdat->lru_lock. + + A special function is mem_cgroup_isolate_pages(). This scans + memcg's private LRU and call __isolate_lru_page() to extract a page + from LRU. + + (By __isolate_lru_page(), the page is removed from both of global and + private LRU.) + + +9. Typical Tests. +================= + + Tests for racy cases. + +9.1 Small limit to memcg. +------------------------- + + When you do test to do racy case, it's good test to set memcg's limit + to be very small rather than GB. Many races found in the test under + xKB or xxMB limits. + + (Memory behavior under GB and Memory behavior under MB shows very + different situation.) + +9.2 Shmem +--------- + + Historically, memcg's shmem handling was poor and we saw some amount + of troubles here. This is because shmem is page-cache but can be + SwapCache. Test with shmem/tmpfs is always good test. + +9.3 Migration +------------- + + For NUMA, migration is an another special case. To do easy test, cpuset + is useful. Following is a sample script to do migration:: + + mount -t cgroup -o cpuset none /opt/cpuset + + mkdir /opt/cpuset/01 + echo 1 > /opt/cpuset/01/cpuset.cpus + echo 0 > /opt/cpuset/01/cpuset.mems + echo 1 > /opt/cpuset/01/cpuset.memory_migrate + mkdir /opt/cpuset/02 + echo 1 > /opt/cpuset/02/cpuset.cpus + echo 1 > /opt/cpuset/02/cpuset.mems + echo 1 > /opt/cpuset/02/cpuset.memory_migrate + + In above set, when you moves a task from 01 to 02, page migration to + node 0 to node 1 will occur. Following is a script to migrate all + under cpuset.:: + + -- + move_task() + { + for pid in $1 + do + /bin/echo $pid >$2/tasks 2>/dev/null + echo -n $pid + echo -n " " + done + echo END + } + + G1_TASK=`cat ${G1}/tasks` + G2_TASK=`cat ${G2}/tasks` + move_task "${G1_TASK}" ${G2} & + -- + +9.4 Memory hotplug +------------------ + + memory hotplug test is one of good test. + + to offline memory, do following:: + + # echo offline > /sys/devices/system/memory/memoryXXX/state + + (XXX is the place of memory) + + This is an easy way to test page migration, too. + +9.5 mkdir/rmdir +--------------- + + When using hierarchy, mkdir/rmdir test should be done. + Use tests like the following:: + + echo 1 >/opt/cgroup/01/memory/use_hierarchy + mkdir /opt/cgroup/01/child_a + mkdir /opt/cgroup/01/child_b + + set limit to 01. + add limit to 01/child_b + run jobs under child_a and child_b + + create/delete following groups at random while jobs are running:: + + /opt/cgroup/01/child_a/child_aa + /opt/cgroup/01/child_b/child_bb + /opt/cgroup/01/child_c + + running new jobs in new group is also good. + +9.6 Mount with other subsystems +------------------------------- + + Mounting with other subsystems is a good test because there is a + race and lock dependency with other cgroup subsystems. + + example:: + + # mount -t cgroup none /cgroup -o cpuset,memory,cpu,devices + + and do task move, mkdir, rmdir etc...under this. + +9.7 swapoff +----------- + + Besides management of swap is one of complicated parts of memcg, + call path of swap-in at swapoff is not same as usual swap-in path.. + It's worth to be tested explicitly. + + For example, test like following is good: + + (Shell-A):: + + # mount -t cgroup none /cgroup -o memory + # mkdir /cgroup/test + # echo 40M > /cgroup/test/memory.limit_in_bytes + # echo 0 > /cgroup/test/tasks + + Run malloc(100M) program under this. You'll see 60M of swaps. + + (Shell-B):: + + # move all tasks in /cgroup/test to /cgroup + # /sbin/swapoff -a + # rmdir /cgroup/test + # kill malloc task. + + Of course, tmpfs v.s. swapoff test should be tested, too. + +9.8 OOM-Killer +-------------- + + Out-of-memory caused by memcg's limit will kill tasks under + the memcg. When hierarchy is used, a task under hierarchy + will be killed by the kernel. + + In this case, panic_on_oom shouldn't be invoked and tasks + in other groups shouldn't be killed. + + It's not difficult to cause OOM under memcg as following. + + Case A) when you can swapoff:: + + #swapoff -a + #echo 50M > /memory.limit_in_bytes + + run 51M of malloc + + Case B) when you use mem+swap limitation:: + + #echo 50M > memory.limit_in_bytes + #echo 50M > memory.memsw.limit_in_bytes + + run 51M of malloc + +9.9 Move charges at task migration +---------------------------------- + + Charges associated with a task can be moved along with task migration. + + (Shell-A):: + + #mkdir /cgroup/A + #echo $$ >/cgroup/A/tasks + + run some programs which uses some amount of memory in /cgroup/A. + + (Shell-B):: + + #mkdir /cgroup/B + #echo 1 >/cgroup/B/memory.move_charge_at_immigrate + #echo "pid of the program running in group A" >/cgroup/B/tasks + + You can see charges have been moved by reading ``*.usage_in_bytes`` or + memory.stat of both A and B. + + See 8.2 of Documentation/admin-guide/cgroup-v1/memory.rst to see what value should + be written to move_charge_at_immigrate. + +9.10 Memory thresholds +---------------------- + + Memory controller implements memory thresholds using cgroups notification + API. You can use tools/cgroup/cgroup_event_listener.c to test it. + + (Shell-A) Create cgroup and run event listener:: + + # mkdir /cgroup/A + # ./cgroup_event_listener /cgroup/A/memory.usage_in_bytes 5M + + (Shell-B) Add task to cgroup and try to allocate and free memory:: + + # echo $$ >/cgroup/A/tasks + # a="$(dd if=/dev/zero bs=1M count=10)" + # a= + + You will see message from cgroup_event_listener every time you cross + the thresholds. + + Use /cgroup/A/memory.memsw.usage_in_bytes to test memsw thresholds. + + It's good idea to test root cgroup as well. diff --git a/Documentation/admin-guide/cgroup-v1/memory.rst b/Documentation/admin-guide/cgroup-v1/memory.rst new file mode 100644 index 000000000000..41bdc038dad9 --- /dev/null +++ b/Documentation/admin-guide/cgroup-v1/memory.rst @@ -0,0 +1,1003 @@ +========================== +Memory Resource Controller +========================== + +NOTE: + This document is hopelessly outdated and it asks for a complete + rewrite. It still contains a useful information so we are keeping it + here but make sure to check the current code if you need a deeper + understanding. + +NOTE: + The Memory Resource Controller has generically been referred to as the + memory controller in this document. Do not confuse memory controller + used here with the memory controller that is used in hardware. + +(For editors) In this document: + When we mention a cgroup (cgroupfs's directory) with memory controller, + we call it "memory cgroup". When you see git-log and source code, you'll + see patch's title and function names tend to use "memcg". + In this document, we avoid using it. + +Benefits and Purpose of the memory controller +============================================= + +The memory controller isolates the memory behaviour of a group of tasks +from the rest of the system. The article on LWN [12] mentions some probable +uses of the memory controller. The memory controller can be used to + +a. Isolate an application or a group of applications + Memory-hungry applications can be isolated and limited to a smaller + amount of memory. +b. Create a cgroup with a limited amount of memory; this can be used + as a good alternative to booting with mem=XXXX. +c. Virtualization solutions can control the amount of memory they want + to assign to a virtual machine instance. +d. A CD/DVD burner could control the amount of memory used by the + rest of the system to ensure that burning does not fail due to lack + of available memory. +e. There are several other use cases; find one or use the controller just + for fun (to learn and hack on the VM subsystem). + +Current Status: linux-2.6.34-mmotm(development version of 2010/April) + +Features: + + - accounting anonymous pages, file caches, swap caches usage and limiting them. + - pages are linked to per-memcg LRU exclusively, and there is no global LRU. + - optionally, memory+swap usage can be accounted and limited. + - hierarchical accounting + - soft limit + - moving (recharging) account at moving a task is selectable. + - usage threshold notifier + - memory pressure notifier + - oom-killer disable knob and oom-notifier + - Root cgroup has no limit controls. + + Kernel memory support is a work in progress, and the current version provides + basically functionality. (See Section 2.7) + +Brief summary of control files. + +==================================== ========================================== + tasks attach a task(thread) and show list of + threads + cgroup.procs show list of processes + cgroup.event_control an interface for event_fd() + memory.usage_in_bytes show current usage for memory + (See 5.5 for details) + memory.memsw.usage_in_bytes show current usage for memory+Swap + (See 5.5 for details) + memory.limit_in_bytes set/show limit of memory usage + memory.memsw.limit_in_bytes set/show limit of memory+Swap usage + memory.failcnt show the number of memory usage hits limits + memory.memsw.failcnt show the number of memory+Swap hits limits + memory.max_usage_in_bytes show max memory usage recorded + memory.memsw.max_usage_in_bytes show max memory+Swap usage recorded + memory.soft_limit_in_bytes set/show soft limit of memory usage + memory.stat show various statistics + memory.use_hierarchy set/show hierarchical account enabled + memory.force_empty trigger forced page reclaim + memory.pressure_level set memory pressure notifications + memory.swappiness set/show swappiness parameter of vmscan + (See sysctl's vm.swappiness) + memory.move_charge_at_immigrate set/show controls of moving charges + memory.oom_control set/show oom controls. + memory.numa_stat show the number of memory usage per numa + node + + memory.kmem.limit_in_bytes set/show hard limit for kernel memory + memory.kmem.usage_in_bytes show current kernel memory allocation + memory.kmem.failcnt show the number of kernel memory usage + hits limits + memory.kmem.max_usage_in_bytes show max kernel memory usage recorded + + memory.kmem.tcp.limit_in_bytes set/show hard limit for tcp buf memory + memory.kmem.tcp.usage_in_bytes show current tcp buf memory allocation + memory.kmem.tcp.failcnt show the number of tcp buf memory usage + hits limits + memory.kmem.tcp.max_usage_in_bytes show max tcp buf memory usage recorded +==================================== ========================================== + +1. History +========== + +The memory controller has a long history. A request for comments for the memory +controller was posted by Balbir Singh [1]. At the time the RFC was posted +there were several implementations for memory control. The goal of the +RFC was to build consensus and agreement for the minimal features required +for memory control. The first RSS controller was posted by Balbir Singh[2] +in Feb 2007. Pavel Emelianov [3][4][5] has since posted three versions of the +RSS controller. At OLS, at the resource management BoF, everyone suggested +that we handle both page cache and RSS together. Another request was raised +to allow user space handling of OOM. The current memory controller is +at version 6; it combines both mapped (RSS) and unmapped Page +Cache Control [11]. + +2. Memory Control +================= + +Memory is a unique resource in the sense that it is present in a limited +amount. If a task requires a lot of CPU processing, the task can spread +its processing over a period of hours, days, months or years, but with +memory, the same physical memory needs to be reused to accomplish the task. + +The memory controller implementation has been divided into phases. These +are: + +1. Memory controller +2. mlock(2) controller +3. Kernel user memory accounting and slab control +4. user mappings length controller + +The memory controller is the first controller developed. + +2.1. Design +----------- + +The core of the design is a counter called the page_counter. The +page_counter tracks the current memory usage and limit of the group of +processes associated with the controller. Each cgroup has a memory controller +specific data structure (mem_cgroup) associated with it. + +2.2. Accounting +--------------- + +:: + + +--------------------+ + | mem_cgroup | + | (page_counter) | + +--------------------+ + / ^ \ + / | \ + +---------------+ | +---------------+ + | mm_struct | |.... | mm_struct | + | | | | | + +---------------+ | +---------------+ + | + + --------------+ + | + +---------------+ +------+--------+ + | page +----------> page_cgroup| + | | | | + +---------------+ +---------------+ + + (Figure 1: Hierarchy of Accounting) + + +Figure 1 shows the important aspects of the controller + +1. Accounting happens per cgroup +2. Each mm_struct knows about which cgroup it belongs to +3. Each page has a pointer to the page_cgroup, which in turn knows the + cgroup it belongs to + +The accounting is done as follows: mem_cgroup_charge_common() is invoked to +set up the necessary data structures and check if the cgroup that is being +charged is over its limit. If it is, then reclaim is invoked on the cgroup. +More details can be found in the reclaim section of this document. +If everything goes well, a page meta-data-structure called page_cgroup is +updated. page_cgroup has its own LRU on cgroup. +(*) page_cgroup structure is allocated at boot/memory-hotplug time. + +2.2.1 Accounting details +------------------------ + +All mapped anon pages (RSS) and cache pages (Page Cache) are accounted. +Some pages which are never reclaimable and will not be on the LRU +are not accounted. We just account pages under usual VM management. + +RSS pages are accounted at page_fault unless they've already been accounted +for earlier. A file page will be accounted for as Page Cache when it's +inserted into inode (radix-tree). While it's mapped into the page tables of +processes, duplicate accounting is carefully avoided. + +An RSS page is unaccounted when it's fully unmapped. A PageCache page is +unaccounted when it's removed from radix-tree. Even if RSS pages are fully +unmapped (by kswapd), they may exist as SwapCache in the system until they +are really freed. Such SwapCaches are also accounted. +A swapped-in page is not accounted until it's mapped. + +Note: The kernel does swapin-readahead and reads multiple swaps at once. +This means swapped-in pages may contain pages for other tasks than a task +causing page fault. So, we avoid accounting at swap-in I/O. + +At page migration, accounting information is kept. + +Note: we just account pages-on-LRU because our purpose is to control amount +of used pages; not-on-LRU pages tend to be out-of-control from VM view. + +2.3 Shared Page Accounting +-------------------------- + +Shared pages are accounted on the basis of the first touch approach. The +cgroup that first touches a page is accounted for the page. The principle +behind this approach is that a cgroup that aggressively uses a shared +page will eventually get charged for it (once it is uncharged from +the cgroup that brought it in -- this will happen on memory pressure). + +But see section 8.2: when moving a task to another cgroup, its pages may +be recharged to the new cgroup, if move_charge_at_immigrate has been chosen. + +Exception: If CONFIG_MEMCG_SWAP is not used. +When you do swapoff and make swapped-out pages of shmem(tmpfs) to +be backed into memory in force, charges for pages are accounted against the +caller of swapoff rather than the users of shmem. + +2.4 Swap Extension (CONFIG_MEMCG_SWAP) +-------------------------------------- + +Swap Extension allows you to record charge for swap. A swapped-in page is +charged back to original page allocator if possible. + +When swap is accounted, following files are added. + + - memory.memsw.usage_in_bytes. + - memory.memsw.limit_in_bytes. + +memsw means memory+swap. Usage of memory+swap is limited by +memsw.limit_in_bytes. + +Example: Assume a system with 4G of swap. A task which allocates 6G of memory +(by mistake) under 2G memory limitation will use all swap. +In this case, setting memsw.limit_in_bytes=3G will prevent bad use of swap. +By using the memsw limit, you can avoid system OOM which can be caused by swap +shortage. + +**why 'memory+swap' rather than swap** + +The global LRU(kswapd) can swap out arbitrary pages. Swap-out means +to move account from memory to swap...there is no change in usage of +memory+swap. In other words, when we want to limit the usage of swap without +affecting global LRU, memory+swap limit is better than just limiting swap from +an OS point of view. + +**What happens when a cgroup hits memory.memsw.limit_in_bytes** + +When a cgroup hits memory.memsw.limit_in_bytes, it's useless to do swap-out +in this cgroup. Then, swap-out will not be done by cgroup routine and file +caches are dropped. But as mentioned above, global LRU can do swapout memory +from it for sanity of the system's memory management state. You can't forbid +it by cgroup. + +2.5 Reclaim +----------- + +Each cgroup maintains a per cgroup LRU which has the same structure as +global VM. When a cgroup goes over its limit, we first try +to reclaim memory from the cgroup so as to make space for the new +pages that the cgroup has touched. If the reclaim is unsuccessful, +an OOM routine is invoked to select and kill the bulkiest task in the +cgroup. (See 10. OOM Control below.) + +The reclaim algorithm has not been modified for cgroups, except that +pages that are selected for reclaiming come from the per-cgroup LRU +list. + +NOTE: + Reclaim does not work for the root cgroup, since we cannot set any + limits on the root cgroup. + +Note2: + When panic_on_oom is set to "2", the whole system will panic. + +When oom event notifier is registered, event will be delivered. +(See oom_control section) + +2.6 Locking +----------- + + lock_page_cgroup()/unlock_page_cgroup() should not be called under + the i_pages lock. + + Other lock order is following: + + PG_locked. + mm->page_table_lock + pgdat->lru_lock + lock_page_cgroup. + + In many cases, just lock_page_cgroup() is called. + + per-zone-per-cgroup LRU (cgroup's private LRU) is just guarded by + pgdat->lru_lock, it has no lock of its own. + +2.7 Kernel Memory Extension (CONFIG_MEMCG_KMEM) +----------------------------------------------- + +With the Kernel memory extension, the Memory Controller is able to limit +the amount of kernel memory used by the system. Kernel memory is fundamentally +different than user memory, since it can't be swapped out, which makes it +possible to DoS the system by consuming too much of this precious resource. + +Kernel memory accounting is enabled for all memory cgroups by default. But +it can be disabled system-wide by passing cgroup.memory=nokmem to the kernel +at boot time. In this case, kernel memory will not be accounted at all. + +Kernel memory limits are not imposed for the root cgroup. Usage for the root +cgroup may or may not be accounted. The memory used is accumulated into +memory.kmem.usage_in_bytes, or in a separate counter when it makes sense. +(currently only for tcp). + +The main "kmem" counter is fed into the main counter, so kmem charges will +also be visible from the user counter. + +Currently no soft limit is implemented for kernel memory. It is future work +to trigger slab reclaim when those limits are reached. + +2.7.1 Current Kernel Memory resources accounted +----------------------------------------------- + +stack pages: + every process consumes some stack pages. By accounting into + kernel memory, we prevent new processes from being created when the kernel + memory usage is too high. + +slab pages: + pages allocated by the SLAB or SLUB allocator are tracked. A copy + of each kmem_cache is created every time the cache is touched by the first time + from inside the memcg. The creation is done lazily, so some objects can still be + skipped while the cache is being created. All objects in a slab page should + belong to the same memcg. This only fails to hold when a task is migrated to a + different memcg during the page allocation by the cache. + +sockets memory pressure: + some sockets protocols have memory pressure + thresholds. The Memory Controller allows them to be controlled individually + per cgroup, instead of globally. + +tcp memory pressure: + sockets memory pressure for the tcp protocol. + +2.7.2 Common use cases +---------------------- + +Because the "kmem" counter is fed to the main user counter, kernel memory can +never be limited completely independently of user memory. Say "U" is the user +limit, and "K" the kernel limit. There are three possible ways limits can be +set: + +U != 0, K = unlimited: + This is the standard memcg limitation mechanism already present before kmem + accounting. Kernel memory is completely ignored. + +U != 0, K < U: + Kernel memory is a subset of the user memory. This setup is useful in + deployments where the total amount of memory per-cgroup is overcommited. + Overcommiting kernel memory limits is definitely not recommended, since the + box can still run out of non-reclaimable memory. + In this case, the admin could set up K so that the sum of all groups is + never greater than the total memory, and freely set U at the cost of his + QoS. + +WARNING: + In the current implementation, memory reclaim will NOT be + triggered for a cgroup when it hits K while staying below U, which makes + this setup impractical. + +U != 0, K >= U: + Since kmem charges will also be fed to the user counter and reclaim will be + triggered for the cgroup for both kinds of memory. This setup gives the + admin a unified view of memory, and it is also useful for people who just + want to track kernel memory usage. + +3. User Interface +================= + +3.0. Configuration +------------------ + +a. Enable CONFIG_CGROUPS +b. Enable CONFIG_MEMCG +c. Enable CONFIG_MEMCG_SWAP (to use swap extension) +d. Enable CONFIG_MEMCG_KMEM (to use kmem extension) + +3.1. Prepare the cgroups (see cgroups.txt, Why are cgroups needed?) +------------------------------------------------------------------- + +:: + + # mount -t tmpfs none /sys/fs/cgroup + # mkdir /sys/fs/cgroup/memory + # mount -t cgroup none /sys/fs/cgroup/memory -o memory + +3.2. Make the new group and move bash into it:: + + # mkdir /sys/fs/cgroup/memory/0 + # echo $$ > /sys/fs/cgroup/memory/0/tasks + +Since now we're in the 0 cgroup, we can alter the memory limit:: + + # echo 4M > /sys/fs/cgroup/memory/0/memory.limit_in_bytes + +NOTE: + We can use a suffix (k, K, m, M, g or G) to indicate values in kilo, + mega or gigabytes. (Here, Kilo, Mega, Giga are Kibibytes, Mebibytes, + Gibibytes.) + +NOTE: + We can write "-1" to reset the ``*.limit_in_bytes(unlimited)``. + +NOTE: + We cannot set limits on the root cgroup any more. + +:: + + # cat /sys/fs/cgroup/memory/0/memory.limit_in_bytes + 4194304 + +We can check the usage:: + + # cat /sys/fs/cgroup/memory/0/memory.usage_in_bytes + 1216512 + +A successful write to this file does not guarantee a successful setting of +this limit to the value written into the file. This can be due to a +number of factors, such as rounding up to page boundaries or the total +availability of memory on the system. The user is required to re-read +this file after a write to guarantee the value committed by the kernel:: + + # echo 1 > memory.limit_in_bytes + # cat memory.limit_in_bytes + 4096 + +The memory.failcnt field gives the number of times that the cgroup limit was +exceeded. + +The memory.stat file gives accounting information. Now, the number of +caches, RSS and Active pages/Inactive pages are shown. + +4. Testing +========== + +For testing features and implementation, see memcg_test.txt. + +Performance test is also important. To see pure memory controller's overhead, +testing on tmpfs will give you good numbers of small overheads. +Example: do kernel make on tmpfs. + +Page-fault scalability is also important. At measuring parallel +page fault test, multi-process test may be better than multi-thread +test because it has noise of shared objects/status. + +But the above two are testing extreme situations. +Trying usual test under memory controller is always helpful. + +4.1 Troubleshooting +------------------- + +Sometimes a user might find that the application under a cgroup is +terminated by the OOM killer. There are several causes for this: + +1. The cgroup limit is too low (just too low to do anything useful) +2. The user is using anonymous memory and swap is turned off or too low + +A sync followed by echo 1 > /proc/sys/vm/drop_caches will help get rid of +some of the pages cached in the cgroup (page cache pages). + +To know what happens, disabling OOM_Kill as per "10. OOM Control" (below) and +seeing what happens will be helpful. + +4.2 Task migration +------------------ + +When a task migrates from one cgroup to another, its charge is not +carried forward by default. The pages allocated from the original cgroup still +remain charged to it, the charge is dropped when the page is freed or +reclaimed. + +You can move charges of a task along with task migration. +See 8. "Move charges at task migration" + +4.3 Removing a cgroup +--------------------- + +A cgroup can be removed by rmdir, but as discussed in sections 4.1 and 4.2, a +cgroup might have some charge associated with it, even though all +tasks have migrated away from it. (because we charge against pages, not +against tasks.) + +We move the stats to root (if use_hierarchy==0) or parent (if +use_hierarchy==1), and no change on the charge except uncharging +from the child. + +Charges recorded in swap information is not updated at removal of cgroup. +Recorded information is discarded and a cgroup which uses swap (swapcache) +will be charged as a new owner of it. + +About use_hierarchy, see Section 6. + +5. Misc. interfaces +=================== + +5.1 force_empty +--------------- + memory.force_empty interface is provided to make cgroup's memory usage empty. + When writing anything to this:: + + # echo 0 > memory.force_empty + + the cgroup will be reclaimed and as many pages reclaimed as possible. + + The typical use case for this interface is before calling rmdir(). + Though rmdir() offlines memcg, but the memcg may still stay there due to + charged file caches. Some out-of-use page caches may keep charged until + memory pressure happens. If you want to avoid that, force_empty will be useful. + + Also, note that when memory.kmem.limit_in_bytes is set the charges due to + kernel pages will still be seen. This is not considered a failure and the + write will still return success. In this case, it is expected that + memory.kmem.usage_in_bytes == memory.usage_in_bytes. + + About use_hierarchy, see Section 6. + +5.2 stat file +------------- + +memory.stat file includes following statistics + +per-memory cgroup local status +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +=============== =============================================================== +cache # of bytes of page cache memory. +rss # of bytes of anonymous and swap cache memory (includes + transparent hugepages). +rss_huge # of bytes of anonymous transparent hugepages. +mapped_file # of bytes of mapped file (includes tmpfs/shmem) +pgpgin # of charging events to the memory cgroup. The charging + event happens each time a page is accounted as either mapped + anon page(RSS) or cache page(Page Cache) to the cgroup. +pgpgout # of uncharging events to the memory cgroup. The uncharging + event happens each time a page is unaccounted from the cgroup. +swap # of bytes of swap usage +dirty # of bytes that are waiting to get written back to the disk. +writeback # of bytes of file/anon cache that are queued for syncing to + disk. +inactive_anon # of bytes of anonymous and swap cache memory on inactive + LRU list. +active_anon # of bytes of anonymous and swap cache memory on active + LRU list. +inactive_file # of bytes of file-backed memory on inactive LRU list. +active_file # of bytes of file-backed memory on active LRU list. +unevictable # of bytes of memory that cannot be reclaimed (mlocked etc). +=============== =============================================================== + +status considering hierarchy (see memory.use_hierarchy settings) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +========================= =================================================== +hierarchical_memory_limit # of bytes of memory limit with regard to hierarchy + under which the memory cgroup is +hierarchical_memsw_limit # of bytes of memory+swap limit with regard to + hierarchy under which memory cgroup is. + +total_ # hierarchical version of , which in + addition to the cgroup's own value includes the + sum of all hierarchical children's values of + , i.e. total_cache +========================= =================================================== + +The following additional stats are dependent on CONFIG_DEBUG_VM +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +========================= ======================================== +recent_rotated_anon VM internal parameter. (see mm/vmscan.c) +recent_rotated_file VM internal parameter. (see mm/vmscan.c) +recent_scanned_anon VM internal parameter. (see mm/vmscan.c) +recent_scanned_file VM internal parameter. (see mm/vmscan.c) +========================= ======================================== + +Memo: + recent_rotated means recent frequency of LRU rotation. + recent_scanned means recent # of scans to LRU. + showing for better debug please see the code for meanings. + +Note: + Only anonymous and swap cache memory is listed as part of 'rss' stat. + This should not be confused with the true 'resident set size' or the + amount of physical memory used by the cgroup. + + 'rss + mapped_file" will give you resident set size of cgroup. + + (Note: file and shmem may be shared among other cgroups. In that case, + mapped_file is accounted only when the memory cgroup is owner of page + cache.) + +5.3 swappiness +-------------- + +Overrides /proc/sys/vm/swappiness for the particular group. The tunable +in the root cgroup corresponds to the global swappiness setting. + +Please note that unlike during the global reclaim, limit reclaim +enforces that 0 swappiness really prevents from any swapping even if +there is a swap storage available. This might lead to memcg OOM killer +if there are no file pages to reclaim. + +5.4 failcnt +----------- + +A memory cgroup provides memory.failcnt and memory.memsw.failcnt files. +This failcnt(== failure count) shows the number of times that a usage counter +hit its limit. When a memory cgroup hits a limit, failcnt increases and +memory under it will be reclaimed. + +You can reset failcnt by writing 0 to failcnt file:: + + # echo 0 > .../memory.failcnt + +5.5 usage_in_bytes +------------------ + +For efficiency, as other kernel components, memory cgroup uses some optimization +to avoid unnecessary cacheline false sharing. usage_in_bytes is affected by the +method and doesn't show 'exact' value of memory (and swap) usage, it's a fuzz +value for efficient access. (Of course, when necessary, it's synchronized.) +If you want to know more exact memory usage, you should use RSS+CACHE(+SWAP) +value in memory.stat(see 5.2). + +5.6 numa_stat +------------- + +This is similar to numa_maps but operates on a per-memcg basis. This is +useful for providing visibility into the numa locality information within +an memcg since the pages are allowed to be allocated from any physical +node. One of the use cases is evaluating application performance by +combining this information with the application's CPU allocation. + +Each memcg's numa_stat file includes "total", "file", "anon" and "unevictable" +per-node page counts including "hierarchical_" which sums up all +hierarchical children's values in addition to the memcg's own value. + +The output format of memory.numa_stat is:: + + total= N0= N1= ... + file= N0= N1= ... + anon= N0= N1= ... + unevictable= N0= N1= ... + hierarchical_= N0= N1= ... + +The "total" count is sum of file + anon + unevictable. + +6. Hierarchy support +==================== + +The memory controller supports a deep hierarchy and hierarchical accounting. +The hierarchy is created by creating the appropriate cgroups in the +cgroup filesystem. Consider for example, the following cgroup filesystem +hierarchy:: + + root + / | \ + / | \ + a b c + | \ + | \ + d e + +In the diagram above, with hierarchical accounting enabled, all memory +usage of e, is accounted to its ancestors up until the root (i.e, c and root), +that has memory.use_hierarchy enabled. If one of the ancestors goes over its +limit, the reclaim algorithm reclaims from the tasks in the ancestor and the +children of the ancestor. + +6.1 Enabling hierarchical accounting and reclaim +------------------------------------------------ + +A memory cgroup by default disables the hierarchy feature. Support +can be enabled by writing 1 to memory.use_hierarchy file of the root cgroup:: + + # echo 1 > memory.use_hierarchy + +The feature can be disabled by:: + + # echo 0 > memory.use_hierarchy + +NOTE1: + Enabling/disabling will fail if either the cgroup already has other + cgroups created below it, or if the parent cgroup has use_hierarchy + enabled. + +NOTE2: + When panic_on_oom is set to "2", the whole system will panic in + case of an OOM event in any cgroup. + +7. Soft limits +============== + +Soft limits allow for greater sharing of memory. The idea behind soft limits +is to allow control groups to use as much of the memory as needed, provided + +a. There is no memory contention +b. They do not exceed their hard limit + +When the system detects memory contention or low memory, control groups +are pushed back to their soft limits. If the soft limit of each control +group is very high, they are pushed back as much as possible to make +sure that one control group does not starve the others of memory. + +Please note that soft limits is a best-effort feature; it comes with +no guarantees, but it does its best to make sure that when memory is +heavily contended for, memory is allocated based on the soft limit +hints/setup. Currently soft limit based reclaim is set up such that +it gets invoked from balance_pgdat (kswapd). + +7.1 Interface +------------- + +Soft limits can be setup by using the following commands (in this example we +assume a soft limit of 256 MiB):: + + # echo 256M > memory.soft_limit_in_bytes + +If we want to change this to 1G, we can at any time use:: + + # echo 1G > memory.soft_limit_in_bytes + +NOTE1: + Soft limits take effect over a long period of time, since they involve + reclaiming memory for balancing between memory cgroups +NOTE2: + It is recommended to set the soft limit always below the hard limit, + otherwise the hard limit will take precedence. + +8. Move charges at task migration +================================= + +Users can move charges associated with a task along with task migration, that +is, uncharge task's pages from the old cgroup and charge them to the new cgroup. +This feature is not supported in !CONFIG_MMU environments because of lack of +page tables. + +8.1 Interface +------------- + +This feature is disabled by default. It can be enabled (and disabled again) by +writing to memory.move_charge_at_immigrate of the destination cgroup. + +If you want to enable it:: + + # echo (some positive value) > memory.move_charge_at_immigrate + +Note: + Each bits of move_charge_at_immigrate has its own meaning about what type + of charges should be moved. See 8.2 for details. +Note: + Charges are moved only when you move mm->owner, in other words, + a leader of a thread group. +Note: + If we cannot find enough space for the task in the destination cgroup, we + try to make space by reclaiming memory. Task migration may fail if we + cannot make enough space. +Note: + It can take several seconds if you move charges much. + +And if you want disable it again:: + + # echo 0 > memory.move_charge_at_immigrate + +8.2 Type of charges which can be moved +-------------------------------------- + +Each bit in move_charge_at_immigrate has its own meaning about what type of +charges should be moved. But in any case, it must be noted that an account of +a page or a swap can be moved only when it is charged to the task's current +(old) memory cgroup. + ++---+--------------------------------------------------------------------------+ +|bit| what type of charges would be moved ? | ++===+==========================================================================+ +| 0 | A charge of an anonymous page (or swap of it) used by the target task. | +| | You must enable Swap Extension (see 2.4) to enable move of swap charges. | ++---+--------------------------------------------------------------------------+ +| 1 | A charge of file pages (normal file, tmpfs file (e.g. ipc shared memory) | +| | and swaps of tmpfs file) mmapped by the target task. Unlike the case of | +| | anonymous pages, file pages (and swaps) in the range mmapped by the task | +| | will be moved even if the task hasn't done page fault, i.e. they might | +| | not be the task's "RSS", but other task's "RSS" that maps the same file. | +| | And mapcount of the page is ignored (the page can be moved even if | +| | page_mapcount(page) > 1). You must enable Swap Extension (see 2.4) to | +| | enable move of swap charges. | ++---+--------------------------------------------------------------------------+ + +8.3 TODO +-------- + +- All of moving charge operations are done under cgroup_mutex. It's not good + behavior to hold the mutex too long, so we may need some trick. + +9. Memory thresholds +==================== + +Memory cgroup implements memory thresholds using the cgroups notification +API (see cgroups.txt). It allows to register multiple memory and memsw +thresholds and gets notifications when it crosses. + +To register a threshold, an application must: + +- create an eventfd using eventfd(2); +- open memory.usage_in_bytes or memory.memsw.usage_in_bytes; +- write string like " " to + cgroup.event_control. + +Application will be notified through eventfd when memory usage crosses +threshold in any direction. + +It's applicable for root and non-root cgroup. + +10. OOM Control +=============== + +memory.oom_control file is for OOM notification and other controls. + +Memory cgroup implements OOM notifier using the cgroup notification +API (See cgroups.txt). It allows to register multiple OOM notification +delivery and gets notification when OOM happens. + +To register a notifier, an application must: + + - create an eventfd using eventfd(2) + - open memory.oom_control file + - write string like " " to + cgroup.event_control + +The application will be notified through eventfd when OOM happens. +OOM notification doesn't work for the root cgroup. + +You can disable the OOM-killer by writing "1" to memory.oom_control file, as: + + #echo 1 > memory.oom_control + +If OOM-killer is disabled, tasks under cgroup will hang/sleep +in memory cgroup's OOM-waitqueue when they request accountable memory. + +For running them, you have to relax the memory cgroup's OOM status by + + * enlarge limit or reduce usage. + +To reduce usage, + + * kill some tasks. + * move some tasks to other group with account migration. + * remove some files (on tmpfs?) + +Then, stopped tasks will work again. + +At reading, current status of OOM is shown. + + - oom_kill_disable 0 or 1 + (if 1, oom-killer is disabled) + - under_oom 0 or 1 + (if 1, the memory cgroup is under OOM, tasks may be stopped.) + +11. Memory Pressure +=================== + +The pressure level notifications can be used to monitor the memory +allocation cost; based on the pressure, applications can implement +different strategies of managing their memory resources. The pressure +levels are defined as following: + +The "low" level means that the system is reclaiming memory for new +allocations. Monitoring this reclaiming activity might be useful for +maintaining cache level. Upon notification, the program (typically +"Activity Manager") might analyze vmstat and act in advance (i.e. +prematurely shutdown unimportant services). + +The "medium" level means that the system is experiencing medium memory +pressure, the system might be making swap, paging out active file caches, +etc. Upon this event applications may decide to further analyze +vmstat/zoneinfo/memcg or internal memory usage statistics and free any +resources that can be easily reconstructed or re-read from a disk. + +The "critical" level means that the system is actively thrashing, it is +about to out of memory (OOM) or even the in-kernel OOM killer is on its +way to trigger. Applications should do whatever they can to help the +system. It might be too late to consult with vmstat or any other +statistics, so it's advisable to take an immediate action. + +By default, events are propagated upward until the event is handled, i.e. the +events are not pass-through. For example, you have three cgroups: A->B->C. Now +you set up an event listener on cgroups A, B and C, and suppose group C +experiences some pressure. In this situation, only group C will receive the +notification, i.e. groups A and B will not receive it. This is done to avoid +excessive "broadcasting" of messages, which disturbs the system and which is +especially bad if we are low on memory or thrashing. Group B, will receive +notification only if there are no event listers for group C. + +There are three optional modes that specify different propagation behavior: + + - "default": this is the default behavior specified above. This mode is the + same as omitting the optional mode parameter, preserved by backwards + compatibility. + + - "hierarchy": events always propagate up to the root, similar to the default + behavior, except that propagation continues regardless of whether there are + event listeners at each level, with the "hierarchy" mode. In the above + example, groups A, B, and C will receive notification of memory pressure. + + - "local": events are pass-through, i.e. they only receive notifications when + memory pressure is experienced in the memcg for which the notification is + registered. In the above example, group C will receive notification if + registered for "local" notification and the group experiences memory + pressure. However, group B will never receive notification, regardless if + there is an event listener for group C or not, if group B is registered for + local notification. + +The level and event notification mode ("hierarchy" or "local", if necessary) are +specified by a comma-delimited string, i.e. "low,hierarchy" specifies +hierarchical, pass-through, notification for all ancestor memcgs. Notification +that is the default, non pass-through behavior, does not specify a mode. +"medium,local" specifies pass-through notification for the medium level. + +The file memory.pressure_level is only used to setup an eventfd. To +register a notification, an application must: + +- create an eventfd using eventfd(2); +- open memory.pressure_level; +- write string as " " + to cgroup.event_control. + +Application will be notified through eventfd when memory pressure is at +the specific level (or higher). Read/write operations to +memory.pressure_level are no implemented. + +Test: + + Here is a small script example that makes a new cgroup, sets up a + memory limit, sets up a notification in the cgroup and then makes child + cgroup experience a critical pressure:: + + # cd /sys/fs/cgroup/memory/ + # mkdir foo + # cd foo + # cgroup_event_listener memory.pressure_level low,hierarchy & + # echo 8000000 > memory.limit_in_bytes + # echo 8000000 > memory.memsw.limit_in_bytes + # echo $$ > tasks + # dd if=/dev/zero | read x + + (Expect a bunch of notifications, and eventually, the oom-killer will + trigger.) + +12. TODO +======== + +1. Make per-cgroup scanner reclaim not-shared pages first +2. Teach controller to account for shared-pages +3. Start reclamation in the background when the limit is + not yet hit but the usage is getting closer + +Summary +======= + +Overall, the memory controller has been a stable controller and has been +commented and discussed quite extensively in the community. + +References +========== + +1. Singh, Balbir. RFC: Memory Controller, http://lwn.net/Articles/206697/ +2. Singh, Balbir. Memory Controller (RSS Control), + http://lwn.net/Articles/222762/ +3. Emelianov, Pavel. Resource controllers based on process cgroups + http://lkml.org/lkml/2007/3/6/198 +4. Emelianov, Pavel. RSS controller based on process cgroups (v2) + http://lkml.org/lkml/2007/4/9/78 +5. Emelianov, Pavel. RSS controller based on process cgroups (v3) + http://lkml.org/lkml/2007/5/30/244 +6. Menage, Paul. Control Groups v10, http://lwn.net/Articles/236032/ +7. Vaidyanathan, Srinivasan, Control Groups: Pagecache accounting and control + subsystem (v3), http://lwn.net/Articles/235534/ +8. Singh, Balbir. RSS controller v2 test results (lmbench), + http://lkml.org/lkml/2007/5/17/232 +9. Singh, Balbir. RSS controller v2 AIM9 results + http://lkml.org/lkml/2007/5/18/1 +10. Singh, Balbir. Memory controller v6 test results, + http://lkml.org/lkml/2007/8/19/36 +11. Singh, Balbir. Memory controller introduction (v6), + http://lkml.org/lkml/2007/8/17/69 +12. Corbet, Jonathan, Controlling memory use in cgroups, + http://lwn.net/Articles/243795/ diff --git a/Documentation/admin-guide/cgroup-v1/net_cls.rst b/Documentation/admin-guide/cgroup-v1/net_cls.rst new file mode 100644 index 000000000000..a2cf272af7a0 --- /dev/null +++ b/Documentation/admin-guide/cgroup-v1/net_cls.rst @@ -0,0 +1,44 @@ +========================= +Network classifier cgroup +========================= + +The Network classifier cgroup provides an interface to +tag network packets with a class identifier (classid). + +The Traffic Controller (tc) can be used to assign +different priorities to packets from different cgroups. +Also, Netfilter (iptables) can use this tag to perform +actions on such packets. + +Creating a net_cls cgroups instance creates a net_cls.classid file. +This net_cls.classid value is initialized to 0. + +You can write hexadecimal values to net_cls.classid; the format for these +values is 0xAAAABBBB; AAAA is the major handle number and BBBB +is the minor handle number. +Reading net_cls.classid yields a decimal result. + +Example:: + + mkdir /sys/fs/cgroup/net_cls + mount -t cgroup -onet_cls net_cls /sys/fs/cgroup/net_cls + mkdir /sys/fs/cgroup/net_cls/0 + echo 0x100001 > /sys/fs/cgroup/net_cls/0/net_cls.classid + +- setting a 10:1 handle:: + + cat /sys/fs/cgroup/net_cls/0/net_cls.classid + 1048577 + +- configuring tc:: + + tc qdisc add dev eth0 root handle 10: htb + tc class add dev eth0 parent 10: classid 10:1 htb rate 40mbit + +- creating traffic class 10:1:: + + tc filter add dev eth0 parent 10: protocol ip prio 10 handle 1: cgroup + +configuring iptables, basic example:: + + iptables -A OUTPUT -m cgroup ! --cgroup 0x100001 -j DROP diff --git a/Documentation/admin-guide/cgroup-v1/net_prio.rst b/Documentation/admin-guide/cgroup-v1/net_prio.rst new file mode 100644 index 000000000000..b40905871c64 --- /dev/null +++ b/Documentation/admin-guide/cgroup-v1/net_prio.rst @@ -0,0 +1,57 @@ +======================= +Network priority cgroup +======================= + +The Network priority cgroup provides an interface to allow an administrator to +dynamically set the priority of network traffic generated by various +applications + +Nominally, an application would set the priority of its traffic via the +SO_PRIORITY socket option. This however, is not always possible because: + +1) The application may not have been coded to set this value +2) The priority of application traffic is often a site-specific administrative + decision rather than an application defined one. + +This cgroup allows an administrator to assign a process to a group which defines +the priority of egress traffic on a given interface. Network priority groups can +be created by first mounting the cgroup filesystem:: + + # mount -t cgroup -onet_prio none /sys/fs/cgroup/net_prio + +With the above step, the initial group acting as the parent accounting group +becomes visible at '/sys/fs/cgroup/net_prio'. This group includes all tasks in +the system. '/sys/fs/cgroup/net_prio/tasks' lists the tasks in this cgroup. + +Each net_prio cgroup contains two files that are subsystem specific + +net_prio.prioidx + This file is read-only, and is simply informative. It contains a unique + integer value that the kernel uses as an internal representation of this + cgroup. + +net_prio.ifpriomap + This file contains a map of the priorities assigned to traffic originating + from processes in this group and egressing the system on various interfaces. + It contains a list of tuples in the form . Contents of this + file can be modified by echoing a string into the file using the same tuple + format. For example:: + + echo "eth0 5" > /sys/fs/cgroups/net_prio/iscsi/net_prio.ifpriomap + +This command would force any traffic originating from processes belonging to the +iscsi net_prio cgroup and egressing on interface eth0 to have the priority of +said traffic set to the value 5. The parent accounting group also has a +writeable 'net_prio.ifpriomap' file that can be used to set a system default +priority. + +Priorities are set immediately prior to queueing a frame to the device +queueing discipline (qdisc) so priorities will be assigned prior to the hardware +queue selection being made. + +One usage for the net_prio cgroup is with mqprio qdisc allowing application +traffic to be steered to hardware/driver based traffic classes. These mappings +can then be managed by administrators or other networking protocols such as +DCBX. + +A new net_prio cgroup inherits the parent's configuration. diff --git a/Documentation/admin-guide/cgroup-v1/pids.rst b/Documentation/admin-guide/cgroup-v1/pids.rst new file mode 100644 index 000000000000..6acebd9e72c8 --- /dev/null +++ b/Documentation/admin-guide/cgroup-v1/pids.rst @@ -0,0 +1,92 @@ +========================= +Process Number Controller +========================= + +Abstract +-------- + +The process number controller is used to allow a cgroup hierarchy to stop any +new tasks from being fork()'d or clone()'d after a certain limit is reached. + +Since it is trivial to hit the task limit without hitting any kmemcg limits in +place, PIDs are a fundamental resource. As such, PID exhaustion must be +preventable in the scope of a cgroup hierarchy by allowing resource limiting of +the number of tasks in a cgroup. + +Usage +----- + +In order to use the `pids` controller, set the maximum number of tasks in +pids.max (this is not available in the root cgroup for obvious reasons). The +number of processes currently in the cgroup is given by pids.current. + +Organisational operations are not blocked by cgroup policies, so it is possible +to have pids.current > pids.max. This can be done by either setting the limit to +be smaller than pids.current, or attaching enough processes to the cgroup such +that pids.current > pids.max. However, it is not possible to violate a cgroup +policy through fork() or clone(). fork() and clone() will return -EAGAIN if the +creation of a new process would cause a cgroup policy to be violated. + +To set a cgroup to have no limit, set pids.max to "max". This is the default for +all new cgroups (N.B. that PID limits are hierarchical, so the most stringent +limit in the hierarchy is followed). + +pids.current tracks all child cgroup hierarchies, so parent/pids.current is a +superset of parent/child/pids.current. + +The pids.events file contains event counters: + + - max: Number of times fork failed because limit was hit. + +Example +------- + +First, we mount the pids controller:: + + # mkdir -p /sys/fs/cgroup/pids + # mount -t cgroup -o pids none /sys/fs/cgroup/pids + +Then we create a hierarchy, set limits and attach processes to it:: + + # mkdir -p /sys/fs/cgroup/pids/parent/child + # echo 2 > /sys/fs/cgroup/pids/parent/pids.max + # echo $$ > /sys/fs/cgroup/pids/parent/cgroup.procs + # cat /sys/fs/cgroup/pids/parent/pids.current + 2 + # + +It should be noted that attempts to overcome the set limit (2 in this case) will +fail:: + + # cat /sys/fs/cgroup/pids/parent/pids.current + 2 + # ( /bin/echo "Here's some processes for you." | cat ) + sh: fork: Resource temporary unavailable + # + +Even if we migrate to a child cgroup (which doesn't have a set limit), we will +not be able to overcome the most stringent limit in the hierarchy (in this case, +parent's):: + + # echo $$ > /sys/fs/cgroup/pids/parent/child/cgroup.procs + # cat /sys/fs/cgroup/pids/parent/pids.current + 2 + # cat /sys/fs/cgroup/pids/parent/child/pids.current + 2 + # cat /sys/fs/cgroup/pids/parent/child/pids.max + max + # ( /bin/echo "Here's some processes for you." | cat ) + sh: fork: Resource temporary unavailable + # + +We can set a limit that is smaller than pids.current, which will stop any new +processes from being forked at all (note that the shell itself counts towards +pids.current):: + + # echo 1 > /sys/fs/cgroup/pids/parent/pids.max + # /bin/echo "We can't even spawn a single process now." + sh: fork: Resource temporary unavailable + # echo 0 > /sys/fs/cgroup/pids/parent/pids.max + # /bin/echo "We can't even spawn a single process now." + sh: fork: Resource temporary unavailable + # diff --git a/Documentation/admin-guide/cgroup-v1/rdma.rst b/Documentation/admin-guide/cgroup-v1/rdma.rst new file mode 100644 index 000000000000..2fcb0a9bf790 --- /dev/null +++ b/Documentation/admin-guide/cgroup-v1/rdma.rst @@ -0,0 +1,117 @@ +=============== +RDMA Controller +=============== + +.. Contents + + 1. Overview + 1-1. What is RDMA controller? + 1-2. Why RDMA controller needed? + 1-3. How is RDMA controller implemented? + 2. Usage Examples + +1. Overview +=========== + +1-1. What is RDMA controller? +----------------------------- + +RDMA controller allows user to limit RDMA/IB specific resources that a given +set of processes can use. These processes are grouped using RDMA controller. + +RDMA controller defines two resources which can be limited for processes of a +cgroup. + +1-2. Why RDMA controller needed? +-------------------------------- + +Currently user space applications can easily take away all the rdma verb +specific resources such as AH, CQ, QP, MR etc. Due to which other applications +in other cgroup or kernel space ULPs may not even get chance to allocate any +rdma resources. This can lead to service unavailability. + +Therefore RDMA controller is needed through which resource consumption +of processes can be limited. Through this controller different rdma +resources can be accounted. + +1-3. How is RDMA controller implemented? +---------------------------------------- + +RDMA cgroup allows limit configuration of resources. Rdma cgroup maintains +resource accounting per cgroup, per device using resource pool structure. +Each such resource pool is limited up to 64 resources in given resource pool +by rdma cgroup, which can be extended later if required. + +This resource pool object is linked to the cgroup css. Typically there +are 0 to 4 resource pool instances per cgroup, per device in most use cases. +But nothing limits to have it more. At present hundreds of RDMA devices per +single cgroup may not be handled optimally, however there is no +known use case or requirement for such configuration either. + +Since RDMA resources can be allocated from any process and can be freed by any +of the child processes which shares the address space, rdma resources are +always owned by the creator cgroup css. This allows process migration from one +to other cgroup without major complexity of transferring resource ownership; +because such ownership is not really present due to shared nature of +rdma resources. Linking resources around css also ensures that cgroups can be +deleted after processes migrated. This allow progress migration as well with +active resources, even though that is not a primary use case. + +Whenever RDMA resource charging occurs, owner rdma cgroup is returned to +the caller. Same rdma cgroup should be passed while uncharging the resource. +This also allows process migrated with active RDMA resource to charge +to new owner cgroup for new resource. It also allows to uncharge resource of +a process from previously charged cgroup which is migrated to new cgroup, +even though that is not a primary use case. + +Resource pool object is created in following situations. +(a) User sets the limit and no previous resource pool exist for the device +of interest for the cgroup. +(b) No resource limits were configured, but IB/RDMA stack tries to +charge the resource. So that it correctly uncharge them when applications are +running without limits and later on when limits are enforced during uncharging, +otherwise usage count will drop to negative. + +Resource pool is destroyed if all the resource limits are set to max and +it is the last resource getting deallocated. + +User should set all the limit to max value if it intents to remove/unconfigure +the resource pool for a particular device. + +IB stack honors limits enforced by the rdma controller. When application +query about maximum resource limits of IB device, it returns minimum of +what is configured by user for a given cgroup and what is supported by +IB device. + +Following resources can be accounted by rdma controller. + + ========== ============================= + hca_handle Maximum number of HCA Handles + hca_object Maximum number of HCA Objects + ========== ============================= + +2. Usage Examples +================= + +(a) Configure resource limit:: + + echo mlx4_0 hca_handle=2 hca_object=2000 > /sys/fs/cgroup/rdma/1/rdma.max + echo ocrdma1 hca_handle=3 > /sys/fs/cgroup/rdma/2/rdma.max + +(b) Query resource limit:: + + cat /sys/fs/cgroup/rdma/2/rdma.max + #Output: + mlx4_0 hca_handle=2 hca_object=2000 + ocrdma1 hca_handle=3 hca_object=max + +(c) Query current usage:: + + cat /sys/fs/cgroup/rdma/2/rdma.current + #Output: + mlx4_0 hca_handle=1 hca_object=20 + ocrdma1 hca_handle=1 hca_object=23 + +(d) Delete resource limit:: + + echo echo mlx4_0 hca_handle=max hca_object=max > /sys/fs/cgroup/rdma/1/rdma.max diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 080b18ce2a5d..ed4c5977d6e1 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -9,7 +9,7 @@ This is the authoritative documentation on the design, interface and conventions of cgroup v2. It describes all userland-visible aspects of cgroup including core and specific controller behaviors. All future changes must be reflected in this document. Documentation for -v1 is available under Documentation/cgroup-v1/. +v1 is available under Documentation/admin-guide/cgroup-v1/. .. CONTENTS diff --git a/Documentation/admin-guide/index.rst b/Documentation/admin-guide/index.rst index 1f0d9b939311..a5fdb1a846ce 100644 --- a/Documentation/admin-guide/index.rst +++ b/Documentation/admin-guide/index.rst @@ -59,6 +59,7 @@ configure specific aspects of kernel behavior to your liking. initrd cgroup-v2 + cgroup-v1/index serial-console braille-console parport diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 78576aa45cce..a571a67e0c85 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4089,7 +4089,7 @@ relax_domain_level= [KNL, SMP] Set scheduler's default relax_domain_level. - See Documentation/cgroup-v1/cpusets.rst. + See Documentation/admin-guide/cgroup-v1/cpusets.rst. reserve= [KNL,BUGS] Force kernel to ignore I/O ports or memory Format: ,[,,,...] @@ -4599,7 +4599,7 @@ swapaccount=[0|1] [KNL] Enable accounting of swap in memory resource controller if no parameter or 1 is given or disable - it if 0 is given (See Documentation/cgroup-v1/memory.rst) + it if 0 is given (See Documentation/admin-guide/cgroup-v1/memory.rst) swiotlb= [ARM,IA-64,PPC,MIPS,X86] Format: { | force | noforce } diff --git a/Documentation/admin-guide/mm/numa_memory_policy.rst b/Documentation/admin-guide/mm/numa_memory_policy.rst index 546f174e5d6a..8463f5538fda 100644 --- a/Documentation/admin-guide/mm/numa_memory_policy.rst +++ b/Documentation/admin-guide/mm/numa_memory_policy.rst @@ -15,7 +15,7 @@ document attempts to describe the concepts and APIs of the 2.6 memory policy support. Memory policies should not be confused with cpusets -(``Documentation/cgroup-v1/cpusets.rst``) +(``Documentation/admin-guide/cgroup-v1/cpusets.rst``) which is an administrative mechanism for restricting the nodes from which memory may be allocated by a set of processes. Memory policies are a programming interface that a NUMA-aware application can take advantage of. When diff --git a/Documentation/block/bfq-iosched.rst b/Documentation/block/bfq-iosched.rst index 2c13b2fc1888..0d237d402860 100644 --- a/Documentation/block/bfq-iosched.rst +++ b/Documentation/block/bfq-iosched.rst @@ -547,7 +547,7 @@ As for cgroups-v1 (blkio controller), the exact set of stat files created, and kept up-to-date by bfq, depends on whether CONFIG_BFQ_CGROUP_DEBUG is set. If it is set, then bfq creates all the stat files documented in -Documentation/cgroup-v1/blkio-controller.rst. If, instead, +Documentation/admin-guide/cgroup-v1/blkio-controller.rst. If, instead, CONFIG_BFQ_CGROUP_DEBUG is not set, then bfq creates only the files:: blkio.bfq.io_service_bytes diff --git a/Documentation/cgroup-v1/blkio-controller.rst b/Documentation/cgroup-v1/blkio-controller.rst deleted file mode 100644 index 1d7d962933be..000000000000 --- a/Documentation/cgroup-v1/blkio-controller.rst +++ /dev/null @@ -1,302 +0,0 @@ -=================== -Block IO Controller -=================== - -Overview -======== -cgroup subsys "blkio" implements the block io controller. There seems to be -a need of various kinds of IO control policies (like proportional BW, max BW) -both at leaf nodes as well as at intermediate nodes in a storage hierarchy. -Plan is to use the same cgroup based management interface for blkio controller -and based on user options switch IO policies in the background. - -One IO control policy is throttling policy which can be used to -specify upper IO rate limits on devices. This policy is implemented in -generic block layer and can be used on leaf nodes as well as higher -level logical devices like device mapper. - -HOWTO -===== -Throttling/Upper Limit policy ------------------------------ -- Enable Block IO controller:: - - CONFIG_BLK_CGROUP=y - -- Enable throttling in block layer:: - - CONFIG_BLK_DEV_THROTTLING=y - -- Mount blkio controller (see cgroups.txt, Why are cgroups needed?):: - - mount -t cgroup -o blkio none /sys/fs/cgroup/blkio - -- Specify a bandwidth rate on particular device for root group. The format - for policy is ": ":: - - echo "8:16 1048576" > /sys/fs/cgroup/blkio/blkio.throttle.read_bps_device - - Above will put a limit of 1MB/second on reads happening for root group - on device having major/minor number 8:16. - -- Run dd to read a file and see if rate is throttled to 1MB/s or not:: - - # dd iflag=direct if=/mnt/common/zerofile of=/dev/null bs=4K count=1024 - 1024+0 records in - 1024+0 records out - 4194304 bytes (4.2 MB) copied, 4.0001 s, 1.0 MB/s - - Limits for writes can be put using blkio.throttle.write_bps_device file. - -Hierarchical Cgroups -==================== - -Throttling implements hierarchy support; however, -throttling's hierarchy support is enabled iff "sane_behavior" is -enabled from cgroup side, which currently is a development option and -not publicly available. - -If somebody created a hierarchy like as follows:: - - root - / \ - test1 test2 - | - test3 - -Throttling with "sane_behavior" will handle the -hierarchy correctly. For throttling, all limits apply -to the whole subtree while all statistics are local to the IOs -directly generated by tasks in that cgroup. - -Throttling without "sane_behavior" enabled from cgroup side will -practically treat all groups at same level as if it looks like the -following:: - - pivot - / / \ \ - root test1 test2 test3 - -Various user visible config options -=================================== -CONFIG_BLK_CGROUP - - Block IO controller. - -CONFIG_BFQ_CGROUP_DEBUG - - Debug help. Right now some additional stats file show up in cgroup - if this option is enabled. - -CONFIG_BLK_DEV_THROTTLING - - Enable block device throttling support in block layer. - -Details of cgroup files -======================= -Proportional weight policy files --------------------------------- -- blkio.weight - - Specifies per cgroup weight. This is default weight of the group - on all the devices until and unless overridden by per device rule. - (See blkio.weight_device). - Currently allowed range of weights is from 10 to 1000. - -- blkio.weight_device - - One can specify per cgroup per device rules using this interface. - These rules override the default value of group weight as specified - by blkio.weight. - - Following is the format:: - - # echo dev_maj:dev_minor weight > blkio.weight_device - - Configure weight=300 on /dev/sdb (8:16) in this cgroup:: - - # echo 8:16 300 > blkio.weight_device - # cat blkio.weight_device - dev weight - 8:16 300 - - Configure weight=500 on /dev/sda (8:0) in this cgroup:: - - # echo 8:0 500 > blkio.weight_device - # cat blkio.weight_device - dev weight - 8:0 500 - 8:16 300 - - Remove specific weight for /dev/sda in this cgroup:: - - # echo 8:0 0 > blkio.weight_device - # cat blkio.weight_device - dev weight - 8:16 300 - -- blkio.leaf_weight[_device] - - Equivalents of blkio.weight[_device] for the purpose of - deciding how much weight tasks in the given cgroup has while - competing with the cgroup's child cgroups. For details, - please refer to Documentation/block/cfq-iosched.txt. - -- blkio.time - - disk time allocated to cgroup per device in milliseconds. First - two fields specify the major and minor number of the device and - third field specifies the disk time allocated to group in - milliseconds. - -- blkio.sectors - - number of sectors transferred to/from disk by the group. First - two fields specify the major and minor number of the device and - third field specifies the number of sectors transferred by the - group to/from the device. - -- blkio.io_service_bytes - - Number of bytes transferred to/from the disk by the group. These - are further divided by the type of operation - read or write, sync - or async. First two fields specify the major and minor number of the - device, third field specifies the operation type and the fourth field - specifies the number of bytes. - -- blkio.io_serviced - - Number of IOs (bio) issued to the disk by the group. These - are further divided by the type of operation - read or write, sync - or async. First two fields specify the major and minor number of the - device, third field specifies the operation type and the fourth field - specifies the number of IOs. - -- blkio.io_service_time - - Total amount of time between request dispatch and request completion - for the IOs done by this cgroup. This is in nanoseconds to make it - meaningful for flash devices too. For devices with queue depth of 1, - this time represents the actual service time. When queue_depth > 1, - that is no longer true as requests may be served out of order. This - may cause the service time for a given IO to include the service time - of multiple IOs when served out of order which may result in total - io_service_time > actual time elapsed. This time is further divided by - the type of operation - read or write, sync or async. First two fields - specify the major and minor number of the device, third field - specifies the operation type and the fourth field specifies the - io_service_time in ns. - -- blkio.io_wait_time - - Total amount of time the IOs for this cgroup spent waiting in the - scheduler queues for service. This can be greater than the total time - elapsed since it is cumulative io_wait_time for all IOs. It is not a - measure of total time the cgroup spent waiting but rather a measure of - the wait_time for its individual IOs. For devices with queue_depth > 1 - this metric does not include the time spent waiting for service once - the IO is dispatched to the device but till it actually gets serviced - (there might be a time lag here due to re-ordering of requests by the - device). This is in nanoseconds to make it meaningful for flash - devices too. This time is further divided by the type of operation - - read or write, sync or async. First two fields specify the major and - minor number of the device, third field specifies the operation type - and the fourth field specifies the io_wait_time in ns. - -- blkio.io_merged - - Total number of bios/requests merged into requests belonging to this - cgroup. This is further divided by the type of operation - read or - write, sync or async. - -- blkio.io_queued - - Total number of requests queued up at any given instant for this - cgroup. This is further divided by the type of operation - read or - write, sync or async. - -- blkio.avg_queue_size - - Debugging aid only enabled if CONFIG_BFQ_CGROUP_DEBUG=y. - The average queue size for this cgroup over the entire time of this - cgroup's existence. Queue size samples are taken each time one of the - queues of this cgroup gets a timeslice. - -- blkio.group_wait_time - - Debugging aid only enabled if CONFIG_BFQ_CGROUP_DEBUG=y. - This is the amount of time the cgroup had to wait since it became busy - (i.e., went from 0 to 1 request queued) to get a timeslice for one of - its queues. This is different from the io_wait_time which is the - cumulative total of the amount of time spent by each IO in that cgroup - waiting in the scheduler queue. This is in nanoseconds. If this is - read when the cgroup is in a waiting (for timeslice) state, the stat - will only report the group_wait_time accumulated till the last time it - got a timeslice and will not include the current delta. - -- blkio.empty_time - - Debugging aid only enabled if CONFIG_BFQ_CGROUP_DEBUG=y. - This is the amount of time a cgroup spends without any pending - requests when not being served, i.e., it does not include any time - spent idling for one of the queues of the cgroup. This is in - nanoseconds. If this is read when the cgroup is in an empty state, - the stat will only report the empty_time accumulated till the last - time it had a pending request and will not include the current delta. - -- blkio.idle_time - - Debugging aid only enabled if CONFIG_BFQ_CGROUP_DEBUG=y. - This is the amount of time spent by the IO scheduler idling for a - given cgroup in anticipation of a better request than the existing ones - from other queues/cgroups. This is in nanoseconds. If this is read - when the cgroup is in an idling state, the stat will only report the - idle_time accumulated till the last idle period and will not include - the current delta. - -- blkio.dequeue - - Debugging aid only enabled if CONFIG_BFQ_CGROUP_DEBUG=y. This - gives the statistics about how many a times a group was dequeued - from service tree of the device. First two fields specify the major - and minor number of the device and third field specifies the number - of times a group was dequeued from a particular device. - -- blkio.*_recursive - - Recursive version of various stats. These files show the - same information as their non-recursive counterparts but - include stats from all the descendant cgroups. - -Throttling/Upper limit policy files ------------------------------------ -- blkio.throttle.read_bps_device - - Specifies upper limit on READ rate from the device. IO rate is - specified in bytes per second. Rules are per device. Following is - the format:: - - echo ": " > /cgrp/blkio.throttle.read_bps_device - -- blkio.throttle.write_bps_device - - Specifies upper limit on WRITE rate to the device. IO rate is - specified in bytes per second. Rules are per device. Following is - the format:: - - echo ": " > /cgrp/blkio.throttle.write_bps_device - -- blkio.throttle.read_iops_device - - Specifies upper limit on READ rate from the device. IO rate is - specified in IO per second. Rules are per device. Following is - the format:: - - echo ": " > /cgrp/blkio.throttle.read_iops_device - -- blkio.throttle.write_iops_device - - Specifies upper limit on WRITE rate to the device. IO rate is - specified in io per second. Rules are per device. Following is - the format:: - - echo ": " > /cgrp/blkio.throttle.write_iops_device - -Note: If both BW and IOPS rules are specified for a device, then IO is - subjected to both the constraints. - -- blkio.throttle.io_serviced - - Number of IOs (bio) issued to the disk by the group. These - are further divided by the type of operation - read or write, sync - or async. First two fields specify the major and minor number of the - device, third field specifies the operation type and the fourth field - specifies the number of IOs. - -- blkio.throttle.io_service_bytes - - Number of bytes transferred to/from the disk by the group. These - are further divided by the type of operation - read or write, sync - or async. First two fields specify the major and minor number of the - device, third field specifies the operation type and the fourth field - specifies the number of bytes. - -Common files among various policies ------------------------------------ -- blkio.reset_stats - - Writing an int to this file will result in resetting all the stats - for that cgroup. diff --git a/Documentation/cgroup-v1/cgroups.rst b/Documentation/cgroup-v1/cgroups.rst deleted file mode 100644 index 46bbe7e022d4..000000000000 --- a/Documentation/cgroup-v1/cgroups.rst +++ /dev/null @@ -1,695 +0,0 @@ -============== -Control Groups -============== - -Written by Paul Menage based on -Documentation/cgroup-v1/cpusets.rst - -Original copyright statements from cpusets.txt: - -Portions Copyright (C) 2004 BULL SA. - -Portions Copyright (c) 2004-2006 Silicon Graphics, Inc. - -Modified by Paul Jackson - -Modified by Christoph Lameter - -.. CONTENTS: - - 1. Control Groups - 1.1 What are cgroups ? - 1.2 Why are cgroups needed ? - 1.3 How are cgroups implemented ? - 1.4 What does notify_on_release do ? - 1.5 What does clone_children do ? - 1.6 How do I use cgroups ? - 2. Usage Examples and Syntax - 2.1 Basic Usage - 2.2 Attaching processes - 2.3 Mounting hierarchies by name - 3. Kernel API - 3.1 Overview - 3.2 Synchronization - 3.3 Subsystem API - 4. Extended attributes usage - 5. Questions - -1. Control Groups -================= - -1.1 What are cgroups ? ----------------------- - -Control Groups provide a mechanism for aggregating/partitioning sets of -tasks, and all their future children, into hierarchical groups with -specialized behaviour. - -Definitions: - -A *cgroup* associates a set of tasks with a set of parameters for one -or more subsystems. - -A *subsystem* is a module that makes use of the task grouping -facilities provided by cgroups to treat groups of tasks in -particular ways. A subsystem is typically a "resource controller" that -schedules a resource or applies per-cgroup limits, but it may be -anything that wants to act on a group of processes, e.g. a -virtualization subsystem. - -A *hierarchy* is a set of cgroups arranged in a tree, such that -every task in the system is in exactly one of the cgroups in the -hierarchy, and a set of subsystems; each subsystem has system-specific -state attached to each cgroup in the hierarchy. Each hierarchy has -an instance of the cgroup virtual filesystem associated with it. - -At any one time there may be multiple active hierarchies of task -cgroups. Each hierarchy is a partition of all tasks in the system. - -User-level code may create and destroy cgroups by name in an -instance of the cgroup virtual file system, specify and query to -which cgroup a task is assigned, and list the task PIDs assigned to -a cgroup. Those creations and assignments only affect the hierarchy -associated with that instance of the cgroup file system. - -On their own, the only use for cgroups is for simple job -tracking. The intention is that other subsystems hook into the generic -cgroup support to provide new attributes for cgroups, such as -accounting/limiting the resources which processes in a cgroup can -access. For example, cpusets (see Documentation/cgroup-v1/cpusets.rst) allow -you to associate a set of CPUs and a set of memory nodes with the -tasks in each cgroup. - -1.2 Why are cgroups needed ? ----------------------------- - -There are multiple efforts to provide process aggregations in the -Linux kernel, mainly for resource-tracking purposes. Such efforts -include cpusets, CKRM/ResGroups, UserBeanCounters, and virtual server -namespaces. These all require the basic notion of a -grouping/partitioning of processes, with newly forked processes ending -up in the same group (cgroup) as their parent process. - -The kernel cgroup patch provides the minimum essential kernel -mechanisms required to efficiently implement such groups. It has -minimal impact on the system fast paths, and provides hooks for -specific subsystems such as cpusets to provide additional behaviour as -desired. - -Multiple hierarchy support is provided to allow for situations where -the division of tasks into cgroups is distinctly different for -different subsystems - having parallel hierarchies allows each -hierarchy to be a natural division of tasks, without having to handle -complex combinations of tasks that would be present if several -unrelated subsystems needed to be forced into the same tree of -cgroups. - -At one extreme, each resource controller or subsystem could be in a -separate hierarchy; at the other extreme, all subsystems -would be attached to the same hierarchy. - -As an example of a scenario (originally proposed by vatsa@in.ibm.com) -that can benefit from multiple hierarchies, consider a large -university server with various users - students, professors, system -tasks etc. The resource planning for this server could be along the -following lines:: - - CPU : "Top cpuset" - / \ - CPUSet1 CPUSet2 - | | - (Professors) (Students) - - In addition (system tasks) are attached to topcpuset (so - that they can run anywhere) with a limit of 20% - - Memory : Professors (50%), Students (30%), system (20%) - - Disk : Professors (50%), Students (30%), system (20%) - - Network : WWW browsing (20%), Network File System (60%), others (20%) - / \ - Professors (15%) students (5%) - -Browsers like Firefox/Lynx go into the WWW network class, while (k)nfsd goes -into the NFS network class. - -At the same time Firefox/Lynx will share an appropriate CPU/Memory class -depending on who launched it (prof/student). - -With the ability to classify tasks differently for different resources -(by putting those resource subsystems in different hierarchies), -the admin can easily set up a script which receives exec notifications -and depending on who is launching the browser he can:: - - # echo browser_pid > /sys/fs/cgroup///tasks - -With only a single hierarchy, he now would potentially have to create -a separate cgroup for every browser launched and associate it with -appropriate network and other resource class. This may lead to -proliferation of such cgroups. - -Also let's say that the administrator would like to give enhanced network -access temporarily to a student's browser (since it is night and the user -wants to do online gaming :)) OR give one of the student's simulation -apps enhanced CPU power. - -With ability to write PIDs directly to resource classes, it's just a -matter of:: - - # echo pid > /sys/fs/cgroup/network//tasks - (after some time) - # echo pid > /sys/fs/cgroup/network//tasks - -Without this ability, the administrator would have to split the cgroup into -multiple separate ones and then associate the new cgroups with the -new resource classes. - - - -1.3 How are cgroups implemented ? ---------------------------------- - -Control Groups extends the kernel as follows: - - - Each task in the system has a reference-counted pointer to a - css_set. - - - A css_set contains a set of reference-counted pointers to - cgroup_subsys_state objects, one for each cgroup subsystem - registered in the system. There is no direct link from a task to - the cgroup of which it's a member in each hierarchy, but this - can be determined by following pointers through the - cgroup_subsys_state objects. This is because accessing the - subsystem state is something that's expected to happen frequently - and in performance-critical code, whereas operations that require a - task's actual cgroup assignments (in particular, moving between - cgroups) are less common. A linked list runs through the cg_list - field of each task_struct using the css_set, anchored at - css_set->tasks. - - - A cgroup hierarchy filesystem can be mounted for browsing and - manipulation from user space. - - - You can list all the tasks (by PID) attached to any cgroup. - -The implementation of cgroups requires a few, simple hooks -into the rest of the kernel, none in performance-critical paths: - - - in init/main.c, to initialize the root cgroups and initial - css_set at system boot. - - - in fork and exit, to attach and detach a task from its css_set. - -In addition, a new file system of type "cgroup" may be mounted, to -enable browsing and modifying the cgroups presently known to the -kernel. When mounting a cgroup hierarchy, you may specify a -comma-separated list of subsystems to mount as the filesystem mount -options. By default, mounting the cgroup filesystem attempts to -mount a hierarchy containing all registered subsystems. - -If an active hierarchy with exactly the same set of subsystems already -exists, it will be reused for the new mount. If no existing hierarchy -matches, and any of the requested subsystems are in use in an existing -hierarchy, the mount will fail with -EBUSY. Otherwise, a new hierarchy -is activated, associated with the requested subsystems. - -It's not currently possible to bind a new subsystem to an active -cgroup hierarchy, or to unbind a subsystem from an active cgroup -hierarchy. This may be possible in future, but is fraught with nasty -error-recovery issues. - -When a cgroup filesystem is unmounted, if there are any -child cgroups created below the top-level cgroup, that hierarchy -will remain active even though unmounted; if there are no -child cgroups then the hierarchy will be deactivated. - -No new system calls are added for cgroups - all support for -querying and modifying cgroups is via this cgroup file system. - -Each task under /proc has an added file named 'cgroup' displaying, -for each active hierarchy, the subsystem names and the cgroup name -as the path relative to the root of the cgroup file system. - -Each cgroup is represented by a directory in the cgroup file system -containing the following files describing that cgroup: - - - tasks: list of tasks (by PID) attached to that cgroup. This list - is not guaranteed to be sorted. Writing a thread ID into this file - moves the thread into this cgroup. - - cgroup.procs: list of thread group IDs in the cgroup. This list is - not guaranteed to be sorted or free of duplicate TGIDs, and userspace - should sort/uniquify the list if this property is required. - Writing a thread group ID into this file moves all threads in that - group into this cgroup. - - notify_on_release flag: run the release agent on exit? - - release_agent: the path to use for release notifications (this file - exists in the top cgroup only) - -Other subsystems such as cpusets may add additional files in each -cgroup dir. - -New cgroups are created using the mkdir system call or shell -command. The properties of a cgroup, such as its flags, are -modified by writing to the appropriate file in that cgroups -directory, as listed above. - -The named hierarchical structure of nested cgroups allows partitioning -a large system into nested, dynamically changeable, "soft-partitions". - -The attachment of each task, automatically inherited at fork by any -children of that task, to a cgroup allows organizing the work load -on a system into related sets of tasks. A task may be re-attached to -any other cgroup, if allowed by the permissions on the necessary -cgroup file system directories. - -When a task is moved from one cgroup to another, it gets a new -css_set pointer - if there's an already existing css_set with the -desired collection of cgroups then that group is reused, otherwise a new -css_set is allocated. The appropriate existing css_set is located by -looking into a hash table. - -To allow access from a cgroup to the css_sets (and hence tasks) -that comprise it, a set of cg_cgroup_link objects form a lattice; -each cg_cgroup_link is linked into a list of cg_cgroup_links for -a single cgroup on its cgrp_link_list field, and a list of -cg_cgroup_links for a single css_set on its cg_link_list. - -Thus the set of tasks in a cgroup can be listed by iterating over -each css_set that references the cgroup, and sub-iterating over -each css_set's task set. - -The use of a Linux virtual file system (vfs) to represent the -cgroup hierarchy provides for a familiar permission and name space -for cgroups, with a minimum of additional kernel code. - -1.4 What does notify_on_release do ? ------------------------------------- - -If the notify_on_release flag is enabled (1) in a cgroup, then -whenever the last task in the cgroup leaves (exits or attaches to -some other cgroup) and the last child cgroup of that cgroup -is removed, then the kernel runs the command specified by the contents -of the "release_agent" file in that hierarchy's root directory, -supplying the pathname (relative to the mount point of the cgroup -file system) of the abandoned cgroup. This enables automatic -removal of abandoned cgroups. The default value of -notify_on_release in the root cgroup at system boot is disabled -(0). The default value of other cgroups at creation is the current -value of their parents' notify_on_release settings. The default value of -a cgroup hierarchy's release_agent path is empty. - -1.5 What does clone_children do ? ---------------------------------- - -This flag only affects the cpuset controller. If the clone_children -flag is enabled (1) in a cgroup, a new cpuset cgroup will copy its -configuration from the parent during initialization. - -1.6 How do I use cgroups ? --------------------------- - -To start a new job that is to be contained within a cgroup, using -the "cpuset" cgroup subsystem, the steps are something like:: - - 1) mount -t tmpfs cgroup_root /sys/fs/cgroup - 2) mkdir /sys/fs/cgroup/cpuset - 3) mount -t cgroup -ocpuset cpuset /sys/fs/cgroup/cpuset - 4) Create the new cgroup by doing mkdir's and write's (or echo's) in - the /sys/fs/cgroup/cpuset virtual file system. - 5) Start a task that will be the "founding father" of the new job. - 6) Attach that task to the new cgroup by writing its PID to the - /sys/fs/cgroup/cpuset tasks file for that cgroup. - 7) fork, exec or clone the job tasks from this founding father task. - -For example, the following sequence of commands will setup a cgroup -named "Charlie", containing just CPUs 2 and 3, and Memory Node 1, -and then start a subshell 'sh' in that cgroup:: - - mount -t tmpfs cgroup_root /sys/fs/cgroup - mkdir /sys/fs/cgroup/cpuset - mount -t cgroup cpuset -ocpuset /sys/fs/cgroup/cpuset - cd /sys/fs/cgroup/cpuset - mkdir Charlie - cd Charlie - /bin/echo 2-3 > cpuset.cpus - /bin/echo 1 > cpuset.mems - /bin/echo $$ > tasks - sh - # The subshell 'sh' is now running in cgroup Charlie - # The next line should display '/Charlie' - cat /proc/self/cgroup - -2. Usage Examples and Syntax -============================ - -2.1 Basic Usage ---------------- - -Creating, modifying, using cgroups can be done through the cgroup -virtual filesystem. - -To mount a cgroup hierarchy with all available subsystems, type:: - - # mount -t cgroup xxx /sys/fs/cgroup - -The "xxx" is not interpreted by the cgroup code, but will appear in -/proc/mounts so may be any useful identifying string that you like. - -Note: Some subsystems do not work without some user input first. For instance, -if cpusets are enabled the user will have to populate the cpus and mems files -for each new cgroup created before that group can be used. - -As explained in section `1.2 Why are cgroups needed?` you should create -different hierarchies of cgroups for each single resource or group of -resources you want to control. Therefore, you should mount a tmpfs on -/sys/fs/cgroup and create directories for each cgroup resource or resource -group:: - - # mount -t tmpfs cgroup_root /sys/fs/cgroup - # mkdir /sys/fs/cgroup/rg1 - -To mount a cgroup hierarchy with just the cpuset and memory -subsystems, type:: - - # mount -t cgroup -o cpuset,memory hier1 /sys/fs/cgroup/rg1 - -While remounting cgroups is currently supported, it is not recommend -to use it. Remounting allows changing bound subsystems and -release_agent. Rebinding is hardly useful as it only works when the -hierarchy is empty and release_agent itself should be replaced with -conventional fsnotify. The support for remounting will be removed in -the future. - -To Specify a hierarchy's release_agent:: - - # mount -t cgroup -o cpuset,release_agent="/sbin/cpuset_release_agent" \ - xxx /sys/fs/cgroup/rg1 - -Note that specifying 'release_agent' more than once will return failure. - -Note that changing the set of subsystems is currently only supported -when the hierarchy consists of a single (root) cgroup. Supporting -the ability to arbitrarily bind/unbind subsystems from an existing -cgroup hierarchy is intended to be implemented in the future. - -Then under /sys/fs/cgroup/rg1 you can find a tree that corresponds to the -tree of the cgroups in the system. For instance, /sys/fs/cgroup/rg1 -is the cgroup that holds the whole system. - -If you want to change the value of release_agent:: - - # echo "/sbin/new_release_agent" > /sys/fs/cgroup/rg1/release_agent - -It can also be changed via remount. - -If you want to create a new cgroup under /sys/fs/cgroup/rg1:: - - # cd /sys/fs/cgroup/rg1 - # mkdir my_cgroup - -Now you want to do something with this cgroup: - - # cd my_cgroup - -In this directory you can find several files:: - - # ls - cgroup.procs notify_on_release tasks - (plus whatever files added by the attached subsystems) - -Now attach your shell to this cgroup:: - - # /bin/echo $$ > tasks - -You can also create cgroups inside your cgroup by using mkdir in this -directory:: - - # mkdir my_sub_cs - -To remove a cgroup, just use rmdir:: - - # rmdir my_sub_cs - -This will fail if the cgroup is in use (has cgroups inside, or -has processes attached, or is held alive by other subsystem-specific -reference). - -2.2 Attaching processes ------------------------ - -:: - - # /bin/echo PID > tasks - -Note that it is PID, not PIDs. You can only attach ONE task at a time. -If you have several tasks to attach, you have to do it one after another:: - - # /bin/echo PID1 > tasks - # /bin/echo PID2 > tasks - ... - # /bin/echo PIDn > tasks - -You can attach the current shell task by echoing 0:: - - # echo 0 > tasks - -You can use the cgroup.procs file instead of the tasks file to move all -threads in a threadgroup at once. Echoing the PID of any task in a -threadgroup to cgroup.procs causes all tasks in that threadgroup to be -attached to the cgroup. Writing 0 to cgroup.procs moves all tasks -in the writing task's threadgroup. - -Note: Since every task is always a member of exactly one cgroup in each -mounted hierarchy, to remove a task from its current cgroup you must -move it into a new cgroup (possibly the root cgroup) by writing to the -new cgroup's tasks file. - -Note: Due to some restrictions enforced by some cgroup subsystems, moving -a process to another cgroup can fail. - -2.3 Mounting hierarchies by name --------------------------------- - -Passing the name= option when mounting a cgroups hierarchy -associates the given name with the hierarchy. This can be used when -mounting a pre-existing hierarchy, in order to refer to it by name -rather than by its set of active subsystems. Each hierarchy is either -nameless, or has a unique name. - -The name should match [\w.-]+ - -When passing a name= option for a new hierarchy, you need to -specify subsystems manually; the legacy behaviour of mounting all -subsystems when none are explicitly specified is not supported when -you give a subsystem a name. - -The name of the subsystem appears as part of the hierarchy description -in /proc/mounts and /proc//cgroups. - - -3. Kernel API -============= - -3.1 Overview ------------- - -Each kernel subsystem that wants to hook into the generic cgroup -system needs to create a cgroup_subsys object. This contains -various methods, which are callbacks from the cgroup system, along -with a subsystem ID which will be assigned by the cgroup system. - -Other fields in the cgroup_subsys object include: - -- subsys_id: a unique array index for the subsystem, indicating which - entry in cgroup->subsys[] this subsystem should be managing. - -- name: should be initialized to a unique subsystem name. Should be - no longer than MAX_CGROUP_TYPE_NAMELEN. - -- early_init: indicate if the subsystem needs early initialization - at system boot. - -Each cgroup object created by the system has an array of pointers, -indexed by subsystem ID; this pointer is entirely managed by the -subsystem; the generic cgroup code will never touch this pointer. - -3.2 Synchronization -------------------- - -There is a global mutex, cgroup_mutex, used by the cgroup -system. This should be taken by anything that wants to modify a -cgroup. It may also be taken to prevent cgroups from being -modified, but more specific locks may be more appropriate in that -situation. - -See kernel/cgroup.c for more details. - -Subsystems can take/release the cgroup_mutex via the functions -cgroup_lock()/cgroup_unlock(). - -Accessing a task's cgroup pointer may be done in the following ways: -- while holding cgroup_mutex -- while holding the task's alloc_lock (via task_lock()) -- inside an rcu_read_lock() section via rcu_dereference() - -3.3 Subsystem API ------------------ - -Each subsystem should: - -- add an entry in linux/cgroup_subsys.h -- define a cgroup_subsys object called _cgrp_subsys - -Each subsystem may export the following methods. The only mandatory -methods are css_alloc/free. Any others that are null are presumed to -be successful no-ops. - -``struct cgroup_subsys_state *css_alloc(struct cgroup *cgrp)`` -(cgroup_mutex held by caller) - -Called to allocate a subsystem state object for a cgroup. The -subsystem should allocate its subsystem state object for the passed -cgroup, returning a pointer to the new object on success or a -ERR_PTR() value. On success, the subsystem pointer should point to -a structure of type cgroup_subsys_state (typically embedded in a -larger subsystem-specific object), which will be initialized by the -cgroup system. Note that this will be called at initialization to -create the root subsystem state for this subsystem; this case can be -identified by the passed cgroup object having a NULL parent (since -it's the root of the hierarchy) and may be an appropriate place for -initialization code. - -``int css_online(struct cgroup *cgrp)`` -(cgroup_mutex held by caller) - -Called after @cgrp successfully completed all allocations and made -visible to cgroup_for_each_child/descendant_*() iterators. The -subsystem may choose to fail creation by returning -errno. This -callback can be used to implement reliable state sharing and -propagation along the hierarchy. See the comment on -cgroup_for_each_descendant_pre() for details. - -``void css_offline(struct cgroup *cgrp);`` -(cgroup_mutex held by caller) - -This is the counterpart of css_online() and called iff css_online() -has succeeded on @cgrp. This signifies the beginning of the end of -@cgrp. @cgrp is being removed and the subsystem should start dropping -all references it's holding on @cgrp. When all references are dropped, -cgroup removal will proceed to the next step - css_free(). After this -callback, @cgrp should be considered dead to the subsystem. - -``void css_free(struct cgroup *cgrp)`` -(cgroup_mutex held by caller) - -The cgroup system is about to free @cgrp; the subsystem should free -its subsystem state object. By the time this method is called, @cgrp -is completely unused; @cgrp->parent is still valid. (Note - can also -be called for a newly-created cgroup if an error occurs after this -subsystem's create() method has been called for the new cgroup). - -``int can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)`` -(cgroup_mutex held by caller) - -Called prior to moving one or more tasks into a cgroup; if the -subsystem returns an error, this will abort the attach operation. -@tset contains the tasks to be attached and is guaranteed to have at -least one task in it. - -If there are multiple tasks in the taskset, then: - - it's guaranteed that all are from the same thread group - - @tset contains all tasks from the thread group whether or not - they're switching cgroups - - the first task is the leader - -Each @tset entry also contains the task's old cgroup and tasks which -aren't switching cgroup can be skipped easily using the -cgroup_taskset_for_each() iterator. Note that this isn't called on a -fork. If this method returns 0 (success) then this should remain valid -while the caller holds cgroup_mutex and it is ensured that either -attach() or cancel_attach() will be called in future. - -``void css_reset(struct cgroup_subsys_state *css)`` -(cgroup_mutex held by caller) - -An optional operation which should restore @css's configuration to the -initial state. This is currently only used on the unified hierarchy -when a subsystem is disabled on a cgroup through -"cgroup.subtree_control" but should remain enabled because other -subsystems depend on it. cgroup core makes such a css invisible by -removing the associated interface files and invokes this callback so -that the hidden subsystem can return to the initial neutral state. -This prevents unexpected resource control from a hidden css and -ensures that the configuration is in the initial state when it is made -visible again later. - -``void cancel_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)`` -(cgroup_mutex held by caller) - -Called when a task attach operation has failed after can_attach() has succeeded. -A subsystem whose can_attach() has some side-effects should provide this -function, so that the subsystem can implement a rollback. If not, not necessary. -This will be called only about subsystems whose can_attach() operation have -succeeded. The parameters are identical to can_attach(). - -``void attach(struct cgroup *cgrp, struct cgroup_taskset *tset)`` -(cgroup_mutex held by caller) - -Called after the task has been attached to the cgroup, to allow any -post-attachment activity that requires memory allocations or blocking. -The parameters are identical to can_attach(). - -``void fork(struct task_struct *task)`` - -Called when a task is forked into a cgroup. - -``void exit(struct task_struct *task)`` - -Called during task exit. - -``void free(struct task_struct *task)`` - -Called when the task_struct is freed. - -``void bind(struct cgroup *root)`` -(cgroup_mutex held by caller) - -Called when a cgroup subsystem is rebound to a different hierarchy -and root cgroup. Currently this will only involve movement between -the default hierarchy (which never has sub-cgroups) and a hierarchy -that is being created/destroyed (and hence has no sub-cgroups). - -4. Extended attribute usage -=========================== - -cgroup filesystem supports certain types of extended attributes in its -directories and files. The current supported types are: - - - Trusted (XATTR_TRUSTED) - - Security (XATTR_SECURITY) - -Both require CAP_SYS_ADMIN capability to set. - -Like in tmpfs, the extended attributes in cgroup filesystem are stored -using kernel memory and it's advised to keep the usage at minimum. This -is the reason why user defined extended attributes are not supported, since -any user can do it and there's no limit in the value size. - -The current known users for this feature are SELinux to limit cgroup usage -in containers and systemd for assorted meta data like main PID in a cgroup -(systemd creates a cgroup per service). - -5. Questions -============ - -:: - - Q: what's up with this '/bin/echo' ? - A: bash's builtin 'echo' command does not check calls to write() against - errors. If you use it in the cgroup file system, you won't be - able to tell whether a command succeeded or failed. - - Q: When I attach processes, only the first of the line gets really attached ! - A: We can only return one error code per call to write(). So you should also - put only ONE PID. diff --git a/Documentation/cgroup-v1/cpuacct.rst b/Documentation/cgroup-v1/cpuacct.rst deleted file mode 100644 index d30ed81d2ad7..000000000000 --- a/Documentation/cgroup-v1/cpuacct.rst +++ /dev/null @@ -1,50 +0,0 @@ -========================= -CPU Accounting Controller -========================= - -The CPU accounting controller is used to group tasks using cgroups and -account the CPU usage of these groups of tasks. - -The CPU accounting controller supports multi-hierarchy groups. An accounting -group accumulates the CPU usage of all of its child groups and the tasks -directly present in its group. - -Accounting groups can be created by first mounting the cgroup filesystem:: - - # mount -t cgroup -ocpuacct none /sys/fs/cgroup - -With the above step, the initial or the parent accounting group becomes -visible at /sys/fs/cgroup. At bootup, this group includes all the tasks in -the system. /sys/fs/cgroup/tasks lists the tasks in this cgroup. -/sys/fs/cgroup/cpuacct.usage gives the CPU time (in nanoseconds) obtained -by this group which is essentially the CPU time obtained by all the tasks -in the system. - -New accounting groups can be created under the parent group /sys/fs/cgroup:: - - # cd /sys/fs/cgroup - # mkdir g1 - # echo $$ > g1/tasks - -The above steps create a new group g1 and move the current shell -process (bash) into it. CPU time consumed by this bash and its children -can be obtained from g1/cpuacct.usage and the same is accumulated in -/sys/fs/cgroup/cpuacct.usage also. - -cpuacct.stat file lists a few statistics which further divide the -CPU time obtained by the cgroup into user and system times. Currently -the following statistics are supported: - -user: Time spent by tasks of the cgroup in user mode. -system: Time spent by tasks of the cgroup in kernel mode. - -user and system are in USER_HZ unit. - -cpuacct controller uses percpu_counter interface to collect user and -system times. This has two side effects: - -- It is theoretically possible to see wrong values for user and system times. - This is because percpu_counter_read() on 32bit systems isn't safe - against concurrent writes. -- It is possible to see slightly outdated values for user and system times - due to the batch processing nature of percpu_counter. diff --git a/Documentation/cgroup-v1/cpusets.rst b/Documentation/cgroup-v1/cpusets.rst deleted file mode 100644 index b6a42cdea72b..000000000000 --- a/Documentation/cgroup-v1/cpusets.rst +++ /dev/null @@ -1,866 +0,0 @@ -======= -CPUSETS -======= - -Copyright (C) 2004 BULL SA. - -Written by Simon.Derr@bull.net - -- Portions Copyright (c) 2004-2006 Silicon Graphics, Inc. -- Modified by Paul Jackson -- Modified by Christoph Lameter -- Modified by Paul Menage -- Modified by Hidetoshi Seto - -.. CONTENTS: - - 1. Cpusets - 1.1 What are cpusets ? - 1.2 Why are cpusets needed ? - 1.3 How are cpusets implemented ? - 1.4 What are exclusive cpusets ? - 1.5 What is memory_pressure ? - 1.6 What is memory spread ? - 1.7 What is sched_load_balance ? - 1.8 What is sched_relax_domain_level ? - 1.9 How do I use cpusets ? - 2. Usage Examples and Syntax - 2.1 Basic Usage - 2.2 Adding/removing cpus - 2.3 Setting flags - 2.4 Attaching processes - 3. Questions - 4. Contact - -1. Cpusets -========== - -1.1 What are cpusets ? ----------------------- - -Cpusets provide a mechanism for assigning a set of CPUs and Memory -Nodes to a set of tasks. In this document "Memory Node" refers to -an on-line node that contains memory. - -Cpusets constrain the CPU and Memory placement of tasks to only -the resources within a task's current cpuset. They form a nested -hierarchy visible in a virtual file system. These are the essential -hooks, beyond what is already present, required to manage dynamic -job placement on large systems. - -Cpusets use the generic cgroup subsystem described in -Documentation/cgroup-v1/cgroups.rst. - -Requests by a task, using the sched_setaffinity(2) system call to -include CPUs in its CPU affinity mask, and using the mbind(2) and -set_mempolicy(2) system calls to include Memory Nodes in its memory -policy, are both filtered through that task's cpuset, filtering out any -CPUs or Memory Nodes not in that cpuset. The scheduler will not -schedule a task on a CPU that is not allowed in its cpus_allowed -vector, and the kernel page allocator will not allocate a page on a -node that is not allowed in the requesting task's mems_allowed vector. - -User level code may create and destroy cpusets by name in the cgroup -virtual file system, manage the attributes and permissions of these -cpusets and which CPUs and Memory Nodes are assigned to each cpuset, -specify and query to which cpuset a task is assigned, and list the -task pids assigned to a cpuset. - - -1.2 Why are cpusets needed ? ----------------------------- - -The management of large computer systems, with many processors (CPUs), -complex memory cache hierarchies and multiple Memory Nodes having -non-uniform access times (NUMA) presents additional challenges for -the efficient scheduling and memory placement of processes. - -Frequently more modest sized systems can be operated with adequate -efficiency just by letting the operating system automatically share -the available CPU and Memory resources amongst the requesting tasks. - -But larger systems, which benefit more from careful processor and -memory placement to reduce memory access times and contention, -and which typically represent a larger investment for the customer, -can benefit from explicitly placing jobs on properly sized subsets of -the system. - -This can be especially valuable on: - - * Web Servers running multiple instances of the same web application, - * Servers running different applications (for instance, a web server - and a database), or - * NUMA systems running large HPC applications with demanding - performance characteristics. - -These subsets, or "soft partitions" must be able to be dynamically -adjusted, as the job mix changes, without impacting other concurrently -executing jobs. The location of the running jobs pages may also be moved -when the memory locations are changed. - -The kernel cpuset patch provides the minimum essential kernel -mechanisms required to efficiently implement such subsets. It -leverages existing CPU and Memory Placement facilities in the Linux -kernel to avoid any additional impact on the critical scheduler or -memory allocator code. - - -1.3 How are cpusets implemented ? ---------------------------------- - -Cpusets provide a Linux kernel mechanism to constrain which CPUs and -Memory Nodes are used by a process or set of processes. - -The Linux kernel already has a pair of mechanisms to specify on which -CPUs a task may be scheduled (sched_setaffinity) and on which Memory -Nodes it may obtain memory (mbind, set_mempolicy). - -Cpusets extends these two mechanisms as follows: - - - Cpusets are sets of allowed CPUs and Memory Nodes, known to the - kernel. - - Each task in the system is attached to a cpuset, via a pointer - in the task structure to a reference counted cgroup structure. - - Calls to sched_setaffinity are filtered to just those CPUs - allowed in that task's cpuset. - - Calls to mbind and set_mempolicy are filtered to just - those Memory Nodes allowed in that task's cpuset. - - The root cpuset contains all the systems CPUs and Memory - Nodes. - - For any cpuset, one can define child cpusets containing a subset - of the parents CPU and Memory Node resources. - - The hierarchy of cpusets can be mounted at /dev/cpuset, for - browsing and manipulation from user space. - - A cpuset may be marked exclusive, which ensures that no other - cpuset (except direct ancestors and descendants) may contain - any overlapping CPUs or Memory Nodes. - - You can list all the tasks (by pid) attached to any cpuset. - -The implementation of cpusets requires a few, simple hooks -into the rest of the kernel, none in performance critical paths: - - - in init/main.c, to initialize the root cpuset at system boot. - - in fork and exit, to attach and detach a task from its cpuset. - - in sched_setaffinity, to mask the requested CPUs by what's - allowed in that task's cpuset. - - in sched.c migrate_live_tasks(), to keep migrating tasks within - the CPUs allowed by their cpuset, if possible. - - in the mbind and set_mempolicy system calls, to mask the requested - Memory Nodes by what's allowed in that task's cpuset. - - in page_alloc.c, to restrict memory to allowed nodes. - - in vmscan.c, to restrict page recovery to the current cpuset. - -You should mount the "cgroup" filesystem type in order to enable -browsing and modifying the cpusets presently known to the kernel. No -new system calls are added for cpusets - all support for querying and -modifying cpusets is via this cpuset file system. - -The /proc//status file for each task has four added lines, -displaying the task's cpus_allowed (on which CPUs it may be scheduled) -and mems_allowed (on which Memory Nodes it may obtain memory), -in the two formats seen in the following example:: - - Cpus_allowed: ffffffff,ffffffff,ffffffff,ffffffff - Cpus_allowed_list: 0-127 - Mems_allowed: ffffffff,ffffffff - Mems_allowed_list: 0-63 - -Each cpuset is represented by a directory in the cgroup file system -containing (on top of the standard cgroup files) the following -files describing that cpuset: - - - cpuset.cpus: list of CPUs in that cpuset - - cpuset.mems: list of Memory Nodes in that cpuset - - cpuset.memory_migrate flag: if set, move pages to cpusets nodes - - cpuset.cpu_exclusive flag: is cpu placement exclusive? - - cpuset.mem_exclusive flag: is memory placement exclusive? - - cpuset.mem_hardwall flag: is memory allocation hardwalled - - cpuset.memory_pressure: measure of how much paging pressure in cpuset - - cpuset.memory_spread_page flag: if set, spread page cache evenly on allowed nodes - - cpuset.memory_spread_slab flag: if set, spread slab cache evenly on allowed nodes - - cpuset.sched_load_balance flag: if set, load balance within CPUs on that cpuset - - cpuset.sched_relax_domain_level: the searching range when migrating tasks - -In addition, only the root cpuset has the following file: - - - cpuset.memory_pressure_enabled flag: compute memory_pressure? - -New cpusets are created using the mkdir system call or shell -command. The properties of a cpuset, such as its flags, allowed -CPUs and Memory Nodes, and attached tasks, are modified by writing -to the appropriate file in that cpusets directory, as listed above. - -The named hierarchical structure of nested cpusets allows partitioning -a large system into nested, dynamically changeable, "soft-partitions". - -The attachment of each task, automatically inherited at fork by any -children of that task, to a cpuset allows organizing the work load -on a system into related sets of tasks such that each set is constrained -to using the CPUs and Memory Nodes of a particular cpuset. A task -may be re-attached to any other cpuset, if allowed by the permissions -on the necessary cpuset file system directories. - -Such management of a system "in the large" integrates smoothly with -the detailed placement done on individual tasks and memory regions -using the sched_setaffinity, mbind and set_mempolicy system calls. - -The following rules apply to each cpuset: - - - Its CPUs and Memory Nodes must be a subset of its parents. - - It can't be marked exclusive unless its parent is. - - If its cpu or memory is exclusive, they may not overlap any sibling. - -These rules, and the natural hierarchy of cpusets, enable efficient -enforcement of the exclusive guarantee, without having to scan all -cpusets every time any of them change to ensure nothing overlaps a -exclusive cpuset. Also, the use of a Linux virtual file system (vfs) -to represent the cpuset hierarchy provides for a familiar permission -and name space for cpusets, with a minimum of additional kernel code. - -The cpus and mems files in the root (top_cpuset) cpuset are -read-only. The cpus file automatically tracks the value of -cpu_online_mask using a CPU hotplug notifier, and the mems file -automatically tracks the value of node_states[N_MEMORY]--i.e., -nodes with memory--using the cpuset_track_online_nodes() hook. - - -1.4 What are exclusive cpusets ? --------------------------------- - -If a cpuset is cpu or mem exclusive, no other cpuset, other than -a direct ancestor or descendant, may share any of the same CPUs or -Memory Nodes. - -A cpuset that is cpuset.mem_exclusive *or* cpuset.mem_hardwall is "hardwalled", -i.e. it restricts kernel allocations for page, buffer and other data -commonly shared by the kernel across multiple users. All cpusets, -whether hardwalled or not, restrict allocations of memory for user -space. This enables configuring a system so that several independent -jobs can share common kernel data, such as file system pages, while -isolating each job's user allocation in its own cpuset. To do this, -construct a large mem_exclusive cpuset to hold all the jobs, and -construct child, non-mem_exclusive cpusets for each individual job. -Only a small amount of typical kernel memory, such as requests from -interrupt handlers, is allowed to be taken outside even a -mem_exclusive cpuset. - - -1.5 What is memory_pressure ? ------------------------------ -The memory_pressure of a cpuset provides a simple per-cpuset metric -of the rate that the tasks in a cpuset are attempting to free up in -use memory on the nodes of the cpuset to satisfy additional memory -requests. - -This enables batch managers monitoring jobs running in dedicated -cpusets to efficiently detect what level of memory pressure that job -is causing. - -This is useful both on tightly managed systems running a wide mix of -submitted jobs, which may choose to terminate or re-prioritize jobs that -are trying to use more memory than allowed on the nodes assigned to them, -and with tightly coupled, long running, massively parallel scientific -computing jobs that will dramatically fail to meet required performance -goals if they start to use more memory than allowed to them. - -This mechanism provides a very economical way for the batch manager -to monitor a cpuset for signs of memory pressure. It's up to the -batch manager or other user code to decide what to do about it and -take action. - -==> - Unless this feature is enabled by writing "1" to the special file - /dev/cpuset/memory_pressure_enabled, the hook in the rebalance - code of __alloc_pages() for this metric reduces to simply noticing - that the cpuset_memory_pressure_enabled flag is zero. So only - systems that enable this feature will compute the metric. - -Why a per-cpuset, running average: - - Because this meter is per-cpuset, rather than per-task or mm, - the system load imposed by a batch scheduler monitoring this - metric is sharply reduced on large systems, because a scan of - the tasklist can be avoided on each set of queries. - - Because this meter is a running average, instead of an accumulating - counter, a batch scheduler can detect memory pressure with a - single read, instead of having to read and accumulate results - for a period of time. - - Because this meter is per-cpuset rather than per-task or mm, - the batch scheduler can obtain the key information, memory - pressure in a cpuset, with a single read, rather than having to - query and accumulate results over all the (dynamically changing) - set of tasks in the cpuset. - -A per-cpuset simple digital filter (requires a spinlock and 3 words -of data per-cpuset) is kept, and updated by any task attached to that -cpuset, if it enters the synchronous (direct) page reclaim code. - -A per-cpuset file provides an integer number representing the recent -(half-life of 10 seconds) rate of direct page reclaims caused by -the tasks in the cpuset, in units of reclaims attempted per second, -times 1000. - - -1.6 What is memory spread ? ---------------------------- -There are two boolean flag files per cpuset that control where the -kernel allocates pages for the file system buffers and related in -kernel data structures. They are called 'cpuset.memory_spread_page' and -'cpuset.memory_spread_slab'. - -If the per-cpuset boolean flag file 'cpuset.memory_spread_page' is set, then -the kernel will spread the file system buffers (page cache) evenly -over all the nodes that the faulting task is allowed to use, instead -of preferring to put those pages on the node where the task is running. - -If the per-cpuset boolean flag file 'cpuset.memory_spread_slab' is set, -then the kernel will spread some file system related slab caches, -such as for inodes and dentries evenly over all the nodes that the -faulting task is allowed to use, instead of preferring to put those -pages on the node where the task is running. - -The setting of these flags does not affect anonymous data segment or -stack segment pages of a task. - -By default, both kinds of memory spreading are off, and memory -pages are allocated on the node local to where the task is running, -except perhaps as modified by the task's NUMA mempolicy or cpuset -configuration, so long as sufficient free memory pages are available. - -When new cpusets are created, they inherit the memory spread settings -of their parent. - -Setting memory spreading causes allocations for the affected page -or slab caches to ignore the task's NUMA mempolicy and be spread -instead. Tasks using mbind() or set_mempolicy() calls to set NUMA -mempolicies will not notice any change in these calls as a result of -their containing task's memory spread settings. If memory spreading -is turned off, then the currently specified NUMA mempolicy once again -applies to memory page allocations. - -Both 'cpuset.memory_spread_page' and 'cpuset.memory_spread_slab' are boolean flag -files. By default they contain "0", meaning that the feature is off -for that cpuset. If a "1" is written to that file, then that turns -the named feature on. - -The implementation is simple. - -Setting the flag 'cpuset.memory_spread_page' turns on a per-process flag -PFA_SPREAD_PAGE for each task that is in that cpuset or subsequently -joins that cpuset. The page allocation calls for the page cache -is modified to perform an inline check for this PFA_SPREAD_PAGE task -flag, and if set, a call to a new routine cpuset_mem_spread_node() -returns the node to prefer for the allocation. - -Similarly, setting 'cpuset.memory_spread_slab' turns on the flag -PFA_SPREAD_SLAB, and appropriately marked slab caches will allocate -pages from the node returned by cpuset_mem_spread_node(). - -The cpuset_mem_spread_node() routine is also simple. It uses the -value of a per-task rotor cpuset_mem_spread_rotor to select the next -node in the current task's mems_allowed to prefer for the allocation. - -This memory placement policy is also known (in other contexts) as -round-robin or interleave. - -This policy can provide substantial improvements for jobs that need -to place thread local data on the corresponding node, but that need -to access large file system data sets that need to be spread across -the several nodes in the jobs cpuset in order to fit. Without this -policy, especially for jobs that might have one thread reading in the -data set, the memory allocation across the nodes in the jobs cpuset -can become very uneven. - -1.7 What is sched_load_balance ? --------------------------------- - -The kernel scheduler (kernel/sched/core.c) automatically load balances -tasks. If one CPU is underutilized, kernel code running on that -CPU will look for tasks on other more overloaded CPUs and move those -tasks to itself, within the constraints of such placement mechanisms -as cpusets and sched_setaffinity. - -The algorithmic cost of load balancing and its impact on key shared -kernel data structures such as the task list increases more than -linearly with the number of CPUs being balanced. So the scheduler -has support to partition the systems CPUs into a number of sched -domains such that it only load balances within each sched domain. -Each sched domain covers some subset of the CPUs in the system; -no two sched domains overlap; some CPUs might not be in any sched -domain and hence won't be load balanced. - -Put simply, it costs less to balance between two smaller sched domains -than one big one, but doing so means that overloads in one of the -two domains won't be load balanced to the other one. - -By default, there is one sched domain covering all CPUs, including those -marked isolated using the kernel boot time "isolcpus=" argument. However, -the isolated CPUs will not participate in load balancing, and will not -have tasks running on them unless explicitly assigned. - -This default load balancing across all CPUs is not well suited for -the following two situations: - - 1) On large systems, load balancing across many CPUs is expensive. - If the system is managed using cpusets to place independent jobs - on separate sets of CPUs, full load balancing is unnecessary. - 2) Systems supporting realtime on some CPUs need to minimize - system overhead on those CPUs, including avoiding task load - balancing if that is not needed. - -When the per-cpuset flag "cpuset.sched_load_balance" is enabled (the default -setting), it requests that all the CPUs in that cpusets allowed 'cpuset.cpus' -be contained in a single sched domain, ensuring that load balancing -can move a task (not otherwised pinned, as by sched_setaffinity) -from any CPU in that cpuset to any other. - -When the per-cpuset flag "cpuset.sched_load_balance" is disabled, then the -scheduler will avoid load balancing across the CPUs in that cpuset, ---except-- in so far as is necessary because some overlapping cpuset -has "sched_load_balance" enabled. - -So, for example, if the top cpuset has the flag "cpuset.sched_load_balance" -enabled, then the scheduler will have one sched domain covering all -CPUs, and the setting of the "cpuset.sched_load_balance" flag in any other -cpusets won't matter, as we're already fully load balancing. - -Therefore in the above two situations, the top cpuset flag -"cpuset.sched_load_balance" should be disabled, and only some of the smaller, -child cpusets have this flag enabled. - -When doing this, you don't usually want to leave any unpinned tasks in -the top cpuset that might use non-trivial amounts of CPU, as such tasks -may be artificially constrained to some subset of CPUs, depending on -the particulars of this flag setting in descendant cpusets. Even if -such a task could use spare CPU cycles in some other CPUs, the kernel -scheduler might not consider the possibility of load balancing that -task to that underused CPU. - -Of course, tasks pinned to a particular CPU can be left in a cpuset -that disables "cpuset.sched_load_balance" as those tasks aren't going anywhere -else anyway. - -There is an impedance mismatch here, between cpusets and sched domains. -Cpusets are hierarchical and nest. Sched domains are flat; they don't -overlap and each CPU is in at most one sched domain. - -It is necessary for sched domains to be flat because load balancing -across partially overlapping sets of CPUs would risk unstable dynamics -that would be beyond our understanding. So if each of two partially -overlapping cpusets enables the flag 'cpuset.sched_load_balance', then we -form a single sched domain that is a superset of both. We won't move -a task to a CPU outside its cpuset, but the scheduler load balancing -code might waste some compute cycles considering that possibility. - -This mismatch is why there is not a simple one-to-one relation -between which cpusets have the flag "cpuset.sched_load_balance" enabled, -and the sched domain configuration. If a cpuset enables the flag, it -will get balancing across all its CPUs, but if it disables the flag, -it will only be assured of no load balancing if no other overlapping -cpuset enables the flag. - -If two cpusets have partially overlapping 'cpuset.cpus' allowed, and only -one of them has this flag enabled, then the other may find its -tasks only partially load balanced, just on the overlapping CPUs. -This is just the general case of the top_cpuset example given a few -paragraphs above. In the general case, as in the top cpuset case, -don't leave tasks that might use non-trivial amounts of CPU in -such partially load balanced cpusets, as they may be artificially -constrained to some subset of the CPUs allowed to them, for lack of -load balancing to the other CPUs. - -CPUs in "cpuset.isolcpus" were excluded from load balancing by the -isolcpus= kernel boot option, and will never be load balanced regardless -of the value of "cpuset.sched_load_balance" in any cpuset. - -1.7.1 sched_load_balance implementation details. ------------------------------------------------- - -The per-cpuset flag 'cpuset.sched_load_balance' defaults to enabled (contrary -to most cpuset flags.) When enabled for a cpuset, the kernel will -ensure that it can load balance across all the CPUs in that cpuset -(makes sure that all the CPUs in the cpus_allowed of that cpuset are -in the same sched domain.) - -If two overlapping cpusets both have 'cpuset.sched_load_balance' enabled, -then they will be (must be) both in the same sched domain. - -If, as is the default, the top cpuset has 'cpuset.sched_load_balance' enabled, -then by the above that means there is a single sched domain covering -the whole system, regardless of any other cpuset settings. - -The kernel commits to user space that it will avoid load balancing -where it can. It will pick as fine a granularity partition of sched -domains as it can while still providing load balancing for any set -of CPUs allowed to a cpuset having 'cpuset.sched_load_balance' enabled. - -The internal kernel cpuset to scheduler interface passes from the -cpuset code to the scheduler code a partition of the load balanced -CPUs in the system. This partition is a set of subsets (represented -as an array of struct cpumask) of CPUs, pairwise disjoint, that cover -all the CPUs that must be load balanced. - -The cpuset code builds a new such partition and passes it to the -scheduler sched domain setup code, to have the sched domains rebuilt -as necessary, whenever: - - - the 'cpuset.sched_load_balance' flag of a cpuset with non-empty CPUs changes, - - or CPUs come or go from a cpuset with this flag enabled, - - or 'cpuset.sched_relax_domain_level' value of a cpuset with non-empty CPUs - and with this flag enabled changes, - - or a cpuset with non-empty CPUs and with this flag enabled is removed, - - or a cpu is offlined/onlined. - -This partition exactly defines what sched domains the scheduler should -setup - one sched domain for each element (struct cpumask) in the -partition. - -The scheduler remembers the currently active sched domain partitions. -When the scheduler routine partition_sched_domains() is invoked from -the cpuset code to update these sched domains, it compares the new -partition requested with the current, and updates its sched domains, -removing the old and adding the new, for each change. - - -1.8 What is sched_relax_domain_level ? --------------------------------------- - -In sched domain, the scheduler migrates tasks in 2 ways; periodic load -balance on tick, and at time of some schedule events. - -When a task is woken up, scheduler try to move the task on idle CPU. -For example, if a task A running on CPU X activates another task B -on the same CPU X, and if CPU Y is X's sibling and performing idle, -then scheduler migrate task B to CPU Y so that task B can start on -CPU Y without waiting task A on CPU X. - -And if a CPU run out of tasks in its runqueue, the CPU try to pull -extra tasks from other busy CPUs to help them before it is going to -be idle. - -Of course it takes some searching cost to find movable tasks and/or -idle CPUs, the scheduler might not search all CPUs in the domain -every time. In fact, in some architectures, the searching ranges on -events are limited in the same socket or node where the CPU locates, -while the load balance on tick searches all. - -For example, assume CPU Z is relatively far from CPU X. Even if CPU Z -is idle while CPU X and the siblings are busy, scheduler can't migrate -woken task B from X to Z since it is out of its searching range. -As the result, task B on CPU X need to wait task A or wait load balance -on the next tick. For some applications in special situation, waiting -1 tick may be too long. - -The 'cpuset.sched_relax_domain_level' file allows you to request changing -this searching range as you like. This file takes int value which -indicates size of searching range in levels ideally as follows, -otherwise initial value -1 that indicates the cpuset has no request. - -====== =========================================================== - -1 no request. use system default or follow request of others. - 0 no search. - 1 search siblings (hyperthreads in a core). - 2 search cores in a package. - 3 search cpus in a node [= system wide on non-NUMA system] - 4 search nodes in a chunk of node [on NUMA system] - 5 search system wide [on NUMA system] -====== =========================================================== - -The system default is architecture dependent. The system default -can be changed using the relax_domain_level= boot parameter. - -This file is per-cpuset and affect the sched domain where the cpuset -belongs to. Therefore if the flag 'cpuset.sched_load_balance' of a cpuset -is disabled, then 'cpuset.sched_relax_domain_level' have no effect since -there is no sched domain belonging the cpuset. - -If multiple cpusets are overlapping and hence they form a single sched -domain, the largest value among those is used. Be careful, if one -requests 0 and others are -1 then 0 is used. - -Note that modifying this file will have both good and bad effects, -and whether it is acceptable or not depends on your situation. -Don't modify this file if you are not sure. - -If your situation is: - - - The migration costs between each cpu can be assumed considerably - small(for you) due to your special application's behavior or - special hardware support for CPU cache etc. - - The searching cost doesn't have impact(for you) or you can make - the searching cost enough small by managing cpuset to compact etc. - - The latency is required even it sacrifices cache hit rate etc. - then increasing 'sched_relax_domain_level' would benefit you. - - -1.9 How do I use cpusets ? --------------------------- - -In order to minimize the impact of cpusets on critical kernel -code, such as the scheduler, and due to the fact that the kernel -does not support one task updating the memory placement of another -task directly, the impact on a task of changing its cpuset CPU -or Memory Node placement, or of changing to which cpuset a task -is attached, is subtle. - -If a cpuset has its Memory Nodes modified, then for each task attached -to that cpuset, the next time that the kernel attempts to allocate -a page of memory for that task, the kernel will notice the change -in the task's cpuset, and update its per-task memory placement to -remain within the new cpusets memory placement. If the task was using -mempolicy MPOL_BIND, and the nodes to which it was bound overlap with -its new cpuset, then the task will continue to use whatever subset -of MPOL_BIND nodes are still allowed in the new cpuset. If the task -was using MPOL_BIND and now none of its MPOL_BIND nodes are allowed -in the new cpuset, then the task will be essentially treated as if it -was MPOL_BIND bound to the new cpuset (even though its NUMA placement, -as queried by get_mempolicy(), doesn't change). If a task is moved -from one cpuset to another, then the kernel will adjust the task's -memory placement, as above, the next time that the kernel attempts -to allocate a page of memory for that task. - -If a cpuset has its 'cpuset.cpus' modified, then each task in that cpuset -will have its allowed CPU placement changed immediately. Similarly, -if a task's pid is written to another cpuset's 'tasks' file, then its -allowed CPU placement is changed immediately. If such a task had been -bound to some subset of its cpuset using the sched_setaffinity() call, -the task will be allowed to run on any CPU allowed in its new cpuset, -negating the effect of the prior sched_setaffinity() call. - -In summary, the memory placement of a task whose cpuset is changed is -updated by the kernel, on the next allocation of a page for that task, -and the processor placement is updated immediately. - -Normally, once a page is allocated (given a physical page -of main memory) then that page stays on whatever node it -was allocated, so long as it remains allocated, even if the -cpusets memory placement policy 'cpuset.mems' subsequently changes. -If the cpuset flag file 'cpuset.memory_migrate' is set true, then when -tasks are attached to that cpuset, any pages that task had -allocated to it on nodes in its previous cpuset are migrated -to the task's new cpuset. The relative placement of the page within -the cpuset is preserved during these migration operations if possible. -For example if the page was on the second valid node of the prior cpuset -then the page will be placed on the second valid node of the new cpuset. - -Also if 'cpuset.memory_migrate' is set true, then if that cpuset's -'cpuset.mems' file is modified, pages allocated to tasks in that -cpuset, that were on nodes in the previous setting of 'cpuset.mems', -will be moved to nodes in the new setting of 'mems.' -Pages that were not in the task's prior cpuset, or in the cpuset's -prior 'cpuset.mems' setting, will not be moved. - -There is an exception to the above. If hotplug functionality is used -to remove all the CPUs that are currently assigned to a cpuset, -then all the tasks in that cpuset will be moved to the nearest ancestor -with non-empty cpus. But the moving of some (or all) tasks might fail if -cpuset is bound with another cgroup subsystem which has some restrictions -on task attaching. In this failing case, those tasks will stay -in the original cpuset, and the kernel will automatically update -their cpus_allowed to allow all online CPUs. When memory hotplug -functionality for removing Memory Nodes is available, a similar exception -is expected to apply there as well. In general, the kernel prefers to -violate cpuset placement, over starving a task that has had all -its allowed CPUs or Memory Nodes taken offline. - -There is a second exception to the above. GFP_ATOMIC requests are -kernel internal allocations that must be satisfied, immediately. -The kernel may drop some request, in rare cases even panic, if a -GFP_ATOMIC alloc fails. If the request cannot be satisfied within -the current task's cpuset, then we relax the cpuset, and look for -memory anywhere we can find it. It's better to violate the cpuset -than stress the kernel. - -To start a new job that is to be contained within a cpuset, the steps are: - - 1) mkdir /sys/fs/cgroup/cpuset - 2) mount -t cgroup -ocpuset cpuset /sys/fs/cgroup/cpuset - 3) Create the new cpuset by doing mkdir's and write's (or echo's) in - the /sys/fs/cgroup/cpuset virtual file system. - 4) Start a task that will be the "founding father" of the new job. - 5) Attach that task to the new cpuset by writing its pid to the - /sys/fs/cgroup/cpuset tasks file for that cpuset. - 6) fork, exec or clone the job tasks from this founding father task. - -For example, the following sequence of commands will setup a cpuset -named "Charlie", containing just CPUs 2 and 3, and Memory Node 1, -and then start a subshell 'sh' in that cpuset:: - - mount -t cgroup -ocpuset cpuset /sys/fs/cgroup/cpuset - cd /sys/fs/cgroup/cpuset - mkdir Charlie - cd Charlie - /bin/echo 2-3 > cpuset.cpus - /bin/echo 1 > cpuset.mems - /bin/echo $$ > tasks - sh - # The subshell 'sh' is now running in cpuset Charlie - # The next line should display '/Charlie' - cat /proc/self/cpuset - -There are ways to query or modify cpusets: - - - via the cpuset file system directly, using the various cd, mkdir, echo, - cat, rmdir commands from the shell, or their equivalent from C. - - via the C library libcpuset. - - via the C library libcgroup. - (http://sourceforge.net/projects/libcg/) - - via the python application cset. - (http://code.google.com/p/cpuset/) - -The sched_setaffinity calls can also be done at the shell prompt using -SGI's runon or Robert Love's taskset. The mbind and set_mempolicy -calls can be done at the shell prompt using the numactl command -(part of Andi Kleen's numa package). - -2. Usage Examples and Syntax -============================ - -2.1 Basic Usage ---------------- - -Creating, modifying, using the cpusets can be done through the cpuset -virtual filesystem. - -To mount it, type: -# mount -t cgroup -o cpuset cpuset /sys/fs/cgroup/cpuset - -Then under /sys/fs/cgroup/cpuset you can find a tree that corresponds to the -tree of the cpusets in the system. For instance, /sys/fs/cgroup/cpuset -is the cpuset that holds the whole system. - -If you want to create a new cpuset under /sys/fs/cgroup/cpuset:: - - # cd /sys/fs/cgroup/cpuset - # mkdir my_cpuset - -Now you want to do something with this cpuset:: - - # cd my_cpuset - -In this directory you can find several files:: - - # ls - cgroup.clone_children cpuset.memory_pressure - cgroup.event_control cpuset.memory_spread_page - cgroup.procs cpuset.memory_spread_slab - cpuset.cpu_exclusive cpuset.mems - cpuset.cpus cpuset.sched_load_balance - cpuset.mem_exclusive cpuset.sched_relax_domain_level - cpuset.mem_hardwall notify_on_release - cpuset.memory_migrate tasks - -Reading them will give you information about the state of this cpuset: -the CPUs and Memory Nodes it can use, the processes that are using -it, its properties. By writing to these files you can manipulate -the cpuset. - -Set some flags:: - - # /bin/echo 1 > cpuset.cpu_exclusive - -Add some cpus:: - - # /bin/echo 0-7 > cpuset.cpus - -Add some mems:: - - # /bin/echo 0-7 > cpuset.mems - -Now attach your shell to this cpuset:: - - # /bin/echo $$ > tasks - -You can also create cpusets inside your cpuset by using mkdir in this -directory:: - - # mkdir my_sub_cs - -To remove a cpuset, just use rmdir:: - - # rmdir my_sub_cs - -This will fail if the cpuset is in use (has cpusets inside, or has -processes attached). - -Note that for legacy reasons, the "cpuset" filesystem exists as a -wrapper around the cgroup filesystem. - -The command:: - - mount -t cpuset X /sys/fs/cgroup/cpuset - -is equivalent to:: - - mount -t cgroup -ocpuset,noprefix X /sys/fs/cgroup/cpuset - echo "/sbin/cpuset_release_agent" > /sys/fs/cgroup/cpuset/release_agent - -2.2 Adding/removing cpus ------------------------- - -This is the syntax to use when writing in the cpus or mems files -in cpuset directories:: - - # /bin/echo 1-4 > cpuset.cpus -> set cpus list to cpus 1,2,3,4 - # /bin/echo 1,2,3,4 > cpuset.cpus -> set cpus list to cpus 1,2,3,4 - -To add a CPU to a cpuset, write the new list of CPUs including the -CPU to be added. To add 6 to the above cpuset:: - - # /bin/echo 1-4,6 > cpuset.cpus -> set cpus list to cpus 1,2,3,4,6 - -Similarly to remove a CPU from a cpuset, write the new list of CPUs -without the CPU to be removed. - -To remove all the CPUs:: - - # /bin/echo "" > cpuset.cpus -> clear cpus list - -2.3 Setting flags ------------------ - -The syntax is very simple:: - - # /bin/echo 1 > cpuset.cpu_exclusive -> set flag 'cpuset.cpu_exclusive' - # /bin/echo 0 > cpuset.cpu_exclusive -> unset flag 'cpuset.cpu_exclusive' - -2.4 Attaching processes ------------------------ - -:: - - # /bin/echo PID > tasks - -Note that it is PID, not PIDs. You can only attach ONE task at a time. -If you have several tasks to attach, you have to do it one after another:: - - # /bin/echo PID1 > tasks - # /bin/echo PID2 > tasks - ... - # /bin/echo PIDn > tasks - - -3. Questions -============ - -Q: - what's up with this '/bin/echo' ? - -A: - bash's builtin 'echo' command does not check calls to write() against - errors. If you use it in the cpuset file system, you won't be - able to tell whether a command succeeded or failed. - -Q: - When I attach processes, only the first of the line gets really attached ! - -A: - We can only return one error code per call to write(). So you should also - put only ONE pid. - -4. Contact -========== - -Web: http://www.bullopensource.org/cpuset diff --git a/Documentation/cgroup-v1/devices.rst b/Documentation/cgroup-v1/devices.rst deleted file mode 100644 index e1886783961e..000000000000 --- a/Documentation/cgroup-v1/devices.rst +++ /dev/null @@ -1,132 +0,0 @@ -=========================== -Device Whitelist Controller -=========================== - -1. Description -============== - -Implement a cgroup to track and enforce open and mknod restrictions -on device files. A device cgroup associates a device access -whitelist with each cgroup. A whitelist entry has 4 fields. -'type' is a (all), c (char), or b (block). 'all' means it applies -to all types and all major and minor numbers. Major and minor are -either an integer or * for all. Access is a composition of r -(read), w (write), and m (mknod). - -The root device cgroup starts with rwm to 'all'. A child device -cgroup gets a copy of the parent. Administrators can then remove -devices from the whitelist or add new entries. A child cgroup can -never receive a device access which is denied by its parent. - -2. User Interface -================= - -An entry is added using devices.allow, and removed using -devices.deny. For instance:: - - echo 'c 1:3 mr' > /sys/fs/cgroup/1/devices.allow - -allows cgroup 1 to read and mknod the device usually known as -/dev/null. Doing:: - - echo a > /sys/fs/cgroup/1/devices.deny - -will remove the default 'a *:* rwm' entry. Doing:: - - echo a > /sys/fs/cgroup/1/devices.allow - -will add the 'a *:* rwm' entry to the whitelist. - -3. Security -=========== - -Any task can move itself between cgroups. This clearly won't -suffice, but we can decide the best way to adequately restrict -movement as people get some experience with this. We may just want -to require CAP_SYS_ADMIN, which at least is a separate bit from -CAP_MKNOD. We may want to just refuse moving to a cgroup which -isn't a descendant of the current one. Or we may want to use -CAP_MAC_ADMIN, since we really are trying to lock down root. - -CAP_SYS_ADMIN is needed to modify the whitelist or move another -task to a new cgroup. (Again we'll probably want to change that). - -A cgroup may not be granted more permissions than the cgroup's -parent has. - -4. Hierarchy -============ - -device cgroups maintain hierarchy by making sure a cgroup never has more -access permissions than its parent. Every time an entry is written to -a cgroup's devices.deny file, all its children will have that entry removed -from their whitelist and all the locally set whitelist entries will be -re-evaluated. In case one of the locally set whitelist entries would provide -more access than the cgroup's parent, it'll be removed from the whitelist. - -Example:: - - A - / \ - B - - group behavior exceptions - A allow "b 8:* rwm", "c 116:1 rw" - B deny "c 1:3 rwm", "c 116:2 rwm", "b 3:* rwm" - -If a device is denied in group A:: - - # echo "c 116:* r" > A/devices.deny - -it'll propagate down and after revalidating B's entries, the whitelist entry -"c 116:2 rwm" will be removed:: - - group whitelist entries denied devices - A all "b 8:* rwm", "c 116:* rw" - B "c 1:3 rwm", "b 3:* rwm" all the rest - -In case parent's exceptions change and local exceptions are not allowed -anymore, they'll be deleted. - -Notice that new whitelist entries will not be propagated:: - - A - / \ - B - - group whitelist entries denied devices - A "c 1:3 rwm", "c 1:5 r" all the rest - B "c 1:3 rwm", "c 1:5 r" all the rest - -when adding ``c *:3 rwm``:: - - # echo "c *:3 rwm" >A/devices.allow - -the result:: - - group whitelist entries denied devices - A "c *:3 rwm", "c 1:5 r" all the rest - B "c 1:3 rwm", "c 1:5 r" all the rest - -but now it'll be possible to add new entries to B:: - - # echo "c 2:3 rwm" >B/devices.allow - # echo "c 50:3 r" >B/devices.allow - -or even:: - - # echo "c *:3 rwm" >B/devices.allow - -Allowing or denying all by writing 'a' to devices.allow or devices.deny will -not be possible once the device cgroups has children. - -4.1 Hierarchy (internal implementation) ---------------------------------------- - -device cgroups is implemented internally using a behavior (ALLOW, DENY) and a -list of exceptions. The internal state is controlled using the same user -interface to preserve compatibility with the previous whitelist-only -implementation. Removal or addition of exceptions that will reduce the access -to devices will be propagated down the hierarchy. -For every propagated exception, the effective rules will be re-evaluated based -on current parent's access rules. diff --git a/Documentation/cgroup-v1/freezer-subsystem.rst b/Documentation/cgroup-v1/freezer-subsystem.rst deleted file mode 100644 index 582d3427de3f..000000000000 --- a/Documentation/cgroup-v1/freezer-subsystem.rst +++ /dev/null @@ -1,127 +0,0 @@ -============== -Cgroup Freezer -============== - -The cgroup freezer is useful to batch job management system which start -and stop sets of tasks in order to schedule the resources of a machine -according to the desires of a system administrator. This sort of program -is often used on HPC clusters to schedule access to the cluster as a -whole. The cgroup freezer uses cgroups to describe the set of tasks to -be started/stopped by the batch job management system. It also provides -a means to start and stop the tasks composing the job. - -The cgroup freezer will also be useful for checkpointing running groups -of tasks. The freezer allows the checkpoint code to obtain a consistent -image of the tasks by attempting to force the tasks in a cgroup into a -quiescent state. Once the tasks are quiescent another task can -walk /proc or invoke a kernel interface to gather information about the -quiesced tasks. Checkpointed tasks can be restarted later should a -recoverable error occur. This also allows the checkpointed tasks to be -migrated between nodes in a cluster by copying the gathered information -to another node and restarting the tasks there. - -Sequences of SIGSTOP and SIGCONT are not always sufficient for stopping -and resuming tasks in userspace. Both of these signals are observable -from within the tasks we wish to freeze. While SIGSTOP cannot be caught, -blocked, or ignored it can be seen by waiting or ptracing parent tasks. -SIGCONT is especially unsuitable since it can be caught by the task. Any -programs designed to watch for SIGSTOP and SIGCONT could be broken by -attempting to use SIGSTOP and SIGCONT to stop and resume tasks. We can -demonstrate this problem using nested bash shells:: - - $ echo $$ - 16644 - $ bash - $ echo $$ - 16690 - - From a second, unrelated bash shell: - $ kill -SIGSTOP 16690 - $ kill -SIGCONT 16690 - - - -This happens because bash can observe both signals and choose how it -responds to them. - -Another example of a program which catches and responds to these -signals is gdb. In fact any program designed to use ptrace is likely to -have a problem with this method of stopping and resuming tasks. - -In contrast, the cgroup freezer uses the kernel freezer code to -prevent the freeze/unfreeze cycle from becoming visible to the tasks -being frozen. This allows the bash example above and gdb to run as -expected. - -The cgroup freezer is hierarchical. Freezing a cgroup freezes all -tasks belonging to the cgroup and all its descendant cgroups. Each -cgroup has its own state (self-state) and the state inherited from the -parent (parent-state). Iff both states are THAWED, the cgroup is -THAWED. - -The following cgroupfs files are created by cgroup freezer. - -* freezer.state: Read-write. - - When read, returns the effective state of the cgroup - "THAWED", - "FREEZING" or "FROZEN". This is the combined self and parent-states. - If any is freezing, the cgroup is freezing (FREEZING or FROZEN). - - FREEZING cgroup transitions into FROZEN state when all tasks - belonging to the cgroup and its descendants become frozen. Note that - a cgroup reverts to FREEZING from FROZEN after a new task is added - to the cgroup or one of its descendant cgroups until the new task is - frozen. - - When written, sets the self-state of the cgroup. Two values are - allowed - "FROZEN" and "THAWED". If FROZEN is written, the cgroup, - if not already freezing, enters FREEZING state along with all its - descendant cgroups. - - If THAWED is written, the self-state of the cgroup is changed to - THAWED. Note that the effective state may not change to THAWED if - the parent-state is still freezing. If a cgroup's effective state - becomes THAWED, all its descendants which are freezing because of - the cgroup also leave the freezing state. - -* freezer.self_freezing: Read only. - - Shows the self-state. 0 if the self-state is THAWED; otherwise, 1. - This value is 1 iff the last write to freezer.state was "FROZEN". - -* freezer.parent_freezing: Read only. - - Shows the parent-state. 0 if none of the cgroup's ancestors is - frozen; otherwise, 1. - -The root cgroup is non-freezable and the above interface files don't -exist. - -* Examples of usage:: - - # mkdir /sys/fs/cgroup/freezer - # mount -t cgroup -ofreezer freezer /sys/fs/cgroup/freezer - # mkdir /sys/fs/cgroup/freezer/0 - # echo $some_pid > /sys/fs/cgroup/freezer/0/tasks - -to get status of the freezer subsystem:: - - # cat /sys/fs/cgroup/freezer/0/freezer.state - THAWED - -to freeze all tasks in the container:: - - # echo FROZEN > /sys/fs/cgroup/freezer/0/freezer.state - # cat /sys/fs/cgroup/freezer/0/freezer.state - FREEZING - # cat /sys/fs/cgroup/freezer/0/freezer.state - FROZEN - -to unfreeze all tasks in the container:: - - # echo THAWED > /sys/fs/cgroup/freezer/0/freezer.state - # cat /sys/fs/cgroup/freezer/0/freezer.state - THAWED - -This is the basic mechanism which should do the right thing for user space task -in a simple scenario. diff --git a/Documentation/cgroup-v1/hugetlb.rst b/Documentation/cgroup-v1/hugetlb.rst deleted file mode 100644 index a3902aa253a9..000000000000 --- a/Documentation/cgroup-v1/hugetlb.rst +++ /dev/null @@ -1,50 +0,0 @@ -================== -HugeTLB Controller -================== - -The HugeTLB controller allows to limit the HugeTLB usage per control group and -enforces the controller limit during page fault. Since HugeTLB doesn't -support page reclaim, enforcing the limit at page fault time implies that, -the application will get SIGBUS signal if it tries to access HugeTLB pages -beyond its limit. This requires the application to know beforehand how much -HugeTLB pages it would require for its use. - -HugeTLB controller can be created by first mounting the cgroup filesystem. - -# mount -t cgroup -o hugetlb none /sys/fs/cgroup - -With the above step, the initial or the parent HugeTLB group becomes -visible at /sys/fs/cgroup. At bootup, this group includes all the tasks in -the system. /sys/fs/cgroup/tasks lists the tasks in this cgroup. - -New groups can be created under the parent group /sys/fs/cgroup:: - - # cd /sys/fs/cgroup - # mkdir g1 - # echo $$ > g1/tasks - -The above steps create a new group g1 and move the current shell -process (bash) into it. - -Brief summary of control files:: - - hugetlb..limit_in_bytes # set/show limit of "hugepagesize" hugetlb usage - hugetlb..max_usage_in_bytes # show max "hugepagesize" hugetlb usage recorded - hugetlb..usage_in_bytes # show current usage for "hugepagesize" hugetlb - hugetlb..failcnt # show the number of allocation failure due to HugeTLB limit - -For a system supporting three hugepage sizes (64k, 32M and 1G), the control -files include:: - - hugetlb.1GB.limit_in_bytes - hugetlb.1GB.max_usage_in_bytes - hugetlb.1GB.usage_in_bytes - hugetlb.1GB.failcnt - hugetlb.64KB.limit_in_bytes - hugetlb.64KB.max_usage_in_bytes - hugetlb.64KB.usage_in_bytes - hugetlb.64KB.failcnt - hugetlb.32MB.limit_in_bytes - hugetlb.32MB.max_usage_in_bytes - hugetlb.32MB.usage_in_bytes - hugetlb.32MB.failcnt diff --git a/Documentation/cgroup-v1/index.rst b/Documentation/cgroup-v1/index.rst deleted file mode 100644 index fe76d42edc11..000000000000 --- a/Documentation/cgroup-v1/index.rst +++ /dev/null @@ -1,30 +0,0 @@ -:orphan: - -======================== -Control Groups version 1 -======================== - -.. toctree:: - :maxdepth: 1 - - cgroups - - blkio-controller - cpuacct - cpusets - devices - freezer-subsystem - hugetlb - memcg_test - memory - net_cls - net_prio - pids - rdma - -.. only:: subproject and html - - Indices - ======= - - * :ref:`genindex` diff --git a/Documentation/cgroup-v1/memcg_test.rst b/Documentation/cgroup-v1/memcg_test.rst deleted file mode 100644 index 91bd18c6a514..000000000000 --- a/Documentation/cgroup-v1/memcg_test.rst +++ /dev/null @@ -1,355 +0,0 @@ -===================================================== -Memory Resource Controller(Memcg) Implementation Memo -===================================================== - -Last Updated: 2010/2 - -Base Kernel Version: based on 2.6.33-rc7-mm(candidate for 34). - -Because VM is getting complex (one of reasons is memcg...), memcg's behavior -is complex. This is a document for memcg's internal behavior. -Please note that implementation details can be changed. - -(*) Topics on API should be in Documentation/cgroup-v1/memory.rst) - -0. How to record usage ? -======================== - - 2 objects are used. - - page_cgroup ....an object per page. - - Allocated at boot or memory hotplug. Freed at memory hot removal. - - swap_cgroup ... an entry per swp_entry. - - Allocated at swapon(). Freed at swapoff(). - - The page_cgroup has USED bit and double count against a page_cgroup never - occurs. swap_cgroup is used only when a charged page is swapped-out. - -1. Charge -========= - - a page/swp_entry may be charged (usage += PAGE_SIZE) at - - mem_cgroup_try_charge() - -2. Uncharge -=========== - - a page/swp_entry may be uncharged (usage -= PAGE_SIZE) by - - mem_cgroup_uncharge() - Called when a page's refcount goes down to 0. - - mem_cgroup_uncharge_swap() - Called when swp_entry's refcnt goes down to 0. A charge against swap - disappears. - -3. charge-commit-cancel -======================= - - Memcg pages are charged in two steps: - - - mem_cgroup_try_charge() - - mem_cgroup_commit_charge() or mem_cgroup_cancel_charge() - - At try_charge(), there are no flags to say "this page is charged". - at this point, usage += PAGE_SIZE. - - At commit(), the page is associated with the memcg. - - At cancel(), simply usage -= PAGE_SIZE. - -Under below explanation, we assume CONFIG_MEM_RES_CTRL_SWAP=y. - -4. Anonymous -============ - - Anonymous page is newly allocated at - - page fault into MAP_ANONYMOUS mapping. - - Copy-On-Write. - - 4.1 Swap-in. - At swap-in, the page is taken from swap-cache. There are 2 cases. - - (a) If the SwapCache is newly allocated and read, it has no charges. - (b) If the SwapCache has been mapped by processes, it has been - charged already. - - 4.2 Swap-out. - At swap-out, typical state transition is below. - - (a) add to swap cache. (marked as SwapCache) - swp_entry's refcnt += 1. - (b) fully unmapped. - swp_entry's refcnt += # of ptes. - (c) write back to swap. - (d) delete from swap cache. (remove from SwapCache) - swp_entry's refcnt -= 1. - - - Finally, at task exit, - (e) zap_pte() is called and swp_entry's refcnt -=1 -> 0. - -5. Page Cache -============= - - Page Cache is charged at - - add_to_page_cache_locked(). - - The logic is very clear. (About migration, see below) - - Note: - __remove_from_page_cache() is called by remove_from_page_cache() - and __remove_mapping(). - -6. Shmem(tmpfs) Page Cache -=========================== - - The best way to understand shmem's page state transition is to read - mm/shmem.c. - - But brief explanation of the behavior of memcg around shmem will be - helpful to understand the logic. - - Shmem's page (just leaf page, not direct/indirect block) can be on - - - radix-tree of shmem's inode. - - SwapCache. - - Both on radix-tree and SwapCache. This happens at swap-in - and swap-out, - - It's charged when... - - - A new page is added to shmem's radix-tree. - - A swp page is read. (move a charge from swap_cgroup to page_cgroup) - -7. Page Migration -================= - - mem_cgroup_migrate() - -8. LRU -====== - Each memcg has its own private LRU. Now, its handling is under global - VM's control (means that it's handled under global pgdat->lru_lock). - Almost all routines around memcg's LRU is called by global LRU's - list management functions under pgdat->lru_lock. - - A special function is mem_cgroup_isolate_pages(). This scans - memcg's private LRU and call __isolate_lru_page() to extract a page - from LRU. - - (By __isolate_lru_page(), the page is removed from both of global and - private LRU.) - - -9. Typical Tests. -================= - - Tests for racy cases. - -9.1 Small limit to memcg. -------------------------- - - When you do test to do racy case, it's good test to set memcg's limit - to be very small rather than GB. Many races found in the test under - xKB or xxMB limits. - - (Memory behavior under GB and Memory behavior under MB shows very - different situation.) - -9.2 Shmem ---------- - - Historically, memcg's shmem handling was poor and we saw some amount - of troubles here. This is because shmem is page-cache but can be - SwapCache. Test with shmem/tmpfs is always good test. - -9.3 Migration -------------- - - For NUMA, migration is an another special case. To do easy test, cpuset - is useful. Following is a sample script to do migration:: - - mount -t cgroup -o cpuset none /opt/cpuset - - mkdir /opt/cpuset/01 - echo 1 > /opt/cpuset/01/cpuset.cpus - echo 0 > /opt/cpuset/01/cpuset.mems - echo 1 > /opt/cpuset/01/cpuset.memory_migrate - mkdir /opt/cpuset/02 - echo 1 > /opt/cpuset/02/cpuset.cpus - echo 1 > /opt/cpuset/02/cpuset.mems - echo 1 > /opt/cpuset/02/cpuset.memory_migrate - - In above set, when you moves a task from 01 to 02, page migration to - node 0 to node 1 will occur. Following is a script to migrate all - under cpuset.:: - - -- - move_task() - { - for pid in $1 - do - /bin/echo $pid >$2/tasks 2>/dev/null - echo -n $pid - echo -n " " - done - echo END - } - - G1_TASK=`cat ${G1}/tasks` - G2_TASK=`cat ${G2}/tasks` - move_task "${G1_TASK}" ${G2} & - -- - -9.4 Memory hotplug ------------------- - - memory hotplug test is one of good test. - - to offline memory, do following:: - - # echo offline > /sys/devices/system/memory/memoryXXX/state - - (XXX is the place of memory) - - This is an easy way to test page migration, too. - -9.5 mkdir/rmdir ---------------- - - When using hierarchy, mkdir/rmdir test should be done. - Use tests like the following:: - - echo 1 >/opt/cgroup/01/memory/use_hierarchy - mkdir /opt/cgroup/01/child_a - mkdir /opt/cgroup/01/child_b - - set limit to 01. - add limit to 01/child_b - run jobs under child_a and child_b - - create/delete following groups at random while jobs are running:: - - /opt/cgroup/01/child_a/child_aa - /opt/cgroup/01/child_b/child_bb - /opt/cgroup/01/child_c - - running new jobs in new group is also good. - -9.6 Mount with other subsystems -------------------------------- - - Mounting with other subsystems is a good test because there is a - race and lock dependency with other cgroup subsystems. - - example:: - - # mount -t cgroup none /cgroup -o cpuset,memory,cpu,devices - - and do task move, mkdir, rmdir etc...under this. - -9.7 swapoff ------------ - - Besides management of swap is one of complicated parts of memcg, - call path of swap-in at swapoff is not same as usual swap-in path.. - It's worth to be tested explicitly. - - For example, test like following is good: - - (Shell-A):: - - # mount -t cgroup none /cgroup -o memory - # mkdir /cgroup/test - # echo 40M > /cgroup/test/memory.limit_in_bytes - # echo 0 > /cgroup/test/tasks - - Run malloc(100M) program under this. You'll see 60M of swaps. - - (Shell-B):: - - # move all tasks in /cgroup/test to /cgroup - # /sbin/swapoff -a - # rmdir /cgroup/test - # kill malloc task. - - Of course, tmpfs v.s. swapoff test should be tested, too. - -9.8 OOM-Killer --------------- - - Out-of-memory caused by memcg's limit will kill tasks under - the memcg. When hierarchy is used, a task under hierarchy - will be killed by the kernel. - - In this case, panic_on_oom shouldn't be invoked and tasks - in other groups shouldn't be killed. - - It's not difficult to cause OOM under memcg as following. - - Case A) when you can swapoff:: - - #swapoff -a - #echo 50M > /memory.limit_in_bytes - - run 51M of malloc - - Case B) when you use mem+swap limitation:: - - #echo 50M > memory.limit_in_bytes - #echo 50M > memory.memsw.limit_in_bytes - - run 51M of malloc - -9.9 Move charges at task migration ----------------------------------- - - Charges associated with a task can be moved along with task migration. - - (Shell-A):: - - #mkdir /cgroup/A - #echo $$ >/cgroup/A/tasks - - run some programs which uses some amount of memory in /cgroup/A. - - (Shell-B):: - - #mkdir /cgroup/B - #echo 1 >/cgroup/B/memory.move_charge_at_immigrate - #echo "pid of the program running in group A" >/cgroup/B/tasks - - You can see charges have been moved by reading ``*.usage_in_bytes`` or - memory.stat of both A and B. - - See 8.2 of Documentation/cgroup-v1/memory.rst to see what value should - be written to move_charge_at_immigrate. - -9.10 Memory thresholds ----------------------- - - Memory controller implements memory thresholds using cgroups notification - API. You can use tools/cgroup/cgroup_event_listener.c to test it. - - (Shell-A) Create cgroup and run event listener:: - - # mkdir /cgroup/A - # ./cgroup_event_listener /cgroup/A/memory.usage_in_bytes 5M - - (Shell-B) Add task to cgroup and try to allocate and free memory:: - - # echo $$ >/cgroup/A/tasks - # a="$(dd if=/dev/zero bs=1M count=10)" - # a= - - You will see message from cgroup_event_listener every time you cross - the thresholds. - - Use /cgroup/A/memory.memsw.usage_in_bytes to test memsw thresholds. - - It's good idea to test root cgroup as well. diff --git a/Documentation/cgroup-v1/memory.rst b/Documentation/cgroup-v1/memory.rst deleted file mode 100644 index 41bdc038dad9..000000000000 --- a/Documentation/cgroup-v1/memory.rst +++ /dev/null @@ -1,1003 +0,0 @@ -========================== -Memory Resource Controller -========================== - -NOTE: - This document is hopelessly outdated and it asks for a complete - rewrite. It still contains a useful information so we are keeping it - here but make sure to check the current code if you need a deeper - understanding. - -NOTE: - The Memory Resource Controller has generically been referred to as the - memory controller in this document. Do not confuse memory controller - used here with the memory controller that is used in hardware. - -(For editors) In this document: - When we mention a cgroup (cgroupfs's directory) with memory controller, - we call it "memory cgroup". When you see git-log and source code, you'll - see patch's title and function names tend to use "memcg". - In this document, we avoid using it. - -Benefits and Purpose of the memory controller -============================================= - -The memory controller isolates the memory behaviour of a group of tasks -from the rest of the system. The article on LWN [12] mentions some probable -uses of the memory controller. The memory controller can be used to - -a. Isolate an application or a group of applications - Memory-hungry applications can be isolated and limited to a smaller - amount of memory. -b. Create a cgroup with a limited amount of memory; this can be used - as a good alternative to booting with mem=XXXX. -c. Virtualization solutions can control the amount of memory they want - to assign to a virtual machine instance. -d. A CD/DVD burner could control the amount of memory used by the - rest of the system to ensure that burning does not fail due to lack - of available memory. -e. There are several other use cases; find one or use the controller just - for fun (to learn and hack on the VM subsystem). - -Current Status: linux-2.6.34-mmotm(development version of 2010/April) - -Features: - - - accounting anonymous pages, file caches, swap caches usage and limiting them. - - pages are linked to per-memcg LRU exclusively, and there is no global LRU. - - optionally, memory+swap usage can be accounted and limited. - - hierarchical accounting - - soft limit - - moving (recharging) account at moving a task is selectable. - - usage threshold notifier - - memory pressure notifier - - oom-killer disable knob and oom-notifier - - Root cgroup has no limit controls. - - Kernel memory support is a work in progress, and the current version provides - basically functionality. (See Section 2.7) - -Brief summary of control files. - -==================================== ========================================== - tasks attach a task(thread) and show list of - threads - cgroup.procs show list of processes - cgroup.event_control an interface for event_fd() - memory.usage_in_bytes show current usage for memory - (See 5.5 for details) - memory.memsw.usage_in_bytes show current usage for memory+Swap - (See 5.5 for details) - memory.limit_in_bytes set/show limit of memory usage - memory.memsw.limit_in_bytes set/show limit of memory+Swap usage - memory.failcnt show the number of memory usage hits limits - memory.memsw.failcnt show the number of memory+Swap hits limits - memory.max_usage_in_bytes show max memory usage recorded - memory.memsw.max_usage_in_bytes show max memory+Swap usage recorded - memory.soft_limit_in_bytes set/show soft limit of memory usage - memory.stat show various statistics - memory.use_hierarchy set/show hierarchical account enabled - memory.force_empty trigger forced page reclaim - memory.pressure_level set memory pressure notifications - memory.swappiness set/show swappiness parameter of vmscan - (See sysctl's vm.swappiness) - memory.move_charge_at_immigrate set/show controls of moving charges - memory.oom_control set/show oom controls. - memory.numa_stat show the number of memory usage per numa - node - - memory.kmem.limit_in_bytes set/show hard limit for kernel memory - memory.kmem.usage_in_bytes show current kernel memory allocation - memory.kmem.failcnt show the number of kernel memory usage - hits limits - memory.kmem.max_usage_in_bytes show max kernel memory usage recorded - - memory.kmem.tcp.limit_in_bytes set/show hard limit for tcp buf memory - memory.kmem.tcp.usage_in_bytes show current tcp buf memory allocation - memory.kmem.tcp.failcnt show the number of tcp buf memory usage - hits limits - memory.kmem.tcp.max_usage_in_bytes show max tcp buf memory usage recorded -==================================== ========================================== - -1. History -========== - -The memory controller has a long history. A request for comments for the memory -controller was posted by Balbir Singh [1]. At the time the RFC was posted -there were several implementations for memory control. The goal of the -RFC was to build consensus and agreement for the minimal features required -for memory control. The first RSS controller was posted by Balbir Singh[2] -in Feb 2007. Pavel Emelianov [3][4][5] has since posted three versions of the -RSS controller. At OLS, at the resource management BoF, everyone suggested -that we handle both page cache and RSS together. Another request was raised -to allow user space handling of OOM. The current memory controller is -at version 6; it combines both mapped (RSS) and unmapped Page -Cache Control [11]. - -2. Memory Control -================= - -Memory is a unique resource in the sense that it is present in a limited -amount. If a task requires a lot of CPU processing, the task can spread -its processing over a period of hours, days, months or years, but with -memory, the same physical memory needs to be reused to accomplish the task. - -The memory controller implementation has been divided into phases. These -are: - -1. Memory controller -2. mlock(2) controller -3. Kernel user memory accounting and slab control -4. user mappings length controller - -The memory controller is the first controller developed. - -2.1. Design ------------ - -The core of the design is a counter called the page_counter. The -page_counter tracks the current memory usage and limit of the group of -processes associated with the controller. Each cgroup has a memory controller -specific data structure (mem_cgroup) associated with it. - -2.2. Accounting ---------------- - -:: - - +--------------------+ - | mem_cgroup | - | (page_counter) | - +--------------------+ - / ^ \ - / | \ - +---------------+ | +---------------+ - | mm_struct | |.... | mm_struct | - | | | | | - +---------------+ | +---------------+ - | - + --------------+ - | - +---------------+ +------+--------+ - | page +----------> page_cgroup| - | | | | - +---------------+ +---------------+ - - (Figure 1: Hierarchy of Accounting) - - -Figure 1 shows the important aspects of the controller - -1. Accounting happens per cgroup -2. Each mm_struct knows about which cgroup it belongs to -3. Each page has a pointer to the page_cgroup, which in turn knows the - cgroup it belongs to - -The accounting is done as follows: mem_cgroup_charge_common() is invoked to -set up the necessary data structures and check if the cgroup that is being -charged is over its limit. If it is, then reclaim is invoked on the cgroup. -More details can be found in the reclaim section of this document. -If everything goes well, a page meta-data-structure called page_cgroup is -updated. page_cgroup has its own LRU on cgroup. -(*) page_cgroup structure is allocated at boot/memory-hotplug time. - -2.2.1 Accounting details ------------------------- - -All mapped anon pages (RSS) and cache pages (Page Cache) are accounted. -Some pages which are never reclaimable and will not be on the LRU -are not accounted. We just account pages under usual VM management. - -RSS pages are accounted at page_fault unless they've already been accounted -for earlier. A file page will be accounted for as Page Cache when it's -inserted into inode (radix-tree). While it's mapped into the page tables of -processes, duplicate accounting is carefully avoided. - -An RSS page is unaccounted when it's fully unmapped. A PageCache page is -unaccounted when it's removed from radix-tree. Even if RSS pages are fully -unmapped (by kswapd), they may exist as SwapCache in the system until they -are really freed. Such SwapCaches are also accounted. -A swapped-in page is not accounted until it's mapped. - -Note: The kernel does swapin-readahead and reads multiple swaps at once. -This means swapped-in pages may contain pages for other tasks than a task -causing page fault. So, we avoid accounting at swap-in I/O. - -At page migration, accounting information is kept. - -Note: we just account pages-on-LRU because our purpose is to control amount -of used pages; not-on-LRU pages tend to be out-of-control from VM view. - -2.3 Shared Page Accounting --------------------------- - -Shared pages are accounted on the basis of the first touch approach. The -cgroup that first touches a page is accounted for the page. The principle -behind this approach is that a cgroup that aggressively uses a shared -page will eventually get charged for it (once it is uncharged from -the cgroup that brought it in -- this will happen on memory pressure). - -But see section 8.2: when moving a task to another cgroup, its pages may -be recharged to the new cgroup, if move_charge_at_immigrate has been chosen. - -Exception: If CONFIG_MEMCG_SWAP is not used. -When you do swapoff and make swapped-out pages of shmem(tmpfs) to -be backed into memory in force, charges for pages are accounted against the -caller of swapoff rather than the users of shmem. - -2.4 Swap Extension (CONFIG_MEMCG_SWAP) --------------------------------------- - -Swap Extension allows you to record charge for swap. A swapped-in page is -charged back to original page allocator if possible. - -When swap is accounted, following files are added. - - - memory.memsw.usage_in_bytes. - - memory.memsw.limit_in_bytes. - -memsw means memory+swap. Usage of memory+swap is limited by -memsw.limit_in_bytes. - -Example: Assume a system with 4G of swap. A task which allocates 6G of memory -(by mistake) under 2G memory limitation will use all swap. -In this case, setting memsw.limit_in_bytes=3G will prevent bad use of swap. -By using the memsw limit, you can avoid system OOM which can be caused by swap -shortage. - -**why 'memory+swap' rather than swap** - -The global LRU(kswapd) can swap out arbitrary pages. Swap-out means -to move account from memory to swap...there is no change in usage of -memory+swap. In other words, when we want to limit the usage of swap without -affecting global LRU, memory+swap limit is better than just limiting swap from -an OS point of view. - -**What happens when a cgroup hits memory.memsw.limit_in_bytes** - -When a cgroup hits memory.memsw.limit_in_bytes, it's useless to do swap-out -in this cgroup. Then, swap-out will not be done by cgroup routine and file -caches are dropped. But as mentioned above, global LRU can do swapout memory -from it for sanity of the system's memory management state. You can't forbid -it by cgroup. - -2.5 Reclaim ------------ - -Each cgroup maintains a per cgroup LRU which has the same structure as -global VM. When a cgroup goes over its limit, we first try -to reclaim memory from the cgroup so as to make space for the new -pages that the cgroup has touched. If the reclaim is unsuccessful, -an OOM routine is invoked to select and kill the bulkiest task in the -cgroup. (See 10. OOM Control below.) - -The reclaim algorithm has not been modified for cgroups, except that -pages that are selected for reclaiming come from the per-cgroup LRU -list. - -NOTE: - Reclaim does not work for the root cgroup, since we cannot set any - limits on the root cgroup. - -Note2: - When panic_on_oom is set to "2", the whole system will panic. - -When oom event notifier is registered, event will be delivered. -(See oom_control section) - -2.6 Locking ------------ - - lock_page_cgroup()/unlock_page_cgroup() should not be called under - the i_pages lock. - - Other lock order is following: - - PG_locked. - mm->page_table_lock - pgdat->lru_lock - lock_page_cgroup. - - In many cases, just lock_page_cgroup() is called. - - per-zone-per-cgroup LRU (cgroup's private LRU) is just guarded by - pgdat->lru_lock, it has no lock of its own. - -2.7 Kernel Memory Extension (CONFIG_MEMCG_KMEM) ------------------------------------------------ - -With the Kernel memory extension, the Memory Controller is able to limit -the amount of kernel memory used by the system. Kernel memory is fundamentally -different than user memory, since it can't be swapped out, which makes it -possible to DoS the system by consuming too much of this precious resource. - -Kernel memory accounting is enabled for all memory cgroups by default. But -it can be disabled system-wide by passing cgroup.memory=nokmem to the kernel -at boot time. In this case, kernel memory will not be accounted at all. - -Kernel memory limits are not imposed for the root cgroup. Usage for the root -cgroup may or may not be accounted. The memory used is accumulated into -memory.kmem.usage_in_bytes, or in a separate counter when it makes sense. -(currently only for tcp). - -The main "kmem" counter is fed into the main counter, so kmem charges will -also be visible from the user counter. - -Currently no soft limit is implemented for kernel memory. It is future work -to trigger slab reclaim when those limits are reached. - -2.7.1 Current Kernel Memory resources accounted ------------------------------------------------ - -stack pages: - every process consumes some stack pages. By accounting into - kernel memory, we prevent new processes from being created when the kernel - memory usage is too high. - -slab pages: - pages allocated by the SLAB or SLUB allocator are tracked. A copy - of each kmem_cache is created every time the cache is touched by the first time - from inside the memcg. The creation is done lazily, so some objects can still be - skipped while the cache is being created. All objects in a slab page should - belong to the same memcg. This only fails to hold when a task is migrated to a - different memcg during the page allocation by the cache. - -sockets memory pressure: - some sockets protocols have memory pressure - thresholds. The Memory Controller allows them to be controlled individually - per cgroup, instead of globally. - -tcp memory pressure: - sockets memory pressure for the tcp protocol. - -2.7.2 Common use cases ----------------------- - -Because the "kmem" counter is fed to the main user counter, kernel memory can -never be limited completely independently of user memory. Say "U" is the user -limit, and "K" the kernel limit. There are three possible ways limits can be -set: - -U != 0, K = unlimited: - This is the standard memcg limitation mechanism already present before kmem - accounting. Kernel memory is completely ignored. - -U != 0, K < U: - Kernel memory is a subset of the user memory. This setup is useful in - deployments where the total amount of memory per-cgroup is overcommited. - Overcommiting kernel memory limits is definitely not recommended, since the - box can still run out of non-reclaimable memory. - In this case, the admin could set up K so that the sum of all groups is - never greater than the total memory, and freely set U at the cost of his - QoS. - -WARNING: - In the current implementation, memory reclaim will NOT be - triggered for a cgroup when it hits K while staying below U, which makes - this setup impractical. - -U != 0, K >= U: - Since kmem charges will also be fed to the user counter and reclaim will be - triggered for the cgroup for both kinds of memory. This setup gives the - admin a unified view of memory, and it is also useful for people who just - want to track kernel memory usage. - -3. User Interface -================= - -3.0. Configuration ------------------- - -a. Enable CONFIG_CGROUPS -b. Enable CONFIG_MEMCG -c. Enable CONFIG_MEMCG_SWAP (to use swap extension) -d. Enable CONFIG_MEMCG_KMEM (to use kmem extension) - -3.1. Prepare the cgroups (see cgroups.txt, Why are cgroups needed?) -------------------------------------------------------------------- - -:: - - # mount -t tmpfs none /sys/fs/cgroup - # mkdir /sys/fs/cgroup/memory - # mount -t cgroup none /sys/fs/cgroup/memory -o memory - -3.2. Make the new group and move bash into it:: - - # mkdir /sys/fs/cgroup/memory/0 - # echo $$ > /sys/fs/cgroup/memory/0/tasks - -Since now we're in the 0 cgroup, we can alter the memory limit:: - - # echo 4M > /sys/fs/cgroup/memory/0/memory.limit_in_bytes - -NOTE: - We can use a suffix (k, K, m, M, g or G) to indicate values in kilo, - mega or gigabytes. (Here, Kilo, Mega, Giga are Kibibytes, Mebibytes, - Gibibytes.) - -NOTE: - We can write "-1" to reset the ``*.limit_in_bytes(unlimited)``. - -NOTE: - We cannot set limits on the root cgroup any more. - -:: - - # cat /sys/fs/cgroup/memory/0/memory.limit_in_bytes - 4194304 - -We can check the usage:: - - # cat /sys/fs/cgroup/memory/0/memory.usage_in_bytes - 1216512 - -A successful write to this file does not guarantee a successful setting of -this limit to the value written into the file. This can be due to a -number of factors, such as rounding up to page boundaries or the total -availability of memory on the system. The user is required to re-read -this file after a write to guarantee the value committed by the kernel:: - - # echo 1 > memory.limit_in_bytes - # cat memory.limit_in_bytes - 4096 - -The memory.failcnt field gives the number of times that the cgroup limit was -exceeded. - -The memory.stat file gives accounting information. Now, the number of -caches, RSS and Active pages/Inactive pages are shown. - -4. Testing -========== - -For testing features and implementation, see memcg_test.txt. - -Performance test is also important. To see pure memory controller's overhead, -testing on tmpfs will give you good numbers of small overheads. -Example: do kernel make on tmpfs. - -Page-fault scalability is also important. At measuring parallel -page fault test, multi-process test may be better than multi-thread -test because it has noise of shared objects/status. - -But the above two are testing extreme situations. -Trying usual test under memory controller is always helpful. - -4.1 Troubleshooting -------------------- - -Sometimes a user might find that the application under a cgroup is -terminated by the OOM killer. There are several causes for this: - -1. The cgroup limit is too low (just too low to do anything useful) -2. The user is using anonymous memory and swap is turned off or too low - -A sync followed by echo 1 > /proc/sys/vm/drop_caches will help get rid of -some of the pages cached in the cgroup (page cache pages). - -To know what happens, disabling OOM_Kill as per "10. OOM Control" (below) and -seeing what happens will be helpful. - -4.2 Task migration ------------------- - -When a task migrates from one cgroup to another, its charge is not -carried forward by default. The pages allocated from the original cgroup still -remain charged to it, the charge is dropped when the page is freed or -reclaimed. - -You can move charges of a task along with task migration. -See 8. "Move charges at task migration" - -4.3 Removing a cgroup ---------------------- - -A cgroup can be removed by rmdir, but as discussed in sections 4.1 and 4.2, a -cgroup might have some charge associated with it, even though all -tasks have migrated away from it. (because we charge against pages, not -against tasks.) - -We move the stats to root (if use_hierarchy==0) or parent (if -use_hierarchy==1), and no change on the charge except uncharging -from the child. - -Charges recorded in swap information is not updated at removal of cgroup. -Recorded information is discarded and a cgroup which uses swap (swapcache) -will be charged as a new owner of it. - -About use_hierarchy, see Section 6. - -5. Misc. interfaces -=================== - -5.1 force_empty ---------------- - memory.force_empty interface is provided to make cgroup's memory usage empty. - When writing anything to this:: - - # echo 0 > memory.force_empty - - the cgroup will be reclaimed and as many pages reclaimed as possible. - - The typical use case for this interface is before calling rmdir(). - Though rmdir() offlines memcg, but the memcg may still stay there due to - charged file caches. Some out-of-use page caches may keep charged until - memory pressure happens. If you want to avoid that, force_empty will be useful. - - Also, note that when memory.kmem.limit_in_bytes is set the charges due to - kernel pages will still be seen. This is not considered a failure and the - write will still return success. In this case, it is expected that - memory.kmem.usage_in_bytes == memory.usage_in_bytes. - - About use_hierarchy, see Section 6. - -5.2 stat file -------------- - -memory.stat file includes following statistics - -per-memory cgroup local status -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -=============== =============================================================== -cache # of bytes of page cache memory. -rss # of bytes of anonymous and swap cache memory (includes - transparent hugepages). -rss_huge # of bytes of anonymous transparent hugepages. -mapped_file # of bytes of mapped file (includes tmpfs/shmem) -pgpgin # of charging events to the memory cgroup. The charging - event happens each time a page is accounted as either mapped - anon page(RSS) or cache page(Page Cache) to the cgroup. -pgpgout # of uncharging events to the memory cgroup. The uncharging - event happens each time a page is unaccounted from the cgroup. -swap # of bytes of swap usage -dirty # of bytes that are waiting to get written back to the disk. -writeback # of bytes of file/anon cache that are queued for syncing to - disk. -inactive_anon # of bytes of anonymous and swap cache memory on inactive - LRU list. -active_anon # of bytes of anonymous and swap cache memory on active - LRU list. -inactive_file # of bytes of file-backed memory on inactive LRU list. -active_file # of bytes of file-backed memory on active LRU list. -unevictable # of bytes of memory that cannot be reclaimed (mlocked etc). -=============== =============================================================== - -status considering hierarchy (see memory.use_hierarchy settings) -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -========================= =================================================== -hierarchical_memory_limit # of bytes of memory limit with regard to hierarchy - under which the memory cgroup is -hierarchical_memsw_limit # of bytes of memory+swap limit with regard to - hierarchy under which memory cgroup is. - -total_ # hierarchical version of , which in - addition to the cgroup's own value includes the - sum of all hierarchical children's values of - , i.e. total_cache -========================= =================================================== - -The following additional stats are dependent on CONFIG_DEBUG_VM -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -========================= ======================================== -recent_rotated_anon VM internal parameter. (see mm/vmscan.c) -recent_rotated_file VM internal parameter. (see mm/vmscan.c) -recent_scanned_anon VM internal parameter. (see mm/vmscan.c) -recent_scanned_file VM internal parameter. (see mm/vmscan.c) -========================= ======================================== - -Memo: - recent_rotated means recent frequency of LRU rotation. - recent_scanned means recent # of scans to LRU. - showing for better debug please see the code for meanings. - -Note: - Only anonymous and swap cache memory is listed as part of 'rss' stat. - This should not be confused with the true 'resident set size' or the - amount of physical memory used by the cgroup. - - 'rss + mapped_file" will give you resident set size of cgroup. - - (Note: file and shmem may be shared among other cgroups. In that case, - mapped_file is accounted only when the memory cgroup is owner of page - cache.) - -5.3 swappiness --------------- - -Overrides /proc/sys/vm/swappiness for the particular group. The tunable -in the root cgroup corresponds to the global swappiness setting. - -Please note that unlike during the global reclaim, limit reclaim -enforces that 0 swappiness really prevents from any swapping even if -there is a swap storage available. This might lead to memcg OOM killer -if there are no file pages to reclaim. - -5.4 failcnt ------------ - -A memory cgroup provides memory.failcnt and memory.memsw.failcnt files. -This failcnt(== failure count) shows the number of times that a usage counter -hit its limit. When a memory cgroup hits a limit, failcnt increases and -memory under it will be reclaimed. - -You can reset failcnt by writing 0 to failcnt file:: - - # echo 0 > .../memory.failcnt - -5.5 usage_in_bytes ------------------- - -For efficiency, as other kernel components, memory cgroup uses some optimization -to avoid unnecessary cacheline false sharing. usage_in_bytes is affected by the -method and doesn't show 'exact' value of memory (and swap) usage, it's a fuzz -value for efficient access. (Of course, when necessary, it's synchronized.) -If you want to know more exact memory usage, you should use RSS+CACHE(+SWAP) -value in memory.stat(see 5.2). - -5.6 numa_stat -------------- - -This is similar to numa_maps but operates on a per-memcg basis. This is -useful for providing visibility into the numa locality information within -an memcg since the pages are allowed to be allocated from any physical -node. One of the use cases is evaluating application performance by -combining this information with the application's CPU allocation. - -Each memcg's numa_stat file includes "total", "file", "anon" and "unevictable" -per-node page counts including "hierarchical_" which sums up all -hierarchical children's values in addition to the memcg's own value. - -The output format of memory.numa_stat is:: - - total= N0= N1= ... - file= N0= N1= ... - anon= N0= N1= ... - unevictable= N0= N1= ... - hierarchical_= N0= N1= ... - -The "total" count is sum of file + anon + unevictable. - -6. Hierarchy support -==================== - -The memory controller supports a deep hierarchy and hierarchical accounting. -The hierarchy is created by creating the appropriate cgroups in the -cgroup filesystem. Consider for example, the following cgroup filesystem -hierarchy:: - - root - / | \ - / | \ - a b c - | \ - | \ - d e - -In the diagram above, with hierarchical accounting enabled, all memory -usage of e, is accounted to its ancestors up until the root (i.e, c and root), -that has memory.use_hierarchy enabled. If one of the ancestors goes over its -limit, the reclaim algorithm reclaims from the tasks in the ancestor and the -children of the ancestor. - -6.1 Enabling hierarchical accounting and reclaim ------------------------------------------------- - -A memory cgroup by default disables the hierarchy feature. Support -can be enabled by writing 1 to memory.use_hierarchy file of the root cgroup:: - - # echo 1 > memory.use_hierarchy - -The feature can be disabled by:: - - # echo 0 > memory.use_hierarchy - -NOTE1: - Enabling/disabling will fail if either the cgroup already has other - cgroups created below it, or if the parent cgroup has use_hierarchy - enabled. - -NOTE2: - When panic_on_oom is set to "2", the whole system will panic in - case of an OOM event in any cgroup. - -7. Soft limits -============== - -Soft limits allow for greater sharing of memory. The idea behind soft limits -is to allow control groups to use as much of the memory as needed, provided - -a. There is no memory contention -b. They do not exceed their hard limit - -When the system detects memory contention or low memory, control groups -are pushed back to their soft limits. If the soft limit of each control -group is very high, they are pushed back as much as possible to make -sure that one control group does not starve the others of memory. - -Please note that soft limits is a best-effort feature; it comes with -no guarantees, but it does its best to make sure that when memory is -heavily contended for, memory is allocated based on the soft limit -hints/setup. Currently soft limit based reclaim is set up such that -it gets invoked from balance_pgdat (kswapd). - -7.1 Interface -------------- - -Soft limits can be setup by using the following commands (in this example we -assume a soft limit of 256 MiB):: - - # echo 256M > memory.soft_limit_in_bytes - -If we want to change this to 1G, we can at any time use:: - - # echo 1G > memory.soft_limit_in_bytes - -NOTE1: - Soft limits take effect over a long period of time, since they involve - reclaiming memory for balancing between memory cgroups -NOTE2: - It is recommended to set the soft limit always below the hard limit, - otherwise the hard limit will take precedence. - -8. Move charges at task migration -================================= - -Users can move charges associated with a task along with task migration, that -is, uncharge task's pages from the old cgroup and charge them to the new cgroup. -This feature is not supported in !CONFIG_MMU environments because of lack of -page tables. - -8.1 Interface -------------- - -This feature is disabled by default. It can be enabled (and disabled again) by -writing to memory.move_charge_at_immigrate of the destination cgroup. - -If you want to enable it:: - - # echo (some positive value) > memory.move_charge_at_immigrate - -Note: - Each bits of move_charge_at_immigrate has its own meaning about what type - of charges should be moved. See 8.2 for details. -Note: - Charges are moved only when you move mm->owner, in other words, - a leader of a thread group. -Note: - If we cannot find enough space for the task in the destination cgroup, we - try to make space by reclaiming memory. Task migration may fail if we - cannot make enough space. -Note: - It can take several seconds if you move charges much. - -And if you want disable it again:: - - # echo 0 > memory.move_charge_at_immigrate - -8.2 Type of charges which can be moved --------------------------------------- - -Each bit in move_charge_at_immigrate has its own meaning about what type of -charges should be moved. But in any case, it must be noted that an account of -a page or a swap can be moved only when it is charged to the task's current -(old) memory cgroup. - -+---+--------------------------------------------------------------------------+ -|bit| what type of charges would be moved ? | -+===+==========================================================================+ -| 0 | A charge of an anonymous page (or swap of it) used by the target task. | -| | You must enable Swap Extension (see 2.4) to enable move of swap charges. | -+---+--------------------------------------------------------------------------+ -| 1 | A charge of file pages (normal file, tmpfs file (e.g. ipc shared memory) | -| | and swaps of tmpfs file) mmapped by the target task. Unlike the case of | -| | anonymous pages, file pages (and swaps) in the range mmapped by the task | -| | will be moved even if the task hasn't done page fault, i.e. they might | -| | not be the task's "RSS", but other task's "RSS" that maps the same file. | -| | And mapcount of the page is ignored (the page can be moved even if | -| | page_mapcount(page) > 1). You must enable Swap Extension (see 2.4) to | -| | enable move of swap charges. | -+---+--------------------------------------------------------------------------+ - -8.3 TODO --------- - -- All of moving charge operations are done under cgroup_mutex. It's not good - behavior to hold the mutex too long, so we may need some trick. - -9. Memory thresholds -==================== - -Memory cgroup implements memory thresholds using the cgroups notification -API (see cgroups.txt). It allows to register multiple memory and memsw -thresholds and gets notifications when it crosses. - -To register a threshold, an application must: - -- create an eventfd using eventfd(2); -- open memory.usage_in_bytes or memory.memsw.usage_in_bytes; -- write string like " " to - cgroup.event_control. - -Application will be notified through eventfd when memory usage crosses -threshold in any direction. - -It's applicable for root and non-root cgroup. - -10. OOM Control -=============== - -memory.oom_control file is for OOM notification and other controls. - -Memory cgroup implements OOM notifier using the cgroup notification -API (See cgroups.txt). It allows to register multiple OOM notification -delivery and gets notification when OOM happens. - -To register a notifier, an application must: - - - create an eventfd using eventfd(2) - - open memory.oom_control file - - write string like " " to - cgroup.event_control - -The application will be notified through eventfd when OOM happens. -OOM notification doesn't work for the root cgroup. - -You can disable the OOM-killer by writing "1" to memory.oom_control file, as: - - #echo 1 > memory.oom_control - -If OOM-killer is disabled, tasks under cgroup will hang/sleep -in memory cgroup's OOM-waitqueue when they request accountable memory. - -For running them, you have to relax the memory cgroup's OOM status by - - * enlarge limit or reduce usage. - -To reduce usage, - - * kill some tasks. - * move some tasks to other group with account migration. - * remove some files (on tmpfs?) - -Then, stopped tasks will work again. - -At reading, current status of OOM is shown. - - - oom_kill_disable 0 or 1 - (if 1, oom-killer is disabled) - - under_oom 0 or 1 - (if 1, the memory cgroup is under OOM, tasks may be stopped.) - -11. Memory Pressure -=================== - -The pressure level notifications can be used to monitor the memory -allocation cost; based on the pressure, applications can implement -different strategies of managing their memory resources. The pressure -levels are defined as following: - -The "low" level means that the system is reclaiming memory for new -allocations. Monitoring this reclaiming activity might be useful for -maintaining cache level. Upon notification, the program (typically -"Activity Manager") might analyze vmstat and act in advance (i.e. -prematurely shutdown unimportant services). - -The "medium" level means that the system is experiencing medium memory -pressure, the system might be making swap, paging out active file caches, -etc. Upon this event applications may decide to further analyze -vmstat/zoneinfo/memcg or internal memory usage statistics and free any -resources that can be easily reconstructed or re-read from a disk. - -The "critical" level means that the system is actively thrashing, it is -about to out of memory (OOM) or even the in-kernel OOM killer is on its -way to trigger. Applications should do whatever they can to help the -system. It might be too late to consult with vmstat or any other -statistics, so it's advisable to take an immediate action. - -By default, events are propagated upward until the event is handled, i.e. the -events are not pass-through. For example, you have three cgroups: A->B->C. Now -you set up an event listener on cgroups A, B and C, and suppose group C -experiences some pressure. In this situation, only group C will receive the -notification, i.e. groups A and B will not receive it. This is done to avoid -excessive "broadcasting" of messages, which disturbs the system and which is -especially bad if we are low on memory or thrashing. Group B, will receive -notification only if there are no event listers for group C. - -There are three optional modes that specify different propagation behavior: - - - "default": this is the default behavior specified above. This mode is the - same as omitting the optional mode parameter, preserved by backwards - compatibility. - - - "hierarchy": events always propagate up to the root, similar to the default - behavior, except that propagation continues regardless of whether there are - event listeners at each level, with the "hierarchy" mode. In the above - example, groups A, B, and C will receive notification of memory pressure. - - - "local": events are pass-through, i.e. they only receive notifications when - memory pressure is experienced in the memcg for which the notification is - registered. In the above example, group C will receive notification if - registered for "local" notification and the group experiences memory - pressure. However, group B will never receive notification, regardless if - there is an event listener for group C or not, if group B is registered for - local notification. - -The level and event notification mode ("hierarchy" or "local", if necessary) are -specified by a comma-delimited string, i.e. "low,hierarchy" specifies -hierarchical, pass-through, notification for all ancestor memcgs. Notification -that is the default, non pass-through behavior, does not specify a mode. -"medium,local" specifies pass-through notification for the medium level. - -The file memory.pressure_level is only used to setup an eventfd. To -register a notification, an application must: - -- create an eventfd using eventfd(2); -- open memory.pressure_level; -- write string as " " - to cgroup.event_control. - -Application will be notified through eventfd when memory pressure is at -the specific level (or higher). Read/write operations to -memory.pressure_level are no implemented. - -Test: - - Here is a small script example that makes a new cgroup, sets up a - memory limit, sets up a notification in the cgroup and then makes child - cgroup experience a critical pressure:: - - # cd /sys/fs/cgroup/memory/ - # mkdir foo - # cd foo - # cgroup_event_listener memory.pressure_level low,hierarchy & - # echo 8000000 > memory.limit_in_bytes - # echo 8000000 > memory.memsw.limit_in_bytes - # echo $$ > tasks - # dd if=/dev/zero | read x - - (Expect a bunch of notifications, and eventually, the oom-killer will - trigger.) - -12. TODO -======== - -1. Make per-cgroup scanner reclaim not-shared pages first -2. Teach controller to account for shared-pages -3. Start reclamation in the background when the limit is - not yet hit but the usage is getting closer - -Summary -======= - -Overall, the memory controller has been a stable controller and has been -commented and discussed quite extensively in the community. - -References -========== - -1. Singh, Balbir. RFC: Memory Controller, http://lwn.net/Articles/206697/ -2. Singh, Balbir. Memory Controller (RSS Control), - http://lwn.net/Articles/222762/ -3. Emelianov, Pavel. Resource controllers based on process cgroups - http://lkml.org/lkml/2007/3/6/198 -4. Emelianov, Pavel. RSS controller based on process cgroups (v2) - http://lkml.org/lkml/2007/4/9/78 -5. Emelianov, Pavel. RSS controller based on process cgroups (v3) - http://lkml.org/lkml/2007/5/30/244 -6. Menage, Paul. Control Groups v10, http://lwn.net/Articles/236032/ -7. Vaidyanathan, Srinivasan, Control Groups: Pagecache accounting and control - subsystem (v3), http://lwn.net/Articles/235534/ -8. Singh, Balbir. RSS controller v2 test results (lmbench), - http://lkml.org/lkml/2007/5/17/232 -9. Singh, Balbir. RSS controller v2 AIM9 results - http://lkml.org/lkml/2007/5/18/1 -10. Singh, Balbir. Memory controller v6 test results, - http://lkml.org/lkml/2007/8/19/36 -11. Singh, Balbir. Memory controller introduction (v6), - http://lkml.org/lkml/2007/8/17/69 -12. Corbet, Jonathan, Controlling memory use in cgroups, - http://lwn.net/Articles/243795/ diff --git a/Documentation/cgroup-v1/net_cls.rst b/Documentation/cgroup-v1/net_cls.rst deleted file mode 100644 index a2cf272af7a0..000000000000 --- a/Documentation/cgroup-v1/net_cls.rst +++ /dev/null @@ -1,44 +0,0 @@ -========================= -Network classifier cgroup -========================= - -The Network classifier cgroup provides an interface to -tag network packets with a class identifier (classid). - -The Traffic Controller (tc) can be used to assign -different priorities to packets from different cgroups. -Also, Netfilter (iptables) can use this tag to perform -actions on such packets. - -Creating a net_cls cgroups instance creates a net_cls.classid file. -This net_cls.classid value is initialized to 0. - -You can write hexadecimal values to net_cls.classid; the format for these -values is 0xAAAABBBB; AAAA is the major handle number and BBBB -is the minor handle number. -Reading net_cls.classid yields a decimal result. - -Example:: - - mkdir /sys/fs/cgroup/net_cls - mount -t cgroup -onet_cls net_cls /sys/fs/cgroup/net_cls - mkdir /sys/fs/cgroup/net_cls/0 - echo 0x100001 > /sys/fs/cgroup/net_cls/0/net_cls.classid - -- setting a 10:1 handle:: - - cat /sys/fs/cgroup/net_cls/0/net_cls.classid - 1048577 - -- configuring tc:: - - tc qdisc add dev eth0 root handle 10: htb - tc class add dev eth0 parent 10: classid 10:1 htb rate 40mbit - -- creating traffic class 10:1:: - - tc filter add dev eth0 parent 10: protocol ip prio 10 handle 1: cgroup - -configuring iptables, basic example:: - - iptables -A OUTPUT -m cgroup ! --cgroup 0x100001 -j DROP diff --git a/Documentation/cgroup-v1/net_prio.rst b/Documentation/cgroup-v1/net_prio.rst deleted file mode 100644 index b40905871c64..000000000000 --- a/Documentation/cgroup-v1/net_prio.rst +++ /dev/null @@ -1,57 +0,0 @@ -======================= -Network priority cgroup -======================= - -The Network priority cgroup provides an interface to allow an administrator to -dynamically set the priority of network traffic generated by various -applications - -Nominally, an application would set the priority of its traffic via the -SO_PRIORITY socket option. This however, is not always possible because: - -1) The application may not have been coded to set this value -2) The priority of application traffic is often a site-specific administrative - decision rather than an application defined one. - -This cgroup allows an administrator to assign a process to a group which defines -the priority of egress traffic on a given interface. Network priority groups can -be created by first mounting the cgroup filesystem:: - - # mount -t cgroup -onet_prio none /sys/fs/cgroup/net_prio - -With the above step, the initial group acting as the parent accounting group -becomes visible at '/sys/fs/cgroup/net_prio'. This group includes all tasks in -the system. '/sys/fs/cgroup/net_prio/tasks' lists the tasks in this cgroup. - -Each net_prio cgroup contains two files that are subsystem specific - -net_prio.prioidx - This file is read-only, and is simply informative. It contains a unique - integer value that the kernel uses as an internal representation of this - cgroup. - -net_prio.ifpriomap - This file contains a map of the priorities assigned to traffic originating - from processes in this group and egressing the system on various interfaces. - It contains a list of tuples in the form . Contents of this - file can be modified by echoing a string into the file using the same tuple - format. For example:: - - echo "eth0 5" > /sys/fs/cgroups/net_prio/iscsi/net_prio.ifpriomap - -This command would force any traffic originating from processes belonging to the -iscsi net_prio cgroup and egressing on interface eth0 to have the priority of -said traffic set to the value 5. The parent accounting group also has a -writeable 'net_prio.ifpriomap' file that can be used to set a system default -priority. - -Priorities are set immediately prior to queueing a frame to the device -queueing discipline (qdisc) so priorities will be assigned prior to the hardware -queue selection being made. - -One usage for the net_prio cgroup is with mqprio qdisc allowing application -traffic to be steered to hardware/driver based traffic classes. These mappings -can then be managed by administrators or other networking protocols such as -DCBX. - -A new net_prio cgroup inherits the parent's configuration. diff --git a/Documentation/cgroup-v1/pids.rst b/Documentation/cgroup-v1/pids.rst deleted file mode 100644 index 6acebd9e72c8..000000000000 --- a/Documentation/cgroup-v1/pids.rst +++ /dev/null @@ -1,92 +0,0 @@ -========================= -Process Number Controller -========================= - -Abstract --------- - -The process number controller is used to allow a cgroup hierarchy to stop any -new tasks from being fork()'d or clone()'d after a certain limit is reached. - -Since it is trivial to hit the task limit without hitting any kmemcg limits in -place, PIDs are a fundamental resource. As such, PID exhaustion must be -preventable in the scope of a cgroup hierarchy by allowing resource limiting of -the number of tasks in a cgroup. - -Usage ------ - -In order to use the `pids` controller, set the maximum number of tasks in -pids.max (this is not available in the root cgroup for obvious reasons). The -number of processes currently in the cgroup is given by pids.current. - -Organisational operations are not blocked by cgroup policies, so it is possible -to have pids.current > pids.max. This can be done by either setting the limit to -be smaller than pids.current, or attaching enough processes to the cgroup such -that pids.current > pids.max. However, it is not possible to violate a cgroup -policy through fork() or clone(). fork() and clone() will return -EAGAIN if the -creation of a new process would cause a cgroup policy to be violated. - -To set a cgroup to have no limit, set pids.max to "max". This is the default for -all new cgroups (N.B. that PID limits are hierarchical, so the most stringent -limit in the hierarchy is followed). - -pids.current tracks all child cgroup hierarchies, so parent/pids.current is a -superset of parent/child/pids.current. - -The pids.events file contains event counters: - - - max: Number of times fork failed because limit was hit. - -Example -------- - -First, we mount the pids controller:: - - # mkdir -p /sys/fs/cgroup/pids - # mount -t cgroup -o pids none /sys/fs/cgroup/pids - -Then we create a hierarchy, set limits and attach processes to it:: - - # mkdir -p /sys/fs/cgroup/pids/parent/child - # echo 2 > /sys/fs/cgroup/pids/parent/pids.max - # echo $$ > /sys/fs/cgroup/pids/parent/cgroup.procs - # cat /sys/fs/cgroup/pids/parent/pids.current - 2 - # - -It should be noted that attempts to overcome the set limit (2 in this case) will -fail:: - - # cat /sys/fs/cgroup/pids/parent/pids.current - 2 - # ( /bin/echo "Here's some processes for you." | cat ) - sh: fork: Resource temporary unavailable - # - -Even if we migrate to a child cgroup (which doesn't have a set limit), we will -not be able to overcome the most stringent limit in the hierarchy (in this case, -parent's):: - - # echo $$ > /sys/fs/cgroup/pids/parent/child/cgroup.procs - # cat /sys/fs/cgroup/pids/parent/pids.current - 2 - # cat /sys/fs/cgroup/pids/parent/child/pids.current - 2 - # cat /sys/fs/cgroup/pids/parent/child/pids.max - max - # ( /bin/echo "Here's some processes for you." | cat ) - sh: fork: Resource temporary unavailable - # - -We can set a limit that is smaller than pids.current, which will stop any new -processes from being forked at all (note that the shell itself counts towards -pids.current):: - - # echo 1 > /sys/fs/cgroup/pids/parent/pids.max - # /bin/echo "We can't even spawn a single process now." - sh: fork: Resource temporary unavailable - # echo 0 > /sys/fs/cgroup/pids/parent/pids.max - # /bin/echo "We can't even spawn a single process now." - sh: fork: Resource temporary unavailable - # diff --git a/Documentation/cgroup-v1/rdma.rst b/Documentation/cgroup-v1/rdma.rst deleted file mode 100644 index 2fcb0a9bf790..000000000000 --- a/Documentation/cgroup-v1/rdma.rst +++ /dev/null @@ -1,117 +0,0 @@ -=============== -RDMA Controller -=============== - -.. Contents - - 1. Overview - 1-1. What is RDMA controller? - 1-2. Why RDMA controller needed? - 1-3. How is RDMA controller implemented? - 2. Usage Examples - -1. Overview -=========== - -1-1. What is RDMA controller? ------------------------------ - -RDMA controller allows user to limit RDMA/IB specific resources that a given -set of processes can use. These processes are grouped using RDMA controller. - -RDMA controller defines two resources which can be limited for processes of a -cgroup. - -1-2. Why RDMA controller needed? --------------------------------- - -Currently user space applications can easily take away all the rdma verb -specific resources such as AH, CQ, QP, MR etc. Due to which other applications -in other cgroup or kernel space ULPs may not even get chance to allocate any -rdma resources. This can lead to service unavailability. - -Therefore RDMA controller is needed through which resource consumption -of processes can be limited. Through this controller different rdma -resources can be accounted. - -1-3. How is RDMA controller implemented? ----------------------------------------- - -RDMA cgroup allows limit configuration of resources. Rdma cgroup maintains -resource accounting per cgroup, per device using resource pool structure. -Each such resource pool is limited up to 64 resources in given resource pool -by rdma cgroup, which can be extended later if required. - -This resource pool object is linked to the cgroup css. Typically there -are 0 to 4 resource pool instances per cgroup, per device in most use cases. -But nothing limits to have it more. At present hundreds of RDMA devices per -single cgroup may not be handled optimally, however there is no -known use case or requirement for such configuration either. - -Since RDMA resources can be allocated from any process and can be freed by any -of the child processes which shares the address space, rdma resources are -always owned by the creator cgroup css. This allows process migration from one -to other cgroup without major complexity of transferring resource ownership; -because such ownership is not really present due to shared nature of -rdma resources. Linking resources around css also ensures that cgroups can be -deleted after processes migrated. This allow progress migration as well with -active resources, even though that is not a primary use case. - -Whenever RDMA resource charging occurs, owner rdma cgroup is returned to -the caller. Same rdma cgroup should be passed while uncharging the resource. -This also allows process migrated with active RDMA resource to charge -to new owner cgroup for new resource. It also allows to uncharge resource of -a process from previously charged cgroup which is migrated to new cgroup, -even though that is not a primary use case. - -Resource pool object is created in following situations. -(a) User sets the limit and no previous resource pool exist for the device -of interest for the cgroup. -(b) No resource limits were configured, but IB/RDMA stack tries to -charge the resource. So that it correctly uncharge them when applications are -running without limits and later on when limits are enforced during uncharging, -otherwise usage count will drop to negative. - -Resource pool is destroyed if all the resource limits are set to max and -it is the last resource getting deallocated. - -User should set all the limit to max value if it intents to remove/unconfigure -the resource pool for a particular device. - -IB stack honors limits enforced by the rdma controller. When application -query about maximum resource limits of IB device, it returns minimum of -what is configured by user for a given cgroup and what is supported by -IB device. - -Following resources can be accounted by rdma controller. - - ========== ============================= - hca_handle Maximum number of HCA Handles - hca_object Maximum number of HCA Objects - ========== ============================= - -2. Usage Examples -================= - -(a) Configure resource limit:: - - echo mlx4_0 hca_handle=2 hca_object=2000 > /sys/fs/cgroup/rdma/1/rdma.max - echo ocrdma1 hca_handle=3 > /sys/fs/cgroup/rdma/2/rdma.max - -(b) Query resource limit:: - - cat /sys/fs/cgroup/rdma/2/rdma.max - #Output: - mlx4_0 hca_handle=2 hca_object=2000 - ocrdma1 hca_handle=3 hca_object=max - -(c) Query current usage:: - - cat /sys/fs/cgroup/rdma/2/rdma.current - #Output: - mlx4_0 hca_handle=1 hca_object=20 - ocrdma1 hca_handle=1 hca_object=23 - -(d) Delete resource limit:: - - echo echo mlx4_0 hca_handle=max hca_object=max > /sys/fs/cgroup/rdma/1/rdma.max diff --git a/Documentation/filesystems/tmpfs.txt b/Documentation/filesystems/tmpfs.txt index cad797a8a39e..5ecbc03e6b2f 100644 --- a/Documentation/filesystems/tmpfs.txt +++ b/Documentation/filesystems/tmpfs.txt @@ -98,7 +98,7 @@ A memory policy with a valid NodeList will be saved, as specified, for use at file creation time. When a task allocates a file in the file system, the mount option memory policy will be applied with a NodeList, if any, modified by the calling task's cpuset constraints -[See Documentation/cgroup-v1/cpusets.rst] and any optional flags, listed +[See Documentation/admin-guide/cgroup-v1/cpusets.rst] and any optional flags, listed below. If the resulting NodeLists is the empty set, the effective memory policy for the file will revert to "default" policy. diff --git a/Documentation/kernel-per-CPU-kthreads.txt b/Documentation/kernel-per-CPU-kthreads.txt index 5623b9916411..4f18456dd3b1 100644 --- a/Documentation/kernel-per-CPU-kthreads.txt +++ b/Documentation/kernel-per-CPU-kthreads.txt @@ -12,7 +12,7 @@ References - Documentation/IRQ-affinity.txt: Binding interrupts to sets of CPUs. -- Documentation/cgroup-v1: Using cgroups to bind tasks to sets of CPUs. +- Documentation/admin-guide/cgroup-v1: Using cgroups to bind tasks to sets of CPUs. - man taskset: Using the taskset command to bind tasks to sets of CPUs. diff --git a/Documentation/scheduler/sched-deadline.rst b/Documentation/scheduler/sched-deadline.rst index 3391e86d810c..14a2f7bf63fe 100644 --- a/Documentation/scheduler/sched-deadline.rst +++ b/Documentation/scheduler/sched-deadline.rst @@ -669,7 +669,7 @@ Deadline Task Scheduling -deadline tasks cannot have an affinity mask smaller that the entire root_domain they are created on. However, affinities can be specified - through the cpuset facility (Documentation/cgroup-v1/cpusets.rst). + through the cpuset facility (Documentation/admin-guide/cgroup-v1/cpusets.rst). 5.1 SCHED_DEADLINE and cpusets HOWTO ------------------------------------ diff --git a/Documentation/scheduler/sched-design-CFS.rst b/Documentation/scheduler/sched-design-CFS.rst index 53b30d1967cf..a96c72651877 100644 --- a/Documentation/scheduler/sched-design-CFS.rst +++ b/Documentation/scheduler/sched-design-CFS.rst @@ -222,7 +222,7 @@ SCHED_BATCH) tasks. These options need CONFIG_CGROUPS to be defined, and let the administrator create arbitrary groups of tasks, using the "cgroup" pseudo filesystem. See - Documentation/cgroup-v1/cgroups.rst for more information about this filesystem. + Documentation/admin-guide/cgroup-v1/cgroups.rst for more information about this filesystem. When CONFIG_FAIR_GROUP_SCHED is defined, a "cpu.shares" file is created for each group created using the pseudo filesystem. See example steps below to create diff --git a/Documentation/scheduler/sched-rt-group.rst b/Documentation/scheduler/sched-rt-group.rst index d27d3f3712fd..655a096ec8fb 100644 --- a/Documentation/scheduler/sched-rt-group.rst +++ b/Documentation/scheduler/sched-rt-group.rst @@ -133,7 +133,7 @@ This uses the cgroup virtual file system and "/cpu.rt_runtime_us" to control the CPU time reserved for each control group. For more information on working with control groups, you should read -Documentation/cgroup-v1/cgroups.rst as well. +Documentation/admin-guide/cgroup-v1/cgroups.rst as well. Group settings are checked against the following limits in order to keep the configuration schedulable: diff --git a/Documentation/vm/numa.rst b/Documentation/vm/numa.rst index 130f3cfa1c19..99fdeca917ca 100644 --- a/Documentation/vm/numa.rst +++ b/Documentation/vm/numa.rst @@ -67,7 +67,7 @@ nodes. Each emulated node will manage a fraction of the underlying cells' physical memory. NUMA emluation is useful for testing NUMA kernel and application features on non-NUMA platforms, and as a sort of memory resource management mechanism when used together with cpusets. -[see Documentation/cgroup-v1/cpusets.rst] +[see Documentation/admin-guide/cgroup-v1/cpusets.rst] For each node with memory, Linux constructs an independent memory management subsystem, complete with its own free page lists, in-use page lists, usage @@ -114,7 +114,7 @@ allocation behavior using Linux NUMA memory policy. [see System administrators can restrict the CPUs and nodes' memories that a non- privileged user can specify in the scheduling or NUMA commands and functions -using control groups and CPUsets. [see Documentation/cgroup-v1/cpusets.rst] +using control groups and CPUsets. [see Documentation/admin-guide/cgroup-v1/cpusets.rst] On architectures that do not hide memoryless nodes, Linux will include only zones [nodes] with memory in the zonelists. This means that for a memoryless diff --git a/Documentation/vm/page_migration.rst b/Documentation/vm/page_migration.rst index 35bba27d5fff..1d6cd7db4e43 100644 --- a/Documentation/vm/page_migration.rst +++ b/Documentation/vm/page_migration.rst @@ -41,7 +41,7 @@ locations. Larger installations usually partition the system using cpusets into sections of nodes. Paul Jackson has equipped cpusets with the ability to move pages when a task is moved to another cpuset (See -Documentation/cgroup-v1/cpusets.rst). +Documentation/admin-guide/cgroup-v1/cpusets.rst). Cpusets allows the automation of process locality. If a task is moved to a new cpuset then also all its pages are moved with it so that the performance of the process does not sink dramatically. Also the pages diff --git a/Documentation/vm/unevictable-lru.rst b/Documentation/vm/unevictable-lru.rst index 109052215bce..17d0861b0f1d 100644 --- a/Documentation/vm/unevictable-lru.rst +++ b/Documentation/vm/unevictable-lru.rst @@ -98,7 +98,7 @@ Memory Control Group Interaction -------------------------------- The unevictable LRU facility interacts with the memory control group [aka -memory controller; see Documentation/cgroup-v1/memory.rst] by extending the +memory controller; see Documentation/admin-guide/cgroup-v1/memory.rst] by extending the lru_list enum. The memory controller data structure automatically gets a per-zone unevictable diff --git a/Documentation/x86/x86_64/fake-numa-for-cpusets.rst b/Documentation/x86/x86_64/fake-numa-for-cpusets.rst index 30108684ae87..ff9bcfd2cc14 100644 --- a/Documentation/x86/x86_64/fake-numa-for-cpusets.rst +++ b/Documentation/x86/x86_64/fake-numa-for-cpusets.rst @@ -15,7 +15,7 @@ assign them to cpusets and their attached tasks. This is a way of limiting the amount of system memory that are available to a certain class of tasks. For more information on the features of cpusets, see -Documentation/cgroup-v1/cpusets.rst. +Documentation/admin-guide/cgroup-v1/cpusets.rst. There are a number of different configurations you can use for your needs. For more information on the numa=fake command line option and its various ways of configuring fake nodes, see Documentation/x86/x86_64/boot-options.rst. @@ -40,7 +40,7 @@ A machine may be split as follows with "numa=fake=4*512," as reported by dmesg:: On node 3 totalpages: 131072 Now following the instructions for mounting the cpusets filesystem from -Documentation/cgroup-v1/cpusets.rst, you can assign fake nodes (i.e. contiguous memory +Documentation/admin-guide/cgroup-v1/cpusets.rst, you can assign fake nodes (i.e. contiguous memory address spaces) to individual cpusets:: [root@xroads /]# mkdir exampleset diff --git a/MAINTAINERS b/MAINTAINERS index 0c603ea73034..c1593a668f80 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4158,7 +4158,7 @@ L: cgroups@vger.kernel.org T: git git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup.git S: Maintained F: Documentation/admin-guide/cgroup-v2.rst -F: Documentation/cgroup-v1/ +F: Documentation/admin-guide/cgroup-v1/ F: include/linux/cgroup* F: kernel/cgroup/ @@ -4169,7 +4169,7 @@ W: http://www.bullopensource.org/cpuset/ W: http://oss.sgi.com/projects/cpusets/ T: git git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup.git S: Maintained -F: Documentation/cgroup-v1/cpusets.rst +F: Documentation/admin-guide/cgroup-v1/cpusets.rst F: include/linux/cpuset.h F: kernel/cgroup/cpuset.c diff --git a/block/Kconfig b/block/Kconfig index b16b3e075d31..8b5f8e560eb4 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -89,7 +89,7 @@ config BLK_DEV_THROTTLING one needs to mount and use blkio cgroup controller for creating cgroups and specifying per device IO rate policies. - See Documentation/cgroup-v1/blkio-controller.rst for more information. + See Documentation/admin-guide/cgroup-v1/blkio-controller.rst for more information. config BLK_DEV_THROTTLING_LOW bool "Block throttling .low limit interface support (EXPERIMENTAL)" diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index c5311935239d..430e219e3aba 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -624,7 +624,7 @@ struct cftype { /* * Control Group subsystem type. - * See Documentation/cgroup-v1/cgroups.rst for details + * See Documentation/admin-guide/cgroup-v1/cgroups.rst for details */ struct cgroup_subsys { struct cgroup_subsys_state *(*css_alloc)(struct cgroup_subsys_state *parent_css); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 6f68438aa4ed..82699845ef79 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -806,7 +806,7 @@ union bpf_attr { * based on a user-provided identifier for all traffic coming from * the tasks belonging to the related cgroup. See also the related * kernel documentation, available from the Linux sources in file - * *Documentation/cgroup-v1/net_cls.rst*. + * *Documentation/admin-guide/cgroup-v1/net_cls.rst*. * * The Linux kernel has two versions for cgroups: there are * cgroups v1 and cgroups v2. Both are available to users, who can diff --git a/init/Kconfig b/init/Kconfig index 9eb92ee52d40..381cdfee6e0e 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -821,7 +821,7 @@ menuconfig CGROUPS controls or device isolation. See - Documentation/scheduler/sched-design-CFS.rst (CFS) - - Documentation/cgroup-v1/ (features for grouping, isolation + - Documentation/admin-guide/cgroup-v1/ (features for grouping, isolation and resource control) Say N if unsure. @@ -883,7 +883,7 @@ config BLK_CGROUP CONFIG_CFQ_GROUP_IOSCHED=y; for enabling throttling policy, set CONFIG_BLK_DEV_THROTTLING=y. - See Documentation/cgroup-v1/blkio-controller.rst for more information. + See Documentation/admin-guide/cgroup-v1/blkio-controller.rst for more information. config CGROUP_WRITEBACK bool diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index b3b02b9c4405..863e434a6020 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -729,7 +729,7 @@ static inline int nr_cpusets(void) * load balancing domains (sched domains) as specified by that partial * partition. * - * See "What is sched_load_balance" in Documentation/cgroup-v1/cpusets.rst + * See "What is sched_load_balance" in Documentation/admin-guide/cgroup-v1/cpusets.rst * for a background explanation of this. * * Does not return errors, on the theory that the callers of this diff --git a/security/device_cgroup.c b/security/device_cgroup.c index c07196502577..725674f3276d 100644 --- a/security/device_cgroup.c +++ b/security/device_cgroup.c @@ -509,7 +509,7 @@ static inline int may_allow_all(struct dev_cgroup *parent) * This is one of the three key functions for hierarchy implementation. * This function is responsible for re-evaluating all the cgroup's active * exceptions due to a parent's exception change. - * Refer to Documentation/cgroup-v1/devices.rst for more details. + * Refer to Documentation/admin-guide/cgroup-v1/devices.rst for more details. */ static void revalidate_active_exceptions(struct dev_cgroup *devcg) { diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index f506c68b2612..17e2b1713702 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -806,7 +806,7 @@ union bpf_attr { * based on a user-provided identifier for all traffic coming from * the tasks belonging to the related cgroup. See also the related * kernel documentation, available from the Linux sources in file - * *Documentation/cgroup-v1/net_cls.rst*. + * *Documentation/admin-guide/cgroup-v1/net_cls.rst*. * * The Linux kernel has two versions for cgroups: there are * cgroups v1 and cgroups v2. Both are available to users, who can -- cgit v1.2.3-71-gd317 From d4ecfeb15494ec261fef2d25d96eecba66f0b182 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Mon, 15 Jul 2019 09:39:53 -0700 Subject: bpf: allow wide aligned loads for bpf_sock_addr user_ip6 and msg_src_ip6 Add explicit check for u64 loads of user_ip6 and msg_src_ip6 and update the comment. Cc: Yonghong Song Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 4 ++-- net/core/filter.c | 12 +++++++++++- 2 files changed, 13 insertions(+), 3 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 6f68438aa4ed..81be929b89fc 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3248,7 +3248,7 @@ struct bpf_sock_addr { __u32 user_ip4; /* Allows 1,2,4-byte read and 4-byte write. * Stored in network byte order. */ - __u32 user_ip6[4]; /* Allows 1,2,4-byte read and 4,8-byte write. + __u32 user_ip6[4]; /* Allows 1,2,4,8-byte read and 4,8-byte write. * Stored in network byte order. */ __u32 user_port; /* Allows 4-byte read and write. @@ -3260,7 +3260,7 @@ struct bpf_sock_addr { __u32 msg_src_ip4; /* Allows 1,2,4-byte read and 4-byte write. * Stored in network byte order. */ - __u32 msg_src_ip6[4]; /* Allows 1,2,4-byte read and 4,8-byte write. + __u32 msg_src_ip6[4]; /* Allows 1,2,4,8-byte read and 4,8-byte write. * Stored in network byte order. */ __bpf_md_ptr(struct bpf_sock *, sk); diff --git a/net/core/filter.c b/net/core/filter.c index c5983ddb1a9f..0f6854ccf894 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -6884,9 +6884,19 @@ static bool sock_addr_is_valid_access(int off, int size, case bpf_ctx_range(struct bpf_sock_addr, msg_src_ip4): case bpf_ctx_range_till(struct bpf_sock_addr, msg_src_ip6[0], msg_src_ip6[3]): - /* Only narrow read access allowed for now. */ if (type == BPF_READ) { bpf_ctx_record_field_size(info, size_default); + + if (bpf_ctx_wide_access_ok(off, size, + struct bpf_sock_addr, + user_ip6)) + return true; + + if (bpf_ctx_wide_access_ok(off, size, + struct bpf_sock_addr, + msg_src_ip6)) + return true; + if (!bpf_ctx_narrow_access_ok(off, size, size_default)) return false; } else { -- cgit v1.2.3-71-gd317 From a5b647007e9d794956dbed9339a3354a9fc4d5c3 Mon Sep 17 00:00:00 2001 From: Vedang Patel Date: Tue, 16 Jul 2019 12:52:18 -0700 Subject: fix: taprio: Change type of txtime-delay parameter to u32 During the review of the iproute2 patches for txtime-assist mode, it was pointed out that it does not make sense for the txtime-delay parameter to be negative. So, change the type of the parameter from s32 to u32. Fixes: 4cfd5779bd6e ("taprio: Add support for txtime-assist mode") Reported-by: Stephen Hemminger Signed-off-by: Vedang Patel Signed-off-by: David S. Miller --- include/uapi/linux/pkt_sched.h | 2 +- net/sched/sch_taprio.c | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index 1f623252abe8..18f185299f47 100644 --- a/include/uapi/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h @@ -1174,7 +1174,7 @@ enum { TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME, /* s64 */ TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME_EXTENSION, /* s64 */ TCA_TAPRIO_ATTR_FLAGS, /* u32 */ - TCA_TAPRIO_ATTR_TXTIME_DELAY, /* s32 */ + TCA_TAPRIO_ATTR_TXTIME_DELAY, /* u32 */ __TCA_TAPRIO_ATTR_MAX, }; diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c index 388750ddc57a..c39db507ba3f 100644 --- a/net/sched/sch_taprio.c +++ b/net/sched/sch_taprio.c @@ -75,7 +75,7 @@ struct taprio_sched { struct sched_gate_list __rcu *admin_sched; struct hrtimer advance_timer; struct list_head taprio_list; - int txtime_delay; + u32 txtime_delay; }; static ktime_t sched_base_time(const struct sched_gate_list *sched) @@ -1113,7 +1113,7 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt, goto unlock; } - q->txtime_delay = nla_get_s32(tb[TCA_TAPRIO_ATTR_TXTIME_DELAY]); + q->txtime_delay = nla_get_u32(tb[TCA_TAPRIO_ATTR_TXTIME_DELAY]); } if (!TXTIME_ASSIST_IS_ENABLED(taprio_flags) && @@ -1430,7 +1430,7 @@ static int taprio_dump(struct Qdisc *sch, struct sk_buff *skb) goto options_error; if (q->txtime_delay && - nla_put_s32(skb, TCA_TAPRIO_ATTR_TXTIME_DELAY, q->txtime_delay)) + nla_put_u32(skb, TCA_TAPRIO_ATTR_TXTIME_DELAY, q->txtime_delay)) goto options_error; if (oper && dump_schedule(skb, oper)) -- cgit v1.2.3-71-gd317 From 694a58e29ef27c4c26f103a9decfd053f94dd34c Mon Sep 17 00:00:00 2001 From: Mikko Rapeli Date: Tue, 16 Jul 2019 16:28:07 -0700 Subject: uapi linux/coda.h: use __kernel_pid_t for userspace Part of a patch by Mikko Rapeli, as Arnd Bergman commented on the original patch. pid_t might differ between libc and the kernel, so the kernel interface has to use types that the kernel defines. Link: http://lkml.kernel.org/r/f374a71f4d351bc8c8b3ac18ad7765c88d806d10.1558117389.git.jaharkes@cs.cmu.edu Signed-off-by: Mikko Rapeli Signed-off-by: Jan Harkes Cc: Arnd Bergmann Cc: Colin Ian King Cc: Dan Carpenter Cc: David Howells Cc: Fabian Frederick Cc: Sam Protsenko Cc: Yann Droneaud Cc: Zhouyang Jia Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/uapi/linux/coda.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/coda.h b/include/uapi/linux/coda.h index 695fade33c64..ed8cb263e482 100644 --- a/include/uapi/linux/coda.h +++ b/include/uapi/linux/coda.h @@ -295,8 +295,8 @@ struct coda_statfs { struct coda_in_hdr { u_int32_t opcode; u_int32_t unique; /* Keep multiple outstanding msgs distinct */ - pid_t pid; - pid_t pgid; + __kernel_pid_t pid; + __kernel_pid_t pgid; vuid_t uid; }; -- cgit v1.2.3-71-gd317 From f90fb3c7e2c13ae829db2274b88b845a75038b8a Mon Sep 17 00:00:00 2001 From: Mikko Rapeli Date: Tue, 16 Jul 2019 16:28:10 -0700 Subject: uapi linux/coda_psdev.h: move upc_req definition from uapi to kernel side headers Only users of upc_req in kernel side fs/coda/psdev.c and fs/coda/upcall.c already include linux/coda_psdev.h. Suggested by Jan Harkes in https://lore.kernel.org/lkml/20150531111913.GA23377@cs.cmu.edu/ Fixes these include/uapi/linux/coda_psdev.h compilation errors in userspace: linux/coda_psdev.h:12:19: error: field `uc_chain' has incomplete type struct list_head uc_chain; ^ linux/coda_psdev.h:13:2: error: unknown type name `caddr_t' caddr_t uc_data; ^ linux/coda_psdev.h:14:2: error: unknown type name `u_short' u_short uc_flags; ^ linux/coda_psdev.h:15:2: error: unknown type name `u_short' u_short uc_inSize; /* Size is at most 5000 bytes */ ^ linux/coda_psdev.h:16:2: error: unknown type name `u_short' u_short uc_outSize; ^ linux/coda_psdev.h:17:2: error: unknown type name `u_short' u_short uc_opcode; /* copied from data to save lookup */ ^ linux/coda_psdev.h:19:2: error: unknown type name `wait_queue_head_t' wait_queue_head_t uc_sleep; /* process' wait queue */ ^ Link: http://lkml.kernel.org/r/9f99f5ce6a0563d5266e6cf7aa9585aac2cae971.1558117389.git.jaharkes@cs.cmu.edu Signed-off-by: Mikko Rapeli Signed-off-by: Jan Harkes Cc: Arnd Bergmann Cc: Colin Ian King Cc: Dan Carpenter Cc: David Howells Cc: Fabian Frederick Cc: Sam Protsenko Cc: Yann Droneaud Cc: Zhouyang Jia Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/coda_psdev.h | 11 +++++++++++ include/uapi/linux/coda_psdev.h | 13 ------------- 2 files changed, 11 insertions(+), 13 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/coda_psdev.h b/include/linux/coda_psdev.h index 15170954aa2b..57d2b2faf6a3 100644 --- a/include/linux/coda_psdev.h +++ b/include/linux/coda_psdev.h @@ -19,6 +19,17 @@ struct venus_comm { struct mutex vc_mutex; }; +/* messages between coda filesystem in kernel and Venus */ +struct upc_req { + struct list_head uc_chain; + caddr_t uc_data; + u_short uc_flags; + u_short uc_inSize; /* Size is at most 5000 bytes */ + u_short uc_outSize; + u_short uc_opcode; /* copied from data to save lookup */ + int uc_unique; + wait_queue_head_t uc_sleep; /* process' wait queue */ +}; static inline struct venus_comm *coda_vcp(struct super_block *sb) { diff --git a/include/uapi/linux/coda_psdev.h b/include/uapi/linux/coda_psdev.h index aa6623efd2dd..d50d51a57fe4 100644 --- a/include/uapi/linux/coda_psdev.h +++ b/include/uapi/linux/coda_psdev.h @@ -7,19 +7,6 @@ #define CODA_PSDEV_MAJOR 67 #define MAX_CODADEVS 5 /* how many do we allow */ - -/* messages between coda filesystem in kernel and Venus */ -struct upc_req { - struct list_head uc_chain; - caddr_t uc_data; - u_short uc_flags; - u_short uc_inSize; /* Size is at most 5000 bytes */ - u_short uc_outSize; - u_short uc_opcode; /* copied from data to save lookup */ - int uc_unique; - wait_queue_head_t uc_sleep; /* process' wait queue */ -}; - #define CODA_REQ_ASYNC 0x1 #define CODA_REQ_READ 0x2 #define CODA_REQ_WRITE 0x4 -- cgit v1.2.3-71-gd317 From 2fe7491d219428a32f09948e88bfaf8e71b9a66b Mon Sep 17 00:00:00 2001 From: Jan Harkes Date: Tue, 16 Jul 2019 16:28:26 -0700 Subject: uapi linux/coda_psdev.h: move CODA_REQ_ from uapi to kernel side headers These constants only used internally and not exposed to userspace. Link: http://lkml.kernel.org/r/baeafc30dad70d8b422ee679420099c2d8aa7da0.1558117389.git.jaharkes@cs.cmu.edu Signed-off-by: Jan Harkes Cc: Arnd Bergmann Cc: Colin Ian King Cc: Dan Carpenter Cc: David Howells Cc: Fabian Frederick Cc: Mikko Rapeli Cc: Sam Protsenko Cc: Yann Droneaud Cc: Zhouyang Jia Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/coda_psdev.h | 5 +++++ include/uapi/linux/coda_psdev.h | 5 ----- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/coda_psdev.h b/include/linux/coda_psdev.h index d1672fd5e638..9487f792770c 100644 --- a/include/linux/coda_psdev.h +++ b/include/linux/coda_psdev.h @@ -31,6 +31,11 @@ struct upc_req { wait_queue_head_t uc_sleep; /* process' wait queue */ }; +#define CODA_REQ_ASYNC 0x1 +#define CODA_REQ_READ 0x2 +#define CODA_REQ_WRITE 0x4 +#define CODA_REQ_ABORT 0x8 + static inline struct venus_comm *coda_vcp(struct super_block *sb) { return (struct venus_comm *)((sb)->s_fs_info); diff --git a/include/uapi/linux/coda_psdev.h b/include/uapi/linux/coda_psdev.h index d50d51a57fe4..3dacb7fad66a 100644 --- a/include/uapi/linux/coda_psdev.h +++ b/include/uapi/linux/coda_psdev.h @@ -7,9 +7,4 @@ #define CODA_PSDEV_MAJOR 67 #define MAX_CODADEVS 5 /* how many do we allow */ -#define CODA_REQ_ASYNC 0x1 -#define CODA_REQ_READ 0x2 -#define CODA_REQ_WRITE 0x4 -#define CODA_REQ_ABORT 0x8 - #endif /* _UAPI__CODA_PSDEV_H */ -- cgit v1.2.3-71-gd317 From 6ced9aa7b56baeb241a715df4539e60d5e3118e2 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 16 Jul 2019 16:28:32 -0700 Subject: coda: stop using 'struct timespec' in user API We exchange file timestamps with user space using psdev device read/write operations with a fixed but architecture specific binary layout. On 32-bit systems, this uses a 'timespec' structure that is defined by the C library to contain two 32-bit values for seconds and nanoseconds. As we get ready for the year 2038 overflow of the 32-bit signed seconds, the kernel now uses 64-bit timestamps internally, and user space will do the same change by changing the 'timespec' definition in the future. Unfortunately, this breaks the layout of the coda_vattr structure, so we need to redefine that in terms of something that does not change. I'm introducing a new 'struct vtimespec' structure here that keeps the existing layout, and the same change has to be done in the coda user space copy of linux/coda.h before anyone can use that on a 32-bit architecture with 64-bit time_t. An open question is what should happen to actual times past y2038, as they are now truncated to the last valid date when sent to user space, and interpreted as pre-1970 times when a timestamp with the MSB set is read back into the kernel. Alternatively, we could change the new timespec64_to_coda()/coda_to_timespec64() functions to use a different interpretation and extend the available range further to the future by disallowing past timestamps. This would require more changes in the user space side though. Link: http://lkml.kernel.org/r/562b7324149461743e4fbe2fedbf7c242f7e274a.1558117389.git.jaharkes@cs.cmu.edu Link: https://patchwork.kernel.org/patch/10474735/ Signed-off-by: Arnd Bergmann Signed-off-by: Jan Harkes Acked-by: Jan Harkes Cc: Colin Ian King Cc: Dan Carpenter Cc: David Howells Cc: Fabian Frederick Cc: Mikko Rapeli Cc: Sam Protsenko Cc: Yann Droneaud Cc: Zhouyang Jia Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/filesystems/coda.txt | 11 ++++++--- fs/coda/coda_linux.c | 50 +++++++++++++++++++++++++++++--------- include/uapi/linux/coda.h | 20 ++++++++++++--- 3 files changed, 62 insertions(+), 19 deletions(-) (limited to 'include/uapi/linux') diff --git a/Documentation/filesystems/coda.txt b/Documentation/filesystems/coda.txt index 61311356025d..ea5969068895 100644 --- a/Documentation/filesystems/coda.txt +++ b/Documentation/filesystems/coda.txt @@ -481,7 +481,10 @@ kernel support. - + struct vtimespec { + long tv_sec; /* seconds */ + long tv_nsec; /* nanoseconds */ + }; struct coda_vattr { enum coda_vtype va_type; /* vnode type (for create) */ @@ -493,9 +496,9 @@ kernel support. long va_fileid; /* file id */ u_quad_t va_size; /* file size in bytes */ long va_blocksize; /* blocksize preferred for i/o */ - struct timespec va_atime; /* time of last access */ - struct timespec va_mtime; /* time of last modification */ - struct timespec va_ctime; /* time file changed */ + struct vtimespec va_atime; /* time of last access */ + struct vtimespec va_mtime; /* time of last modification */ + struct vtimespec va_ctime; /* time file changed */ u_long va_gen; /* generation number of file */ u_long va_flags; /* flags defined for file */ dev_t va_rdev; /* device special file represents */ diff --git a/fs/coda/coda_linux.c b/fs/coda/coda_linux.c index f3d543dd9a98..8addcd166908 100644 --- a/fs/coda/coda_linux.c +++ b/fs/coda/coda_linux.c @@ -66,6 +66,32 @@ unsigned short coda_flags_to_cflags(unsigned short flags) return coda_flags; } +static struct timespec64 coda_to_timespec64(struct vtimespec ts) +{ + /* + * We interpret incoming timestamps as 'signed' to match traditional + * usage and support pre-1970 timestamps, but this breaks in y2038 + * on 32-bit machines. + */ + struct timespec64 ts64 = { + .tv_sec = ts.tv_sec, + .tv_nsec = ts.tv_nsec, + }; + + return ts64; +} + +static struct vtimespec timespec64_to_coda(struct timespec64 ts64) +{ + /* clamp the timestamps to the maximum range rather than wrapping */ + struct vtimespec ts = { + .tv_sec = lower_32_bits(clamp_t(time64_t, ts64.tv_sec, + LONG_MIN, LONG_MAX)), + .tv_nsec = ts64.tv_nsec, + }; + + return ts; +} /* utility functions below */ void coda_vattr_to_iattr(struct inode *inode, struct coda_vattr *attr) @@ -105,11 +131,11 @@ void coda_vattr_to_iattr(struct inode *inode, struct coda_vattr *attr) if (attr->va_size != -1) inode->i_blocks = (attr->va_size + 511) >> 9; if (attr->va_atime.tv_sec != -1) - inode->i_atime = timespec_to_timespec64(attr->va_atime); + inode->i_atime = coda_to_timespec64(attr->va_atime); if (attr->va_mtime.tv_sec != -1) - inode->i_mtime = timespec_to_timespec64(attr->va_mtime); + inode->i_mtime = coda_to_timespec64(attr->va_mtime); if (attr->va_ctime.tv_sec != -1) - inode->i_ctime = timespec_to_timespec64(attr->va_ctime); + inode->i_ctime = coda_to_timespec64(attr->va_ctime); } @@ -130,12 +156,12 @@ void coda_iattr_to_vattr(struct iattr *iattr, struct coda_vattr *vattr) vattr->va_uid = (vuid_t) -1; vattr->va_gid = (vgid_t) -1; vattr->va_size = (off_t) -1; - vattr->va_atime.tv_sec = (time_t) -1; - vattr->va_atime.tv_nsec = (time_t) -1; - vattr->va_mtime.tv_sec = (time_t) -1; - vattr->va_mtime.tv_nsec = (time_t) -1; - vattr->va_ctime.tv_sec = (time_t) -1; - vattr->va_ctime.tv_nsec = (time_t) -1; + vattr->va_atime.tv_sec = (long) -1; + vattr->va_atime.tv_nsec = (long) -1; + vattr->va_mtime.tv_sec = (long) -1; + vattr->va_mtime.tv_nsec = (long) -1; + vattr->va_ctime.tv_sec = (long) -1; + vattr->va_ctime.tv_nsec = (long) -1; vattr->va_type = C_VNON; vattr->va_fileid = -1; vattr->va_gen = -1; @@ -175,13 +201,13 @@ void coda_iattr_to_vattr(struct iattr *iattr, struct coda_vattr *vattr) vattr->va_size = iattr->ia_size; } if ( valid & ATTR_ATIME ) { - vattr->va_atime = timespec64_to_timespec(iattr->ia_atime); + vattr->va_atime = timespec64_to_coda(iattr->ia_atime); } if ( valid & ATTR_MTIME ) { - vattr->va_mtime = timespec64_to_timespec(iattr->ia_mtime); + vattr->va_mtime = timespec64_to_coda(iattr->ia_mtime); } if ( valid & ATTR_CTIME ) { - vattr->va_ctime = timespec64_to_timespec(iattr->ia_ctime); + vattr->va_ctime = timespec64_to_coda(iattr->ia_ctime); } } diff --git a/include/uapi/linux/coda.h b/include/uapi/linux/coda.h index ed8cb263e482..fc5f7874208a 100644 --- a/include/uapi/linux/coda.h +++ b/include/uapi/linux/coda.h @@ -211,6 +211,20 @@ struct CodaFid { */ enum coda_vtype { C_VNON, C_VREG, C_VDIR, C_VBLK, C_VCHR, C_VLNK, C_VSOCK, C_VFIFO, C_VBAD }; +#ifdef __linux__ +/* + * This matches the traditional Linux 'timespec' structure binary layout, + * before using 64-bit time_t everywhere. Overflows in y2038 on 32-bit + * architectures. + */ +struct vtimespec { + long tv_sec; /* seconds */ + long tv_nsec; /* nanoseconds */ +}; +#else +#define vtimespec timespec +#endif + struct coda_vattr { long va_type; /* vnode type (for create) */ u_short va_mode; /* files access mode and type */ @@ -220,9 +234,9 @@ struct coda_vattr { long va_fileid; /* file id */ u_quad_t va_size; /* file size in bytes */ long va_blocksize; /* blocksize preferred for i/o */ - struct timespec va_atime; /* time of last access */ - struct timespec va_mtime; /* time of last modification */ - struct timespec va_ctime; /* time file changed */ + struct vtimespec va_atime; /* time of last access */ + struct vtimespec va_mtime; /* time of last modification */ + struct vtimespec va_ctime; /* time file changed */ u_long va_gen; /* generation number of file */ u_long va_flags; /* flags defined for file */ cdev_t va_rdev; /* device special file represents */ -- cgit v1.2.3-71-gd317 From 5e7c31dfe74703f428220384b2863525957cc160 Mon Sep 17 00:00:00 2001 From: Jan Harkes Date: Tue, 16 Jul 2019 16:28:35 -0700 Subject: coda: change Coda's user api to use 64-bit time_t in timespec Move the 32-bit time_t problems to userspace. Link: http://lkml.kernel.org/r/8d089068823bfb292a4020f773922fbd82ffad39.1558117389.git.jaharkes@cs.cmu.edu Signed-off-by: Jan Harkes Cc: Arnd Bergmann Cc: Colin Ian King Cc: Dan Carpenter Cc: David Howells Cc: Fabian Frederick Cc: Mikko Rapeli Cc: Sam Protsenko Cc: Yann Droneaud Cc: Zhouyang Jia Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/filesystems/coda.txt | 10 +++++----- fs/coda/coda_linux.c | 21 +++++++-------------- include/uapi/linux/coda.h | 33 +++++++-------------------------- 3 files changed, 19 insertions(+), 45 deletions(-) (limited to 'include/uapi/linux') diff --git a/Documentation/filesystems/coda.txt b/Documentation/filesystems/coda.txt index ea5969068895..545262c167c3 100644 --- a/Documentation/filesystems/coda.txt +++ b/Documentation/filesystems/coda.txt @@ -481,8 +481,8 @@ kernel support. - struct vtimespec { - long tv_sec; /* seconds */ + struct coda_timespec { + int64_t tv_sec; /* seconds */ long tv_nsec; /* nanoseconds */ }; @@ -496,9 +496,9 @@ kernel support. long va_fileid; /* file id */ u_quad_t va_size; /* file size in bytes */ long va_blocksize; /* blocksize preferred for i/o */ - struct vtimespec va_atime; /* time of last access */ - struct vtimespec va_mtime; /* time of last modification */ - struct vtimespec va_ctime; /* time file changed */ + struct coda_timespec va_atime; /* time of last access */ + struct coda_timespec va_mtime; /* time of last modification */ + struct coda_timespec va_ctime; /* time file changed */ u_long va_gen; /* generation number of file */ u_long va_flags; /* flags defined for file */ dev_t va_rdev; /* device special file represents */ diff --git a/fs/coda/coda_linux.c b/fs/coda/coda_linux.c index 8addcd166908..e4b5f02f0dd4 100644 --- a/fs/coda/coda_linux.c +++ b/fs/coda/coda_linux.c @@ -66,13 +66,8 @@ unsigned short coda_flags_to_cflags(unsigned short flags) return coda_flags; } -static struct timespec64 coda_to_timespec64(struct vtimespec ts) +static struct timespec64 coda_to_timespec64(struct coda_timespec ts) { - /* - * We interpret incoming timestamps as 'signed' to match traditional - * usage and support pre-1970 timestamps, but this breaks in y2038 - * on 32-bit machines. - */ struct timespec64 ts64 = { .tv_sec = ts.tv_sec, .tv_nsec = ts.tv_nsec, @@ -81,12 +76,10 @@ static struct timespec64 coda_to_timespec64(struct vtimespec ts) return ts64; } -static struct vtimespec timespec64_to_coda(struct timespec64 ts64) +static struct coda_timespec timespec64_to_coda(struct timespec64 ts64) { - /* clamp the timestamps to the maximum range rather than wrapping */ - struct vtimespec ts = { - .tv_sec = lower_32_bits(clamp_t(time64_t, ts64.tv_sec, - LONG_MIN, LONG_MAX)), + struct coda_timespec ts = { + .tv_sec = ts64.tv_sec, .tv_nsec = ts64.tv_nsec, }; @@ -156,11 +149,11 @@ void coda_iattr_to_vattr(struct iattr *iattr, struct coda_vattr *vattr) vattr->va_uid = (vuid_t) -1; vattr->va_gid = (vgid_t) -1; vattr->va_size = (off_t) -1; - vattr->va_atime.tv_sec = (long) -1; + vattr->va_atime.tv_sec = (int64_t) -1; vattr->va_atime.tv_nsec = (long) -1; - vattr->va_mtime.tv_sec = (long) -1; + vattr->va_mtime.tv_sec = (int64_t) -1; vattr->va_mtime.tv_nsec = (long) -1; - vattr->va_ctime.tv_sec = (long) -1; + vattr->va_ctime.tv_sec = (int64_t) -1; vattr->va_ctime.tv_nsec = (long) -1; vattr->va_type = C_VNON; vattr->va_fileid = -1; diff --git a/include/uapi/linux/coda.h b/include/uapi/linux/coda.h index fc5f7874208a..5dba636b6e11 100644 --- a/include/uapi/linux/coda.h +++ b/include/uapi/linux/coda.h @@ -86,10 +86,6 @@ typedef unsigned long long u_quad_t; #define inline -struct timespec { - long ts_sec; - long ts_nsec; -}; #else /* DJGPP but not KERNEL */ #include typedef unsigned long long u_quad_t; @@ -110,13 +106,6 @@ typedef unsigned long long u_quad_t; #define cdev_t dev_t #endif -#ifdef __CYGWIN32__ -struct timespec { - time_t tv_sec; /* seconds */ - long tv_nsec; /* nanoseconds */ -}; -#endif - #ifndef __BIT_TYPES_DEFINED__ #define __BIT_TYPES_DEFINED__ typedef signed char int8_t; @@ -211,19 +200,10 @@ struct CodaFid { */ enum coda_vtype { C_VNON, C_VREG, C_VDIR, C_VBLK, C_VCHR, C_VLNK, C_VSOCK, C_VFIFO, C_VBAD }; -#ifdef __linux__ -/* - * This matches the traditional Linux 'timespec' structure binary layout, - * before using 64-bit time_t everywhere. Overflows in y2038 on 32-bit - * architectures. - */ -struct vtimespec { - long tv_sec; /* seconds */ +struct coda_timespec { + int64_t tv_sec; /* seconds */ long tv_nsec; /* nanoseconds */ }; -#else -#define vtimespec timespec -#endif struct coda_vattr { long va_type; /* vnode type (for create) */ @@ -234,9 +214,9 @@ struct coda_vattr { long va_fileid; /* file id */ u_quad_t va_size; /* file size in bytes */ long va_blocksize; /* blocksize preferred for i/o */ - struct vtimespec va_atime; /* time of last access */ - struct vtimespec va_mtime; /* time of last modification */ - struct vtimespec va_ctime; /* time file changed */ + struct coda_timespec va_atime; /* time of last access */ + struct coda_timespec va_mtime; /* time of last modification */ + struct coda_timespec va_ctime; /* time file changed */ u_long va_gen; /* generation number of file */ u_long va_flags; /* flags defined for file */ cdev_t va_rdev; /* device special file represents */ @@ -301,7 +281,8 @@ struct coda_statfs { #define CIOC_KERNEL_VERSION _IOWR('c', 10, size_t) -#define CODA_KERNEL_VERSION 3 /* 128-bit file identifiers */ +// CODA_KERNEL_VERSION 3 /* 128-bit file identifiers */ +#define CODA_KERNEL_VERSION 4 /* 64-bit timespec */ /* * Venus <-> Coda RPC arguments -- cgit v1.2.3-71-gd317 From 6dc280ebeed2c96a2fb933103dafe655a922b9c1 Mon Sep 17 00:00:00 2001 From: Jan Harkes Date: Tue, 16 Jul 2019 16:28:51 -0700 Subject: coda: remove uapi/linux/coda_psdev.h Nothing is left in this header that is used by userspace. Link: http://lkml.kernel.org/r/bb11378cef94739f2cf89425dd6d302a52c64480.1558117389.git.jaharkes@cs.cmu.edu Signed-off-by: Jan Harkes Cc: Arnd Bergmann Cc: Colin Ian King Cc: Dan Carpenter Cc: David Howells Cc: Fabian Frederick Cc: Mikko Rapeli Cc: Sam Protsenko Cc: Yann Droneaud Cc: Zhouyang Jia Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/coda/coda_psdev.h | 5 ++++- include/uapi/linux/coda_psdev.h | 10 ---------- 2 files changed, 4 insertions(+), 11 deletions(-) delete mode 100644 include/uapi/linux/coda_psdev.h (limited to 'include/uapi/linux') diff --git a/fs/coda/coda_psdev.h b/fs/coda/coda_psdev.h index 012e16f741a6..801423cbbdfc 100644 --- a/fs/coda/coda_psdev.h +++ b/fs/coda/coda_psdev.h @@ -3,8 +3,11 @@ #define __CODA_PSDEV_H #include +#include #include -#include + +#define CODA_PSDEV_MAJOR 67 +#define MAX_CODADEVS 5 /* how many do we allow */ struct kstatfs; diff --git a/include/uapi/linux/coda_psdev.h b/include/uapi/linux/coda_psdev.h deleted file mode 100644 index 3dacb7fad66a..000000000000 --- a/include/uapi/linux/coda_psdev.h +++ /dev/null @@ -1,10 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -#ifndef _UAPI__CODA_PSDEV_H -#define _UAPI__CODA_PSDEV_H - -#include - -#define CODA_PSDEV_MAJOR 67 -#define MAX_CODADEVS 5 /* how many do we allow */ - -#endif /* _UAPI__CODA_PSDEV_H */ -- cgit v1.2.3-71-gd317 From a9fba24c6ac9b66c09dfc2a0e845ecace187e89c Mon Sep 17 00:00:00 2001 From: Pedro Cuadra Date: Tue, 16 Jul 2019 16:29:13 -0700 Subject: coda: add hinting support for partial file caching This adds support for partial file caching in Coda. Every read, write and mmap informs the userspace cache manager about what part of a file is about to be accessed so that the cache manager can ensure the relevant parts are available before the operation is allowed to proceed. When a read or write operation completes, this is also reported to allow the cache manager to track when partially cached content can be released. If the cache manager does not support partial file caching, or when the entire file has been fetched into the local cache, the cache manager may return an EOPNOTSUPP error to indicate that intent upcalls are no longer necessary until the file is closed. [akpm@linux-foundation.org: little whitespace fixup] Link: http://lkml.kernel.org/r/20190618181301.6960-1-jaharkes@cs.cmu.edu Signed-off-by: Pedro Cuadra Signed-off-by: Jan Harkes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/coda/coda_fs_i.h | 1 + fs/coda/coda_psdev.h | 3 ++ fs/coda/file.c | 61 +++++++++++++++++++++++++++++++++-------- fs/coda/psdev.c | 2 +- fs/coda/upcall.c | 70 +++++++++++++++++++++++++++++++++++++++-------- include/uapi/linux/coda.h | 29 ++++++++++++++++++-- 6 files changed, 139 insertions(+), 27 deletions(-) (limited to 'include/uapi/linux') diff --git a/fs/coda/coda_fs_i.h b/fs/coda/coda_fs_i.h index c99d574d1c43..1763ff95d865 100644 --- a/fs/coda/coda_fs_i.h +++ b/fs/coda/coda_fs_i.h @@ -40,6 +40,7 @@ struct coda_file_info { int cfi_magic; /* magic number */ struct file *cfi_container; /* container file for this cnode */ unsigned int cfi_mapcount; /* nr of times this file is mapped */ + bool cfi_access_intent; /* is access intent supported */ }; /* flags */ diff --git a/fs/coda/coda_psdev.h b/fs/coda/coda_psdev.h index 801423cbbdfc..52da08c770b0 100644 --- a/fs/coda/coda_psdev.h +++ b/fs/coda/coda_psdev.h @@ -83,6 +83,9 @@ int coda_downcall(struct venus_comm *vcp, int opcode, union outputArgs *out, size_t nbytes); int venus_fsync(struct super_block *sb, struct CodaFid *fid); int venus_statfs(struct dentry *dentry, struct kstatfs *sfs); +int venus_access_intent(struct super_block *sb, struct CodaFid *fid, + bool *access_intent_supported, + size_t count, loff_t ppos, int type); /* * Statistics diff --git a/fs/coda/file.c b/fs/coda/file.c index 0dbd13ab72e3..128d63df5bfb 100644 --- a/fs/coda/file.c +++ b/fs/coda/file.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include "coda_psdev.h" @@ -37,9 +38,25 @@ static ssize_t coda_file_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct file *coda_file = iocb->ki_filp; + struct inode *coda_inode = file_inode(coda_file); struct coda_file_info *cfi = coda_ftoc(coda_file); + loff_t ki_pos = iocb->ki_pos; + size_t count = iov_iter_count(to); + ssize_t ret; + + ret = venus_access_intent(coda_inode->i_sb, coda_i2f(coda_inode), + &cfi->cfi_access_intent, + count, ki_pos, CODA_ACCESS_TYPE_READ); + if (ret) + goto finish_read; - return vfs_iter_read(cfi->cfi_container, to, &iocb->ki_pos, 0); + ret = vfs_iter_read(cfi->cfi_container, to, &iocb->ki_pos, 0); + +finish_read: + venus_access_intent(coda_inode->i_sb, coda_i2f(coda_inode), + &cfi->cfi_access_intent, + count, ki_pos, CODA_ACCESS_TYPE_READ_FINISH); + return ret; } static ssize_t @@ -48,10 +65,17 @@ coda_file_write_iter(struct kiocb *iocb, struct iov_iter *to) struct file *coda_file = iocb->ki_filp; struct inode *coda_inode = file_inode(coda_file); struct coda_file_info *cfi = coda_ftoc(coda_file); - struct file *host_file; + struct file *host_file = cfi->cfi_container; + loff_t ki_pos = iocb->ki_pos; + size_t count = iov_iter_count(to); ssize_t ret; - host_file = cfi->cfi_container; + ret = venus_access_intent(coda_inode->i_sb, coda_i2f(coda_inode), + &cfi->cfi_access_intent, + count, ki_pos, CODA_ACCESS_TYPE_WRITE); + if (ret) + goto finish_write; + file_start_write(host_file); inode_lock(coda_inode); ret = vfs_iter_write(cfi->cfi_container, to, &iocb->ki_pos, 0); @@ -60,6 +84,11 @@ coda_file_write_iter(struct kiocb *iocb, struct iov_iter *to) coda_inode->i_mtime = coda_inode->i_ctime = current_time(coda_inode); inode_unlock(coda_inode); file_end_write(host_file); + +finish_write: + venus_access_intent(coda_inode->i_sb, coda_i2f(coda_inode), + &cfi->cfi_access_intent, + count, ki_pos, CODA_ACCESS_TYPE_WRITE_FINISH); return ret; } @@ -94,29 +123,35 @@ coda_vm_close(struct vm_area_struct *vma) static int coda_file_mmap(struct file *coda_file, struct vm_area_struct *vma) { - struct coda_file_info *cfi; + struct inode *coda_inode = file_inode(coda_file); + struct coda_file_info *cfi = coda_ftoc(coda_file); + struct file *host_file = cfi->cfi_container; + struct inode *host_inode = file_inode(host_file); struct coda_inode_info *cii; - struct file *host_file; - struct inode *coda_inode, *host_inode; struct coda_vm_ops *cvm_ops; + loff_t ppos; + size_t count; int ret; - cfi = coda_ftoc(coda_file); - host_file = cfi->cfi_container; - if (!host_file->f_op->mmap) return -ENODEV; if (WARN_ON(coda_file != vma->vm_file)) return -EIO; + count = vma->vm_end - vma->vm_start; + ppos = vma->vm_pgoff * PAGE_SIZE; + + ret = venus_access_intent(coda_inode->i_sb, coda_i2f(coda_inode), + &cfi->cfi_access_intent, + count, ppos, CODA_ACCESS_TYPE_MMAP); + if (ret) + return ret; + cvm_ops = kmalloc(sizeof(struct coda_vm_ops), GFP_KERNEL); if (!cvm_ops) return -ENOMEM; - coda_inode = file_inode(coda_file); - host_inode = file_inode(host_file); - cii = ITOC(coda_inode); spin_lock(&cii->c_lock); coda_file->f_mapping = host_file->f_mapping; @@ -188,6 +223,8 @@ int coda_open(struct inode *coda_inode, struct file *coda_file) cfi->cfi_magic = CODA_MAGIC; cfi->cfi_mapcount = 0; cfi->cfi_container = host_file; + /* assume access intents are supported unless we hear otherwise */ + cfi->cfi_access_intent = true; BUG_ON(coda_file->private_data != NULL); coda_file->private_data = cfi; diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c index ebfbbea9fa48..240669f51eac 100644 --- a/fs/coda/psdev.c +++ b/fs/coda/psdev.c @@ -388,7 +388,7 @@ MODULE_AUTHOR("Jan Harkes, Peter J. Braam"); MODULE_DESCRIPTION("Coda Distributed File System VFS interface"); MODULE_ALIAS_CHARDEV_MAJOR(CODA_PSDEV_MAJOR); MODULE_LICENSE("GPL"); -MODULE_VERSION("6.11"); +MODULE_VERSION("7.0"); static int __init init_coda(void) { diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c index 15c0e4fdb0e3..eb3b1898da46 100644 --- a/fs/coda/upcall.c +++ b/fs/coda/upcall.c @@ -569,6 +569,47 @@ int venus_statfs(struct dentry *dentry, struct kstatfs *sfs) return error; } +int venus_access_intent(struct super_block *sb, struct CodaFid *fid, + bool *access_intent_supported, + size_t count, loff_t ppos, int type) +{ + union inputArgs *inp; + union outputArgs *outp; + int insize, outsize, error; + bool finalizer = + type == CODA_ACCESS_TYPE_READ_FINISH || + type == CODA_ACCESS_TYPE_WRITE_FINISH; + + if (!*access_intent_supported && !finalizer) + return 0; + + insize = SIZE(access_intent); + UPARG(CODA_ACCESS_INTENT); + + inp->coda_access_intent.VFid = *fid; + inp->coda_access_intent.count = count; + inp->coda_access_intent.pos = ppos; + inp->coda_access_intent.type = type; + + error = coda_upcall(coda_vcp(sb), insize, + finalizer ? NULL : &outsize, inp); + + /* + * we have to free the request buffer for synchronous upcalls + * or when asynchronous upcalls fail, but not when asynchronous + * upcalls succeed + */ + if (!finalizer || error) + kvfree(inp); + + /* Chunked access is not supported or an old Coda client */ + if (error == -EOPNOTSUPP) { + *access_intent_supported = false; + error = 0; + } + return error; +} + /* * coda_upcall and coda_downcall routines. */ @@ -598,10 +639,12 @@ static void coda_unblock_signals(sigset_t *old) * has seen them, * - CODA_CLOSE or CODA_RELEASE upcall (to avoid reference count problems) * - CODA_STORE (to avoid data loss) + * - CODA_ACCESS_INTENT (to avoid reference count problems) */ #define CODA_INTERRUPTIBLE(r) (!coda_hard && \ (((r)->uc_opcode != CODA_CLOSE && \ (r)->uc_opcode != CODA_STORE && \ + (r)->uc_opcode != CODA_ACCESS_INTENT && \ (r)->uc_opcode != CODA_RELEASE) || \ (r)->uc_flags & CODA_REQ_READ)) @@ -687,21 +730,25 @@ static int coda_upcall(struct venus_comm *vcp, goto exit; } + buffer->ih.unique = ++vcp->vc_seq; + req->uc_data = (void *)buffer; - req->uc_flags = 0; + req->uc_flags = outSize ? 0 : CODA_REQ_ASYNC; req->uc_inSize = inSize; - req->uc_outSize = *outSize ? *outSize : inSize; - req->uc_opcode = ((union inputArgs *)buffer)->ih.opcode; - req->uc_unique = ++vcp->vc_seq; + req->uc_outSize = (outSize && *outSize) ? *outSize : inSize; + req->uc_opcode = buffer->ih.opcode; + req->uc_unique = buffer->ih.unique; init_waitqueue_head(&req->uc_sleep); - /* Fill in the common input args. */ - ((union inputArgs *)buffer)->ih.unique = req->uc_unique; - /* Append msg to pending queue and poke Venus. */ list_add_tail(&req->uc_chain, &vcp->vc_pending); - wake_up_interruptible(&vcp->vc_waitq); + + if (req->uc_flags & CODA_REQ_ASYNC) { + mutex_unlock(&vcp->vc_mutex); + return 0; + } + /* We can be interrupted while we wait for Venus to process * our request. If the interrupt occurs before Venus has read * the request, we dequeue and return. If it occurs after the @@ -743,20 +790,20 @@ static int coda_upcall(struct venus_comm *vcp, sig_req = kmalloc(sizeof(struct upc_req), GFP_KERNEL); if (!sig_req) goto exit; - sig_req->uc_data = kvzalloc(sizeof(struct coda_in_hdr), GFP_KERNEL); - if (!sig_req->uc_data) { + sig_inputArgs = kvzalloc(sizeof(struct coda_in_hdr), GFP_KERNEL); + if (!sig_inputArgs) { kfree(sig_req); goto exit; } error = -EINTR; - sig_inputArgs = (union inputArgs *)sig_req->uc_data; sig_inputArgs->ih.opcode = CODA_SIGNAL; sig_inputArgs->ih.unique = req->uc_unique; sig_req->uc_flags = CODA_REQ_ASYNC; sig_req->uc_opcode = sig_inputArgs->ih.opcode; sig_req->uc_unique = sig_inputArgs->ih.unique; + sig_req->uc_data = (void *)sig_inputArgs; sig_req->uc_inSize = sizeof(struct coda_in_hdr); sig_req->uc_outSize = sizeof(struct coda_in_hdr); @@ -911,4 +958,3 @@ unlock_out: iput(inode); return 0; } - diff --git a/include/uapi/linux/coda.h b/include/uapi/linux/coda.h index 5dba636b6e11..aa34c2dcae8d 100644 --- a/include/uapi/linux/coda.h +++ b/include/uapi/linux/coda.h @@ -271,7 +271,8 @@ struct coda_statfs { #define CODA_STATFS 34 #define CODA_STORE 35 #define CODA_RELEASE 36 -#define CODA_NCALLS 37 +#define CODA_ACCESS_INTENT 37 +#define CODA_NCALLS 38 #define DOWNCALL(opcode) (opcode >= CODA_REPLACE && opcode <= CODA_PURGEFID) @@ -281,8 +282,12 @@ struct coda_statfs { #define CIOC_KERNEL_VERSION _IOWR('c', 10, size_t) +// CODA_KERNEL_VERSION 0 /* don't care about kernel version number */ +// CODA_KERNEL_VERSION 1 /* The old venus 4.6 compatible interface */ +// CODA_KERNEL_VERSION 2 /* venus_lookup gets an extra parameter */ // CODA_KERNEL_VERSION 3 /* 128-bit file identifiers */ -#define CODA_KERNEL_VERSION 4 /* 64-bit timespec */ +// CODA_KERNEL_VERSION 4 /* 64-bit timespec */ +#define CODA_KERNEL_VERSION 5 /* access intent support */ /* * Venus <-> Coda RPC arguments @@ -637,6 +642,25 @@ struct coda_statfs_out { struct coda_statfs stat; }; +#define CODA_ACCESS_TYPE_READ 1 +#define CODA_ACCESS_TYPE_WRITE 2 +#define CODA_ACCESS_TYPE_MMAP 3 +#define CODA_ACCESS_TYPE_READ_FINISH 4 +#define CODA_ACCESS_TYPE_WRITE_FINISH 5 + +/* coda_access_intent: NO_OUT */ +struct coda_access_intent_in { + struct coda_in_hdr ih; + struct CodaFid VFid; + int count; + int pos; + int type; +}; + +struct coda_access_intent_out { + struct coda_out_hdr out; +}; + /* * Occasionally, we don't cache the fid returned by CODA_LOOKUP. * For instance, if the fid is inconsistent. @@ -668,6 +692,7 @@ union inputArgs { struct coda_open_by_fd_in coda_open_by_fd; struct coda_open_by_path_in coda_open_by_path; struct coda_statfs_in coda_statfs; + struct coda_access_intent_in coda_access_intent; }; union outputArgs { -- cgit v1.2.3-71-gd317 From 201766a20e30f982ccfe36bebfad9602c3ff574a Mon Sep 17 00:00:00 2001 From: Elvira Khabirova Date: Tue, 16 Jul 2019 16:29:42 -0700 Subject: ptrace: add PTRACE_GET_SYSCALL_INFO request PTRACE_GET_SYSCALL_INFO is a generic ptrace API that lets ptracer obtain details of the syscall the tracee is blocked in. There are two reasons for a special syscall-related ptrace request. Firstly, with the current ptrace API there are cases when ptracer cannot retrieve necessary information about syscalls. Some examples include: * The notorious int-0x80-from-64-bit-task issue. See [1] for details. In short, if a 64-bit task performs a syscall through int 0x80, its tracer has no reliable means to find out that the syscall was, in fact, a compat syscall, and misidentifies it. * Syscall-enter-stop and syscall-exit-stop look the same for the tracer. Common practice is to keep track of the sequence of ptrace-stops in order not to mix the two syscall-stops up. But it is not as simple as it looks; for example, strace had a (just recently fixed) long-standing bug where attaching strace to a tracee that is performing the execve system call led to the tracer identifying the following syscall-exit-stop as syscall-enter-stop, which messed up all the state tracking. * Since the introduction of commit 84d77d3f06e7 ("ptrace: Don't allow accessing an undumpable mm"), both PTRACE_PEEKDATA and process_vm_readv become unavailable when the process dumpable flag is cleared. On such architectures as ia64 this results in all syscall arguments being unavailable for the tracer. Secondly, ptracers also have to support a lot of arch-specific code for obtaining information about the tracee. For some architectures, this requires a ptrace(PTRACE_PEEKUSER, ...) invocation for every syscall argument and return value. ptrace(2) man page: long ptrace(enum __ptrace_request request, pid_t pid, void *addr, void *data); ... PTRACE_GET_SYSCALL_INFO Retrieve information about the syscall that caused the stop. The information is placed into the buffer pointed by "data" argument, which should be a pointer to a buffer of type "struct ptrace_syscall_info". The "addr" argument contains the size of the buffer pointed to by "data" argument (i.e., sizeof(struct ptrace_syscall_info)). The return value contains the number of bytes available to be written by the kernel. If the size of data to be written by the kernel exceeds the size specified by "addr" argument, the output is truncated. [ldv@altlinux.org: selftests/seccomp/seccomp_bpf: update for PTRACE_GET_SYSCALL_INFO] Link: http://lkml.kernel.org/r/20190708182904.GA12332@altlinux.org Link: http://lkml.kernel.org/r/20190510152842.GF28558@altlinux.org Signed-off-by: Elvira Khabirova Co-developed-by: Dmitry V. Levin Signed-off-by: Dmitry V. Levin Reviewed-by: Oleg Nesterov Reviewed-by: Kees Cook Reviewed-by: Andy Lutomirski Cc: Eugene Syromyatnikov Cc: Benjamin Herrenschmidt Cc: Greentime Hu Cc: Helge Deller [parisc] Cc: James E.J. Bottomley Cc: James Hogan Cc: kbuild test robot Cc: Michael Ellerman Cc: Paul Burton Cc: Paul Mackerras Cc: Ralf Baechle Cc: Richard Kuo Cc: Shuah Khan Cc: Vincent Chen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/tracehook.h | 9 ++- include/uapi/linux/ptrace.h | 35 +++++++++ kernel/ptrace.c | 101 +++++++++++++++++++++++++- tools/testing/selftests/seccomp/seccomp_bpf.c | 13 +++- 4 files changed, 150 insertions(+), 8 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h index 8446573cc682..36fb3bbed6b2 100644 --- a/include/linux/tracehook.h +++ b/include/linux/tracehook.h @@ -54,13 +54,15 @@ struct linux_binprm; /* * ptrace report for syscall entry and exit looks identical. */ -static inline int ptrace_report_syscall(struct pt_regs *regs) +static inline int ptrace_report_syscall(struct pt_regs *regs, + unsigned long message) { int ptrace = current->ptrace; if (!(ptrace & PT_PTRACED)) return 0; + current->ptrace_message = message; ptrace_notify(SIGTRAP | ((ptrace & PT_TRACESYSGOOD) ? 0x80 : 0)); /* @@ -73,6 +75,7 @@ static inline int ptrace_report_syscall(struct pt_regs *regs) current->exit_code = 0; } + current->ptrace_message = 0; return fatal_signal_pending(current); } @@ -98,7 +101,7 @@ static inline int ptrace_report_syscall(struct pt_regs *regs) static inline __must_check int tracehook_report_syscall_entry( struct pt_regs *regs) { - return ptrace_report_syscall(regs); + return ptrace_report_syscall(regs, PTRACE_EVENTMSG_SYSCALL_ENTRY); } /** @@ -123,7 +126,7 @@ static inline void tracehook_report_syscall_exit(struct pt_regs *regs, int step) if (step) user_single_step_report(regs); else - ptrace_report_syscall(regs); + ptrace_report_syscall(regs, PTRACE_EVENTMSG_SYSCALL_EXIT); } /** diff --git a/include/uapi/linux/ptrace.h b/include/uapi/linux/ptrace.h index d5a1b8a492b9..a71b6e3b03eb 100644 --- a/include/uapi/linux/ptrace.h +++ b/include/uapi/linux/ptrace.h @@ -73,6 +73,41 @@ struct seccomp_metadata { __u64 flags; /* Output: filter's flags */ }; +#define PTRACE_GET_SYSCALL_INFO 0x420e +#define PTRACE_SYSCALL_INFO_NONE 0 +#define PTRACE_SYSCALL_INFO_ENTRY 1 +#define PTRACE_SYSCALL_INFO_EXIT 2 +#define PTRACE_SYSCALL_INFO_SECCOMP 3 + +struct ptrace_syscall_info { + __u8 op; /* PTRACE_SYSCALL_INFO_* */ + __u32 arch __attribute__((__aligned__(sizeof(__u32)))); + __u64 instruction_pointer; + __u64 stack_pointer; + union { + struct { + __u64 nr; + __u64 args[6]; + } entry; + struct { + __s64 rval; + __u8 is_error; + } exit; + struct { + __u64 nr; + __u64 args[6]; + __u32 ret_data; + } seccomp; + }; +}; + +/* + * These values are stored in task->ptrace_message + * by tracehook_report_syscall_* to describe the current syscall-stop. + */ +#define PTRACE_EVENTMSG_SYSCALL_ENTRY 1 +#define PTRACE_EVENTMSG_SYSCALL_EXIT 2 + /* Read signals from a shared (process wide) queue */ #define PTRACE_PEEKSIGINFO_SHARED (1 << 0) diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 83a531cea2f3..cb9ddcc08119 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -32,6 +32,8 @@ #include #include +#include /* for syscall_get_* */ + /* * Access another process' address space via ptrace. * Source/target buffer must be kernel space, @@ -897,7 +899,100 @@ static int ptrace_regset(struct task_struct *task, int req, unsigned int type, * to ensure no machine forgets it. */ EXPORT_SYMBOL_GPL(task_user_regset_view); -#endif + +static unsigned long +ptrace_get_syscall_info_entry(struct task_struct *child, struct pt_regs *regs, + struct ptrace_syscall_info *info) +{ + unsigned long args[ARRAY_SIZE(info->entry.args)]; + int i; + + info->op = PTRACE_SYSCALL_INFO_ENTRY; + info->entry.nr = syscall_get_nr(child, regs); + syscall_get_arguments(child, regs, args); + for (i = 0; i < ARRAY_SIZE(args); i++) + info->entry.args[i] = args[i]; + + /* args is the last field in struct ptrace_syscall_info.entry */ + return offsetofend(struct ptrace_syscall_info, entry.args); +} + +static unsigned long +ptrace_get_syscall_info_seccomp(struct task_struct *child, struct pt_regs *regs, + struct ptrace_syscall_info *info) +{ + /* + * As struct ptrace_syscall_info.entry is currently a subset + * of struct ptrace_syscall_info.seccomp, it makes sense to + * initialize that subset using ptrace_get_syscall_info_entry(). + * This can be reconsidered in the future if these structures + * diverge significantly enough. + */ + ptrace_get_syscall_info_entry(child, regs, info); + info->op = PTRACE_SYSCALL_INFO_SECCOMP; + info->seccomp.ret_data = child->ptrace_message; + + /* ret_data is the last field in struct ptrace_syscall_info.seccomp */ + return offsetofend(struct ptrace_syscall_info, seccomp.ret_data); +} + +static unsigned long +ptrace_get_syscall_info_exit(struct task_struct *child, struct pt_regs *regs, + struct ptrace_syscall_info *info) +{ + info->op = PTRACE_SYSCALL_INFO_EXIT; + info->exit.rval = syscall_get_error(child, regs); + info->exit.is_error = !!info->exit.rval; + if (!info->exit.is_error) + info->exit.rval = syscall_get_return_value(child, regs); + + /* is_error is the last field in struct ptrace_syscall_info.exit */ + return offsetofend(struct ptrace_syscall_info, exit.is_error); +} + +static int +ptrace_get_syscall_info(struct task_struct *child, unsigned long user_size, + void __user *datavp) +{ + struct pt_regs *regs = task_pt_regs(child); + struct ptrace_syscall_info info = { + .op = PTRACE_SYSCALL_INFO_NONE, + .arch = syscall_get_arch(child), + .instruction_pointer = instruction_pointer(regs), + .stack_pointer = user_stack_pointer(regs), + }; + unsigned long actual_size = offsetof(struct ptrace_syscall_info, entry); + unsigned long write_size; + + /* + * This does not need lock_task_sighand() to access + * child->last_siginfo because ptrace_freeze_traced() + * called earlier by ptrace_check_attach() ensures that + * the tracee cannot go away and clear its last_siginfo. + */ + switch (child->last_siginfo ? child->last_siginfo->si_code : 0) { + case SIGTRAP | 0x80: + switch (child->ptrace_message) { + case PTRACE_EVENTMSG_SYSCALL_ENTRY: + actual_size = ptrace_get_syscall_info_entry(child, regs, + &info); + break; + case PTRACE_EVENTMSG_SYSCALL_EXIT: + actual_size = ptrace_get_syscall_info_exit(child, regs, + &info); + break; + } + break; + case SIGTRAP | (PTRACE_EVENT_SECCOMP << 8): + actual_size = ptrace_get_syscall_info_seccomp(child, regs, + &info); + break; + } + + write_size = min(actual_size, user_size); + return copy_to_user(datavp, &info, write_size) ? -EFAULT : actual_size; +} +#endif /* CONFIG_HAVE_ARCH_TRACEHOOK */ int ptrace_request(struct task_struct *child, long request, unsigned long addr, unsigned long data) @@ -1114,6 +1209,10 @@ int ptrace_request(struct task_struct *child, long request, ret = __put_user(kiov.iov_len, &uiov->iov_len); break; } + + case PTRACE_GET_SYSCALL_INFO: + ret = ptrace_get_syscall_info(child, addr, datavp); + break; #endif case PTRACE_SECCOMP_GET_FILTER: diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c index dc66fe852768..6ef7f16c4cf5 100644 --- a/tools/testing/selftests/seccomp/seccomp_bpf.c +++ b/tools/testing/selftests/seccomp/seccomp_bpf.c @@ -1775,13 +1775,18 @@ void tracer_ptrace(struct __test_metadata *_metadata, pid_t tracee, unsigned long msg; static bool entry; - /* Make sure we got an empty message. */ + /* + * The traditional way to tell PTRACE_SYSCALL entry/exit + * is by counting. + */ + entry = !entry; + + /* Make sure we got an appropriate message. */ ret = ptrace(PTRACE_GETEVENTMSG, tracee, NULL, &msg); EXPECT_EQ(0, ret); - EXPECT_EQ(0, msg); + EXPECT_EQ(entry ? PTRACE_EVENTMSG_SYSCALL_ENTRY + : PTRACE_EVENTMSG_SYSCALL_EXIT, msg); - /* The only way to tell PTRACE_SYSCALL entry/exit is by counting. */ - entry = !entry; if (!entry) return; -- cgit v1.2.3-71-gd317 From 8c2e408e73f735d2e6e8b43f9b038c9abb082939 Mon Sep 17 00:00:00 2001 From: Pankaj Gupta Date: Fri, 12 Jul 2019 10:46:10 +0530 Subject: virtio_pmem: fix sparse warning This patch fixes below sparse warning related to __virtio type in virtio pmem driver. This is reported by Intel test bot on linux-next tree. nd_virtio.c:56:28: warning: incorrect type in assignment (different base types) nd_virtio.c:56:28: expected unsigned int [unsigned] [usertype] type nd_virtio.c:56:28: got restricted __virtio32 nd_virtio.c:93:59: warning: incorrect type in argument 2 (different base types) nd_virtio.c:93:59: expected restricted __virtio32 [usertype] val nd_virtio.c:93:59: got unsigned int [unsigned] [usertype] ret Reported-by: kbuild test robot Signed-off-by: Pankaj Gupta Acked-by: Michael S. Tsirkin Signed-off-by: Dan Williams --- drivers/nvdimm/nd_virtio.c | 4 ++-- include/uapi/linux/virtio_pmem.h | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/nvdimm/nd_virtio.c b/drivers/nvdimm/nd_virtio.c index 8645275c08c2..10351d5b49fa 100644 --- a/drivers/nvdimm/nd_virtio.c +++ b/drivers/nvdimm/nd_virtio.c @@ -53,7 +53,7 @@ static int virtio_pmem_flush(struct nd_region *nd_region) init_waitqueue_head(&req_data->host_acked); init_waitqueue_head(&req_data->wq_buf); INIT_LIST_HEAD(&req_data->list); - req_data->req.type = cpu_to_virtio32(vdev, VIRTIO_PMEM_REQ_TYPE_FLUSH); + req_data->req.type = cpu_to_le32(VIRTIO_PMEM_REQ_TYPE_FLUSH); sg_init_one(&sg, &req_data->req, sizeof(req_data->req)); sgs[0] = &sg; sg_init_one(&ret, &req_data->resp.ret, sizeof(req_data->resp)); @@ -90,7 +90,7 @@ static int virtio_pmem_flush(struct nd_region *nd_region) } else { /* A host repsonse results in "host_ack" getting called */ wait_event(req_data->host_acked, req_data->done); - err = virtio32_to_cpu(vdev, req_data->resp.ret); + err = le32_to_cpu(req_data->resp.ret); } kfree(req_data); diff --git a/include/uapi/linux/virtio_pmem.h b/include/uapi/linux/virtio_pmem.h index efcd72f2d20d..9a63ed6d062f 100644 --- a/include/uapi/linux/virtio_pmem.h +++ b/include/uapi/linux/virtio_pmem.h @@ -23,12 +23,12 @@ struct virtio_pmem_config { struct virtio_pmem_resp { /* Host return status corresponding to flush request */ - __u32 ret; + __le32 ret; }; struct virtio_pmem_req { /* command type */ - __u32 type; + __le32 type; }; #endif -- cgit v1.2.3-71-gd317 From 5edaac063bbf1267260ad2a5b9bb803399343e58 Mon Sep 17 00:00:00 2001 From: John Crispin Date: Thu, 27 Jun 2019 11:58:32 +0200 Subject: nl80211: fix NL80211_HE_MAX_CAPABILITY_LEN NL80211_HE_MAX_CAPABILITY_LEN has changed between D2.0 and D4.0. It is now MAC (6) + PHY (11) + MCS (12) + PPE (25) = 54. Signed-off-by: John Crispin Link: https://lore.kernel.org/r/20190627095832.19445-1-john@phrozen.org Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 75758ec26c8b..beb9a9d0c00a 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -2863,7 +2863,7 @@ enum nl80211_attrs { #define NL80211_HT_CAPABILITY_LEN 26 #define NL80211_VHT_CAPABILITY_LEN 12 #define NL80211_HE_MIN_CAPABILITY_LEN 16 -#define NL80211_HE_MAX_CAPABILITY_LEN 51 +#define NL80211_HE_MAX_CAPABILITY_LEN 54 #define NL80211_MAX_NR_CIPHER_SUITES 5 #define NL80211_MAX_NR_AKM_SUITES 2 -- cgit v1.2.3-71-gd317 From 5d4b45a1dd7b00feab57624035dcdbc1bab2e0f8 Mon Sep 17 00:00:00 2001 From: Markus Koch Date: Sun, 21 Jul 2019 20:20:28 +0300 Subject: Input: add support for the FlySky FS-iA6B RC receiver This patch adds support for the FlySky FS-iA6B RC receiver (serial IBUS). It allows the usage of the FlySky FS-i6 and other AFHDS compliant remote controls as a joystick input device. To use it, a patch to inputattach which adds the FS-iA6B as a 115200 baud serial device is required. I will upstream it after this patch is merged. More information about the hardware can be found here: https://notsyncing.net/?p=blog&b=2018.linux-fsia6b Signed-off-by: Markus Koch Signed-off-by: Dmitry Torokhov --- MAINTAINERS | 6 ++ drivers/input/joystick/Kconfig | 10 ++ drivers/input/joystick/Makefile | 5 +- drivers/input/joystick/fsia6b.c | 231 ++++++++++++++++++++++++++++++++++++++++ include/uapi/linux/serio.h | 1 + 5 files changed, 251 insertions(+), 2 deletions(-) create mode 100644 drivers/input/joystick/fsia6b.c (limited to 'include/uapi/linux') diff --git a/MAINTAINERS b/MAINTAINERS index 677ef41cb012..cdd25b0a1218 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -12394,6 +12394,12 @@ S: Maintained F: Documentation/input/devices/pxrc.rst F: drivers/input/joystick/pxrc.c +FLYSKY FSIA6B RC RECEIVER +M: Markus Koch +L: linux-input@vger.kernel.org +S: Maintained +F: drivers/input/joystick/fsia6b.c + PHONET PROTOCOL M: Remi Denis-Courmont S: Supported diff --git a/drivers/input/joystick/Kconfig b/drivers/input/joystick/Kconfig index 72b932901d00..312b854b5506 100644 --- a/drivers/input/joystick/Kconfig +++ b/drivers/input/joystick/Kconfig @@ -362,4 +362,14 @@ config JOYSTICK_PXRC To compile this driver as a module, choose M here: the module will be called pxrc. +config JOYSTICK_FSIA6B + tristate "FlySky FS-iA6B RC Receiver" + select SERIO + help + Say Y here if you use a FlySky FS-i6 RC remote control along with the + FS-iA6B RC receiver as a joystick input device. + + To compile this driver as a module, choose M here: the + module will be called fsia6b. + endif diff --git a/drivers/input/joystick/Makefile b/drivers/input/joystick/Makefile index dd0492ebbed7..8656023f6ef5 100644 --- a/drivers/input/joystick/Makefile +++ b/drivers/input/joystick/Makefile @@ -12,6 +12,7 @@ obj-$(CONFIG_JOYSTICK_AS5011) += as5011.o obj-$(CONFIG_JOYSTICK_ANALOG) += analog.o obj-$(CONFIG_JOYSTICK_COBRA) += cobra.o obj-$(CONFIG_JOYSTICK_DB9) += db9.o +obj-$(CONFIG_JOYSTICK_FSIA6B) += fsia6b.o obj-$(CONFIG_JOYSTICK_GAMECON) += gamecon.o obj-$(CONFIG_JOYSTICK_GF2K) += gf2k.o obj-$(CONFIG_JOYSTICK_GRIP) += grip.o @@ -23,7 +24,7 @@ obj-$(CONFIG_JOYSTICK_JOYDUMP) += joydump.o obj-$(CONFIG_JOYSTICK_MAGELLAN) += magellan.o obj-$(CONFIG_JOYSTICK_MAPLE) += maplecontrol.o obj-$(CONFIG_JOYSTICK_PSXPAD_SPI) += psxpad-spi.o -obj-$(CONFIG_JOYSTICK_PXRC) += pxrc.o +obj-$(CONFIG_JOYSTICK_PXRC) += pxrc.o obj-$(CONFIG_JOYSTICK_SIDEWINDER) += sidewinder.o obj-$(CONFIG_JOYSTICK_SPACEBALL) += spaceball.o obj-$(CONFIG_JOYSTICK_SPACEORB) += spaceorb.o @@ -32,7 +33,7 @@ obj-$(CONFIG_JOYSTICK_TMDC) += tmdc.o obj-$(CONFIG_JOYSTICK_TURBOGRAFX) += turbografx.o obj-$(CONFIG_JOYSTICK_TWIDJOY) += twidjoy.o obj-$(CONFIG_JOYSTICK_WARRIOR) += warrior.o +obj-$(CONFIG_JOYSTICK_WALKERA0701) += walkera0701.o obj-$(CONFIG_JOYSTICK_XPAD) += xpad.o obj-$(CONFIG_JOYSTICK_ZHENHUA) += zhenhua.o -obj-$(CONFIG_JOYSTICK_WALKERA0701) += walkera0701.o diff --git a/drivers/input/joystick/fsia6b.c b/drivers/input/joystick/fsia6b.c new file mode 100644 index 000000000000..e78c4c768990 --- /dev/null +++ b/drivers/input/joystick/fsia6b.c @@ -0,0 +1,231 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * FS-iA6B iBus RC receiver driver + * + * This driver provides all 14 channels of the FlySky FS-ia6B RC receiver + * as analog values. + * + * Additionally, the channels can be converted to discrete switch values. + * By default, it is configured for the offical FS-i6 remote control. + * If you use a different hardware configuration, you can configure it + * using the `switch_config` parameter. + */ + +#include +#include +#include +#include +#include +#include +#include + +#define DRIVER_DESC "FS-iA6B iBus RC receiver" + +MODULE_AUTHOR("Markus Koch "); +MODULE_DESCRIPTION(DRIVER_DESC); +MODULE_LICENSE("GPL"); + +#define IBUS_SERVO_COUNT 14 + +static char *switch_config = "00000022320000"; +module_param(switch_config, charp, 0444); +MODULE_PARM_DESC(switch_config, + "Amount of switch positions per channel (14 characters, 0-3)"); + +static int fsia6b_axes[IBUS_SERVO_COUNT] = { + ABS_X, ABS_Y, + ABS_Z, ABS_RX, + ABS_RY, ABS_RZ, + ABS_HAT0X, ABS_HAT0Y, + ABS_HAT1X, ABS_HAT1Y, + ABS_HAT2X, ABS_HAT2Y, + ABS_HAT3X, ABS_HAT3Y +}; + +enum ibus_state { SYNC, COLLECT, PROCESS }; + +struct ibus_packet { + enum ibus_state state; + + int offset; + u16 ibuf; + u16 channel[IBUS_SERVO_COUNT]; +}; + +struct fsia6b { + struct input_dev *dev; + struct ibus_packet packet; + + char phys[32]; +}; + +static irqreturn_t fsia6b_serio_irq(struct serio *serio, + unsigned char data, unsigned int flags) +{ + struct fsia6b *fsia6b = serio_get_drvdata(serio); + int i; + int sw_state; + int sw_id = BTN_0; + + fsia6b->packet.ibuf = (data << 8) | ((fsia6b->packet.ibuf >> 8) & 0xFF); + + switch (fsia6b->packet.state) { + case SYNC: + if (fsia6b->packet.ibuf == 0x4020) + fsia6b->packet.state = COLLECT; + break; + + case COLLECT: + fsia6b->packet.state = PROCESS; + break; + + case PROCESS: + fsia6b->packet.channel[fsia6b->packet.offset] = + fsia6b->packet.ibuf; + fsia6b->packet.offset++; + + if (fsia6b->packet.offset == IBUS_SERVO_COUNT) { + fsia6b->packet.offset = 0; + fsia6b->packet.state = SYNC; + for (i = 0; i < IBUS_SERVO_COUNT; ++i) { + input_report_abs(fsia6b->dev, fsia6b_axes[i], + fsia6b->packet.channel[i]); + + sw_state = 0; + if (fsia6b->packet.channel[i] > 1900) + sw_state = 1; + else if (fsia6b->packet.channel[i] < 1100) + sw_state = 2; + + switch (switch_config[i]) { + case '3': + input_report_key(fsia6b->dev, + sw_id++, + sw_state == 0); + /* fall-through */ + case '2': + input_report_key(fsia6b->dev, + sw_id++, + sw_state == 1); + /* fall-through */ + case '1': + input_report_key(fsia6b->dev, + sw_id++, + sw_state == 2); + } + } + input_sync(fsia6b->dev); + } else { + fsia6b->packet.state = COLLECT; + } + break; + } + + return IRQ_HANDLED; +} + +static int fsia6b_serio_connect(struct serio *serio, struct serio_driver *drv) +{ + struct fsia6b *fsia6b; + struct input_dev *input_dev; + int err; + int i, j; + int sw_id = 0; + + fsia6b = kzalloc(sizeof(*fsia6b), GFP_KERNEL); + if (!fsia6b) + return -ENOMEM; + + fsia6b->packet.ibuf = 0; + fsia6b->packet.offset = 0; + fsia6b->packet.state = SYNC; + + serio_set_drvdata(serio, fsia6b); + + input_dev = input_allocate_device(); + if (!input_dev) { + err = -ENOMEM; + goto fail1; + } + fsia6b->dev = input_dev; + + snprintf(fsia6b->phys, sizeof(fsia6b->phys), "%s/input0", serio->phys); + + input_dev->name = DRIVER_DESC; + input_dev->phys = fsia6b->phys; + input_dev->id.bustype = BUS_RS232; + input_dev->id.vendor = SERIO_FSIA6B; + input_dev->id.product = serio->id.id; + input_dev->id.version = 0x0100; + input_dev->dev.parent = &serio->dev; + + for (i = 0; i < IBUS_SERVO_COUNT; i++) + input_set_abs_params(input_dev, fsia6b_axes[i], + 1000, 2000, 2, 2); + + /* Register switch configuration */ + for (i = 0; i < IBUS_SERVO_COUNT; i++) { + if (switch_config[i] < '0' || switch_config[i] > '3') { + dev_err(&fsia6b->dev->dev, + "Invalid switch configuration supplied for fsia6b.\n"); + err = -EINVAL; + goto fail2; + } + + for (j = '1'; j <= switch_config[i]; j++) { + input_set_capability(input_dev, EV_KEY, BTN_0 + sw_id); + sw_id++; + } + } + + err = serio_open(serio, drv); + if (err) + goto fail2; + + err = input_register_device(fsia6b->dev); + if (err) + goto fail3; + + return 0; + +fail3: serio_close(serio); +fail2: input_free_device(input_dev); +fail1: serio_set_drvdata(serio, NULL); + kfree(fsia6b); + return err; +} + +static void fsia6b_serio_disconnect(struct serio *serio) +{ + struct fsia6b *fsia6b = serio_get_drvdata(serio); + + serio_close(serio); + serio_set_drvdata(serio, NULL); + input_unregister_device(fsia6b->dev); + kfree(fsia6b); +} + +static const struct serio_device_id fsia6b_serio_ids[] = { + { + .type = SERIO_RS232, + .proto = SERIO_FSIA6B, + .id = SERIO_ANY, + .extra = SERIO_ANY, + }, + { 0 } +}; + +MODULE_DEVICE_TABLE(serio, fsia6b_serio_ids); + +static struct serio_driver fsia6b_serio_drv = { + .driver = { + .name = "fsia6b" + }, + .description = DRIVER_DESC, + .id_table = fsia6b_serio_ids, + .interrupt = fsia6b_serio_irq, + .connect = fsia6b_serio_connect, + .disconnect = fsia6b_serio_disconnect +}; + +module_serio_driver(fsia6b_serio_drv) diff --git a/include/uapi/linux/serio.h b/include/uapi/linux/serio.h index a0cac1d8670d..50e991952c97 100644 --- a/include/uapi/linux/serio.h +++ b/include/uapi/linux/serio.h @@ -82,5 +82,6 @@ #define SERIO_EGALAX 0x3f #define SERIO_PULSE8_CEC 0x40 #define SERIO_RAINSHADOW_CEC 0x41 +#define SERIO_FSIA6B 0x42 #endif /* _UAPI_SERIO_H */ -- cgit v1.2.3-71-gd317 From ae24fb49d01103c80d6ff3b78714259c1c62c958 Mon Sep 17 00:00:00 2001 From: Jean-Philippe Brucker Date: Mon, 22 Jul 2019 15:40:07 +0100 Subject: iommu/virtio: Update to most recent specification Following specification review a few things were changed in v8 of the virtio-iommu series [1], but have been omitted when merging the base driver. Add them now: * Remove the EXEC flag. * Add feature bit for the MMIO flag. * Change domain_bits to domain_range. * Add NOMEM status flag. [1] https://lore.kernel.org/linux-iommu/20190530170929.19366-1-jean-philippe.brucker@arm.com/ Fixes: edcd69ab9a32 ("iommu: Add virtio-iommu driver") Reported-by: Eric Auger Signed-off-by: Jean-Philippe Brucker Signed-off-by: Michael S. Tsirkin Reviewed-by: Eric Auger Tested-by: Eric Auger Acked-by: Joerg Roedel --- drivers/iommu/virtio-iommu.c | 40 ++++++++++++++++++++++++++++----------- include/uapi/linux/virtio_iommu.h | 32 +++++++++++++++++-------------- 2 files changed, 47 insertions(+), 25 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/iommu/virtio-iommu.c b/drivers/iommu/virtio-iommu.c index 433f4d2ee956..80a740df0737 100644 --- a/drivers/iommu/virtio-iommu.c +++ b/drivers/iommu/virtio-iommu.c @@ -2,7 +2,7 @@ /* * Virtio driver for the paravirtualized IOMMU * - * Copyright (C) 2018 Arm Limited + * Copyright (C) 2019 Arm Limited */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt @@ -47,7 +47,10 @@ struct viommu_dev { /* Device configuration */ struct iommu_domain_geometry geometry; u64 pgsize_bitmap; - u8 domain_bits; + u32 first_domain; + u32 last_domain; + /* Supported MAP flags */ + u32 map_flags; u32 probe_size; }; @@ -62,6 +65,7 @@ struct viommu_domain { struct viommu_dev *viommu; struct mutex mutex; /* protects viommu pointer */ unsigned int id; + u32 map_flags; spinlock_t mappings_lock; struct rb_root_cached mappings; @@ -113,6 +117,8 @@ static int viommu_get_req_errno(void *buf, size_t len) return -ENOENT; case VIRTIO_IOMMU_S_FAULT: return -EFAULT; + case VIRTIO_IOMMU_S_NOMEM: + return -ENOMEM; case VIRTIO_IOMMU_S_IOERR: case VIRTIO_IOMMU_S_DEVERR: default: @@ -607,15 +613,15 @@ static int viommu_domain_finalise(struct viommu_dev *viommu, { int ret; struct viommu_domain *vdomain = to_viommu_domain(domain); - unsigned int max_domain = viommu->domain_bits > 31 ? ~0 : - (1U << viommu->domain_bits) - 1; vdomain->viommu = viommu; + vdomain->map_flags = viommu->map_flags; domain->pgsize_bitmap = viommu->pgsize_bitmap; domain->geometry = viommu->geometry; - ret = ida_alloc_max(&viommu->domain_ids, max_domain, GFP_KERNEL); + ret = ida_alloc_range(&viommu->domain_ids, viommu->first_domain, + viommu->last_domain, GFP_KERNEL); if (ret >= 0) vdomain->id = (unsigned int)ret; @@ -710,7 +716,7 @@ static int viommu_map(struct iommu_domain *domain, unsigned long iova, phys_addr_t paddr, size_t size, int prot) { int ret; - int flags; + u32 flags; struct virtio_iommu_req_map map; struct viommu_domain *vdomain = to_viommu_domain(domain); @@ -718,6 +724,9 @@ static int viommu_map(struct iommu_domain *domain, unsigned long iova, (prot & IOMMU_WRITE ? VIRTIO_IOMMU_MAP_F_WRITE : 0) | (prot & IOMMU_MMIO ? VIRTIO_IOMMU_MAP_F_MMIO : 0); + if (flags & ~vdomain->map_flags) + return -EINVAL; + ret = viommu_add_mapping(vdomain, iova, paddr, size, flags); if (ret) return ret; @@ -1027,7 +1036,8 @@ static int viommu_probe(struct virtio_device *vdev) goto err_free_vqs; } - viommu->domain_bits = 32; + viommu->map_flags = VIRTIO_IOMMU_MAP_F_READ | VIRTIO_IOMMU_MAP_F_WRITE; + viommu->last_domain = ~0U; /* Optional features */ virtio_cread_feature(vdev, VIRTIO_IOMMU_F_INPUT_RANGE, @@ -1038,9 +1048,13 @@ static int viommu_probe(struct virtio_device *vdev) struct virtio_iommu_config, input_range.end, &input_end); - virtio_cread_feature(vdev, VIRTIO_IOMMU_F_DOMAIN_BITS, - struct virtio_iommu_config, domain_bits, - &viommu->domain_bits); + virtio_cread_feature(vdev, VIRTIO_IOMMU_F_DOMAIN_RANGE, + struct virtio_iommu_config, domain_range.start, + &viommu->first_domain); + + virtio_cread_feature(vdev, VIRTIO_IOMMU_F_DOMAIN_RANGE, + struct virtio_iommu_config, domain_range.end, + &viommu->last_domain); virtio_cread_feature(vdev, VIRTIO_IOMMU_F_PROBE, struct virtio_iommu_config, probe_size, @@ -1052,6 +1066,9 @@ static int viommu_probe(struct virtio_device *vdev) .force_aperture = true, }; + if (virtio_has_feature(vdev, VIRTIO_IOMMU_F_MMIO)) + viommu->map_flags |= VIRTIO_IOMMU_MAP_F_MMIO; + viommu_ops.pgsize_bitmap = viommu->pgsize_bitmap; virtio_device_ready(vdev); @@ -1130,9 +1147,10 @@ static void viommu_config_changed(struct virtio_device *vdev) static unsigned int features[] = { VIRTIO_IOMMU_F_MAP_UNMAP, - VIRTIO_IOMMU_F_DOMAIN_BITS, VIRTIO_IOMMU_F_INPUT_RANGE, + VIRTIO_IOMMU_F_DOMAIN_RANGE, VIRTIO_IOMMU_F_PROBE, + VIRTIO_IOMMU_F_MMIO, }; static struct virtio_device_id id_table[] = { diff --git a/include/uapi/linux/virtio_iommu.h b/include/uapi/linux/virtio_iommu.h index ba1b460c9944..237e36a280cb 100644 --- a/include/uapi/linux/virtio_iommu.h +++ b/include/uapi/linux/virtio_iommu.h @@ -1,8 +1,8 @@ /* SPDX-License-Identifier: BSD-3-Clause */ /* - * Virtio-iommu definition v0.9 + * Virtio-iommu definition v0.12 * - * Copyright (C) 2018 Arm Ltd. + * Copyright (C) 2019 Arm Ltd. */ #ifndef _UAPI_LINUX_VIRTIO_IOMMU_H #define _UAPI_LINUX_VIRTIO_IOMMU_H @@ -11,26 +11,31 @@ /* Feature bits */ #define VIRTIO_IOMMU_F_INPUT_RANGE 0 -#define VIRTIO_IOMMU_F_DOMAIN_BITS 1 +#define VIRTIO_IOMMU_F_DOMAIN_RANGE 1 #define VIRTIO_IOMMU_F_MAP_UNMAP 2 #define VIRTIO_IOMMU_F_BYPASS 3 #define VIRTIO_IOMMU_F_PROBE 4 +#define VIRTIO_IOMMU_F_MMIO 5 -struct virtio_iommu_range { - __u64 start; - __u64 end; +struct virtio_iommu_range_64 { + __le64 start; + __le64 end; +}; + +struct virtio_iommu_range_32 { + __le32 start; + __le32 end; }; struct virtio_iommu_config { /* Supported page sizes */ - __u64 page_size_mask; + __le64 page_size_mask; /* Supported IOVA range */ - struct virtio_iommu_range input_range; + struct virtio_iommu_range_64 input_range; /* Max domain ID size */ - __u8 domain_bits; - __u8 padding[3]; + struct virtio_iommu_range_32 domain_range; /* Probe buffer size */ - __u32 probe_size; + __le32 probe_size; }; /* Request types */ @@ -49,6 +54,7 @@ struct virtio_iommu_config { #define VIRTIO_IOMMU_S_RANGE 0x05 #define VIRTIO_IOMMU_S_NOENT 0x06 #define VIRTIO_IOMMU_S_FAULT 0x07 +#define VIRTIO_IOMMU_S_NOMEM 0x08 struct virtio_iommu_req_head { __u8 type; @@ -78,12 +84,10 @@ struct virtio_iommu_req_detach { #define VIRTIO_IOMMU_MAP_F_READ (1 << 0) #define VIRTIO_IOMMU_MAP_F_WRITE (1 << 1) -#define VIRTIO_IOMMU_MAP_F_EXEC (1 << 2) -#define VIRTIO_IOMMU_MAP_F_MMIO (1 << 3) +#define VIRTIO_IOMMU_MAP_F_MMIO (1 << 2) #define VIRTIO_IOMMU_MAP_F_MASK (VIRTIO_IOMMU_MAP_F_READ | \ VIRTIO_IOMMU_MAP_F_WRITE | \ - VIRTIO_IOMMU_MAP_F_EXEC | \ VIRTIO_IOMMU_MAP_F_MMIO) struct virtio_iommu_req_map { -- cgit v1.2.3-71-gd317 From 2f5947dfcaecb99f2dd559156eecbeb7b95e4c02 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 24 Jul 2019 09:24:49 +0200 Subject: Documentation: move Documentation/virtual to Documentation/virt Renaming docs seems to be en vogue at the moment, so fix on of the grossly misnamed directories. We usually never use "virtual" as a shortcut for virtualization in the kernel, but always virt, as seen in the virt/ top-level directory. Fix up the documentation to match that. Fixes: ed16648eb5b8 ("Move kvm, uml, and lguest subdirectories under a common "virtual" directory, I.E:") Signed-off-by: Christoph Hellwig Signed-off-by: Paolo Bonzini --- Documentation/admin-guide/kernel-parameters.txt | 2 +- Documentation/virt/index.rst | 18 + Documentation/virt/kvm/amd-memory-encryption.rst | 250 + Documentation/virt/kvm/api.txt | 5296 ++++++++++++++++++++ Documentation/virt/kvm/arm/hyp-abi.txt | 53 + Documentation/virt/kvm/arm/psci.txt | 61 + Documentation/virt/kvm/cpuid.rst | 107 + Documentation/virt/kvm/devices/README | 1 + Documentation/virt/kvm/devices/arm-vgic-its.txt | 181 + Documentation/virt/kvm/devices/arm-vgic-v3.txt | 251 + Documentation/virt/kvm/devices/arm-vgic.txt | 127 + Documentation/virt/kvm/devices/mpic.txt | 53 + Documentation/virt/kvm/devices/s390_flic.txt | 163 + Documentation/virt/kvm/devices/vcpu.txt | 62 + Documentation/virt/kvm/devices/vfio.txt | 36 + Documentation/virt/kvm/devices/vm.txt | 270 + Documentation/virt/kvm/devices/xics.txt | 66 + Documentation/virt/kvm/devices/xive.txt | 197 + Documentation/virt/kvm/halt-polling.txt | 136 + Documentation/virt/kvm/hypercalls.txt | 154 + Documentation/virt/kvm/index.rst | 11 + Documentation/virt/kvm/locking.txt | 215 + Documentation/virt/kvm/mmu.txt | 449 ++ Documentation/virt/kvm/msr.txt | 284 ++ Documentation/virt/kvm/nested-vmx.txt | 240 + Documentation/virt/kvm/ppc-pv.txt | 212 + Documentation/virt/kvm/review-checklist.txt | 38 + Documentation/virt/kvm/s390-diag.txt | 83 + Documentation/virt/kvm/timekeeping.txt | 612 +++ Documentation/virt/kvm/vcpu-requests.rst | 307 ++ Documentation/virt/paravirt_ops.rst | 35 + Documentation/virt/uml/UserModeLinux-HOWTO.txt | 4589 +++++++++++++++++ Documentation/virtual/index.rst | 18 - .../virtual/kvm/amd-memory-encryption.rst | 250 - Documentation/virtual/kvm/api.txt | 5296 -------------------- Documentation/virtual/kvm/arm/hyp-abi.txt | 53 - Documentation/virtual/kvm/arm/psci.txt | 61 - Documentation/virtual/kvm/cpuid.rst | 107 - Documentation/virtual/kvm/devices/README | 1 - Documentation/virtual/kvm/devices/arm-vgic-its.txt | 181 - Documentation/virtual/kvm/devices/arm-vgic-v3.txt | 251 - Documentation/virtual/kvm/devices/arm-vgic.txt | 127 - Documentation/virtual/kvm/devices/mpic.txt | 53 - Documentation/virtual/kvm/devices/s390_flic.txt | 163 - Documentation/virtual/kvm/devices/vcpu.txt | 62 - Documentation/virtual/kvm/devices/vfio.txt | 36 - Documentation/virtual/kvm/devices/vm.txt | 270 - Documentation/virtual/kvm/devices/xics.txt | 66 - Documentation/virtual/kvm/devices/xive.txt | 197 - Documentation/virtual/kvm/halt-polling.txt | 136 - Documentation/virtual/kvm/hypercalls.txt | 154 - Documentation/virtual/kvm/index.rst | 11 - Documentation/virtual/kvm/locking.txt | 215 - Documentation/virtual/kvm/mmu.txt | 449 -- Documentation/virtual/kvm/msr.txt | 284 -- Documentation/virtual/kvm/nested-vmx.txt | 240 - Documentation/virtual/kvm/ppc-pv.txt | 212 - Documentation/virtual/kvm/review-checklist.txt | 38 - Documentation/virtual/kvm/s390-diag.txt | 83 - Documentation/virtual/kvm/timekeeping.txt | 612 --- Documentation/virtual/kvm/vcpu-requests.rst | 307 -- Documentation/virtual/paravirt_ops.rst | 35 - Documentation/virtual/uml/UserModeLinux-HOWTO.txt | 4589 ----------------- MAINTAINERS | 6 +- arch/powerpc/include/uapi/asm/kvm_para.h | 2 +- arch/x86/kvm/mmu.c | 2 +- include/uapi/linux/kvm.h | 4 +- tools/include/uapi/linux/kvm.h | 4 +- virt/kvm/arm/arm.c | 2 +- virt/kvm/arm/vgic/vgic-mmio-v3.c | 2 +- virt/kvm/arm/vgic/vgic.h | 4 +- 71 files changed, 14571 insertions(+), 14571 deletions(-) create mode 100644 Documentation/virt/index.rst create mode 100644 Documentation/virt/kvm/amd-memory-encryption.rst create mode 100644 Documentation/virt/kvm/api.txt create mode 100644 Documentation/virt/kvm/arm/hyp-abi.txt create mode 100644 Documentation/virt/kvm/arm/psci.txt create mode 100644 Documentation/virt/kvm/cpuid.rst create mode 100644 Documentation/virt/kvm/devices/README create mode 100644 Documentation/virt/kvm/devices/arm-vgic-its.txt create mode 100644 Documentation/virt/kvm/devices/arm-vgic-v3.txt create mode 100644 Documentation/virt/kvm/devices/arm-vgic.txt create mode 100644 Documentation/virt/kvm/devices/mpic.txt create mode 100644 Documentation/virt/kvm/devices/s390_flic.txt create mode 100644 Documentation/virt/kvm/devices/vcpu.txt create mode 100644 Documentation/virt/kvm/devices/vfio.txt create mode 100644 Documentation/virt/kvm/devices/vm.txt create mode 100644 Documentation/virt/kvm/devices/xics.txt create mode 100644 Documentation/virt/kvm/devices/xive.txt create mode 100644 Documentation/virt/kvm/halt-polling.txt create mode 100644 Documentation/virt/kvm/hypercalls.txt create mode 100644 Documentation/virt/kvm/index.rst create mode 100644 Documentation/virt/kvm/locking.txt create mode 100644 Documentation/virt/kvm/mmu.txt create mode 100644 Documentation/virt/kvm/msr.txt create mode 100644 Documentation/virt/kvm/nested-vmx.txt create mode 100644 Documentation/virt/kvm/ppc-pv.txt create mode 100644 Documentation/virt/kvm/review-checklist.txt create mode 100644 Documentation/virt/kvm/s390-diag.txt create mode 100644 Documentation/virt/kvm/timekeeping.txt create mode 100644 Documentation/virt/kvm/vcpu-requests.rst create mode 100644 Documentation/virt/paravirt_ops.rst create mode 100644 Documentation/virt/uml/UserModeLinux-HOWTO.txt delete mode 100644 Documentation/virtual/index.rst delete mode 100644 Documentation/virtual/kvm/amd-memory-encryption.rst delete mode 100644 Documentation/virtual/kvm/api.txt delete mode 100644 Documentation/virtual/kvm/arm/hyp-abi.txt delete mode 100644 Documentation/virtual/kvm/arm/psci.txt delete mode 100644 Documentation/virtual/kvm/cpuid.rst delete mode 100644 Documentation/virtual/kvm/devices/README delete mode 100644 Documentation/virtual/kvm/devices/arm-vgic-its.txt delete mode 100644 Documentation/virtual/kvm/devices/arm-vgic-v3.txt delete mode 100644 Documentation/virtual/kvm/devices/arm-vgic.txt delete mode 100644 Documentation/virtual/kvm/devices/mpic.txt delete mode 100644 Documentation/virtual/kvm/devices/s390_flic.txt delete mode 100644 Documentation/virtual/kvm/devices/vcpu.txt delete mode 100644 Documentation/virtual/kvm/devices/vfio.txt delete mode 100644 Documentation/virtual/kvm/devices/vm.txt delete mode 100644 Documentation/virtual/kvm/devices/xics.txt delete mode 100644 Documentation/virtual/kvm/devices/xive.txt delete mode 100644 Documentation/virtual/kvm/halt-polling.txt delete mode 100644 Documentation/virtual/kvm/hypercalls.txt delete mode 100644 Documentation/virtual/kvm/index.rst delete mode 100644 Documentation/virtual/kvm/locking.txt delete mode 100644 Documentation/virtual/kvm/mmu.txt delete mode 100644 Documentation/virtual/kvm/msr.txt delete mode 100644 Documentation/virtual/kvm/nested-vmx.txt delete mode 100644 Documentation/virtual/kvm/ppc-pv.txt delete mode 100644 Documentation/virtual/kvm/review-checklist.txt delete mode 100644 Documentation/virtual/kvm/s390-diag.txt delete mode 100644 Documentation/virtual/kvm/timekeeping.txt delete mode 100644 Documentation/virtual/kvm/vcpu-requests.rst delete mode 100644 Documentation/virtual/paravirt_ops.rst delete mode 100644 Documentation/virtual/uml/UserModeLinux-HOWTO.txt (limited to 'include/uapi/linux') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 099c5a4be95b..8a8880cec34b 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -2532,7 +2532,7 @@ mem_encrypt=on: Activate SME mem_encrypt=off: Do not activate SME - Refer to Documentation/virtual/kvm/amd-memory-encryption.rst + Refer to Documentation/virt/kvm/amd-memory-encryption.rst for details on when memory encryption can be activated. mem_sleep_default= [SUSPEND] Default system suspend mode: diff --git a/Documentation/virt/index.rst b/Documentation/virt/index.rst new file mode 100644 index 000000000000..062ffb527043 --- /dev/null +++ b/Documentation/virt/index.rst @@ -0,0 +1,18 @@ +.. SPDX-License-Identifier: GPL-2.0 + +============================ +Linux Virtualization Support +============================ + +.. toctree:: + :maxdepth: 2 + + kvm/index + paravirt_ops + +.. only:: html and subproject + + Indices + ======= + + * :ref:`genindex` diff --git a/Documentation/virt/kvm/amd-memory-encryption.rst b/Documentation/virt/kvm/amd-memory-encryption.rst new file mode 100644 index 000000000000..d18c97b4e140 --- /dev/null +++ b/Documentation/virt/kvm/amd-memory-encryption.rst @@ -0,0 +1,250 @@ +====================================== +Secure Encrypted Virtualization (SEV) +====================================== + +Overview +======== + +Secure Encrypted Virtualization (SEV) is a feature found on AMD processors. + +SEV is an extension to the AMD-V architecture which supports running +virtual machines (VMs) under the control of a hypervisor. When enabled, +the memory contents of a VM will be transparently encrypted with a key +unique to that VM. + +The hypervisor can determine the SEV support through the CPUID +instruction. The CPUID function 0x8000001f reports information related +to SEV:: + + 0x8000001f[eax]: + Bit[1] indicates support for SEV + ... + [ecx]: + Bits[31:0] Number of encrypted guests supported simultaneously + +If support for SEV is present, MSR 0xc001_0010 (MSR_K8_SYSCFG) and MSR 0xc001_0015 +(MSR_K7_HWCR) can be used to determine if it can be enabled:: + + 0xc001_0010: + Bit[23] 1 = memory encryption can be enabled + 0 = memory encryption can not be enabled + + 0xc001_0015: + Bit[0] 1 = memory encryption can be enabled + 0 = memory encryption can not be enabled + +When SEV support is available, it can be enabled in a specific VM by +setting the SEV bit before executing VMRUN.:: + + VMCB[0x90]: + Bit[1] 1 = SEV is enabled + 0 = SEV is disabled + +SEV hardware uses ASIDs to associate a memory encryption key with a VM. +Hence, the ASID for the SEV-enabled guests must be from 1 to a maximum value +defined in the CPUID 0x8000001f[ecx] field. + +SEV Key Management +================== + +The SEV guest key management is handled by a separate processor called the AMD +Secure Processor (AMD-SP). Firmware running inside the AMD-SP provides a secure +key management interface to perform common hypervisor activities such as +encrypting bootstrap code, snapshot, migrating and debugging the guest. For more +information, see the SEV Key Management spec [api-spec]_ + +KVM implements the following commands to support common lifecycle events of SEV +guests, such as launching, running, snapshotting, migrating and decommissioning. + +1. KVM_SEV_INIT +--------------- + +The KVM_SEV_INIT command is used by the hypervisor to initialize the SEV platform +context. In a typical workflow, this command should be the first command issued. + +Returns: 0 on success, -negative on error + +2. KVM_SEV_LAUNCH_START +----------------------- + +The KVM_SEV_LAUNCH_START command is used for creating the memory encryption +context. To create the encryption context, user must provide a guest policy, +the owner's public Diffie-Hellman (PDH) key and session information. + +Parameters: struct kvm_sev_launch_start (in/out) + +Returns: 0 on success, -negative on error + +:: + + struct kvm_sev_launch_start { + __u32 handle; /* if zero then firmware creates a new handle */ + __u32 policy; /* guest's policy */ + + __u64 dh_uaddr; /* userspace address pointing to the guest owner's PDH key */ + __u32 dh_len; + + __u64 session_addr; /* userspace address which points to the guest session information */ + __u32 session_len; + }; + +On success, the 'handle' field contains a new handle and on error, a negative value. + +For more details, see SEV spec Section 6.2. + +3. KVM_SEV_LAUNCH_UPDATE_DATA +----------------------------- + +The KVM_SEV_LAUNCH_UPDATE_DATA is used for encrypting a memory region. It also +calculates a measurement of the memory contents. The measurement is a signature +of the memory contents that can be sent to the guest owner as an attestation +that the memory was encrypted correctly by the firmware. + +Parameters (in): struct kvm_sev_launch_update_data + +Returns: 0 on success, -negative on error + +:: + + struct kvm_sev_launch_update { + __u64 uaddr; /* userspace address to be encrypted (must be 16-byte aligned) */ + __u32 len; /* length of the data to be encrypted (must be 16-byte aligned) */ + }; + +For more details, see SEV spec Section 6.3. + +4. KVM_SEV_LAUNCH_MEASURE +------------------------- + +The KVM_SEV_LAUNCH_MEASURE command is used to retrieve the measurement of the +data encrypted by the KVM_SEV_LAUNCH_UPDATE_DATA command. The guest owner may +wait to provide the guest with confidential information until it can verify the +measurement. Since the guest owner knows the initial contents of the guest at +boot, the measurement can be verified by comparing it to what the guest owner +expects. + +Parameters (in): struct kvm_sev_launch_measure + +Returns: 0 on success, -negative on error + +:: + + struct kvm_sev_launch_measure { + __u64 uaddr; /* where to copy the measurement */ + __u32 len; /* length of measurement blob */ + }; + +For more details on the measurement verification flow, see SEV spec Section 6.4. + +5. KVM_SEV_LAUNCH_FINISH +------------------------ + +After completion of the launch flow, the KVM_SEV_LAUNCH_FINISH command can be +issued to make the guest ready for the execution. + +Returns: 0 on success, -negative on error + +6. KVM_SEV_GUEST_STATUS +----------------------- + +The KVM_SEV_GUEST_STATUS command is used to retrieve status information about a +SEV-enabled guest. + +Parameters (out): struct kvm_sev_guest_status + +Returns: 0 on success, -negative on error + +:: + + struct kvm_sev_guest_status { + __u32 handle; /* guest handle */ + __u32 policy; /* guest policy */ + __u8 state; /* guest state (see enum below) */ + }; + +SEV guest state: + +:: + + enum { + SEV_STATE_INVALID = 0; + SEV_STATE_LAUNCHING, /* guest is currently being launched */ + SEV_STATE_SECRET, /* guest is being launched and ready to accept the ciphertext data */ + SEV_STATE_RUNNING, /* guest is fully launched and running */ + SEV_STATE_RECEIVING, /* guest is being migrated in from another SEV machine */ + SEV_STATE_SENDING /* guest is getting migrated out to another SEV machine */ + }; + +7. KVM_SEV_DBG_DECRYPT +---------------------- + +The KVM_SEV_DEBUG_DECRYPT command can be used by the hypervisor to request the +firmware to decrypt the data at the given memory region. + +Parameters (in): struct kvm_sev_dbg + +Returns: 0 on success, -negative on error + +:: + + struct kvm_sev_dbg { + __u64 src_uaddr; /* userspace address of data to decrypt */ + __u64 dst_uaddr; /* userspace address of destination */ + __u32 len; /* length of memory region to decrypt */ + }; + +The command returns an error if the guest policy does not allow debugging. + +8. KVM_SEV_DBG_ENCRYPT +---------------------- + +The KVM_SEV_DEBUG_ENCRYPT command can be used by the hypervisor to request the +firmware to encrypt the data at the given memory region. + +Parameters (in): struct kvm_sev_dbg + +Returns: 0 on success, -negative on error + +:: + + struct kvm_sev_dbg { + __u64 src_uaddr; /* userspace address of data to encrypt */ + __u64 dst_uaddr; /* userspace address of destination */ + __u32 len; /* length of memory region to encrypt */ + }; + +The command returns an error if the guest policy does not allow debugging. + +9. KVM_SEV_LAUNCH_SECRET +------------------------ + +The KVM_SEV_LAUNCH_SECRET command can be used by the hypervisor to inject secret +data after the measurement has been validated by the guest owner. + +Parameters (in): struct kvm_sev_launch_secret + +Returns: 0 on success, -negative on error + +:: + + struct kvm_sev_launch_secret { + __u64 hdr_uaddr; /* userspace address containing the packet header */ + __u32 hdr_len; + + __u64 guest_uaddr; /* the guest memory region where the secret should be injected */ + __u32 guest_len; + + __u64 trans_uaddr; /* the hypervisor memory region which contains the secret */ + __u32 trans_len; + }; + +References +========== + + +See [white-paper]_, [api-spec]_, [amd-apm]_ and [kvm-forum]_ for more info. + +.. [white-paper] http://amd-dev.wpengine.netdna-cdn.com/wordpress/media/2013/12/AMD_Memory_Encryption_Whitepaper_v7-Public.pdf +.. [api-spec] http://support.amd.com/TechDocs/55766_SEV-KM_API_Specification.pdf +.. [amd-apm] http://support.amd.com/TechDocs/24593.pdf (section 15.34) +.. [kvm-forum] http://www.linux-kvm.org/images/7/74/02x08A-Thomas_Lendacky-AMDs_Virtualizatoin_Memory_Encryption_Technology.pdf diff --git a/Documentation/virt/kvm/api.txt b/Documentation/virt/kvm/api.txt new file mode 100644 index 000000000000..2d067767b617 --- /dev/null +++ b/Documentation/virt/kvm/api.txt @@ -0,0 +1,5296 @@ +The Definitive KVM (Kernel-based Virtual Machine) API Documentation +=================================================================== + +1. General description +---------------------- + +The kvm API is a set of ioctls that are issued to control various aspects +of a virtual machine. The ioctls belong to three classes: + + - System ioctls: These query and set global attributes which affect the + whole kvm subsystem. In addition a system ioctl is used to create + virtual machines. + + - VM ioctls: These query and set attributes that affect an entire virtual + machine, for example memory layout. In addition a VM ioctl is used to + create virtual cpus (vcpus) and devices. + + VM ioctls must be issued from the same process (address space) that was + used to create the VM. + + - vcpu ioctls: These query and set attributes that control the operation + of a single virtual cpu. + + vcpu ioctls should be issued from the same thread that was used to create + the vcpu, except for asynchronous vcpu ioctl that are marked as such in + the documentation. Otherwise, the first ioctl after switching threads + could see a performance impact. + + - device ioctls: These query and set attributes that control the operation + of a single device. + + device ioctls must be issued from the same process (address space) that + was used to create the VM. + +2. File descriptors +------------------- + +The kvm API is centered around file descriptors. An initial +open("/dev/kvm") obtains a handle to the kvm subsystem; this handle +can be used to issue system ioctls. A KVM_CREATE_VM ioctl on this +handle will create a VM file descriptor which can be used to issue VM +ioctls. A KVM_CREATE_VCPU or KVM_CREATE_DEVICE ioctl on a VM fd will +create a virtual cpu or device and return a file descriptor pointing to +the new resource. Finally, ioctls on a vcpu or device fd can be used +to control the vcpu or device. For vcpus, this includes the important +task of actually running guest code. + +In general file descriptors can be migrated among processes by means +of fork() and the SCM_RIGHTS facility of unix domain socket. These +kinds of tricks are explicitly not supported by kvm. While they will +not cause harm to the host, their actual behavior is not guaranteed by +the API. See "General description" for details on the ioctl usage +model that is supported by KVM. + +It is important to note that althought VM ioctls may only be issued from +the process that created the VM, a VM's lifecycle is associated with its +file descriptor, not its creator (process). In other words, the VM and +its resources, *including the associated address space*, are not freed +until the last reference to the VM's file descriptor has been released. +For example, if fork() is issued after ioctl(KVM_CREATE_VM), the VM will +not be freed until both the parent (original) process and its child have +put their references to the VM's file descriptor. + +Because a VM's resources are not freed until the last reference to its +file descriptor is released, creating additional references to a VM via +via fork(), dup(), etc... without careful consideration is strongly +discouraged and may have unwanted side effects, e.g. memory allocated +by and on behalf of the VM's process may not be freed/unaccounted when +the VM is shut down. + + +3. Extensions +------------- + +As of Linux 2.6.22, the KVM ABI has been stabilized: no backward +incompatible change are allowed. However, there is an extension +facility that allows backward-compatible extensions to the API to be +queried and used. + +The extension mechanism is not based on the Linux version number. +Instead, kvm defines extension identifiers and a facility to query +whether a particular extension identifier is available. If it is, a +set of ioctls is available for application use. + + +4. API description +------------------ + +This section describes ioctls that can be used to control kvm guests. +For each ioctl, the following information is provided along with a +description: + + Capability: which KVM extension provides this ioctl. Can be 'basic', + which means that is will be provided by any kernel that supports + API version 12 (see section 4.1), a KVM_CAP_xyz constant, which + means availability needs to be checked with KVM_CHECK_EXTENSION + (see section 4.4), or 'none' which means that while not all kernels + support this ioctl, there's no capability bit to check its + availability: for kernels that don't support the ioctl, + the ioctl returns -ENOTTY. + + Architectures: which instruction set architectures provide this ioctl. + x86 includes both i386 and x86_64. + + Type: system, vm, or vcpu. + + Parameters: what parameters are accepted by the ioctl. + + Returns: the return value. General error numbers (EBADF, ENOMEM, EINVAL) + are not detailed, but errors with specific meanings are. + + +4.1 KVM_GET_API_VERSION + +Capability: basic +Architectures: all +Type: system ioctl +Parameters: none +Returns: the constant KVM_API_VERSION (=12) + +This identifies the API version as the stable kvm API. It is not +expected that this number will change. However, Linux 2.6.20 and +2.6.21 report earlier versions; these are not documented and not +supported. Applications should refuse to run if KVM_GET_API_VERSION +returns a value other than 12. If this check passes, all ioctls +described as 'basic' will be available. + + +4.2 KVM_CREATE_VM + +Capability: basic +Architectures: all +Type: system ioctl +Parameters: machine type identifier (KVM_VM_*) +Returns: a VM fd that can be used to control the new virtual machine. + +The new VM has no virtual cpus and no memory. +You probably want to use 0 as machine type. + +In order to create user controlled virtual machines on S390, check +KVM_CAP_S390_UCONTROL and use the flag KVM_VM_S390_UCONTROL as +privileged user (CAP_SYS_ADMIN). + +To use hardware assisted virtualization on MIPS (VZ ASE) rather than +the default trap & emulate implementation (which changes the virtual +memory layout to fit in user mode), check KVM_CAP_MIPS_VZ and use the +flag KVM_VM_MIPS_VZ. + + +On arm64, the physical address size for a VM (IPA Size limit) is limited +to 40bits by default. The limit can be configured if the host supports the +extension KVM_CAP_ARM_VM_IPA_SIZE. When supported, use +KVM_VM_TYPE_ARM_IPA_SIZE(IPA_Bits) to set the size in the machine type +identifier, where IPA_Bits is the maximum width of any physical +address used by the VM. The IPA_Bits is encoded in bits[7-0] of the +machine type identifier. + +e.g, to configure a guest to use 48bit physical address size : + + vm_fd = ioctl(dev_fd, KVM_CREATE_VM, KVM_VM_TYPE_ARM_IPA_SIZE(48)); + +The requested size (IPA_Bits) must be : + 0 - Implies default size, 40bits (for backward compatibility) + + or + + N - Implies N bits, where N is a positive integer such that, + 32 <= N <= Host_IPA_Limit + +Host_IPA_Limit is the maximum possible value for IPA_Bits on the host and +is dependent on the CPU capability and the kernel configuration. The limit can +be retrieved using KVM_CAP_ARM_VM_IPA_SIZE of the KVM_CHECK_EXTENSION +ioctl() at run-time. + +Please note that configuring the IPA size does not affect the capability +exposed by the guest CPUs in ID_AA64MMFR0_EL1[PARange]. It only affects +size of the address translated by the stage2 level (guest physical to +host physical address translations). + + +4.3 KVM_GET_MSR_INDEX_LIST, KVM_GET_MSR_FEATURE_INDEX_LIST + +Capability: basic, KVM_CAP_GET_MSR_FEATURES for KVM_GET_MSR_FEATURE_INDEX_LIST +Architectures: x86 +Type: system ioctl +Parameters: struct kvm_msr_list (in/out) +Returns: 0 on success; -1 on error +Errors: + EFAULT: the msr index list cannot be read from or written to + E2BIG: the msr index list is to be to fit in the array specified by + the user. + +struct kvm_msr_list { + __u32 nmsrs; /* number of msrs in entries */ + __u32 indices[0]; +}; + +The user fills in the size of the indices array in nmsrs, and in return +kvm adjusts nmsrs to reflect the actual number of msrs and fills in the +indices array with their numbers. + +KVM_GET_MSR_INDEX_LIST returns the guest msrs that are supported. The list +varies by kvm version and host processor, but does not change otherwise. + +Note: if kvm indicates supports MCE (KVM_CAP_MCE), then the MCE bank MSRs are +not returned in the MSR list, as different vcpus can have a different number +of banks, as set via the KVM_X86_SETUP_MCE ioctl. + +KVM_GET_MSR_FEATURE_INDEX_LIST returns the list of MSRs that can be passed +to the KVM_GET_MSRS system ioctl. This lets userspace probe host capabilities +and processor features that are exposed via MSRs (e.g., VMX capabilities). +This list also varies by kvm version and host processor, but does not change +otherwise. + + +4.4 KVM_CHECK_EXTENSION + +Capability: basic, KVM_CAP_CHECK_EXTENSION_VM for vm ioctl +Architectures: all +Type: system ioctl, vm ioctl +Parameters: extension identifier (KVM_CAP_*) +Returns: 0 if unsupported; 1 (or some other positive integer) if supported + +The API allows the application to query about extensions to the core +kvm API. Userspace passes an extension identifier (an integer) and +receives an integer that describes the extension availability. +Generally 0 means no and 1 means yes, but some extensions may report +additional information in the integer return value. + +Based on their initialization different VMs may have different capabilities. +It is thus encouraged to use the vm ioctl to query for capabilities (available +with KVM_CAP_CHECK_EXTENSION_VM on the vm fd) + +4.5 KVM_GET_VCPU_MMAP_SIZE + +Capability: basic +Architectures: all +Type: system ioctl +Parameters: none +Returns: size of vcpu mmap area, in bytes + +The KVM_RUN ioctl (cf.) communicates with userspace via a shared +memory region. This ioctl returns the size of that region. See the +KVM_RUN documentation for details. + + +4.6 KVM_SET_MEMORY_REGION + +Capability: basic +Architectures: all +Type: vm ioctl +Parameters: struct kvm_memory_region (in) +Returns: 0 on success, -1 on error + +This ioctl is obsolete and has been removed. + + +4.7 KVM_CREATE_VCPU + +Capability: basic +Architectures: all +Type: vm ioctl +Parameters: vcpu id (apic id on x86) +Returns: vcpu fd on success, -1 on error + +This API adds a vcpu to a virtual machine. No more than max_vcpus may be added. +The vcpu id is an integer in the range [0, max_vcpu_id). + +The recommended max_vcpus value can be retrieved using the KVM_CAP_NR_VCPUS of +the KVM_CHECK_EXTENSION ioctl() at run-time. +The maximum possible value for max_vcpus can be retrieved using the +KVM_CAP_MAX_VCPUS of the KVM_CHECK_EXTENSION ioctl() at run-time. + +If the KVM_CAP_NR_VCPUS does not exist, you should assume that max_vcpus is 4 +cpus max. +If the KVM_CAP_MAX_VCPUS does not exist, you should assume that max_vcpus is +same as the value returned from KVM_CAP_NR_VCPUS. + +The maximum possible value for max_vcpu_id can be retrieved using the +KVM_CAP_MAX_VCPU_ID of the KVM_CHECK_EXTENSION ioctl() at run-time. + +If the KVM_CAP_MAX_VCPU_ID does not exist, you should assume that max_vcpu_id +is the same as the value returned from KVM_CAP_MAX_VCPUS. + +On powerpc using book3s_hv mode, the vcpus are mapped onto virtual +threads in one or more virtual CPU cores. (This is because the +hardware requires all the hardware threads in a CPU core to be in the +same partition.) The KVM_CAP_PPC_SMT capability indicates the number +of vcpus per virtual core (vcore). The vcore id is obtained by +dividing the vcpu id by the number of vcpus per vcore. The vcpus in a +given vcore will always be in the same physical core as each other +(though that might be a different physical core from time to time). +Userspace can control the threading (SMT) mode of the guest by its +allocation of vcpu ids. For example, if userspace wants +single-threaded guest vcpus, it should make all vcpu ids be a multiple +of the number of vcpus per vcore. + +For virtual cpus that have been created with S390 user controlled virtual +machines, the resulting vcpu fd can be memory mapped at page offset +KVM_S390_SIE_PAGE_OFFSET in order to obtain a memory map of the virtual +cpu's hardware control block. + + +4.8 KVM_GET_DIRTY_LOG (vm ioctl) + +Capability: basic +Architectures: all +Type: vm ioctl +Parameters: struct kvm_dirty_log (in/out) +Returns: 0 on success, -1 on error + +/* for KVM_GET_DIRTY_LOG */ +struct kvm_dirty_log { + __u32 slot; + __u32 padding; + union { + void __user *dirty_bitmap; /* one bit per page */ + __u64 padding; + }; +}; + +Given a memory slot, return a bitmap containing any pages dirtied +since the last call to this ioctl. Bit 0 is the first page in the +memory slot. Ensure the entire structure is cleared to avoid padding +issues. + +If KVM_CAP_MULTI_ADDRESS_SPACE is available, bits 16-31 specifies +the address space for which you want to return the dirty bitmap. +They must be less than the value that KVM_CHECK_EXTENSION returns for +the KVM_CAP_MULTI_ADDRESS_SPACE capability. + +The bits in the dirty bitmap are cleared before the ioctl returns, unless +KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 is enabled. For more information, +see the description of the capability. + +4.9 KVM_SET_MEMORY_ALIAS + +Capability: basic +Architectures: x86 +Type: vm ioctl +Parameters: struct kvm_memory_alias (in) +Returns: 0 (success), -1 (error) + +This ioctl is obsolete and has been removed. + + +4.10 KVM_RUN + +Capability: basic +Architectures: all +Type: vcpu ioctl +Parameters: none +Returns: 0 on success, -1 on error +Errors: + EINTR: an unmasked signal is pending + +This ioctl is used to run a guest virtual cpu. While there are no +explicit parameters, there is an implicit parameter block that can be +obtained by mmap()ing the vcpu fd at offset 0, with the size given by +KVM_GET_VCPU_MMAP_SIZE. The parameter block is formatted as a 'struct +kvm_run' (see below). + + +4.11 KVM_GET_REGS + +Capability: basic +Architectures: all except ARM, arm64 +Type: vcpu ioctl +Parameters: struct kvm_regs (out) +Returns: 0 on success, -1 on error + +Reads the general purpose registers from the vcpu. + +/* x86 */ +struct kvm_regs { + /* out (KVM_GET_REGS) / in (KVM_SET_REGS) */ + __u64 rax, rbx, rcx, rdx; + __u64 rsi, rdi, rsp, rbp; + __u64 r8, r9, r10, r11; + __u64 r12, r13, r14, r15; + __u64 rip, rflags; +}; + +/* mips */ +struct kvm_regs { + /* out (KVM_GET_REGS) / in (KVM_SET_REGS) */ + __u64 gpr[32]; + __u64 hi; + __u64 lo; + __u64 pc; +}; + + +4.12 KVM_SET_REGS + +Capability: basic +Architectures: all except ARM, arm64 +Type: vcpu ioctl +Parameters: struct kvm_regs (in) +Returns: 0 on success, -1 on error + +Writes the general purpose registers into the vcpu. + +See KVM_GET_REGS for the data structure. + + +4.13 KVM_GET_SREGS + +Capability: basic +Architectures: x86, ppc +Type: vcpu ioctl +Parameters: struct kvm_sregs (out) +Returns: 0 on success, -1 on error + +Reads special registers from the vcpu. + +/* x86 */ +struct kvm_sregs { + struct kvm_segment cs, ds, es, fs, gs, ss; + struct kvm_segment tr, ldt; + struct kvm_dtable gdt, idt; + __u64 cr0, cr2, cr3, cr4, cr8; + __u64 efer; + __u64 apic_base; + __u64 interrupt_bitmap[(KVM_NR_INTERRUPTS + 63) / 64]; +}; + +/* ppc -- see arch/powerpc/include/uapi/asm/kvm.h */ + +interrupt_bitmap is a bitmap of pending external interrupts. At most +one bit may be set. This interrupt has been acknowledged by the APIC +but not yet injected into the cpu core. + + +4.14 KVM_SET_SREGS + +Capability: basic +Architectures: x86, ppc +Type: vcpu ioctl +Parameters: struct kvm_sregs (in) +Returns: 0 on success, -1 on error + +Writes special registers into the vcpu. See KVM_GET_SREGS for the +data structures. + + +4.15 KVM_TRANSLATE + +Capability: basic +Architectures: x86 +Type: vcpu ioctl +Parameters: struct kvm_translation (in/out) +Returns: 0 on success, -1 on error + +Translates a virtual address according to the vcpu's current address +translation mode. + +struct kvm_translation { + /* in */ + __u64 linear_address; + + /* out */ + __u64 physical_address; + __u8 valid; + __u8 writeable; + __u8 usermode; + __u8 pad[5]; +}; + + +4.16 KVM_INTERRUPT + +Capability: basic +Architectures: x86, ppc, mips +Type: vcpu ioctl +Parameters: struct kvm_interrupt (in) +Returns: 0 on success, negative on failure. + +Queues a hardware interrupt vector to be injected. + +/* for KVM_INTERRUPT */ +struct kvm_interrupt { + /* in */ + __u32 irq; +}; + +X86: + +Returns: 0 on success, + -EEXIST if an interrupt is already enqueued + -EINVAL the the irq number is invalid + -ENXIO if the PIC is in the kernel + -EFAULT if the pointer is invalid + +Note 'irq' is an interrupt vector, not an interrupt pin or line. This +ioctl is useful if the in-kernel PIC is not used. + +PPC: + +Queues an external interrupt to be injected. This ioctl is overleaded +with 3 different irq values: + +a) KVM_INTERRUPT_SET + + This injects an edge type external interrupt into the guest once it's ready + to receive interrupts. When injected, the interrupt is done. + +b) KVM_INTERRUPT_UNSET + + This unsets any pending interrupt. + + Only available with KVM_CAP_PPC_UNSET_IRQ. + +c) KVM_INTERRUPT_SET_LEVEL + + This injects a level type external interrupt into the guest context. The + interrupt stays pending until a specific ioctl with KVM_INTERRUPT_UNSET + is triggered. + + Only available with KVM_CAP_PPC_IRQ_LEVEL. + +Note that any value for 'irq' other than the ones stated above is invalid +and incurs unexpected behavior. + +This is an asynchronous vcpu ioctl and can be invoked from any thread. + +MIPS: + +Queues an external interrupt to be injected into the virtual CPU. A negative +interrupt number dequeues the interrupt. + +This is an asynchronous vcpu ioctl and can be invoked from any thread. + + +4.17 KVM_DEBUG_GUEST + +Capability: basic +Architectures: none +Type: vcpu ioctl +Parameters: none) +Returns: -1 on error + +Support for this has been removed. Use KVM_SET_GUEST_DEBUG instead. + + +4.18 KVM_GET_MSRS + +Capability: basic (vcpu), KVM_CAP_GET_MSR_FEATURES (system) +Architectures: x86 +Type: system ioctl, vcpu ioctl +Parameters: struct kvm_msrs (in/out) +Returns: number of msrs successfully returned; + -1 on error + +When used as a system ioctl: +Reads the values of MSR-based features that are available for the VM. This +is similar to KVM_GET_SUPPORTED_CPUID, but it returns MSR indices and values. +The list of msr-based features can be obtained using KVM_GET_MSR_FEATURE_INDEX_LIST +in a system ioctl. + +When used as a vcpu ioctl: +Reads model-specific registers from the vcpu. Supported msr indices can +be obtained using KVM_GET_MSR_INDEX_LIST in a system ioctl. + +struct kvm_msrs { + __u32 nmsrs; /* number of msrs in entries */ + __u32 pad; + + struct kvm_msr_entry entries[0]; +}; + +struct kvm_msr_entry { + __u32 index; + __u32 reserved; + __u64 data; +}; + +Application code should set the 'nmsrs' member (which indicates the +size of the entries array) and the 'index' member of each array entry. +kvm will fill in the 'data' member. + + +4.19 KVM_SET_MSRS + +Capability: basic +Architectures: x86 +Type: vcpu ioctl +Parameters: struct kvm_msrs (in) +Returns: 0 on success, -1 on error + +Writes model-specific registers to the vcpu. See KVM_GET_MSRS for the +data structures. + +Application code should set the 'nmsrs' member (which indicates the +size of the entries array), and the 'index' and 'data' members of each +array entry. + + +4.20 KVM_SET_CPUID + +Capability: basic +Architectures: x86 +Type: vcpu ioctl +Parameters: struct kvm_cpuid (in) +Returns: 0 on success, -1 on error + +Defines the vcpu responses to the cpuid instruction. Applications +should use the KVM_SET_CPUID2 ioctl if available. + + +struct kvm_cpuid_entry { + __u32 function; + __u32 eax; + __u32 ebx; + __u32 ecx; + __u32 edx; + __u32 padding; +}; + +/* for KVM_SET_CPUID */ +struct kvm_cpuid { + __u32 nent; + __u32 padding; + struct kvm_cpuid_entry entries[0]; +}; + + +4.21 KVM_SET_SIGNAL_MASK + +Capability: basic +Architectures: all +Type: vcpu ioctl +Parameters: struct kvm_signal_mask (in) +Returns: 0 on success, -1 on error + +Defines which signals are blocked during execution of KVM_RUN. This +signal mask temporarily overrides the threads signal mask. Any +unblocked signal received (except SIGKILL and SIGSTOP, which retain +their traditional behaviour) will cause KVM_RUN to return with -EINTR. + +Note the signal will only be delivered if not blocked by the original +signal mask. + +/* for KVM_SET_SIGNAL_MASK */ +struct kvm_signal_mask { + __u32 len; + __u8 sigset[0]; +}; + + +4.22 KVM_GET_FPU + +Capability: basic +Architectures: x86 +Type: vcpu ioctl +Parameters: struct kvm_fpu (out) +Returns: 0 on success, -1 on error + +Reads the floating point state from the vcpu. + +/* for KVM_GET_FPU and KVM_SET_FPU */ +struct kvm_fpu { + __u8 fpr[8][16]; + __u16 fcw; + __u16 fsw; + __u8 ftwx; /* in fxsave format */ + __u8 pad1; + __u16 last_opcode; + __u64 last_ip; + __u64 last_dp; + __u8 xmm[16][16]; + __u32 mxcsr; + __u32 pad2; +}; + + +4.23 KVM_SET_FPU + +Capability: basic +Architectures: x86 +Type: vcpu ioctl +Parameters: struct kvm_fpu (in) +Returns: 0 on success, -1 on error + +Writes the floating point state to the vcpu. + +/* for KVM_GET_FPU and KVM_SET_FPU */ +struct kvm_fpu { + __u8 fpr[8][16]; + __u16 fcw; + __u16 fsw; + __u8 ftwx; /* in fxsave format */ + __u8 pad1; + __u16 last_opcode; + __u64 last_ip; + __u64 last_dp; + __u8 xmm[16][16]; + __u32 mxcsr; + __u32 pad2; +}; + + +4.24 KVM_CREATE_IRQCHIP + +Capability: KVM_CAP_IRQCHIP, KVM_CAP_S390_IRQCHIP (s390) +Architectures: x86, ARM, arm64, s390 +Type: vm ioctl +Parameters: none +Returns: 0 on success, -1 on error + +Creates an interrupt controller model in the kernel. +On x86, creates a virtual ioapic, a virtual PIC (two PICs, nested), and sets up +future vcpus to have a local APIC. IRQ routing for GSIs 0-15 is set to both +PIC and IOAPIC; GSI 16-23 only go to the IOAPIC. +On ARM/arm64, a GICv2 is created. Any other GIC versions require the usage of +KVM_CREATE_DEVICE, which also supports creating a GICv2. Using +KVM_CREATE_DEVICE is preferred over KVM_CREATE_IRQCHIP for GICv2. +On s390, a dummy irq routing table is created. + +Note that on s390 the KVM_CAP_S390_IRQCHIP vm capability needs to be enabled +before KVM_CREATE_IRQCHIP can be used. + + +4.25 KVM_IRQ_LINE + +Capability: KVM_CAP_IRQCHIP +Architectures: x86, arm, arm64 +Type: vm ioctl +Parameters: struct kvm_irq_level +Returns: 0 on success, -1 on error + +Sets the level of a GSI input to the interrupt controller model in the kernel. +On some architectures it is required that an interrupt controller model has +been previously created with KVM_CREATE_IRQCHIP. Note that edge-triggered +interrupts require the level to be set to 1 and then back to 0. + +On real hardware, interrupt pins can be active-low or active-high. This +does not matter for the level field of struct kvm_irq_level: 1 always +means active (asserted), 0 means inactive (deasserted). + +x86 allows the operating system to program the interrupt polarity +(active-low/active-high) for level-triggered interrupts, and KVM used +to consider the polarity. However, due to bitrot in the handling of +active-low interrupts, the above convention is now valid on x86 too. +This is signaled by KVM_CAP_X86_IOAPIC_POLARITY_IGNORED. Userspace +should not present interrupts to the guest as active-low unless this +capability is present (or unless it is not using the in-kernel irqchip, +of course). + + +ARM/arm64 can signal an interrupt either at the CPU level, or at the +in-kernel irqchip (GIC), and for in-kernel irqchip can tell the GIC to +use PPIs designated for specific cpus. The irq field is interpreted +like this: + +  bits: | 31 ... 24 | 23 ... 16 | 15 ... 0 | + field: | irq_type | vcpu_index | irq_id | + +The irq_type field has the following values: +- irq_type[0]: out-of-kernel GIC: irq_id 0 is IRQ, irq_id 1 is FIQ +- irq_type[1]: in-kernel GIC: SPI, irq_id between 32 and 1019 (incl.) + (the vcpu_index field is ignored) +- irq_type[2]: in-kernel GIC: PPI, irq_id between 16 and 31 (incl.) + +(The irq_id field thus corresponds nicely to the IRQ ID in the ARM GIC specs) + +In both cases, level is used to assert/deassert the line. + +struct kvm_irq_level { + union { + __u32 irq; /* GSI */ + __s32 status; /* not used for KVM_IRQ_LEVEL */ + }; + __u32 level; /* 0 or 1 */ +}; + + +4.26 KVM_GET_IRQCHIP + +Capability: KVM_CAP_IRQCHIP +Architectures: x86 +Type: vm ioctl +Parameters: struct kvm_irqchip (in/out) +Returns: 0 on success, -1 on error + +Reads the state of a kernel interrupt controller created with +KVM_CREATE_IRQCHIP into a buffer provided by the caller. + +struct kvm_irqchip { + __u32 chip_id; /* 0 = PIC1, 1 = PIC2, 2 = IOAPIC */ + __u32 pad; + union { + char dummy[512]; /* reserving space */ + struct kvm_pic_state pic; + struct kvm_ioapic_state ioapic; + } chip; +}; + + +4.27 KVM_SET_IRQCHIP + +Capability: KVM_CAP_IRQCHIP +Architectures: x86 +Type: vm ioctl +Parameters: struct kvm_irqchip (in) +Returns: 0 on success, -1 on error + +Sets the state of a kernel interrupt controller created with +KVM_CREATE_IRQCHIP from a buffer provided by the caller. + +struct kvm_irqchip { + __u32 chip_id; /* 0 = PIC1, 1 = PIC2, 2 = IOAPIC */ + __u32 pad; + union { + char dummy[512]; /* reserving space */ + struct kvm_pic_state pic; + struct kvm_ioapic_state ioapic; + } chip; +}; + + +4.28 KVM_XEN_HVM_CONFIG + +Capability: KVM_CAP_XEN_HVM +Architectures: x86 +Type: vm ioctl +Parameters: struct kvm_xen_hvm_config (in) +Returns: 0 on success, -1 on error + +Sets the MSR that the Xen HVM guest uses to initialize its hypercall +page, and provides the starting address and size of the hypercall +blobs in userspace. When the guest writes the MSR, kvm copies one +page of a blob (32- or 64-bit, depending on the vcpu mode) to guest +memory. + +struct kvm_xen_hvm_config { + __u32 flags; + __u32 msr; + __u64 blob_addr_32; + __u64 blob_addr_64; + __u8 blob_size_32; + __u8 blob_size_64; + __u8 pad2[30]; +}; + + +4.29 KVM_GET_CLOCK + +Capability: KVM_CAP_ADJUST_CLOCK +Architectures: x86 +Type: vm ioctl +Parameters: struct kvm_clock_data (out) +Returns: 0 on success, -1 on error + +Gets the current timestamp of kvmclock as seen by the current guest. In +conjunction with KVM_SET_CLOCK, it is used to ensure monotonicity on scenarios +such as migration. + +When KVM_CAP_ADJUST_CLOCK is passed to KVM_CHECK_EXTENSION, it returns the +set of bits that KVM can return in struct kvm_clock_data's flag member. + +The only flag defined now is KVM_CLOCK_TSC_STABLE. If set, the returned +value is the exact kvmclock value seen by all VCPUs at the instant +when KVM_GET_CLOCK was called. If clear, the returned value is simply +CLOCK_MONOTONIC plus a constant offset; the offset can be modified +with KVM_SET_CLOCK. KVM will try to make all VCPUs follow this clock, +but the exact value read by each VCPU could differ, because the host +TSC is not stable. + +struct kvm_clock_data { + __u64 clock; /* kvmclock current value */ + __u32 flags; + __u32 pad[9]; +}; + + +4.30 KVM_SET_CLOCK + +Capability: KVM_CAP_ADJUST_CLOCK +Architectures: x86 +Type: vm ioctl +Parameters: struct kvm_clock_data (in) +Returns: 0 on success, -1 on error + +Sets the current timestamp of kvmclock to the value specified in its parameter. +In conjunction with KVM_GET_CLOCK, it is used to ensure monotonicity on scenarios +such as migration. + +struct kvm_clock_data { + __u64 clock; /* kvmclock current value */ + __u32 flags; + __u32 pad[9]; +}; + + +4.31 KVM_GET_VCPU_EVENTS + +Capability: KVM_CAP_VCPU_EVENTS +Extended by: KVM_CAP_INTR_SHADOW +Architectures: x86, arm, arm64 +Type: vcpu ioctl +Parameters: struct kvm_vcpu_event (out) +Returns: 0 on success, -1 on error + +X86: + +Gets currently pending exceptions, interrupts, and NMIs as well as related +states of the vcpu. + +struct kvm_vcpu_events { + struct { + __u8 injected; + __u8 nr; + __u8 has_error_code; + __u8 pending; + __u32 error_code; + } exception; + struct { + __u8 injected; + __u8 nr; + __u8 soft; + __u8 shadow; + } interrupt; + struct { + __u8 injected; + __u8 pending; + __u8 masked; + __u8 pad; + } nmi; + __u32 sipi_vector; + __u32 flags; + struct { + __u8 smm; + __u8 pending; + __u8 smm_inside_nmi; + __u8 latched_init; + } smi; + __u8 reserved[27]; + __u8 exception_has_payload; + __u64 exception_payload; +}; + +The following bits are defined in the flags field: + +- KVM_VCPUEVENT_VALID_SHADOW may be set to signal that + interrupt.shadow contains a valid state. + +- KVM_VCPUEVENT_VALID_SMM may be set to signal that smi contains a + valid state. + +- KVM_VCPUEVENT_VALID_PAYLOAD may be set to signal that the + exception_has_payload, exception_payload, and exception.pending + fields contain a valid state. This bit will be set whenever + KVM_CAP_EXCEPTION_PAYLOAD is enabled. + +ARM/ARM64: + +If the guest accesses a device that is being emulated by the host kernel in +such a way that a real device would generate a physical SError, KVM may make +a virtual SError pending for that VCPU. This system error interrupt remains +pending until the guest takes the exception by unmasking PSTATE.A. + +Running the VCPU may cause it to take a pending SError, or make an access that +causes an SError to become pending. The event's description is only valid while +the VPCU is not running. + +This API provides a way to read and write the pending 'event' state that is not +visible to the guest. To save, restore or migrate a VCPU the struct representing +the state can be read then written using this GET/SET API, along with the other +guest-visible registers. It is not possible to 'cancel' an SError that has been +made pending. + +A device being emulated in user-space may also wish to generate an SError. To do +this the events structure can be populated by user-space. The current state +should be read first, to ensure no existing SError is pending. If an existing +SError is pending, the architecture's 'Multiple SError interrupts' rules should +be followed. (2.5.3 of DDI0587.a "ARM Reliability, Availability, and +Serviceability (RAS) Specification"). + +SError exceptions always have an ESR value. Some CPUs have the ability to +specify what the virtual SError's ESR value should be. These systems will +advertise KVM_CAP_ARM_INJECT_SERROR_ESR. In this case exception.has_esr will +always have a non-zero value when read, and the agent making an SError pending +should specify the ISS field in the lower 24 bits of exception.serror_esr. If +the system supports KVM_CAP_ARM_INJECT_SERROR_ESR, but user-space sets the events +with exception.has_esr as zero, KVM will choose an ESR. + +Specifying exception.has_esr on a system that does not support it will return +-EINVAL. Setting anything other than the lower 24bits of exception.serror_esr +will return -EINVAL. + +struct kvm_vcpu_events { + struct { + __u8 serror_pending; + __u8 serror_has_esr; + /* Align it to 8 bytes */ + __u8 pad[6]; + __u64 serror_esr; + } exception; + __u32 reserved[12]; +}; + +4.32 KVM_SET_VCPU_EVENTS + +Capability: KVM_CAP_VCPU_EVENTS +Extended by: KVM_CAP_INTR_SHADOW +Architectures: x86, arm, arm64 +Type: vcpu ioctl +Parameters: struct kvm_vcpu_event (in) +Returns: 0 on success, -1 on error + +X86: + +Set pending exceptions, interrupts, and NMIs as well as related states of the +vcpu. + +See KVM_GET_VCPU_EVENTS for the data structure. + +Fields that may be modified asynchronously by running VCPUs can be excluded +from the update. These fields are nmi.pending, sipi_vector, smi.smm, +smi.pending. Keep the corresponding bits in the flags field cleared to +suppress overwriting the current in-kernel state. The bits are: + +KVM_VCPUEVENT_VALID_NMI_PENDING - transfer nmi.pending to the kernel +KVM_VCPUEVENT_VALID_SIPI_VECTOR - transfer sipi_vector +KVM_VCPUEVENT_VALID_SMM - transfer the smi sub-struct. + +If KVM_CAP_INTR_SHADOW is available, KVM_VCPUEVENT_VALID_SHADOW can be set in +the flags field to signal that interrupt.shadow contains a valid state and +shall be written into the VCPU. + +KVM_VCPUEVENT_VALID_SMM can only be set if KVM_CAP_X86_SMM is available. + +If KVM_CAP_EXCEPTION_PAYLOAD is enabled, KVM_VCPUEVENT_VALID_PAYLOAD +can be set in the flags field to signal that the +exception_has_payload, exception_payload, and exception.pending fields +contain a valid state and shall be written into the VCPU. + +ARM/ARM64: + +Set the pending SError exception state for this VCPU. It is not possible to +'cancel' an Serror that has been made pending. + +See KVM_GET_VCPU_EVENTS for the data structure. + + +4.33 KVM_GET_DEBUGREGS + +Capability: KVM_CAP_DEBUGREGS +Architectures: x86 +Type: vm ioctl +Parameters: struct kvm_debugregs (out) +Returns: 0 on success, -1 on error + +Reads debug registers from the vcpu. + +struct kvm_debugregs { + __u64 db[4]; + __u64 dr6; + __u64 dr7; + __u64 flags; + __u64 reserved[9]; +}; + + +4.34 KVM_SET_DEBUGREGS + +Capability: KVM_CAP_DEBUGREGS +Architectures: x86 +Type: vm ioctl +Parameters: struct kvm_debugregs (in) +Returns: 0 on success, -1 on error + +Writes debug registers into the vcpu. + +See KVM_GET_DEBUGREGS for the data structure. The flags field is unused +yet and must be cleared on entry. + + +4.35 KVM_SET_USER_MEMORY_REGION + +Capability: KVM_CAP_USER_MEMORY +Architectures: all +Type: vm ioctl +Parameters: struct kvm_userspace_memory_region (in) +Returns: 0 on success, -1 on error + +struct kvm_userspace_memory_region { + __u32 slot; + __u32 flags; + __u64 guest_phys_addr; + __u64 memory_size; /* bytes */ + __u64 userspace_addr; /* start of the userspace allocated memory */ +}; + +/* for kvm_memory_region::flags */ +#define KVM_MEM_LOG_DIRTY_PAGES (1UL << 0) +#define KVM_MEM_READONLY (1UL << 1) + +This ioctl allows the user to create, modify or delete a guest physical +memory slot. Bits 0-15 of "slot" specify the slot id and this value +should be less than the maximum number of user memory slots supported per +VM. The maximum allowed slots can be queried using KVM_CAP_NR_MEMSLOTS. +Slots may not overlap in guest physical address space. + +If KVM_CAP_MULTI_ADDRESS_SPACE is available, bits 16-31 of "slot" +specifies the address space which is being modified. They must be +less than the value that KVM_CHECK_EXTENSION returns for the +KVM_CAP_MULTI_ADDRESS_SPACE capability. Slots in separate address spaces +are unrelated; the restriction on overlapping slots only applies within +each address space. + +Deleting a slot is done by passing zero for memory_size. When changing +an existing slot, it may be moved in the guest physical memory space, +or its flags may be modified, but it may not be resized. + +Memory for the region is taken starting at the address denoted by the +field userspace_addr, which must point at user addressable memory for +the entire memory slot size. Any object may back this memory, including +anonymous memory, ordinary files, and hugetlbfs. + +It is recommended that the lower 21 bits of guest_phys_addr and userspace_addr +be identical. This allows large pages in the guest to be backed by large +pages in the host. + +The flags field supports two flags: KVM_MEM_LOG_DIRTY_PAGES and +KVM_MEM_READONLY. The former can be set to instruct KVM to keep track of +writes to memory within the slot. See KVM_GET_DIRTY_LOG ioctl to know how to +use it. The latter can be set, if KVM_CAP_READONLY_MEM capability allows it, +to make a new slot read-only. In this case, writes to this memory will be +posted to userspace as KVM_EXIT_MMIO exits. + +When the KVM_CAP_SYNC_MMU capability is available, changes in the backing of +the memory region are automatically reflected into the guest. For example, an +mmap() that affects the region will be made visible immediately. Another +example is madvise(MADV_DROP). + +It is recommended to use this API instead of the KVM_SET_MEMORY_REGION ioctl. +The KVM_SET_MEMORY_REGION does not allow fine grained control over memory +allocation and is deprecated. + + +4.36 KVM_SET_TSS_ADDR + +Capability: KVM_CAP_SET_TSS_ADDR +Architectures: x86 +Type: vm ioctl +Parameters: unsigned long tss_address (in) +Returns: 0 on success, -1 on error + +This ioctl defines the physical address of a three-page region in the guest +physical address space. The region must be within the first 4GB of the +guest physical address space and must not conflict with any memory slot +or any mmio address. The guest may malfunction if it accesses this memory +region. + +This ioctl is required on Intel-based hosts. This is needed on Intel hardware +because of a quirk in the virtualization implementation (see the internals +documentation when it pops into existence). + + +4.37 KVM_ENABLE_CAP + +Capability: KVM_CAP_ENABLE_CAP +Architectures: mips, ppc, s390 +Type: vcpu ioctl +Parameters: struct kvm_enable_cap (in) +Returns: 0 on success; -1 on error + +Capability: KVM_CAP_ENABLE_CAP_VM +Architectures: all +Type: vcpu ioctl +Parameters: struct kvm_enable_cap (in) +Returns: 0 on success; -1 on error + ++Not all extensions are enabled by default. Using this ioctl the application +can enable an extension, making it available to the guest. + +On systems that do not support this ioctl, it always fails. On systems that +do support it, it only works for extensions that are supported for enablement. + +To check if a capability can be enabled, the KVM_CHECK_EXTENSION ioctl should +be used. + +struct kvm_enable_cap { + /* in */ + __u32 cap; + +The capability that is supposed to get enabled. + + __u32 flags; + +A bitfield indicating future enhancements. Has to be 0 for now. + + __u64 args[4]; + +Arguments for enabling a feature. If a feature needs initial values to +function properly, this is the place to put them. + + __u8 pad[64]; +}; + +The vcpu ioctl should be used for vcpu-specific capabilities, the vm ioctl +for vm-wide capabilities. + +4.38 KVM_GET_MP_STATE + +Capability: KVM_CAP_MP_STATE +Architectures: x86, s390, arm, arm64 +Type: vcpu ioctl +Parameters: struct kvm_mp_state (out) +Returns: 0 on success; -1 on error + +struct kvm_mp_state { + __u32 mp_state; +}; + +Returns the vcpu's current "multiprocessing state" (though also valid on +uniprocessor guests). + +Possible values are: + + - KVM_MP_STATE_RUNNABLE: the vcpu is currently running [x86,arm/arm64] + - KVM_MP_STATE_UNINITIALIZED: the vcpu is an application processor (AP) + which has not yet received an INIT signal [x86] + - KVM_MP_STATE_INIT_RECEIVED: the vcpu has received an INIT signal, and is + now ready for a SIPI [x86] + - KVM_MP_STATE_HALTED: the vcpu has executed a HLT instruction and + is waiting for an interrupt [x86] + - KVM_MP_STATE_SIPI_RECEIVED: the vcpu has just received a SIPI (vector + accessible via KVM_GET_VCPU_EVENTS) [x86] + - KVM_MP_STATE_STOPPED: the vcpu is stopped [s390,arm/arm64] + - KVM_MP_STATE_CHECK_STOP: the vcpu is in a special error state [s390] + - KVM_MP_STATE_OPERATING: the vcpu is operating (running or halted) + [s390] + - KVM_MP_STATE_LOAD: the vcpu is in a special load/startup state + [s390] + +On x86, this ioctl is only useful after KVM_CREATE_IRQCHIP. Without an +in-kernel irqchip, the multiprocessing state must be maintained by userspace on +these architectures. + +For arm/arm64: + +The only states that are valid are KVM_MP_STATE_STOPPED and +KVM_MP_STATE_RUNNABLE which reflect if the vcpu is paused or not. + +4.39 KVM_SET_MP_STATE + +Capability: KVM_CAP_MP_STATE +Architectures: x86, s390, arm, arm64 +Type: vcpu ioctl +Parameters: struct kvm_mp_state (in) +Returns: 0 on success; -1 on error + +Sets the vcpu's current "multiprocessing state"; see KVM_GET_MP_STATE for +arguments. + +On x86, this ioctl is only useful after KVM_CREATE_IRQCHIP. Without an +in-kernel irqchip, the multiprocessing state must be maintained by userspace on +these architectures. + +For arm/arm64: + +The only states that are valid are KVM_MP_STATE_STOPPED and +KVM_MP_STATE_RUNNABLE which reflect if the vcpu should be paused or not. + +4.40 KVM_SET_IDENTITY_MAP_ADDR + +Capability: KVM_CAP_SET_IDENTITY_MAP_ADDR +Architectures: x86 +Type: vm ioctl +Parameters: unsigned long identity (in) +Returns: 0 on success, -1 on error + +This ioctl defines the physical address of a one-page region in the guest +physical address space. The region must be within the first 4GB of the +guest physical address space and must not conflict with any memory slot +or any mmio address. The guest may malfunction if it accesses this memory +region. + +Setting the address to 0 will result in resetting the address to its default +(0xfffbc000). + +This ioctl is required on Intel-based hosts. This is needed on Intel hardware +because of a quirk in the virtualization implementation (see the internals +documentation when it pops into existence). + +Fails if any VCPU has already been created. + +4.41 KVM_SET_BOOT_CPU_ID + +Capability: KVM_CAP_SET_BOOT_CPU_ID +Architectures: x86 +Type: vm ioctl +Parameters: unsigned long vcpu_id +Returns: 0 on success, -1 on error + +Define which vcpu is the Bootstrap Processor (BSP). Values are the same +as the vcpu id in KVM_CREATE_VCPU. If this ioctl is not called, the default +is vcpu 0. + + +4.42 KVM_GET_XSAVE + +Capability: KVM_CAP_XSAVE +Architectures: x86 +Type: vcpu ioctl +Parameters: struct kvm_xsave (out) +Returns: 0 on success, -1 on error + +struct kvm_xsave { + __u32 region[1024]; +}; + +This ioctl would copy current vcpu's xsave struct to the userspace. + + +4.43 KVM_SET_XSAVE + +Capability: KVM_CAP_XSAVE +Architectures: x86 +Type: vcpu ioctl +Parameters: struct kvm_xsave (in) +Returns: 0 on success, -1 on error + +struct kvm_xsave { + __u32 region[1024]; +}; + +This ioctl would copy userspace's xsave struct to the kernel. + + +4.44 KVM_GET_XCRS + +Capability: KVM_CAP_XCRS +Architectures: x86 +Type: vcpu ioctl +Parameters: struct kvm_xcrs (out) +Returns: 0 on success, -1 on error + +struct kvm_xcr { + __u32 xcr; + __u32 reserved; + __u64 value; +}; + +struct kvm_xcrs { + __u32 nr_xcrs; + __u32 flags; + struct kvm_xcr xcrs[KVM_MAX_XCRS]; + __u64 padding[16]; +}; + +This ioctl would copy current vcpu's xcrs to the userspace. + + +4.45 KVM_SET_XCRS + +Capability: KVM_CAP_XCRS +Architectures: x86 +Type: vcpu ioctl +Parameters: struct kvm_xcrs (in) +Returns: 0 on success, -1 on error + +struct kvm_xcr { + __u32 xcr; + __u32 reserved; + __u64 value; +}; + +struct kvm_xcrs { + __u32 nr_xcrs; + __u32 flags; + struct kvm_xcr xcrs[KVM_MAX_XCRS]; + __u64 padding[16]; +}; + +This ioctl would set vcpu's xcr to the value userspace specified. + + +4.46 KVM_GET_SUPPORTED_CPUID + +Capability: KVM_CAP_EXT_CPUID +Architectures: x86 +Type: system ioctl +Parameters: struct kvm_cpuid2 (in/out) +Returns: 0 on success, -1 on error + +struct kvm_cpuid2 { + __u32 nent; + __u32 padding; + struct kvm_cpuid_entry2 entries[0]; +}; + +#define KVM_CPUID_FLAG_SIGNIFCANT_INDEX BIT(0) +#define KVM_CPUID_FLAG_STATEFUL_FUNC BIT(1) +#define KVM_CPUID_FLAG_STATE_READ_NEXT BIT(2) + +struct kvm_cpuid_entry2 { + __u32 function; + __u32 index; + __u32 flags; + __u32 eax; + __u32 ebx; + __u32 ecx; + __u32 edx; + __u32 padding[3]; +}; + +This ioctl returns x86 cpuid features which are supported by both the +hardware and kvm in its default configuration. Userspace can use the +information returned by this ioctl to construct cpuid information (for +KVM_SET_CPUID2) that is consistent with hardware, kernel, and +userspace capabilities, and with user requirements (for example, the +user may wish to constrain cpuid to emulate older hardware, or for +feature consistency across a cluster). + +Note that certain capabilities, such as KVM_CAP_X86_DISABLE_EXITS, may +expose cpuid features (e.g. MONITOR) which are not supported by kvm in +its default configuration. If userspace enables such capabilities, it +is responsible for modifying the results of this ioctl appropriately. + +Userspace invokes KVM_GET_SUPPORTED_CPUID by passing a kvm_cpuid2 structure +with the 'nent' field indicating the number of entries in the variable-size +array 'entries'. If the number of entries is too low to describe the cpu +capabilities, an error (E2BIG) is returned. If the number is too high, +the 'nent' field is adjusted and an error (ENOMEM) is returned. If the +number is just right, the 'nent' field is adjusted to the number of valid +entries in the 'entries' array, which is then filled. + +The entries returned are the host cpuid as returned by the cpuid instruction, +with unknown or unsupported features masked out. Some features (for example, +x2apic), may not be present in the host cpu, but are exposed by kvm if it can +emulate them efficiently. The fields in each entry are defined as follows: + + function: the eax value used to obtain the entry + index: the ecx value used to obtain the entry (for entries that are + affected by ecx) + flags: an OR of zero or more of the following: + KVM_CPUID_FLAG_SIGNIFCANT_INDEX: + if the index field is valid + KVM_CPUID_FLAG_STATEFUL_FUNC: + if cpuid for this function returns different values for successive + invocations; there will be several entries with the same function, + all with this flag set + KVM_CPUID_FLAG_STATE_READ_NEXT: + for KVM_CPUID_FLAG_STATEFUL_FUNC entries, set if this entry is + the first entry to be read by a cpu + eax, ebx, ecx, edx: the values returned by the cpuid instruction for + this function/index combination + +The TSC deadline timer feature (CPUID leaf 1, ecx[24]) is always returned +as false, since the feature depends on KVM_CREATE_IRQCHIP for local APIC +support. Instead it is reported via + + ioctl(KVM_CHECK_EXTENSION, KVM_CAP_TSC_DEADLINE_TIMER) + +if that returns true and you use KVM_CREATE_IRQCHIP, or if you emulate the +feature in userspace, then you can enable the feature for KVM_SET_CPUID2. + + +4.47 KVM_PPC_GET_PVINFO + +Capability: KVM_CAP_PPC_GET_PVINFO +Architectures: ppc +Type: vm ioctl +Parameters: struct kvm_ppc_pvinfo (out) +Returns: 0 on success, !0 on error + +struct kvm_ppc_pvinfo { + __u32 flags; + __u32 hcall[4]; + __u8 pad[108]; +}; + +This ioctl fetches PV specific information that need to be passed to the guest +using the device tree or other means from vm context. + +The hcall array defines 4 instructions that make up a hypercall. + +If any additional field gets added to this structure later on, a bit for that +additional piece of information will be set in the flags bitmap. + +The flags bitmap is defined as: + + /* the host supports the ePAPR idle hcall + #define KVM_PPC_PVINFO_FLAGS_EV_IDLE (1<<0) + +4.52 KVM_SET_GSI_ROUTING + +Capability: KVM_CAP_IRQ_ROUTING +Architectures: x86 s390 arm arm64 +Type: vm ioctl +Parameters: struct kvm_irq_routing (in) +Returns: 0 on success, -1 on error + +Sets the GSI routing table entries, overwriting any previously set entries. + +On arm/arm64, GSI routing has the following limitation: +- GSI routing does not apply to KVM_IRQ_LINE but only to KVM_IRQFD. + +struct kvm_irq_routing { + __u32 nr; + __u32 flags; + struct kvm_irq_routing_entry entries[0]; +}; + +No flags are specified so far, the corresponding field must be set to zero. + +struct kvm_irq_routing_entry { + __u32 gsi; + __u32 type; + __u32 flags; + __u32 pad; + union { + struct kvm_irq_routing_irqchip irqchip; + struct kvm_irq_routing_msi msi; + struct kvm_irq_routing_s390_adapter adapter; + struct kvm_irq_routing_hv_sint hv_sint; + __u32 pad[8]; + } u; +}; + +/* gsi routing entry types */ +#define KVM_IRQ_ROUTING_IRQCHIP 1 +#define KVM_IRQ_ROUTING_MSI 2 +#define KVM_IRQ_ROUTING_S390_ADAPTER 3 +#define KVM_IRQ_ROUTING_HV_SINT 4 + +flags: +- KVM_MSI_VALID_DEVID: used along with KVM_IRQ_ROUTING_MSI routing entry + type, specifies that the devid field contains a valid value. The per-VM + KVM_CAP_MSI_DEVID capability advertises the requirement to provide + the device ID. If this capability is not available, userspace should + never set the KVM_MSI_VALID_DEVID flag as the ioctl might fail. +- zero otherwise + +struct kvm_irq_routing_irqchip { + __u32 irqchip; + __u32 pin; +}; + +struct kvm_irq_routing_msi { + __u32 address_lo; + __u32 address_hi; + __u32 data; + union { + __u32 pad; + __u32 devid; + }; +}; + +If KVM_MSI_VALID_DEVID is set, devid contains a unique device identifier +for the device that wrote the MSI message. For PCI, this is usually a +BFD identifier in the lower 16 bits. + +On x86, address_hi is ignored unless the KVM_X2APIC_API_USE_32BIT_IDS +feature of KVM_CAP_X2APIC_API capability is enabled. If it is enabled, +address_hi bits 31-8 provide bits 31-8 of the destination id. Bits 7-0 of +address_hi must be zero. + +struct kvm_irq_routing_s390_adapter { + __u64 ind_addr; + __u64 summary_addr; + __u64 ind_offset; + __u32 summary_offset; + __u32 adapter_id; +}; + +struct kvm_irq_routing_hv_sint { + __u32 vcpu; + __u32 sint; +}; + + +4.55 KVM_SET_TSC_KHZ + +Capability: KVM_CAP_TSC_CONTROL +Architectures: x86 +Type: vcpu ioctl +Parameters: virtual tsc_khz +Returns: 0 on success, -1 on error + +Specifies the tsc frequency for the virtual machine. The unit of the +frequency is KHz. + + +4.56 KVM_GET_TSC_KHZ + +Capability: KVM_CAP_GET_TSC_KHZ +Architectures: x86 +Type: vcpu ioctl +Parameters: none +Returns: virtual tsc-khz on success, negative value on error + +Returns the tsc frequency of the guest. The unit of the return value is +KHz. If the host has unstable tsc this ioctl returns -EIO instead as an +error. + + +4.57 KVM_GET_LAPIC + +Capability: KVM_CAP_IRQCHIP +Architectures: x86 +Type: vcpu ioctl +Parameters: struct kvm_lapic_state (out) +Returns: 0 on success, -1 on error + +#define KVM_APIC_REG_SIZE 0x400 +struct kvm_lapic_state { + char regs[KVM_APIC_REG_SIZE]; +}; + +Reads the Local APIC registers and copies them into the input argument. The +data format and layout are the same as documented in the architecture manual. + +If KVM_X2APIC_API_USE_32BIT_IDS feature of KVM_CAP_X2APIC_API is +enabled, then the format of APIC_ID register depends on the APIC mode +(reported by MSR_IA32_APICBASE) of its VCPU. x2APIC stores APIC ID in +the APIC_ID register (bytes 32-35). xAPIC only allows an 8-bit APIC ID +which is stored in bits 31-24 of the APIC register, or equivalently in +byte 35 of struct kvm_lapic_state's regs field. KVM_GET_LAPIC must then +be called after MSR_IA32_APICBASE has been set with KVM_SET_MSR. + +If KVM_X2APIC_API_USE_32BIT_IDS feature is disabled, struct kvm_lapic_state +always uses xAPIC format. + + +4.58 KVM_SET_LAPIC + +Capability: KVM_CAP_IRQCHIP +Architectures: x86 +Type: vcpu ioctl +Parameters: struct kvm_lapic_state (in) +Returns: 0 on success, -1 on error + +#define KVM_APIC_REG_SIZE 0x400 +struct kvm_lapic_state { + char regs[KVM_APIC_REG_SIZE]; +}; + +Copies the input argument into the Local APIC registers. The data format +and layout are the same as documented in the architecture manual. + +The format of the APIC ID register (bytes 32-35 of struct kvm_lapic_state's +regs field) depends on the state of the KVM_CAP_X2APIC_API capability. +See the note in KVM_GET_LAPIC. + + +4.59 KVM_IOEVENTFD + +Capability: KVM_CAP_IOEVENTFD +Architectures: all +Type: vm ioctl +Parameters: struct kvm_ioeventfd (in) +Returns: 0 on success, !0 on error + +This ioctl attaches or detaches an ioeventfd to a legal pio/mmio address +within the guest. A guest write in the registered address will signal the +provided event instead of triggering an exit. + +struct kvm_ioeventfd { + __u64 datamatch; + __u64 addr; /* legal pio/mmio address */ + __u32 len; /* 0, 1, 2, 4, or 8 bytes */ + __s32 fd; + __u32 flags; + __u8 pad[36]; +}; + +For the special case of virtio-ccw devices on s390, the ioevent is matched +to a subchannel/virtqueue tuple instead. + +The following flags are defined: + +#define KVM_IOEVENTFD_FLAG_DATAMATCH (1 << kvm_ioeventfd_flag_nr_datamatch) +#define KVM_IOEVENTFD_FLAG_PIO (1 << kvm_ioeventfd_flag_nr_pio) +#define KVM_IOEVENTFD_FLAG_DEASSIGN (1 << kvm_ioeventfd_flag_nr_deassign) +#define KVM_IOEVENTFD_FLAG_VIRTIO_CCW_NOTIFY \ + (1 << kvm_ioeventfd_flag_nr_virtio_ccw_notify) + +If datamatch flag is set, the event will be signaled only if the written value +to the registered address is equal to datamatch in struct kvm_ioeventfd. + +For virtio-ccw devices, addr contains the subchannel id and datamatch the +virtqueue index. + +With KVM_CAP_IOEVENTFD_ANY_LENGTH, a zero length ioeventfd is allowed, and +the kernel will ignore the length of guest write and may get a faster vmexit. +The speedup may only apply to specific architectures, but the ioeventfd will +work anyway. + +4.60 KVM_DIRTY_TLB + +Capability: KVM_CAP_SW_TLB +Architectures: ppc +Type: vcpu ioctl +Parameters: struct kvm_dirty_tlb (in) +Returns: 0 on success, -1 on error + +struct kvm_dirty_tlb { + __u64 bitmap; + __u32 num_dirty; +}; + +This must be called whenever userspace has changed an entry in the shared +TLB, prior to calling KVM_RUN on the associated vcpu. + +The "bitmap" field is the userspace address of an array. This array +consists of a number of bits, equal to the total number of TLB entries as +determined by the last successful call to KVM_CONFIG_TLB, rounded up to the +nearest multiple of 64. + +Each bit corresponds to one TLB entry, ordered the same as in the shared TLB +array. + +The array is little-endian: the bit 0 is the least significant bit of the +first byte, bit 8 is the least significant bit of the second byte, etc. +This avoids any complications with differing word sizes. + +The "num_dirty" field is a performance hint for KVM to determine whether it +should skip processing the bitmap and just invalidate everything. It must +be set to the number of set bits in the bitmap. + + +4.62 KVM_CREATE_SPAPR_TCE + +Capability: KVM_CAP_SPAPR_TCE +Architectures: powerpc +Type: vm ioctl +Parameters: struct kvm_create_spapr_tce (in) +Returns: file descriptor for manipulating the created TCE table + +This creates a virtual TCE (translation control entry) table, which +is an IOMMU for PAPR-style virtual I/O. It is used to translate +logical addresses used in virtual I/O into guest physical addresses, +and provides a scatter/gather capability for PAPR virtual I/O. + +/* for KVM_CAP_SPAPR_TCE */ +struct kvm_create_spapr_tce { + __u64 liobn; + __u32 window_size; +}; + +The liobn field gives the logical IO bus number for which to create a +TCE table. The window_size field specifies the size of the DMA window +which this TCE table will translate - the table will contain one 64 +bit TCE entry for every 4kiB of the DMA window. + +When the guest issues an H_PUT_TCE hcall on a liobn for which a TCE +table has been created using this ioctl(), the kernel will handle it +in real mode, updating the TCE table. H_PUT_TCE calls for other +liobns will cause a vm exit and must be handled by userspace. + +The return value is a file descriptor which can be passed to mmap(2) +to map the created TCE table into userspace. This lets userspace read +the entries written by kernel-handled H_PUT_TCE calls, and also lets +userspace update the TCE table directly which is useful in some +circumstances. + + +4.63 KVM_ALLOCATE_RMA + +Capability: KVM_CAP_PPC_RMA +Architectures: powerpc +Type: vm ioctl +Parameters: struct kvm_allocate_rma (out) +Returns: file descriptor for mapping the allocated RMA + +This allocates a Real Mode Area (RMA) from the pool allocated at boot +time by the kernel. An RMA is a physically-contiguous, aligned region +of memory used on older POWER processors to provide the memory which +will be accessed by real-mode (MMU off) accesses in a KVM guest. +POWER processors support a set of sizes for the RMA that usually +includes 64MB, 128MB, 256MB and some larger powers of two. + +/* for KVM_ALLOCATE_RMA */ +struct kvm_allocate_rma { + __u64 rma_size; +}; + +The return value is a file descriptor which can be passed to mmap(2) +to map the allocated RMA into userspace. The mapped area can then be +passed to the KVM_SET_USER_MEMORY_REGION ioctl to establish it as the +RMA for a virtual machine. The size of the RMA in bytes (which is +fixed at host kernel boot time) is returned in the rma_size field of +the argument structure. + +The KVM_CAP_PPC_RMA capability is 1 or 2 if the KVM_ALLOCATE_RMA ioctl +is supported; 2 if the processor requires all virtual machines to have +an RMA, or 1 if the processor can use an RMA but doesn't require it, +because it supports the Virtual RMA (VRMA) facility. + + +4.64 KVM_NMI + +Capability: KVM_CAP_USER_NMI +Architectures: x86 +Type: vcpu ioctl +Parameters: none +Returns: 0 on success, -1 on error + +Queues an NMI on the thread's vcpu. Note this is well defined only +when KVM_CREATE_IRQCHIP has not been called, since this is an interface +between the virtual cpu core and virtual local APIC. After KVM_CREATE_IRQCHIP +has been called, this interface is completely emulated within the kernel. + +To use this to emulate the LINT1 input with KVM_CREATE_IRQCHIP, use the +following algorithm: + + - pause the vcpu + - read the local APIC's state (KVM_GET_LAPIC) + - check whether changing LINT1 will queue an NMI (see the LVT entry for LINT1) + - if so, issue KVM_NMI + - resume the vcpu + +Some guests configure the LINT1 NMI input to cause a panic, aiding in +debugging. + + +4.65 KVM_S390_UCAS_MAP + +Capability: KVM_CAP_S390_UCONTROL +Architectures: s390 +Type: vcpu ioctl +Parameters: struct kvm_s390_ucas_mapping (in) +Returns: 0 in case of success + +The parameter is defined like this: + struct kvm_s390_ucas_mapping { + __u64 user_addr; + __u64 vcpu_addr; + __u64 length; + }; + +This ioctl maps the memory at "user_addr" with the length "length" to +the vcpu's address space starting at "vcpu_addr". All parameters need to +be aligned by 1 megabyte. + + +4.66 KVM_S390_UCAS_UNMAP + +Capability: KVM_CAP_S390_UCONTROL +Architectures: s390 +Type: vcpu ioctl +Parameters: struct kvm_s390_ucas_mapping (in) +Returns: 0 in case of success + +The parameter is defined like this: + struct kvm_s390_ucas_mapping { + __u64 user_addr; + __u64 vcpu_addr; + __u64 length; + }; + +This ioctl unmaps the memory in the vcpu's address space starting at +"vcpu_addr" with the length "length". The field "user_addr" is ignored. +All parameters need to be aligned by 1 megabyte. + + +4.67 KVM_S390_VCPU_FAULT + +Capability: KVM_CAP_S390_UCONTROL +Architectures: s390 +Type: vcpu ioctl +Parameters: vcpu absolute address (in) +Returns: 0 in case of success + +This call creates a page table entry on the virtual cpu's address space +(for user controlled virtual machines) or the virtual machine's address +space (for regular virtual machines). This only works for minor faults, +thus it's recommended to access subject memory page via the user page +table upfront. This is useful to handle validity intercepts for user +controlled virtual machines to fault in the virtual cpu's lowcore pages +prior to calling the KVM_RUN ioctl. + + +4.68 KVM_SET_ONE_REG + +Capability: KVM_CAP_ONE_REG +Architectures: all +Type: vcpu ioctl +Parameters: struct kvm_one_reg (in) +Returns: 0 on success, negative value on failure +Errors: +  ENOENT:   no such register +  EINVAL:   invalid register ID, or no such register +  EPERM:    (arm64) register access not allowed before vcpu finalization +(These error codes are indicative only: do not rely on a specific error +code being returned in a specific situation.) + +struct kvm_one_reg { + __u64 id; + __u64 addr; +}; + +Using this ioctl, a single vcpu register can be set to a specific value +defined by user space with the passed in struct kvm_one_reg, where id +refers to the register identifier as described below and addr is a pointer +to a variable with the respective size. There can be architecture agnostic +and architecture specific registers. Each have their own range of operation +and their own constants and width. To keep track of the implemented +registers, find a list below: + + Arch | Register | Width (bits) + | | + PPC | KVM_REG_PPC_HIOR | 64 + PPC | KVM_REG_PPC_IAC1 | 64 + PPC | KVM_REG_PPC_IAC2 | 64 + PPC | KVM_REG_PPC_IAC3 | 64 + PPC | KVM_REG_PPC_IAC4 | 64 + PPC | KVM_REG_PPC_DAC1 | 64 + PPC | KVM_REG_PPC_DAC2 | 64 + PPC | KVM_REG_PPC_DABR | 64 + PPC | KVM_REG_PPC_DSCR | 64 + PPC | KVM_REG_PPC_PURR | 64 + PPC | KVM_REG_PPC_SPURR | 64 + PPC | KVM_REG_PPC_DAR | 64 + PPC | KVM_REG_PPC_DSISR | 32 + PPC | KVM_REG_PPC_AMR | 64 + PPC | KVM_REG_PPC_UAMOR | 64 + PPC | KVM_REG_PPC_MMCR0 | 64 + PPC | KVM_REG_PPC_MMCR1 | 64 + PPC | KVM_REG_PPC_MMCRA | 64 + PPC | KVM_REG_PPC_MMCR2 | 64 + PPC | KVM_REG_PPC_MMCRS | 64 + PPC | KVM_REG_PPC_SIAR | 64 + PPC | KVM_REG_PPC_SDAR | 64 + PPC | KVM_REG_PPC_SIER | 64 + PPC | KVM_REG_PPC_PMC1 | 32 + PPC | KVM_REG_PPC_PMC2 | 32 + PPC | KVM_REG_PPC_PMC3 | 32 + PPC | KVM_REG_PPC_PMC4 | 32 + PPC | KVM_REG_PPC_PMC5 | 32 + PPC | KVM_REG_PPC_PMC6 | 32 + PPC | KVM_REG_PPC_PMC7 | 32 + PPC | KVM_REG_PPC_PMC8 | 32 + PPC | KVM_REG_PPC_FPR0 | 64 + ... + PPC | KVM_REG_PPC_FPR31 | 64 + PPC | KVM_REG_PPC_VR0 | 128 + ... + PPC | KVM_REG_PPC_VR31 | 128 + PPC | KVM_REG_PPC_VSR0 | 128 + ... + PPC | KVM_REG_PPC_VSR31 | 128 + PPC | KVM_REG_PPC_FPSCR | 64 + PPC | KVM_REG_PPC_VSCR | 32 + PPC | KVM_REG_PPC_VPA_ADDR | 64 + PPC | KVM_REG_PPC_VPA_SLB | 128 + PPC | KVM_REG_PPC_VPA_DTL | 128 + PPC | KVM_REG_PPC_EPCR | 32 + PPC | KVM_REG_PPC_EPR | 32 + PPC | KVM_REG_PPC_TCR | 32 + PPC | KVM_REG_PPC_TSR | 32 + PPC | KVM_REG_PPC_OR_TSR | 32 + PPC | KVM_REG_PPC_CLEAR_TSR | 32 + PPC | KVM_REG_PPC_MAS0 | 32 + PPC | KVM_REG_PPC_MAS1 | 32 + PPC | KVM_REG_PPC_MAS2 | 64 + PPC | KVM_REG_PPC_MAS7_3 | 64 + PPC | KVM_REG_PPC_MAS4 | 32 + PPC | KVM_REG_PPC_MAS6 | 32 + PPC | KVM_REG_PPC_MMUCFG | 32 + PPC | KVM_REG_PPC_TLB0CFG | 32 + PPC | KVM_REG_PPC_TLB1CFG | 32 + PPC | KVM_REG_PPC_TLB2CFG | 32 + PPC | KVM_REG_PPC_TLB3CFG | 32 + PPC | KVM_REG_PPC_TLB0PS | 32 + PPC | KVM_REG_PPC_TLB1PS | 32 + PPC | KVM_REG_PPC_TLB2PS | 32 + PPC | KVM_REG_PPC_TLB3PS | 32 + PPC | KVM_REG_PPC_EPTCFG | 32 + PPC | KVM_REG_PPC_ICP_STATE | 64 + PPC | KVM_REG_PPC_VP_STATE | 128 + PPC | KVM_REG_PPC_TB_OFFSET | 64 + PPC | KVM_REG_PPC_SPMC1 | 32 + PPC | KVM_REG_PPC_SPMC2 | 32 + PPC | KVM_REG_PPC_IAMR | 64 + PPC | KVM_REG_PPC_TFHAR | 64 + PPC | KVM_REG_PPC_TFIAR | 64 + PPC | KVM_REG_PPC_TEXASR | 64 + PPC | KVM_REG_PPC_FSCR | 64 + PPC | KVM_REG_PPC_PSPB | 32 + PPC | KVM_REG_PPC_EBBHR | 64 + PPC | KVM_REG_PPC_EBBRR | 64 + PPC | KVM_REG_PPC_BESCR | 64 + PPC | KVM_REG_PPC_TAR | 64 + PPC | KVM_REG_PPC_DPDES | 64 + PPC | KVM_REG_PPC_DAWR | 64 + PPC | KVM_REG_PPC_DAWRX | 64 + PPC | KVM_REG_PPC_CIABR | 64 + PPC | KVM_REG_PPC_IC | 64 + PPC | KVM_REG_PPC_VTB | 64 + PPC | KVM_REG_PPC_CSIGR | 64 + PPC | KVM_REG_PPC_TACR | 64 + PPC | KVM_REG_PPC_TCSCR | 64 + PPC | KVM_REG_PPC_PID | 64 + PPC | KVM_REG_PPC_ACOP | 64 + PPC | KVM_REG_PPC_VRSAVE | 32 + PPC | KVM_REG_PPC_LPCR | 32 + PPC | KVM_REG_PPC_LPCR_64 | 64 + PPC | KVM_REG_PPC_PPR | 64 + PPC | KVM_REG_PPC_ARCH_COMPAT | 32 + PPC | KVM_REG_PPC_DABRX | 32 + PPC | KVM_REG_PPC_WORT | 64 + PPC | KVM_REG_PPC_SPRG9 | 64 + PPC | KVM_REG_PPC_DBSR | 32 + PPC | KVM_REG_PPC_TIDR | 64 + PPC | KVM_REG_PPC_PSSCR | 64 + PPC | KVM_REG_PPC_DEC_EXPIRY | 64 + PPC | KVM_REG_PPC_PTCR | 64 + PPC | KVM_REG_PPC_TM_GPR0 | 64 + ... + PPC | KVM_REG_PPC_TM_GPR31 | 64 + PPC | KVM_REG_PPC_TM_VSR0 | 128 + ... + PPC | KVM_REG_PPC_TM_VSR63 | 128 + PPC | KVM_REG_PPC_TM_CR | 64 + PPC | KVM_REG_PPC_TM_LR | 64 + PPC | KVM_REG_PPC_TM_CTR | 64 + PPC | KVM_REG_PPC_TM_FPSCR | 64 + PPC | KVM_REG_PPC_TM_AMR | 64 + PPC | KVM_REG_PPC_TM_PPR | 64 + PPC | KVM_REG_PPC_TM_VRSAVE | 64 + PPC | KVM_REG_PPC_TM_VSCR | 32 + PPC | KVM_REG_PPC_TM_DSCR | 64 + PPC | KVM_REG_PPC_TM_TAR | 64 + PPC | KVM_REG_PPC_TM_XER | 64 + | | + MIPS | KVM_REG_MIPS_R0 | 64 + ... + MIPS | KVM_REG_MIPS_R31 | 64 + MIPS | KVM_REG_MIPS_HI | 64 + MIPS | KVM_REG_MIPS_LO | 64 + MIPS | KVM_REG_MIPS_PC | 64 + MIPS | KVM_REG_MIPS_CP0_INDEX | 32 + MIPS | KVM_REG_MIPS_CP0_ENTRYLO0 | 64 + MIPS | KVM_REG_MIPS_CP0_ENTRYLO1 | 64 + MIPS | KVM_REG_MIPS_CP0_CONTEXT | 64 + MIPS | KVM_REG_MIPS_CP0_CONTEXTCONFIG| 32 + MIPS | KVM_REG_MIPS_CP0_USERLOCAL | 64 + MIPS | KVM_REG_MIPS_CP0_XCONTEXTCONFIG| 64 + MIPS | KVM_REG_MIPS_CP0_PAGEMASK | 32 + MIPS | KVM_REG_MIPS_CP0_PAGEGRAIN | 32 + MIPS | KVM_REG_MIPS_CP0_SEGCTL0 | 64 + MIPS | KVM_REG_MIPS_CP0_SEGCTL1 | 64 + MIPS | KVM_REG_MIPS_CP0_SEGCTL2 | 64 + MIPS | KVM_REG_MIPS_CP0_PWBASE | 64 + MIPS | KVM_REG_MIPS_CP0_PWFIELD | 64 + MIPS | KVM_REG_MIPS_CP0_PWSIZE | 64 + MIPS | KVM_REG_MIPS_CP0_WIRED | 32 + MIPS | KVM_REG_MIPS_CP0_PWCTL | 32 + MIPS | KVM_REG_MIPS_CP0_HWRENA | 32 + MIPS | KVM_REG_MIPS_CP0_BADVADDR | 64 + MIPS | KVM_REG_MIPS_CP0_BADINSTR | 32 + MIPS | KVM_REG_MIPS_CP0_BADINSTRP | 32 + MIPS | KVM_REG_MIPS_CP0_COUNT | 32 + MIPS | KVM_REG_MIPS_CP0_ENTRYHI | 64 + MIPS | KVM_REG_MIPS_CP0_COMPARE | 32 + MIPS | KVM_REG_MIPS_CP0_STATUS | 32 + MIPS | KVM_REG_MIPS_CP0_INTCTL | 32 + MIPS | KVM_REG_MIPS_CP0_CAUSE | 32 + MIPS | KVM_REG_MIPS_CP0_EPC | 64 + MIPS | KVM_REG_MIPS_CP0_PRID | 32 + MIPS | KVM_REG_MIPS_CP0_EBASE | 64 + MIPS | KVM_REG_MIPS_CP0_CONFIG | 32 + MIPS | KVM_REG_MIPS_CP0_CONFIG1 | 32 + MIPS | KVM_REG_MIPS_CP0_CONFIG2 | 32 + MIPS | KVM_REG_MIPS_CP0_CONFIG3 | 32 + MIPS | KVM_REG_MIPS_CP0_CONFIG4 | 32 + MIPS | KVM_REG_MIPS_CP0_CONFIG5 | 32 + MIPS | KVM_REG_MIPS_CP0_CONFIG7 | 32 + MIPS | KVM_REG_MIPS_CP0_XCONTEXT | 64 + MIPS | KVM_REG_MIPS_CP0_ERROREPC | 64 + MIPS | KVM_REG_MIPS_CP0_KSCRATCH1 | 64 + MIPS | KVM_REG_MIPS_CP0_KSCRATCH2 | 64 + MIPS | KVM_REG_MIPS_CP0_KSCRATCH3 | 64 + MIPS | KVM_REG_MIPS_CP0_KSCRATCH4 | 64 + MIPS | KVM_REG_MIPS_CP0_KSCRATCH5 | 64 + MIPS | KVM_REG_MIPS_CP0_KSCRATCH6 | 64 + MIPS | KVM_REG_MIPS_CP0_MAAR(0..63) | 64 + MIPS | KVM_REG_MIPS_COUNT_CTL | 64 + MIPS | KVM_REG_MIPS_COUNT_RESUME | 64 + MIPS | KVM_REG_MIPS_COUNT_HZ | 64 + MIPS | KVM_REG_MIPS_FPR_32(0..31) | 32 + MIPS | KVM_REG_MIPS_FPR_64(0..31) | 64 + MIPS | KVM_REG_MIPS_VEC_128(0..31) | 128 + MIPS | KVM_REG_MIPS_FCR_IR | 32 + MIPS | KVM_REG_MIPS_FCR_CSR | 32 + MIPS | KVM_REG_MIPS_MSA_IR | 32 + MIPS | KVM_REG_MIPS_MSA_CSR | 32 + +ARM registers are mapped using the lower 32 bits. The upper 16 of that +is the register group type, or coprocessor number: + +ARM core registers have the following id bit patterns: + 0x4020 0000 0010 + +ARM 32-bit CP15 registers have the following id bit patterns: + 0x4020 0000 000F + +ARM 64-bit CP15 registers have the following id bit patterns: + 0x4030 0000 000F + +ARM CCSIDR registers are demultiplexed by CSSELR value: + 0x4020 0000 0011 00 + +ARM 32-bit VFP control registers have the following id bit patterns: + 0x4020 0000 0012 1 + +ARM 64-bit FP registers have the following id bit patterns: + 0x4030 0000 0012 0 + +ARM firmware pseudo-registers have the following bit pattern: + 0x4030 0000 0014 + + +arm64 registers are mapped using the lower 32 bits. The upper 16 of +that is the register group type, or coprocessor number: + +arm64 core/FP-SIMD registers have the following id bit patterns. Note +that the size of the access is variable, as the kvm_regs structure +contains elements ranging from 32 to 128 bits. The index is a 32bit +value in the kvm_regs structure seen as a 32bit array. + 0x60x0 0000 0010 + +Specifically: + Encoding Register Bits kvm_regs member +---------------------------------------------------------------- + 0x6030 0000 0010 0000 X0 64 regs.regs[0] + 0x6030 0000 0010 0002 X1 64 regs.regs[1] + ... + 0x6030 0000 0010 003c X30 64 regs.regs[30] + 0x6030 0000 0010 003e SP 64 regs.sp + 0x6030 0000 0010 0040 PC 64 regs.pc + 0x6030 0000 0010 0042 PSTATE 64 regs.pstate + 0x6030 0000 0010 0044 SP_EL1 64 sp_el1 + 0x6030 0000 0010 0046 ELR_EL1 64 elr_el1 + 0x6030 0000 0010 0048 SPSR_EL1 64 spsr[KVM_SPSR_EL1] (alias SPSR_SVC) + 0x6030 0000 0010 004a SPSR_ABT 64 spsr[KVM_SPSR_ABT] + 0x6030 0000 0010 004c SPSR_UND 64 spsr[KVM_SPSR_UND] + 0x6030 0000 0010 004e SPSR_IRQ 64 spsr[KVM_SPSR_IRQ] + 0x6060 0000 0010 0050 SPSR_FIQ 64 spsr[KVM_SPSR_FIQ] + 0x6040 0000 0010 0054 V0 128 fp_regs.vregs[0] (*) + 0x6040 0000 0010 0058 V1 128 fp_regs.vregs[1] (*) + ... + 0x6040 0000 0010 00d0 V31 128 fp_regs.vregs[31] (*) + 0x6020 0000 0010 00d4 FPSR 32 fp_regs.fpsr + 0x6020 0000 0010 00d5 FPCR 32 fp_regs.fpcr + +(*) These encodings are not accepted for SVE-enabled vcpus. See + KVM_ARM_VCPU_INIT. + + The equivalent register content can be accessed via bits [127:0] of + the corresponding SVE Zn registers instead for vcpus that have SVE + enabled (see below). + +arm64 CCSIDR registers are demultiplexed by CSSELR value: + 0x6020 0000 0011 00 + +arm64 system registers have the following id bit patterns: + 0x6030 0000 0013 + +arm64 firmware pseudo-registers have the following bit pattern: + 0x6030 0000 0014 + +arm64 SVE registers have the following bit patterns: + 0x6080 0000 0015 00 Zn bits[2048*slice + 2047 : 2048*slice] + 0x6050 0000 0015 04 Pn bits[256*slice + 255 : 256*slice] + 0x6050 0000 0015 060 FFR bits[256*slice + 255 : 256*slice] + 0x6060 0000 0015 ffff KVM_REG_ARM64_SVE_VLS pseudo-register + +Access to register IDs where 2048 * slice >= 128 * max_vq will fail with +ENOENT. max_vq is the vcpu's maximum supported vector length in 128-bit +quadwords: see (**) below. + +These registers are only accessible on vcpus for which SVE is enabled. +See KVM_ARM_VCPU_INIT for details. + +In addition, except for KVM_REG_ARM64_SVE_VLS, these registers are not +accessible until the vcpu's SVE configuration has been finalized +using KVM_ARM_VCPU_FINALIZE(KVM_ARM_VCPU_SVE). See KVM_ARM_VCPU_INIT +and KVM_ARM_VCPU_FINALIZE for more information about this procedure. + +KVM_REG_ARM64_SVE_VLS is a pseudo-register that allows the set of vector +lengths supported by the vcpu to be discovered and configured by +userspace. When transferred to or from user memory via KVM_GET_ONE_REG +or KVM_SET_ONE_REG, the value of this register is of type +__u64[KVM_ARM64_SVE_VLS_WORDS], and encodes the set of vector lengths as +follows: + +__u64 vector_lengths[KVM_ARM64_SVE_VLS_WORDS]; + +if (vq >= SVE_VQ_MIN && vq <= SVE_VQ_MAX && + ((vector_lengths[(vq - KVM_ARM64_SVE_VQ_MIN) / 64] >> + ((vq - KVM_ARM64_SVE_VQ_MIN) % 64)) & 1)) + /* Vector length vq * 16 bytes supported */ +else + /* Vector length vq * 16 bytes not supported */ + +(**) The maximum value vq for which the above condition is true is +max_vq. This is the maximum vector length available to the guest on +this vcpu, and determines which register slices are visible through +this ioctl interface. + +(See Documentation/arm64/sve.rst for an explanation of the "vq" +nomenclature.) + +KVM_REG_ARM64_SVE_VLS is only accessible after KVM_ARM_VCPU_INIT. +KVM_ARM_VCPU_INIT initialises it to the best set of vector lengths that +the host supports. + +Userspace may subsequently modify it if desired until the vcpu's SVE +configuration is finalized using KVM_ARM_VCPU_FINALIZE(KVM_ARM_VCPU_SVE). + +Apart from simply removing all vector lengths from the host set that +exceed some value, support for arbitrarily chosen sets of vector lengths +is hardware-dependent and may not be available. Attempting to configure +an invalid set of vector lengths via KVM_SET_ONE_REG will fail with +EINVAL. + +After the vcpu's SVE configuration is finalized, further attempts to +write this register will fail with EPERM. + + +MIPS registers are mapped using the lower 32 bits. The upper 16 of that is +the register group type: + +MIPS core registers (see above) have the following id bit patterns: + 0x7030 0000 0000 + +MIPS CP0 registers (see KVM_REG_MIPS_CP0_* above) have the following id bit +patterns depending on whether they're 32-bit or 64-bit registers: + 0x7020 0000 0001 00 (32-bit) + 0x7030 0000 0001 00 (64-bit) + +Note: KVM_REG_MIPS_CP0_ENTRYLO0 and KVM_REG_MIPS_CP0_ENTRYLO1 are the MIPS64 +versions of the EntryLo registers regardless of the word size of the host +hardware, host kernel, guest, and whether XPA is present in the guest, i.e. +with the RI and XI bits (if they exist) in bits 63 and 62 respectively, and +the PFNX field starting at bit 30. + +MIPS MAARs (see KVM_REG_MIPS_CP0_MAAR(*) above) have the following id bit +patterns: + 0x7030 0000 0001 01 + +MIPS KVM control registers (see above) have the following id bit patterns: + 0x7030 0000 0002 + +MIPS FPU registers (see KVM_REG_MIPS_FPR_{32,64}() above) have the following +id bit patterns depending on the size of the register being accessed. They are +always accessed according to the current guest FPU mode (Status.FR and +Config5.FRE), i.e. as the guest would see them, and they become unpredictable +if the guest FPU mode is changed. MIPS SIMD Architecture (MSA) vector +registers (see KVM_REG_MIPS_VEC_128() above) have similar patterns as they +overlap the FPU registers: + 0x7020 0000 0003 00 <0:3> (32-bit FPU registers) + 0x7030 0000 0003 00 <0:3> (64-bit FPU registers) + 0x7040 0000 0003 00 <0:3> (128-bit MSA vector registers) + +MIPS FPU control registers (see KVM_REG_MIPS_FCR_{IR,CSR} above) have the +following id bit patterns: + 0x7020 0000 0003 01 <0:3> + +MIPS MSA control registers (see KVM_REG_MIPS_MSA_{IR,CSR} above) have the +following id bit patterns: + 0x7020 0000 0003 02 <0:3> + + +4.69 KVM_GET_ONE_REG + +Capability: KVM_CAP_ONE_REG +Architectures: all +Type: vcpu ioctl +Parameters: struct kvm_one_reg (in and out) +Returns: 0 on success, negative value on failure +Errors include: +  ENOENT:   no such register +  EINVAL:   invalid register ID, or no such register +  EPERM:    (arm64) register access not allowed before vcpu finalization +(These error codes are indicative only: do not rely on a specific error +code being returned in a specific situation.) + +This ioctl allows to receive the value of a single register implemented +in a vcpu. The register to read is indicated by the "id" field of the +kvm_one_reg struct passed in. On success, the register value can be found +at the memory location pointed to by "addr". + +The list of registers accessible using this interface is identical to the +list in 4.68. + + +4.70 KVM_KVMCLOCK_CTRL + +Capability: KVM_CAP_KVMCLOCK_CTRL +Architectures: Any that implement pvclocks (currently x86 only) +Type: vcpu ioctl +Parameters: None +Returns: 0 on success, -1 on error + +This signals to the host kernel that the specified guest is being paused by +userspace. The host will set a flag in the pvclock structure that is checked +from the soft lockup watchdog. The flag is part of the pvclock structure that +is shared between guest and host, specifically the second bit of the flags +field of the pvclock_vcpu_time_info structure. It will be set exclusively by +the host and read/cleared exclusively by the guest. The guest operation of +checking and clearing the flag must an atomic operation so +load-link/store-conditional, or equivalent must be used. There are two cases +where the guest will clear the flag: when the soft lockup watchdog timer resets +itself or when a soft lockup is detected. This ioctl can be called any time +after pausing the vcpu, but before it is resumed. + + +4.71 KVM_SIGNAL_MSI + +Capability: KVM_CAP_SIGNAL_MSI +Architectures: x86 arm arm64 +Type: vm ioctl +Parameters: struct kvm_msi (in) +Returns: >0 on delivery, 0 if guest blocked the MSI, and -1 on error + +Directly inject a MSI message. Only valid with in-kernel irqchip that handles +MSI messages. + +struct kvm_msi { + __u32 address_lo; + __u32 address_hi; + __u32 data; + __u32 flags; + __u32 devid; + __u8 pad[12]; +}; + +flags: KVM_MSI_VALID_DEVID: devid contains a valid value. The per-VM + KVM_CAP_MSI_DEVID capability advertises the requirement to provide + the device ID. If this capability is not available, userspace + should never set the KVM_MSI_VALID_DEVID flag as the ioctl might fail. + +If KVM_MSI_VALID_DEVID is set, devid contains a unique device identifier +for the device that wrote the MSI message. For PCI, this is usually a +BFD identifier in the lower 16 bits. + +On x86, address_hi is ignored unless the KVM_X2APIC_API_USE_32BIT_IDS +feature of KVM_CAP_X2APIC_API capability is enabled. If it is enabled, +address_hi bits 31-8 provide bits 31-8 of the destination id. Bits 7-0 of +address_hi must be zero. + + +4.71 KVM_CREATE_PIT2 + +Capability: KVM_CAP_PIT2 +Architectures: x86 +Type: vm ioctl +Parameters: struct kvm_pit_config (in) +Returns: 0 on success, -1 on error + +Creates an in-kernel device model for the i8254 PIT. This call is only valid +after enabling in-kernel irqchip support via KVM_CREATE_IRQCHIP. The following +parameters have to be passed: + +struct kvm_pit_config { + __u32 flags; + __u32 pad[15]; +}; + +Valid flags are: + +#define KVM_PIT_SPEAKER_DUMMY 1 /* emulate speaker port stub */ + +PIT timer interrupts may use a per-VM kernel thread for injection. If it +exists, this thread will have a name of the following pattern: + +kvm-pit/ + +When running a guest with elevated priorities, the scheduling parameters of +this thread may have to be adjusted accordingly. + +This IOCTL replaces the obsolete KVM_CREATE_PIT. + + +4.72 KVM_GET_PIT2 + +Capability: KVM_CAP_PIT_STATE2 +Architectures: x86 +Type: vm ioctl +Parameters: struct kvm_pit_state2 (out) +Returns: 0 on success, -1 on error + +Retrieves the state of the in-kernel PIT model. Only valid after +KVM_CREATE_PIT2. The state is returned in the following structure: + +struct kvm_pit_state2 { + struct kvm_pit_channel_state channels[3]; + __u32 flags; + __u32 reserved[9]; +}; + +Valid flags are: + +/* disable PIT in HPET legacy mode */ +#define KVM_PIT_FLAGS_HPET_LEGACY 0x00000001 + +This IOCTL replaces the obsolete KVM_GET_PIT. + + +4.73 KVM_SET_PIT2 + +Capability: KVM_CAP_PIT_STATE2 +Architectures: x86 +Type: vm ioctl +Parameters: struct kvm_pit_state2 (in) +Returns: 0 on success, -1 on error + +Sets the state of the in-kernel PIT model. Only valid after KVM_CREATE_PIT2. +See KVM_GET_PIT2 for details on struct kvm_pit_state2. + +This IOCTL replaces the obsolete KVM_SET_PIT. + + +4.74 KVM_PPC_GET_SMMU_INFO + +Capability: KVM_CAP_PPC_GET_SMMU_INFO +Architectures: powerpc +Type: vm ioctl +Parameters: None +Returns: 0 on success, -1 on error + +This populates and returns a structure describing the features of +the "Server" class MMU emulation supported by KVM. +This can in turn be used by userspace to generate the appropriate +device-tree properties for the guest operating system. + +The structure contains some global information, followed by an +array of supported segment page sizes: + + struct kvm_ppc_smmu_info { + __u64 flags; + __u32 slb_size; + __u32 pad; + struct kvm_ppc_one_seg_page_size sps[KVM_PPC_PAGE_SIZES_MAX_SZ]; + }; + +The supported flags are: + + - KVM_PPC_PAGE_SIZES_REAL: + When that flag is set, guest page sizes must "fit" the backing + store page sizes. When not set, any page size in the list can + be used regardless of how they are backed by userspace. + + - KVM_PPC_1T_SEGMENTS + The emulated MMU supports 1T segments in addition to the + standard 256M ones. + + - KVM_PPC_NO_HASH + This flag indicates that HPT guests are not supported by KVM, + thus all guests must use radix MMU mode. + +The "slb_size" field indicates how many SLB entries are supported + +The "sps" array contains 8 entries indicating the supported base +page sizes for a segment in increasing order. Each entry is defined +as follow: + + struct kvm_ppc_one_seg_page_size { + __u32 page_shift; /* Base page shift of segment (or 0) */ + __u32 slb_enc; /* SLB encoding for BookS */ + struct kvm_ppc_one_page_size enc[KVM_PPC_PAGE_SIZES_MAX_SZ]; + }; + +An entry with a "page_shift" of 0 is unused. Because the array is +organized in increasing order, a lookup can stop when encoutering +such an entry. + +The "slb_enc" field provides the encoding to use in the SLB for the +page size. The bits are in positions such as the value can directly +be OR'ed into the "vsid" argument of the slbmte instruction. + +The "enc" array is a list which for each of those segment base page +size provides the list of supported actual page sizes (which can be +only larger or equal to the base page size), along with the +corresponding encoding in the hash PTE. Similarly, the array is +8 entries sorted by increasing sizes and an entry with a "0" shift +is an empty entry and a terminator: + + struct kvm_ppc_one_page_size { + __u32 page_shift; /* Page shift (or 0) */ + __u32 pte_enc; /* Encoding in the HPTE (>>12) */ + }; + +The "pte_enc" field provides a value that can OR'ed into the hash +PTE's RPN field (ie, it needs to be shifted left by 12 to OR it +into the hash PTE second double word). + +4.75 KVM_IRQFD + +Capability: KVM_CAP_IRQFD +Architectures: x86 s390 arm arm64 +Type: vm ioctl +Parameters: struct kvm_irqfd (in) +Returns: 0 on success, -1 on error + +Allows setting an eventfd to directly trigger a guest interrupt. +kvm_irqfd.fd specifies the file descriptor to use as the eventfd and +kvm_irqfd.gsi specifies the irqchip pin toggled by this event. When +an event is triggered on the eventfd, an interrupt is injected into +the guest using the specified gsi pin. The irqfd is removed using +the KVM_IRQFD_FLAG_DEASSIGN flag, specifying both kvm_irqfd.fd +and kvm_irqfd.gsi. + +With KVM_CAP_IRQFD_RESAMPLE, KVM_IRQFD supports a de-assert and notify +mechanism allowing emulation of level-triggered, irqfd-based +interrupts. When KVM_IRQFD_FLAG_RESAMPLE is set the user must pass an +additional eventfd in the kvm_irqfd.resamplefd field. When operating +in resample mode, posting of an interrupt through kvm_irq.fd asserts +the specified gsi in the irqchip. When the irqchip is resampled, such +as from an EOI, the gsi is de-asserted and the user is notified via +kvm_irqfd.resamplefd. It is the user's responsibility to re-queue +the interrupt if the device making use of it still requires service. +Note that closing the resamplefd is not sufficient to disable the +irqfd. The KVM_IRQFD_FLAG_RESAMPLE is only necessary on assignment +and need not be specified with KVM_IRQFD_FLAG_DEASSIGN. + +On arm/arm64, gsi routing being supported, the following can happen: +- in case no routing entry is associated to this gsi, injection fails +- in case the gsi is associated to an irqchip routing entry, + irqchip.pin + 32 corresponds to the injected SPI ID. +- in case the gsi is associated to an MSI routing entry, the MSI + message and device ID are translated into an LPI (support restricted + to GICv3 ITS in-kernel emulation). + +4.76 KVM_PPC_ALLOCATE_HTAB + +Capability: KVM_CAP_PPC_ALLOC_HTAB +Architectures: powerpc +Type: vm ioctl +Parameters: Pointer to u32 containing hash table order (in/out) +Returns: 0 on success, -1 on error + +This requests the host kernel to allocate an MMU hash table for a +guest using the PAPR paravirtualization interface. This only does +anything if the kernel is configured to use the Book 3S HV style of +virtualization. Otherwise the capability doesn't exist and the ioctl +returns an ENOTTY error. The rest of this description assumes Book 3S +HV. + +There must be no vcpus running when this ioctl is called; if there +are, it will do nothing and return an EBUSY error. + +The parameter is a pointer to a 32-bit unsigned integer variable +containing the order (log base 2) of the desired size of the hash +table, which must be between 18 and 46. On successful return from the +ioctl, the value will not be changed by the kernel. + +If no hash table has been allocated when any vcpu is asked to run +(with the KVM_RUN ioctl), the host kernel will allocate a +default-sized hash table (16 MB). + +If this ioctl is called when a hash table has already been allocated, +with a different order from the existing hash table, the existing hash +table will be freed and a new one allocated. If this is ioctl is +called when a hash table has already been allocated of the same order +as specified, the kernel will clear out the existing hash table (zero +all HPTEs). In either case, if the guest is using the virtualized +real-mode area (VRMA) facility, the kernel will re-create the VMRA +HPTEs on the next KVM_RUN of any vcpu. + +4.77 KVM_S390_INTERRUPT + +Capability: basic +Architectures: s390 +Type: vm ioctl, vcpu ioctl +Parameters: struct kvm_s390_interrupt (in) +Returns: 0 on success, -1 on error + +Allows to inject an interrupt to the guest. Interrupts can be floating +(vm ioctl) or per cpu (vcpu ioctl), depending on the interrupt type. + +Interrupt parameters are passed via kvm_s390_interrupt: + +struct kvm_s390_interrupt { + __u32 type; + __u32 parm; + __u64 parm64; +}; + +type can be one of the following: + +KVM_S390_SIGP_STOP (vcpu) - sigp stop; optional flags in parm +KVM_S390_PROGRAM_INT (vcpu) - program check; code in parm +KVM_S390_SIGP_SET_PREFIX (vcpu) - sigp set prefix; prefix address in parm +KVM_S390_RESTART (vcpu) - restart +KVM_S390_INT_CLOCK_COMP (vcpu) - clock comparator interrupt +KVM_S390_INT_CPU_TIMER (vcpu) - CPU timer interrupt +KVM_S390_INT_VIRTIO (vm) - virtio external interrupt; external interrupt + parameters in parm and parm64 +KVM_S390_INT_SERVICE (vm) - sclp external interrupt; sclp parameter in parm +KVM_S390_INT_EMERGENCY (vcpu) - sigp emergency; source cpu in parm +KVM_S390_INT_EXTERNAL_CALL (vcpu) - sigp external call; source cpu in parm +KVM_S390_INT_IO(ai,cssid,ssid,schid) (vm) - compound value to indicate an + I/O interrupt (ai - adapter interrupt; cssid,ssid,schid - subchannel); + I/O interruption parameters in parm (subchannel) and parm64 (intparm, + interruption subclass) +KVM_S390_MCHK (vm, vcpu) - machine check interrupt; cr 14 bits in parm, + machine check interrupt code in parm64 (note that + machine checks needing further payload are not + supported by this ioctl) + +This is an asynchronous vcpu ioctl and can be invoked from any thread. + +4.78 KVM_PPC_GET_HTAB_FD + +Capability: KVM_CAP_PPC_HTAB_FD +Architectures: powerpc +Type: vm ioctl +Parameters: Pointer to struct kvm_get_htab_fd (in) +Returns: file descriptor number (>= 0) on success, -1 on error + +This returns a file descriptor that can be used either to read out the +entries in the guest's hashed page table (HPT), or to write entries to +initialize the HPT. The returned fd can only be written to if the +KVM_GET_HTAB_WRITE bit is set in the flags field of the argument, and +can only be read if that bit is clear. The argument struct looks like +this: + +/* For KVM_PPC_GET_HTAB_FD */ +struct kvm_get_htab_fd { + __u64 flags; + __u64 start_index; + __u64 reserved[2]; +}; + +/* Values for kvm_get_htab_fd.flags */ +#define KVM_GET_HTAB_BOLTED_ONLY ((__u64)0x1) +#define KVM_GET_HTAB_WRITE ((__u64)0x2) + +The `start_index' field gives the index in the HPT of the entry at +which to start reading. It is ignored when writing. + +Reads on the fd will initially supply information about all +"interesting" HPT entries. Interesting entries are those with the +bolted bit set, if the KVM_GET_HTAB_BOLTED_ONLY bit is set, otherwise +all entries. When the end of the HPT is reached, the read() will +return. If read() is called again on the fd, it will start again from +the beginning of the HPT, but will only return HPT entries that have +changed since they were last read. + +Data read or written is structured as a header (8 bytes) followed by a +series of valid HPT entries (16 bytes) each. The header indicates how +many valid HPT entries there are and how many invalid entries follow +the valid entries. The invalid entries are not represented explicitly +in the stream. The header format is: + +struct kvm_get_htab_header { + __u32 index; + __u16 n_valid; + __u16 n_invalid; +}; + +Writes to the fd create HPT entries starting at the index given in the +header; first `n_valid' valid entries with contents from the data +written, then `n_invalid' invalid entries, invalidating any previously +valid entries found. + +4.79 KVM_CREATE_DEVICE + +Capability: KVM_CAP_DEVICE_CTRL +Type: vm ioctl +Parameters: struct kvm_create_device (in/out) +Returns: 0 on success, -1 on error +Errors: + ENODEV: The device type is unknown or unsupported + EEXIST: Device already created, and this type of device may not + be instantiated multiple times + + Other error conditions may be defined by individual device types or + have their standard meanings. + +Creates an emulated device in the kernel. The file descriptor returned +in fd can be used with KVM_SET/GET/HAS_DEVICE_ATTR. + +If the KVM_CREATE_DEVICE_TEST flag is set, only test whether the +device type is supported (not necessarily whether it can be created +in the current vm). + +Individual devices should not define flags. Attributes should be used +for specifying any behavior that is not implied by the device type +number. + +struct kvm_create_device { + __u32 type; /* in: KVM_DEV_TYPE_xxx */ + __u32 fd; /* out: device handle */ + __u32 flags; /* in: KVM_CREATE_DEVICE_xxx */ +}; + +4.80 KVM_SET_DEVICE_ATTR/KVM_GET_DEVICE_ATTR + +Capability: KVM_CAP_DEVICE_CTRL, KVM_CAP_VM_ATTRIBUTES for vm device, + KVM_CAP_VCPU_ATTRIBUTES for vcpu device +Type: device ioctl, vm ioctl, vcpu ioctl +Parameters: struct kvm_device_attr +Returns: 0 on success, -1 on error +Errors: + ENXIO: The group or attribute is unknown/unsupported for this device + or hardware support is missing. + EPERM: The attribute cannot (currently) be accessed this way + (e.g. read-only attribute, or attribute that only makes + sense when the device is in a different state) + + Other error conditions may be defined by individual device types. + +Gets/sets a specified piece of device configuration and/or state. The +semantics are device-specific. See individual device documentation in +the "devices" directory. As with ONE_REG, the size of the data +transferred is defined by the particular attribute. + +struct kvm_device_attr { + __u32 flags; /* no flags currently defined */ + __u32 group; /* device-defined */ + __u64 attr; /* group-defined */ + __u64 addr; /* userspace address of attr data */ +}; + +4.81 KVM_HAS_DEVICE_ATTR + +Capability: KVM_CAP_DEVICE_CTRL, KVM_CAP_VM_ATTRIBUTES for vm device, + KVM_CAP_VCPU_ATTRIBUTES for vcpu device +Type: device ioctl, vm ioctl, vcpu ioctl +Parameters: struct kvm_device_attr +Returns: 0 on success, -1 on error +Errors: + ENXIO: The group or attribute is unknown/unsupported for this device + or hardware support is missing. + +Tests whether a device supports a particular attribute. A successful +return indicates the attribute is implemented. It does not necessarily +indicate that the attribute can be read or written in the device's +current state. "addr" is ignored. + +4.82 KVM_ARM_VCPU_INIT + +Capability: basic +Architectures: arm, arm64 +Type: vcpu ioctl +Parameters: struct kvm_vcpu_init (in) +Returns: 0 on success; -1 on error +Errors: +  EINVAL:    the target is unknown, or the combination of features is invalid. +  ENOENT:    a features bit specified is unknown. + +This tells KVM what type of CPU to present to the guest, and what +optional features it should have.  This will cause a reset of the cpu +registers to their initial values.  If this is not called, KVM_RUN will +return ENOEXEC for that vcpu. + +Note that because some registers reflect machine topology, all vcpus +should be created before this ioctl is invoked. + +Userspace can call this function multiple times for a given vcpu, including +after the vcpu has been run. This will reset the vcpu to its initial +state. All calls to this function after the initial call must use the same +target and same set of feature flags, otherwise EINVAL will be returned. + +Possible features: + - KVM_ARM_VCPU_POWER_OFF: Starts the CPU in a power-off state. + Depends on KVM_CAP_ARM_PSCI. If not set, the CPU will be powered on + and execute guest code when KVM_RUN is called. + - KVM_ARM_VCPU_EL1_32BIT: Starts the CPU in a 32bit mode. + Depends on KVM_CAP_ARM_EL1_32BIT (arm64 only). + - KVM_ARM_VCPU_PSCI_0_2: Emulate PSCI v0.2 (or a future revision + backward compatible with v0.2) for the CPU. + Depends on KVM_CAP_ARM_PSCI_0_2. + - KVM_ARM_VCPU_PMU_V3: Emulate PMUv3 for the CPU. + Depends on KVM_CAP_ARM_PMU_V3. + + - KVM_ARM_VCPU_PTRAUTH_ADDRESS: Enables Address Pointer authentication + for arm64 only. + Depends on KVM_CAP_ARM_PTRAUTH_ADDRESS. + If KVM_CAP_ARM_PTRAUTH_ADDRESS and KVM_CAP_ARM_PTRAUTH_GENERIC are + both present, then both KVM_ARM_VCPU_PTRAUTH_ADDRESS and + KVM_ARM_VCPU_PTRAUTH_GENERIC must be requested or neither must be + requested. + + - KVM_ARM_VCPU_PTRAUTH_GENERIC: Enables Generic Pointer authentication + for arm64 only. + Depends on KVM_CAP_ARM_PTRAUTH_GENERIC. + If KVM_CAP_ARM_PTRAUTH_ADDRESS and KVM_CAP_ARM_PTRAUTH_GENERIC are + both present, then both KVM_ARM_VCPU_PTRAUTH_ADDRESS and + KVM_ARM_VCPU_PTRAUTH_GENERIC must be requested or neither must be + requested. + + - KVM_ARM_VCPU_SVE: Enables SVE for the CPU (arm64 only). + Depends on KVM_CAP_ARM_SVE. + Requires KVM_ARM_VCPU_FINALIZE(KVM_ARM_VCPU_SVE): + + * After KVM_ARM_VCPU_INIT: + + - KVM_REG_ARM64_SVE_VLS may be read using KVM_GET_ONE_REG: the + initial value of this pseudo-register indicates the best set of + vector lengths possible for a vcpu on this host. + + * Before KVM_ARM_VCPU_FINALIZE(KVM_ARM_VCPU_SVE): + + - KVM_RUN and KVM_GET_REG_LIST are not available; + + - KVM_GET_ONE_REG and KVM_SET_ONE_REG cannot be used to access + the scalable archietctural SVE registers + KVM_REG_ARM64_SVE_ZREG(), KVM_REG_ARM64_SVE_PREG() or + KVM_REG_ARM64_SVE_FFR; + + - KVM_REG_ARM64_SVE_VLS may optionally be written using + KVM_SET_ONE_REG, to modify the set of vector lengths available + for the vcpu. + + * After KVM_ARM_VCPU_FINALIZE(KVM_ARM_VCPU_SVE): + + - the KVM_REG_ARM64_SVE_VLS pseudo-register is immutable, and can + no longer be written using KVM_SET_ONE_REG. + +4.83 KVM_ARM_PREFERRED_TARGET + +Capability: basic +Architectures: arm, arm64 +Type: vm ioctl +Parameters: struct struct kvm_vcpu_init (out) +Returns: 0 on success; -1 on error +Errors: + ENODEV: no preferred target available for the host + +This queries KVM for preferred CPU target type which can be emulated +by KVM on underlying host. + +The ioctl returns struct kvm_vcpu_init instance containing information +about preferred CPU target type and recommended features for it. The +kvm_vcpu_init->features bitmap returned will have feature bits set if +the preferred target recommends setting these features, but this is +not mandatory. + +The information returned by this ioctl can be used to prepare an instance +of struct kvm_vcpu_init for KVM_ARM_VCPU_INIT ioctl which will result in +in VCPU matching underlying host. + + +4.84 KVM_GET_REG_LIST + +Capability: basic +Architectures: arm, arm64, mips +Type: vcpu ioctl +Parameters: struct kvm_reg_list (in/out) +Returns: 0 on success; -1 on error +Errors: +  E2BIG:     the reg index list is too big to fit in the array specified by +             the user (the number required will be written into n). + +struct kvm_reg_list { + __u64 n; /* number of registers in reg[] */ + __u64 reg[0]; +}; + +This ioctl returns the guest registers that are supported for the +KVM_GET_ONE_REG/KVM_SET_ONE_REG calls. + + +4.85 KVM_ARM_SET_DEVICE_ADDR (deprecated) + +Capability: KVM_CAP_ARM_SET_DEVICE_ADDR +Architectures: arm, arm64 +Type: vm ioctl +Parameters: struct kvm_arm_device_address (in) +Returns: 0 on success, -1 on error +Errors: + ENODEV: The device id is unknown + ENXIO: Device not supported on current system + EEXIST: Address already set + E2BIG: Address outside guest physical address space + EBUSY: Address overlaps with other device range + +struct kvm_arm_device_addr { + __u64 id; + __u64 addr; +}; + +Specify a device address in the guest's physical address space where guests +can access emulated or directly exposed devices, which the host kernel needs +to know about. The id field is an architecture specific identifier for a +specific device. + +ARM/arm64 divides the id field into two parts, a device id and an +address type id specific to the individual device. + +  bits: | 63 ... 32 | 31 ... 16 | 15 ... 0 | + field: | 0x00000000 | device id | addr type id | + +ARM/arm64 currently only require this when using the in-kernel GIC +support for the hardware VGIC features, using KVM_ARM_DEVICE_VGIC_V2 +as the device id. When setting the base address for the guest's +mapping of the VGIC virtual CPU and distributor interface, the ioctl +must be called after calling KVM_CREATE_IRQCHIP, but before calling +KVM_RUN on any of the VCPUs. Calling this ioctl twice for any of the +base addresses will return -EEXIST. + +Note, this IOCTL is deprecated and the more flexible SET/GET_DEVICE_ATTR API +should be used instead. + + +4.86 KVM_PPC_RTAS_DEFINE_TOKEN + +Capability: KVM_CAP_PPC_RTAS +Architectures: ppc +Type: vm ioctl +Parameters: struct kvm_rtas_token_args +Returns: 0 on success, -1 on error + +Defines a token value for a RTAS (Run Time Abstraction Services) +service in order to allow it to be handled in the kernel. The +argument struct gives the name of the service, which must be the name +of a service that has a kernel-side implementation. If the token +value is non-zero, it will be associated with that service, and +subsequent RTAS calls by the guest specifying that token will be +handled by the kernel. If the token value is 0, then any token +associated with the service will be forgotten, and subsequent RTAS +calls by the guest for that service will be passed to userspace to be +handled. + +4.87 KVM_SET_GUEST_DEBUG + +Capability: KVM_CAP_SET_GUEST_DEBUG +Architectures: x86, s390, ppc, arm64 +Type: vcpu ioctl +Parameters: struct kvm_guest_debug (in) +Returns: 0 on success; -1 on error + +struct kvm_guest_debug { + __u32 control; + __u32 pad; + struct kvm_guest_debug_arch arch; +}; + +Set up the processor specific debug registers and configure vcpu for +handling guest debug events. There are two parts to the structure, the +first a control bitfield indicates the type of debug events to handle +when running. Common control bits are: + + - KVM_GUESTDBG_ENABLE: guest debugging is enabled + - KVM_GUESTDBG_SINGLESTEP: the next run should single-step + +The top 16 bits of the control field are architecture specific control +flags which can include the following: + + - KVM_GUESTDBG_USE_SW_BP: using software breakpoints [x86, arm64] + - KVM_GUESTDBG_USE_HW_BP: using hardware breakpoints [x86, s390, arm64] + - KVM_GUESTDBG_INJECT_DB: inject DB type exception [x86] + - KVM_GUESTDBG_INJECT_BP: inject BP type exception [x86] + - KVM_GUESTDBG_EXIT_PENDING: trigger an immediate guest exit [s390] + +For example KVM_GUESTDBG_USE_SW_BP indicates that software breakpoints +are enabled in memory so we need to ensure breakpoint exceptions are +correctly trapped and the KVM run loop exits at the breakpoint and not +running off into the normal guest vector. For KVM_GUESTDBG_USE_HW_BP +we need to ensure the guest vCPUs architecture specific registers are +updated to the correct (supplied) values. + +The second part of the structure is architecture specific and +typically contains a set of debug registers. + +For arm64 the number of debug registers is implementation defined and +can be determined by querying the KVM_CAP_GUEST_DEBUG_HW_BPS and +KVM_CAP_GUEST_DEBUG_HW_WPS capabilities which return a positive number +indicating the number of supported registers. + +When debug events exit the main run loop with the reason +KVM_EXIT_DEBUG with the kvm_debug_exit_arch part of the kvm_run +structure containing architecture specific debug information. + +4.88 KVM_GET_EMULATED_CPUID + +Capability: KVM_CAP_EXT_EMUL_CPUID +Architectures: x86 +Type: system ioctl +Parameters: struct kvm_cpuid2 (in/out) +Returns: 0 on success, -1 on error + +struct kvm_cpuid2 { + __u32 nent; + __u32 flags; + struct kvm_cpuid_entry2 entries[0]; +}; + +The member 'flags' is used for passing flags from userspace. + +#define KVM_CPUID_FLAG_SIGNIFCANT_INDEX BIT(0) +#define KVM_CPUID_FLAG_STATEFUL_FUNC BIT(1) +#define KVM_CPUID_FLAG_STATE_READ_NEXT BIT(2) + +struct kvm_cpuid_entry2 { + __u32 function; + __u32 index; + __u32 flags; + __u32 eax; + __u32 ebx; + __u32 ecx; + __u32 edx; + __u32 padding[3]; +}; + +This ioctl returns x86 cpuid features which are emulated by +kvm.Userspace can use the information returned by this ioctl to query +which features are emulated by kvm instead of being present natively. + +Userspace invokes KVM_GET_EMULATED_CPUID by passing a kvm_cpuid2 +structure with the 'nent' field indicating the number of entries in +the variable-size array 'entries'. If the number of entries is too low +to describe the cpu capabilities, an error (E2BIG) is returned. If the +number is too high, the 'nent' field is adjusted and an error (ENOMEM) +is returned. If the number is just right, the 'nent' field is adjusted +to the number of valid entries in the 'entries' array, which is then +filled. + +The entries returned are the set CPUID bits of the respective features +which kvm emulates, as returned by the CPUID instruction, with unknown +or unsupported feature bits cleared. + +Features like x2apic, for example, may not be present in the host cpu +but are exposed by kvm in KVM_GET_SUPPORTED_CPUID because they can be +emulated efficiently and thus not included here. + +The fields in each entry are defined as follows: + + function: the eax value used to obtain the entry + index: the ecx value used to obtain the entry (for entries that are + affected by ecx) + flags: an OR of zero or more of the following: + KVM_CPUID_FLAG_SIGNIFCANT_INDEX: + if the index field is valid + KVM_CPUID_FLAG_STATEFUL_FUNC: + if cpuid for this function returns different values for successive + invocations; there will be several entries with the same function, + all with this flag set + KVM_CPUID_FLAG_STATE_READ_NEXT: + for KVM_CPUID_FLAG_STATEFUL_FUNC entries, set if this entry is + the first entry to be read by a cpu + eax, ebx, ecx, edx: the values returned by the cpuid instruction for + this function/index combination + +4.89 KVM_S390_MEM_OP + +Capability: KVM_CAP_S390_MEM_OP +Architectures: s390 +Type: vcpu ioctl +Parameters: struct kvm_s390_mem_op (in) +Returns: = 0 on success, + < 0 on generic error (e.g. -EFAULT or -ENOMEM), + > 0 if an exception occurred while walking the page tables + +Read or write data from/to the logical (virtual) memory of a VCPU. + +Parameters are specified via the following structure: + +struct kvm_s390_mem_op { + __u64 gaddr; /* the guest address */ + __u64 flags; /* flags */ + __u32 size; /* amount of bytes */ + __u32 op; /* type of operation */ + __u64 buf; /* buffer in userspace */ + __u8 ar; /* the access register number */ + __u8 reserved[31]; /* should be set to 0 */ +}; + +The type of operation is specified in the "op" field. It is either +KVM_S390_MEMOP_LOGICAL_READ for reading from logical memory space or +KVM_S390_MEMOP_LOGICAL_WRITE for writing to logical memory space. The +KVM_S390_MEMOP_F_CHECK_ONLY flag can be set in the "flags" field to check +whether the corresponding memory access would create an access exception +(without touching the data in the memory at the destination). In case an +access exception occurred while walking the MMU tables of the guest, the +ioctl returns a positive error number to indicate the type of exception. +This exception is also raised directly at the corresponding VCPU if the +flag KVM_S390_MEMOP_F_INJECT_EXCEPTION is set in the "flags" field. + +The start address of the memory region has to be specified in the "gaddr" +field, and the length of the region in the "size" field. "buf" is the buffer +supplied by the userspace application where the read data should be written +to for KVM_S390_MEMOP_LOGICAL_READ, or where the data that should be written +is stored for a KVM_S390_MEMOP_LOGICAL_WRITE. "buf" is unused and can be NULL +when KVM_S390_MEMOP_F_CHECK_ONLY is specified. "ar" designates the access +register number to be used. + +The "reserved" field is meant for future extensions. It is not used by +KVM with the currently defined set of flags. + +4.90 KVM_S390_GET_SKEYS + +Capability: KVM_CAP_S390_SKEYS +Architectures: s390 +Type: vm ioctl +Parameters: struct kvm_s390_skeys +Returns: 0 on success, KVM_S390_GET_KEYS_NONE if guest is not using storage + keys, negative value on error + +This ioctl is used to get guest storage key values on the s390 +architecture. The ioctl takes parameters via the kvm_s390_skeys struct. + +struct kvm_s390_skeys { + __u64 start_gfn; + __u64 count; + __u64 skeydata_addr; + __u32 flags; + __u32 reserved[9]; +}; + +The start_gfn field is the number of the first guest frame whose storage keys +you want to get. + +The count field is the number of consecutive frames (starting from start_gfn) +whose storage keys to get. The count field must be at least 1 and the maximum +allowed value is defined as KVM_S390_SKEYS_ALLOC_MAX. Values outside this range +will cause the ioctl to return -EINVAL. + +The skeydata_addr field is the address to a buffer large enough to hold count +bytes. This buffer will be filled with storage key data by the ioctl. + +4.91 KVM_S390_SET_SKEYS + +Capability: KVM_CAP_S390_SKEYS +Architectures: s390 +Type: vm ioctl +Parameters: struct kvm_s390_skeys +Returns: 0 on success, negative value on error + +This ioctl is used to set guest storage key values on the s390 +architecture. The ioctl takes parameters via the kvm_s390_skeys struct. +See section on KVM_S390_GET_SKEYS for struct definition. + +The start_gfn field is the number of the first guest frame whose storage keys +you want to set. + +The count field is the number of consecutive frames (starting from start_gfn) +whose storage keys to get. The count field must be at least 1 and the maximum +allowed value is defined as KVM_S390_SKEYS_ALLOC_MAX. Values outside this range +will cause the ioctl to return -EINVAL. + +The skeydata_addr field is the address to a buffer containing count bytes of +storage keys. Each byte in the buffer will be set as the storage key for a +single frame starting at start_gfn for count frames. + +Note: If any architecturally invalid key value is found in the given data then +the ioctl will return -EINVAL. + +4.92 KVM_S390_IRQ + +Capability: KVM_CAP_S390_INJECT_IRQ +Architectures: s390 +Type: vcpu ioctl +Parameters: struct kvm_s390_irq (in) +Returns: 0 on success, -1 on error +Errors: + EINVAL: interrupt type is invalid + type is KVM_S390_SIGP_STOP and flag parameter is invalid value + type is KVM_S390_INT_EXTERNAL_CALL and code is bigger + than the maximum of VCPUs + EBUSY: type is KVM_S390_SIGP_SET_PREFIX and vcpu is not stopped + type is KVM_S390_SIGP_STOP and a stop irq is already pending + type is KVM_S390_INT_EXTERNAL_CALL and an external call interrupt + is already pending + +Allows to inject an interrupt to the guest. + +Using struct kvm_s390_irq as a parameter allows +to inject additional payload which is not +possible via KVM_S390_INTERRUPT. + +Interrupt parameters are passed via kvm_s390_irq: + +struct kvm_s390_irq { + __u64 type; + union { + struct kvm_s390_io_info io; + struct kvm_s390_ext_info ext; + struct kvm_s390_pgm_info pgm; + struct kvm_s390_emerg_info emerg; + struct kvm_s390_extcall_info extcall; + struct kvm_s390_prefix_info prefix; + struct kvm_s390_stop_info stop; + struct kvm_s390_mchk_info mchk; + char reserved[64]; + } u; +}; + +type can be one of the following: + +KVM_S390_SIGP_STOP - sigp stop; parameter in .stop +KVM_S390_PROGRAM_INT - program check; parameters in .pgm +KVM_S390_SIGP_SET_PREFIX - sigp set prefix; parameters in .prefix +KVM_S390_RESTART - restart; no parameters +KVM_S390_INT_CLOCK_COMP - clock comparator interrupt; no parameters +KVM_S390_INT_CPU_TIMER - CPU timer interrupt; no parameters +KVM_S390_INT_EMERGENCY - sigp emergency; parameters in .emerg +KVM_S390_INT_EXTERNAL_CALL - sigp external call; parameters in .extcall +KVM_S390_MCHK - machine check interrupt; parameters in .mchk + +This is an asynchronous vcpu ioctl and can be invoked from any thread. + +4.94 KVM_S390_GET_IRQ_STATE + +Capability: KVM_CAP_S390_IRQ_STATE +Architectures: s390 +Type: vcpu ioctl +Parameters: struct kvm_s390_irq_state (out) +Returns: >= number of bytes copied into buffer, + -EINVAL if buffer size is 0, + -ENOBUFS if buffer size is too small to fit all pending interrupts, + -EFAULT if the buffer address was invalid + +This ioctl allows userspace to retrieve the complete state of all currently +pending interrupts in a single buffer. Use cases include migration +and introspection. The parameter structure contains the address of a +userspace buffer and its length: + +struct kvm_s390_irq_state { + __u64 buf; + __u32 flags; /* will stay unused for compatibility reasons */ + __u32 len; + __u32 reserved[4]; /* will stay unused for compatibility reasons */ +}; + +Userspace passes in the above struct and for each pending interrupt a +struct kvm_s390_irq is copied to the provided buffer. + +The structure contains a flags and a reserved field for future extensions. As +the kernel never checked for flags == 0 and QEMU never pre-zeroed flags and +reserved, these fields can not be used in the future without breaking +compatibility. + +If -ENOBUFS is returned the buffer provided was too small and userspace +may retry with a bigger buffer. + +4.95 KVM_S390_SET_IRQ_STATE + +Capability: KVM_CAP_S390_IRQ_STATE +Architectures: s390 +Type: vcpu ioctl +Parameters: struct kvm_s390_irq_state (in) +Returns: 0 on success, + -EFAULT if the buffer address was invalid, + -EINVAL for an invalid buffer length (see below), + -EBUSY if there were already interrupts pending, + errors occurring when actually injecting the + interrupt. See KVM_S390_IRQ. + +This ioctl allows userspace to set the complete state of all cpu-local +interrupts currently pending for the vcpu. It is intended for restoring +interrupt state after a migration. The input parameter is a userspace buffer +containing a struct kvm_s390_irq_state: + +struct kvm_s390_irq_state { + __u64 buf; + __u32 flags; /* will stay unused for compatibility reasons */ + __u32 len; + __u32 reserved[4]; /* will stay unused for compatibility reasons */ +}; + +The restrictions for flags and reserved apply as well. +(see KVM_S390_GET_IRQ_STATE) + +The userspace memory referenced by buf contains a struct kvm_s390_irq +for each interrupt to be injected into the guest. +If one of the interrupts could not be injected for some reason the +ioctl aborts. + +len must be a multiple of sizeof(struct kvm_s390_irq). It must be > 0 +and it must not exceed (max_vcpus + 32) * sizeof(struct kvm_s390_irq), +which is the maximum number of possibly pending cpu-local interrupts. + +4.96 KVM_SMI + +Capability: KVM_CAP_X86_SMM +Architectures: x86 +Type: vcpu ioctl +Parameters: none +Returns: 0 on success, -1 on error + +Queues an SMI on the thread's vcpu. + +4.97 KVM_CAP_PPC_MULTITCE + +Capability: KVM_CAP_PPC_MULTITCE +Architectures: ppc +Type: vm + +This capability means the kernel is capable of handling hypercalls +H_PUT_TCE_INDIRECT and H_STUFF_TCE without passing those into the user +space. This significantly accelerates DMA operations for PPC KVM guests. +User space should expect that its handlers for these hypercalls +are not going to be called if user space previously registered LIOBN +in KVM (via KVM_CREATE_SPAPR_TCE or similar calls). + +In order to enable H_PUT_TCE_INDIRECT and H_STUFF_TCE use in the guest, +user space might have to advertise it for the guest. For example, +IBM pSeries (sPAPR) guest starts using them if "hcall-multi-tce" is +present in the "ibm,hypertas-functions" device-tree property. + +The hypercalls mentioned above may or may not be processed successfully +in the kernel based fast path. If they can not be handled by the kernel, +they will get passed on to user space. So user space still has to have +an implementation for these despite the in kernel acceleration. + +This capability is always enabled. + +4.98 KVM_CREATE_SPAPR_TCE_64 + +Capability: KVM_CAP_SPAPR_TCE_64 +Architectures: powerpc +Type: vm ioctl +Parameters: struct kvm_create_spapr_tce_64 (in) +Returns: file descriptor for manipulating the created TCE table + +This is an extension for KVM_CAP_SPAPR_TCE which only supports 32bit +windows, described in 4.62 KVM_CREATE_SPAPR_TCE + +This capability uses extended struct in ioctl interface: + +/* for KVM_CAP_SPAPR_TCE_64 */ +struct kvm_create_spapr_tce_64 { + __u64 liobn; + __u32 page_shift; + __u32 flags; + __u64 offset; /* in pages */ + __u64 size; /* in pages */ +}; + +The aim of extension is to support an additional bigger DMA window with +a variable page size. +KVM_CREATE_SPAPR_TCE_64 receives a 64bit window size, an IOMMU page shift and +a bus offset of the corresponding DMA window, @size and @offset are numbers +of IOMMU pages. + +@flags are not used at the moment. + +The rest of functionality is identical to KVM_CREATE_SPAPR_TCE. + +4.99 KVM_REINJECT_CONTROL + +Capability: KVM_CAP_REINJECT_CONTROL +Architectures: x86 +Type: vm ioctl +Parameters: struct kvm_reinject_control (in) +Returns: 0 on success, + -EFAULT if struct kvm_reinject_control cannot be read, + -ENXIO if KVM_CREATE_PIT or KVM_CREATE_PIT2 didn't succeed earlier. + +i8254 (PIT) has two modes, reinject and !reinject. The default is reinject, +where KVM queues elapsed i8254 ticks and monitors completion of interrupt from +vector(s) that i8254 injects. Reinject mode dequeues a tick and injects its +interrupt whenever there isn't a pending interrupt from i8254. +!reinject mode injects an interrupt as soon as a tick arrives. + +struct kvm_reinject_control { + __u8 pit_reinject; + __u8 reserved[31]; +}; + +pit_reinject = 0 (!reinject mode) is recommended, unless running an old +operating system that uses the PIT for timing (e.g. Linux 2.4.x). + +4.100 KVM_PPC_CONFIGURE_V3_MMU + +Capability: KVM_CAP_PPC_RADIX_MMU or KVM_CAP_PPC_HASH_MMU_V3 +Architectures: ppc +Type: vm ioctl +Parameters: struct kvm_ppc_mmuv3_cfg (in) +Returns: 0 on success, + -EFAULT if struct kvm_ppc_mmuv3_cfg cannot be read, + -EINVAL if the configuration is invalid + +This ioctl controls whether the guest will use radix or HPT (hashed +page table) translation, and sets the pointer to the process table for +the guest. + +struct kvm_ppc_mmuv3_cfg { + __u64 flags; + __u64 process_table; +}; + +There are two bits that can be set in flags; KVM_PPC_MMUV3_RADIX and +KVM_PPC_MMUV3_GTSE. KVM_PPC_MMUV3_RADIX, if set, configures the guest +to use radix tree translation, and if clear, to use HPT translation. +KVM_PPC_MMUV3_GTSE, if set and if KVM permits it, configures the guest +to be able to use the global TLB and SLB invalidation instructions; +if clear, the guest may not use these instructions. + +The process_table field specifies the address and size of the guest +process table, which is in the guest's space. This field is formatted +as the second doubleword of the partition table entry, as defined in +the Power ISA V3.00, Book III section 5.7.6.1. + +4.101 KVM_PPC_GET_RMMU_INFO + +Capability: KVM_CAP_PPC_RADIX_MMU +Architectures: ppc +Type: vm ioctl +Parameters: struct kvm_ppc_rmmu_info (out) +Returns: 0 on success, + -EFAULT if struct kvm_ppc_rmmu_info cannot be written, + -EINVAL if no useful information can be returned + +This ioctl returns a structure containing two things: (a) a list +containing supported radix tree geometries, and (b) a list that maps +page sizes to put in the "AP" (actual page size) field for the tlbie +(TLB invalidate entry) instruction. + +struct kvm_ppc_rmmu_info { + struct kvm_ppc_radix_geom { + __u8 page_shift; + __u8 level_bits[4]; + __u8 pad[3]; + } geometries[8]; + __u32 ap_encodings[8]; +}; + +The geometries[] field gives up to 8 supported geometries for the +radix page table, in terms of the log base 2 of the smallest page +size, and the number of bits indexed at each level of the tree, from +the PTE level up to the PGD level in that order. Any unused entries +will have 0 in the page_shift field. + +The ap_encodings gives the supported page sizes and their AP field +encodings, encoded with the AP value in the top 3 bits and the log +base 2 of the page size in the bottom 6 bits. + +4.102 KVM_PPC_RESIZE_HPT_PREPARE + +Capability: KVM_CAP_SPAPR_RESIZE_HPT +Architectures: powerpc +Type: vm ioctl +Parameters: struct kvm_ppc_resize_hpt (in) +Returns: 0 on successful completion, + >0 if a new HPT is being prepared, the value is an estimated + number of milliseconds until preparation is complete + -EFAULT if struct kvm_reinject_control cannot be read, + -EINVAL if the supplied shift or flags are invalid + -ENOMEM if unable to allocate the new HPT + -ENOSPC if there was a hash collision when moving existing + HPT entries to the new HPT + -EIO on other error conditions + +Used to implement the PAPR extension for runtime resizing of a guest's +Hashed Page Table (HPT). Specifically this starts, stops or monitors +the preparation of a new potential HPT for the guest, essentially +implementing the H_RESIZE_HPT_PREPARE hypercall. + +If called with shift > 0 when there is no pending HPT for the guest, +this begins preparation of a new pending HPT of size 2^(shift) bytes. +It then returns a positive integer with the estimated number of +milliseconds until preparation is complete. + +If called when there is a pending HPT whose size does not match that +requested in the parameters, discards the existing pending HPT and +creates a new one as above. + +If called when there is a pending HPT of the size requested, will: + * If preparation of the pending HPT is already complete, return 0 + * If preparation of the pending HPT has failed, return an error + code, then discard the pending HPT. + * If preparation of the pending HPT is still in progress, return an + estimated number of milliseconds until preparation is complete. + +If called with shift == 0, discards any currently pending HPT and +returns 0 (i.e. cancels any in-progress preparation). + +flags is reserved for future expansion, currently setting any bits in +flags will result in an -EINVAL. + +Normally this will be called repeatedly with the same parameters until +it returns <= 0. The first call will initiate preparation, subsequent +ones will monitor preparation until it completes or fails. + +struct kvm_ppc_resize_hpt { + __u64 flags; + __u32 shift; + __u32 pad; +}; + +4.103 KVM_PPC_RESIZE_HPT_COMMIT + +Capability: KVM_CAP_SPAPR_RESIZE_HPT +Architectures: powerpc +Type: vm ioctl +Parameters: struct kvm_ppc_resize_hpt (in) +Returns: 0 on successful completion, + -EFAULT if struct kvm_reinject_control cannot be read, + -EINVAL if the supplied shift or flags are invalid + -ENXIO is there is no pending HPT, or the pending HPT doesn't + have the requested size + -EBUSY if the pending HPT is not fully prepared + -ENOSPC if there was a hash collision when moving existing + HPT entries to the new HPT + -EIO on other error conditions + +Used to implement the PAPR extension for runtime resizing of a guest's +Hashed Page Table (HPT). Specifically this requests that the guest be +transferred to working with the new HPT, essentially implementing the +H_RESIZE_HPT_COMMIT hypercall. + +This should only be called after KVM_PPC_RESIZE_HPT_PREPARE has +returned 0 with the same parameters. In other cases +KVM_PPC_RESIZE_HPT_COMMIT will return an error (usually -ENXIO or +-EBUSY, though others may be possible if the preparation was started, +but failed). + +This will have undefined effects on the guest if it has not already +placed itself in a quiescent state where no vcpu will make MMU enabled +memory accesses. + +On succsful completion, the pending HPT will become the guest's active +HPT and the previous HPT will be discarded. + +On failure, the guest will still be operating on its previous HPT. + +struct kvm_ppc_resize_hpt { + __u64 flags; + __u32 shift; + __u32 pad; +}; + +4.104 KVM_X86_GET_MCE_CAP_SUPPORTED + +Capability: KVM_CAP_MCE +Architectures: x86 +Type: system ioctl +Parameters: u64 mce_cap (out) +Returns: 0 on success, -1 on error + +Returns supported MCE capabilities. The u64 mce_cap parameter +has the same format as the MSR_IA32_MCG_CAP register. Supported +capabilities will have the corresponding bits set. + +4.105 KVM_X86_SETUP_MCE + +Capability: KVM_CAP_MCE +Architectures: x86 +Type: vcpu ioctl +Parameters: u64 mcg_cap (in) +Returns: 0 on success, + -EFAULT if u64 mcg_cap cannot be read, + -EINVAL if the requested number of banks is invalid, + -EINVAL if requested MCE capability is not supported. + +Initializes MCE support for use. The u64 mcg_cap parameter +has the same format as the MSR_IA32_MCG_CAP register and +specifies which capabilities should be enabled. The maximum +supported number of error-reporting banks can be retrieved when +checking for KVM_CAP_MCE. The supported capabilities can be +retrieved with KVM_X86_GET_MCE_CAP_SUPPORTED. + +4.106 KVM_X86_SET_MCE + +Capability: KVM_CAP_MCE +Architectures: x86 +Type: vcpu ioctl +Parameters: struct kvm_x86_mce (in) +Returns: 0 on success, + -EFAULT if struct kvm_x86_mce cannot be read, + -EINVAL if the bank number is invalid, + -EINVAL if VAL bit is not set in status field. + +Inject a machine check error (MCE) into the guest. The input +parameter is: + +struct kvm_x86_mce { + __u64 status; + __u64 addr; + __u64 misc; + __u64 mcg_status; + __u8 bank; + __u8 pad1[7]; + __u64 pad2[3]; +}; + +If the MCE being reported is an uncorrected error, KVM will +inject it as an MCE exception into the guest. If the guest +MCG_STATUS register reports that an MCE is in progress, KVM +causes an KVM_EXIT_SHUTDOWN vmexit. + +Otherwise, if the MCE is a corrected error, KVM will just +store it in the corresponding bank (provided this bank is +not holding a previously reported uncorrected error). + +4.107 KVM_S390_GET_CMMA_BITS + +Capability: KVM_CAP_S390_CMMA_MIGRATION +Architectures: s390 +Type: vm ioctl +Parameters: struct kvm_s390_cmma_log (in, out) +Returns: 0 on success, a negative value on error + +This ioctl is used to get the values of the CMMA bits on the s390 +architecture. It is meant to be used in two scenarios: +- During live migration to save the CMMA values. Live migration needs + to be enabled via the KVM_REQ_START_MIGRATION VM property. +- To non-destructively peek at the CMMA values, with the flag + KVM_S390_CMMA_PEEK set. + +The ioctl takes parameters via the kvm_s390_cmma_log struct. The desired +values are written to a buffer whose location is indicated via the "values" +member in the kvm_s390_cmma_log struct. The values in the input struct are +also updated as needed. +Each CMMA value takes up one byte. + +struct kvm_s390_cmma_log { + __u64 start_gfn; + __u32 count; + __u32 flags; + union { + __u64 remaining; + __u64 mask; + }; + __u64 values; +}; + +start_gfn is the number of the first guest frame whose CMMA values are +to be retrieved, + +count is the length of the buffer in bytes, + +values points to the buffer where the result will be written to. + +If count is greater than KVM_S390_SKEYS_MAX, then it is considered to be +KVM_S390_SKEYS_MAX. KVM_S390_SKEYS_MAX is re-used for consistency with +other ioctls. + +The result is written in the buffer pointed to by the field values, and +the values of the input parameter are updated as follows. + +Depending on the flags, different actions are performed. The only +supported flag so far is KVM_S390_CMMA_PEEK. + +The default behaviour if KVM_S390_CMMA_PEEK is not set is: +start_gfn will indicate the first page frame whose CMMA bits were dirty. +It is not necessarily the same as the one passed as input, as clean pages +are skipped. + +count will indicate the number of bytes actually written in the buffer. +It can (and very often will) be smaller than the input value, since the +buffer is only filled until 16 bytes of clean values are found (which +are then not copied in the buffer). Since a CMMA migration block needs +the base address and the length, for a total of 16 bytes, we will send +back some clean data if there is some dirty data afterwards, as long as +the size of the clean data does not exceed the size of the header. This +allows to minimize the amount of data to be saved or transferred over +the network at the expense of more roundtrips to userspace. The next +invocation of the ioctl will skip over all the clean values, saving +potentially more than just the 16 bytes we found. + +If KVM_S390_CMMA_PEEK is set: +the existing storage attributes are read even when not in migration +mode, and no other action is performed; + +the output start_gfn will be equal to the input start_gfn, + +the output count will be equal to the input count, except if the end of +memory has been reached. + +In both cases: +the field "remaining" will indicate the total number of dirty CMMA values +still remaining, or 0 if KVM_S390_CMMA_PEEK is set and migration mode is +not enabled. + +mask is unused. + +values points to the userspace buffer where the result will be stored. + +This ioctl can fail with -ENOMEM if not enough memory can be allocated to +complete the task, with -ENXIO if CMMA is not enabled, with -EINVAL if +KVM_S390_CMMA_PEEK is not set but migration mode was not enabled, with +-EFAULT if the userspace address is invalid or if no page table is +present for the addresses (e.g. when using hugepages). + +4.108 KVM_S390_SET_CMMA_BITS + +Capability: KVM_CAP_S390_CMMA_MIGRATION +Architectures: s390 +Type: vm ioctl +Parameters: struct kvm_s390_cmma_log (in) +Returns: 0 on success, a negative value on error + +This ioctl is used to set the values of the CMMA bits on the s390 +architecture. It is meant to be used during live migration to restore +the CMMA values, but there are no restrictions on its use. +The ioctl takes parameters via the kvm_s390_cmma_values struct. +Each CMMA value takes up one byte. + +struct kvm_s390_cmma_log { + __u64 start_gfn; + __u32 count; + __u32 flags; + union { + __u64 remaining; + __u64 mask; + }; + __u64 values; +}; + +start_gfn indicates the starting guest frame number, + +count indicates how many values are to be considered in the buffer, + +flags is not used and must be 0. + +mask indicates which PGSTE bits are to be considered. + +remaining is not used. + +values points to the buffer in userspace where to store the values. + +This ioctl can fail with -ENOMEM if not enough memory can be allocated to +complete the task, with -ENXIO if CMMA is not enabled, with -EINVAL if +the count field is too large (e.g. more than KVM_S390_CMMA_SIZE_MAX) or +if the flags field was not 0, with -EFAULT if the userspace address is +invalid, if invalid pages are written to (e.g. after the end of memory) +or if no page table is present for the addresses (e.g. when using +hugepages). + +4.109 KVM_PPC_GET_CPU_CHAR + +Capability: KVM_CAP_PPC_GET_CPU_CHAR +Architectures: powerpc +Type: vm ioctl +Parameters: struct kvm_ppc_cpu_char (out) +Returns: 0 on successful completion + -EFAULT if struct kvm_ppc_cpu_char cannot be written + +This ioctl gives userspace information about certain characteristics +of the CPU relating to speculative execution of instructions and +possible information leakage resulting from speculative execution (see +CVE-2017-5715, CVE-2017-5753 and CVE-2017-5754). The information is +returned in struct kvm_ppc_cpu_char, which looks like this: + +struct kvm_ppc_cpu_char { + __u64 character; /* characteristics of the CPU */ + __u64 behaviour; /* recommended software behaviour */ + __u64 character_mask; /* valid bits in character */ + __u64 behaviour_mask; /* valid bits in behaviour */ +}; + +For extensibility, the character_mask and behaviour_mask fields +indicate which bits of character and behaviour have been filled in by +the kernel. If the set of defined bits is extended in future then +userspace will be able to tell whether it is running on a kernel that +knows about the new bits. + +The character field describes attributes of the CPU which can help +with preventing inadvertent information disclosure - specifically, +whether there is an instruction to flash-invalidate the L1 data cache +(ori 30,30,0 or mtspr SPRN_TRIG2,rN), whether the L1 data cache is set +to a mode where entries can only be used by the thread that created +them, whether the bcctr[l] instruction prevents speculation, and +whether a speculation barrier instruction (ori 31,31,0) is provided. + +The behaviour field describes actions that software should take to +prevent inadvertent information disclosure, and thus describes which +vulnerabilities the hardware is subject to; specifically whether the +L1 data cache should be flushed when returning to user mode from the +kernel, and whether a speculation barrier should be placed between an +array bounds check and the array access. + +These fields use the same bit definitions as the new +H_GET_CPU_CHARACTERISTICS hypercall. + +4.110 KVM_MEMORY_ENCRYPT_OP + +Capability: basic +Architectures: x86 +Type: system +Parameters: an opaque platform specific structure (in/out) +Returns: 0 on success; -1 on error + +If the platform supports creating encrypted VMs then this ioctl can be used +for issuing platform-specific memory encryption commands to manage those +encrypted VMs. + +Currently, this ioctl is used for issuing Secure Encrypted Virtualization +(SEV) commands on AMD Processors. The SEV commands are defined in +Documentation/virt/kvm/amd-memory-encryption.rst. + +4.111 KVM_MEMORY_ENCRYPT_REG_REGION + +Capability: basic +Architectures: x86 +Type: system +Parameters: struct kvm_enc_region (in) +Returns: 0 on success; -1 on error + +This ioctl can be used to register a guest memory region which may +contain encrypted data (e.g. guest RAM, SMRAM etc). + +It is used in the SEV-enabled guest. When encryption is enabled, a guest +memory region may contain encrypted data. The SEV memory encryption +engine uses a tweak such that two identical plaintext pages, each at +different locations will have differing ciphertexts. So swapping or +moving ciphertext of those pages will not result in plaintext being +swapped. So relocating (or migrating) physical backing pages for the SEV +guest will require some additional steps. + +Note: The current SEV key management spec does not provide commands to +swap or migrate (move) ciphertext pages. Hence, for now we pin the guest +memory region registered with the ioctl. + +4.112 KVM_MEMORY_ENCRYPT_UNREG_REGION + +Capability: basic +Architectures: x86 +Type: system +Parameters: struct kvm_enc_region (in) +Returns: 0 on success; -1 on error + +This ioctl can be used to unregister the guest memory region registered +with KVM_MEMORY_ENCRYPT_REG_REGION ioctl above. + +4.113 KVM_HYPERV_EVENTFD + +Capability: KVM_CAP_HYPERV_EVENTFD +Architectures: x86 +Type: vm ioctl +Parameters: struct kvm_hyperv_eventfd (in) + +This ioctl (un)registers an eventfd to receive notifications from the guest on +the specified Hyper-V connection id through the SIGNAL_EVENT hypercall, without +causing a user exit. SIGNAL_EVENT hypercall with non-zero event flag number +(bits 24-31) still triggers a KVM_EXIT_HYPERV_HCALL user exit. + +struct kvm_hyperv_eventfd { + __u32 conn_id; + __s32 fd; + __u32 flags; + __u32 padding[3]; +}; + +The conn_id field should fit within 24 bits: + +#define KVM_HYPERV_CONN_ID_MASK 0x00ffffff + +The acceptable values for the flags field are: + +#define KVM_HYPERV_EVENTFD_DEASSIGN (1 << 0) + +Returns: 0 on success, + -EINVAL if conn_id or flags is outside the allowed range + -ENOENT on deassign if the conn_id isn't registered + -EEXIST on assign if the conn_id is already registered + +4.114 KVM_GET_NESTED_STATE + +Capability: KVM_CAP_NESTED_STATE +Architectures: x86 +Type: vcpu ioctl +Parameters: struct kvm_nested_state (in/out) +Returns: 0 on success, -1 on error +Errors: + E2BIG: the total state size exceeds the value of 'size' specified by + the user; the size required will be written into size. + +struct kvm_nested_state { + __u16 flags; + __u16 format; + __u32 size; + + union { + struct kvm_vmx_nested_state_hdr vmx; + struct kvm_svm_nested_state_hdr svm; + + /* Pad the header to 128 bytes. */ + __u8 pad[120]; + } hdr; + + union { + struct kvm_vmx_nested_state_data vmx[0]; + struct kvm_svm_nested_state_data svm[0]; + } data; +}; + +#define KVM_STATE_NESTED_GUEST_MODE 0x00000001 +#define KVM_STATE_NESTED_RUN_PENDING 0x00000002 +#define KVM_STATE_NESTED_EVMCS 0x00000004 + +#define KVM_STATE_NESTED_FORMAT_VMX 0 +#define KVM_STATE_NESTED_FORMAT_SVM 1 + +#define KVM_STATE_NESTED_VMX_VMCS_SIZE 0x1000 + +#define KVM_STATE_NESTED_VMX_SMM_GUEST_MODE 0x00000001 +#define KVM_STATE_NESTED_VMX_SMM_VMXON 0x00000002 + +struct kvm_vmx_nested_state_hdr { + __u64 vmxon_pa; + __u64 vmcs12_pa; + + struct { + __u16 flags; + } smm; +}; + +struct kvm_vmx_nested_state_data { + __u8 vmcs12[KVM_STATE_NESTED_VMX_VMCS_SIZE]; + __u8 shadow_vmcs12[KVM_STATE_NESTED_VMX_VMCS_SIZE]; +}; + +This ioctl copies the vcpu's nested virtualization state from the kernel to +userspace. + +The maximum size of the state can be retrieved by passing KVM_CAP_NESTED_STATE +to the KVM_CHECK_EXTENSION ioctl(). + +4.115 KVM_SET_NESTED_STATE + +Capability: KVM_CAP_NESTED_STATE +Architectures: x86 +Type: vcpu ioctl +Parameters: struct kvm_nested_state (in) +Returns: 0 on success, -1 on error + +This copies the vcpu's kvm_nested_state struct from userspace to the kernel. +For the definition of struct kvm_nested_state, see KVM_GET_NESTED_STATE. + +4.116 KVM_(UN)REGISTER_COALESCED_MMIO + +Capability: KVM_CAP_COALESCED_MMIO (for coalesced mmio) + KVM_CAP_COALESCED_PIO (for coalesced pio) +Architectures: all +Type: vm ioctl +Parameters: struct kvm_coalesced_mmio_zone +Returns: 0 on success, < 0 on error + +Coalesced I/O is a performance optimization that defers hardware +register write emulation so that userspace exits are avoided. It is +typically used to reduce the overhead of emulating frequently accessed +hardware registers. + +When a hardware register is configured for coalesced I/O, write accesses +do not exit to userspace and their value is recorded in a ring buffer +that is shared between kernel and userspace. + +Coalesced I/O is used if one or more write accesses to a hardware +register can be deferred until a read or a write to another hardware +register on the same device. This last access will cause a vmexit and +userspace will process accesses from the ring buffer before emulating +it. That will avoid exiting to userspace on repeated writes. + +Coalesced pio is based on coalesced mmio. There is little difference +between coalesced mmio and pio except that coalesced pio records accesses +to I/O ports. + +4.117 KVM_CLEAR_DIRTY_LOG (vm ioctl) + +Capability: KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 +Architectures: x86, arm, arm64, mips +Type: vm ioctl +Parameters: struct kvm_dirty_log (in) +Returns: 0 on success, -1 on error + +/* for KVM_CLEAR_DIRTY_LOG */ +struct kvm_clear_dirty_log { + __u32 slot; + __u32 num_pages; + __u64 first_page; + union { + void __user *dirty_bitmap; /* one bit per page */ + __u64 padding; + }; +}; + +The ioctl clears the dirty status of pages in a memory slot, according to +the bitmap that is passed in struct kvm_clear_dirty_log's dirty_bitmap +field. Bit 0 of the bitmap corresponds to page "first_page" in the +memory slot, and num_pages is the size in bits of the input bitmap. +first_page must be a multiple of 64; num_pages must also be a multiple of +64 unless first_page + num_pages is the size of the memory slot. For each +bit that is set in the input bitmap, the corresponding page is marked "clean" +in KVM's dirty bitmap, and dirty tracking is re-enabled for that page +(for example via write-protection, or by clearing the dirty bit in +a page table entry). + +If KVM_CAP_MULTI_ADDRESS_SPACE is available, bits 16-31 specifies +the address space for which you want to return the dirty bitmap. +They must be less than the value that KVM_CHECK_EXTENSION returns for +the KVM_CAP_MULTI_ADDRESS_SPACE capability. + +This ioctl is mostly useful when KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 +is enabled; for more information, see the description of the capability. +However, it can always be used as long as KVM_CHECK_EXTENSION confirms +that KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 is present. + +4.118 KVM_GET_SUPPORTED_HV_CPUID + +Capability: KVM_CAP_HYPERV_CPUID +Architectures: x86 +Type: vcpu ioctl +Parameters: struct kvm_cpuid2 (in/out) +Returns: 0 on success, -1 on error + +struct kvm_cpuid2 { + __u32 nent; + __u32 padding; + struct kvm_cpuid_entry2 entries[0]; +}; + +struct kvm_cpuid_entry2 { + __u32 function; + __u32 index; + __u32 flags; + __u32 eax; + __u32 ebx; + __u32 ecx; + __u32 edx; + __u32 padding[3]; +}; + +This ioctl returns x86 cpuid features leaves related to Hyper-V emulation in +KVM. Userspace can use the information returned by this ioctl to construct +cpuid information presented to guests consuming Hyper-V enlightenments (e.g. +Windows or Hyper-V guests). + +CPUID feature leaves returned by this ioctl are defined by Hyper-V Top Level +Functional Specification (TLFS). These leaves can't be obtained with +KVM_GET_SUPPORTED_CPUID ioctl because some of them intersect with KVM feature +leaves (0x40000000, 0x40000001). + +Currently, the following list of CPUID leaves are returned: + HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS + HYPERV_CPUID_INTERFACE + HYPERV_CPUID_VERSION + HYPERV_CPUID_FEATURES + HYPERV_CPUID_ENLIGHTMENT_INFO + HYPERV_CPUID_IMPLEMENT_LIMITS + HYPERV_CPUID_NESTED_FEATURES + +HYPERV_CPUID_NESTED_FEATURES leaf is only exposed when Enlightened VMCS was +enabled on the corresponding vCPU (KVM_CAP_HYPERV_ENLIGHTENED_VMCS). + +Userspace invokes KVM_GET_SUPPORTED_CPUID by passing a kvm_cpuid2 structure +with the 'nent' field indicating the number of entries in the variable-size +array 'entries'. If the number of entries is too low to describe all Hyper-V +feature leaves, an error (E2BIG) is returned. If the number is more or equal +to the number of Hyper-V feature leaves, the 'nent' field is adjusted to the +number of valid entries in the 'entries' array, which is then filled. + +'index' and 'flags' fields in 'struct kvm_cpuid_entry2' are currently reserved, +userspace should not expect to get any particular value there. + +4.119 KVM_ARM_VCPU_FINALIZE + +Architectures: arm, arm64 +Type: vcpu ioctl +Parameters: int feature (in) +Returns: 0 on success, -1 on error +Errors: + EPERM: feature not enabled, needs configuration, or already finalized + EINVAL: feature unknown or not present + +Recognised values for feature: + arm64 KVM_ARM_VCPU_SVE (requires KVM_CAP_ARM_SVE) + +Finalizes the configuration of the specified vcpu feature. + +The vcpu must already have been initialised, enabling the affected feature, by +means of a successful KVM_ARM_VCPU_INIT call with the appropriate flag set in +features[]. + +For affected vcpu features, this is a mandatory step that must be performed +before the vcpu is fully usable. + +Between KVM_ARM_VCPU_INIT and KVM_ARM_VCPU_FINALIZE, the feature may be +configured by use of ioctls such as KVM_SET_ONE_REG. The exact configuration +that should be performaned and how to do it are feature-dependent. + +Other calls that depend on a particular feature being finalized, such as +KVM_RUN, KVM_GET_REG_LIST, KVM_GET_ONE_REG and KVM_SET_ONE_REG, will fail with +-EPERM unless the feature has already been finalized by means of a +KVM_ARM_VCPU_FINALIZE call. + +See KVM_ARM_VCPU_INIT for details of vcpu features that require finalization +using this ioctl. + +4.120 KVM_SET_PMU_EVENT_FILTER + +Capability: KVM_CAP_PMU_EVENT_FILTER +Architectures: x86 +Type: vm ioctl +Parameters: struct kvm_pmu_event_filter (in) +Returns: 0 on success, -1 on error + +struct kvm_pmu_event_filter { + __u32 action; + __u32 nevents; + __u32 fixed_counter_bitmap; + __u32 flags; + __u32 pad[4]; + __u64 events[0]; +}; + +This ioctl restricts the set of PMU events that the guest can program. +The argument holds a list of events which will be allowed or denied. +The eventsel+umask of each event the guest attempts to program is compared +against the events field to determine whether the guest should have access. +The events field only controls general purpose counters; fixed purpose +counters are controlled by the fixed_counter_bitmap. + +No flags are defined yet, the field must be zero. + +Valid values for 'action': +#define KVM_PMU_EVENT_ALLOW 0 +#define KVM_PMU_EVENT_DENY 1 + + +5. The kvm_run structure +------------------------ + +Application code obtains a pointer to the kvm_run structure by +mmap()ing a vcpu fd. From that point, application code can control +execution by changing fields in kvm_run prior to calling the KVM_RUN +ioctl, and obtain information about the reason KVM_RUN returned by +looking up structure members. + +struct kvm_run { + /* in */ + __u8 request_interrupt_window; + +Request that KVM_RUN return when it becomes possible to inject external +interrupts into the guest. Useful in conjunction with KVM_INTERRUPT. + + __u8 immediate_exit; + +This field is polled once when KVM_RUN starts; if non-zero, KVM_RUN +exits immediately, returning -EINTR. In the common scenario where a +signal is used to "kick" a VCPU out of KVM_RUN, this field can be used +to avoid usage of KVM_SET_SIGNAL_MASK, which has worse scalability. +Rather than blocking the signal outside KVM_RUN, userspace can set up +a signal handler that sets run->immediate_exit to a non-zero value. + +This field is ignored if KVM_CAP_IMMEDIATE_EXIT is not available. + + __u8 padding1[6]; + + /* out */ + __u32 exit_reason; + +When KVM_RUN has returned successfully (return value 0), this informs +application code why KVM_RUN has returned. Allowable values for this +field are detailed below. + + __u8 ready_for_interrupt_injection; + +If request_interrupt_window has been specified, this field indicates +an interrupt can be injected now with KVM_INTERRUPT. + + __u8 if_flag; + +The value of the current interrupt flag. Only valid if in-kernel +local APIC is not used. + + __u16 flags; + +More architecture-specific flags detailing state of the VCPU that may +affect the device's behavior. The only currently defined flag is +KVM_RUN_X86_SMM, which is valid on x86 machines and is set if the +VCPU is in system management mode. + + /* in (pre_kvm_run), out (post_kvm_run) */ + __u64 cr8; + +The value of the cr8 register. Only valid if in-kernel local APIC is +not used. Both input and output. + + __u64 apic_base; + +The value of the APIC BASE msr. Only valid if in-kernel local +APIC is not used. Both input and output. + + union { + /* KVM_EXIT_UNKNOWN */ + struct { + __u64 hardware_exit_reason; + } hw; + +If exit_reason is KVM_EXIT_UNKNOWN, the vcpu has exited due to unknown +reasons. Further architecture-specific information is available in +hardware_exit_reason. + + /* KVM_EXIT_FAIL_ENTRY */ + struct { + __u64 hardware_entry_failure_reason; + } fail_entry; + +If exit_reason is KVM_EXIT_FAIL_ENTRY, the vcpu could not be run due +to unknown reasons. Further architecture-specific information is +available in hardware_entry_failure_reason. + + /* KVM_EXIT_EXCEPTION */ + struct { + __u32 exception; + __u32 error_code; + } ex; + +Unused. + + /* KVM_EXIT_IO */ + struct { +#define KVM_EXIT_IO_IN 0 +#define KVM_EXIT_IO_OUT 1 + __u8 direction; + __u8 size; /* bytes */ + __u16 port; + __u32 count; + __u64 data_offset; /* relative to kvm_run start */ + } io; + +If exit_reason is KVM_EXIT_IO, then the vcpu has +executed a port I/O instruction which could not be satisfied by kvm. +data_offset describes where the data is located (KVM_EXIT_IO_OUT) or +where kvm expects application code to place the data for the next +KVM_RUN invocation (KVM_EXIT_IO_IN). Data format is a packed array. + + /* KVM_EXIT_DEBUG */ + struct { + struct kvm_debug_exit_arch arch; + } debug; + +If the exit_reason is KVM_EXIT_DEBUG, then a vcpu is processing a debug event +for which architecture specific information is returned. + + /* KVM_EXIT_MMIO */ + struct { + __u64 phys_addr; + __u8 data[8]; + __u32 len; + __u8 is_write; + } mmio; + +If exit_reason is KVM_EXIT_MMIO, then the vcpu has +executed a memory-mapped I/O instruction which could not be satisfied +by kvm. The 'data' member contains the written data if 'is_write' is +true, and should be filled by application code otherwise. + +The 'data' member contains, in its first 'len' bytes, the value as it would +appear if the VCPU performed a load or store of the appropriate width directly +to the byte array. + +NOTE: For KVM_EXIT_IO, KVM_EXIT_MMIO, KVM_EXIT_OSI, KVM_EXIT_PAPR and + KVM_EXIT_EPR the corresponding +operations are complete (and guest state is consistent) only after userspace +has re-entered the kernel with KVM_RUN. The kernel side will first finish +incomplete operations and then check for pending signals. Userspace +can re-enter the guest with an unmasked signal pending to complete +pending operations. + + /* KVM_EXIT_HYPERCALL */ + struct { + __u64 nr; + __u64 args[6]; + __u64 ret; + __u32 longmode; + __u32 pad; + } hypercall; + +Unused. This was once used for 'hypercall to userspace'. To implement +such functionality, use KVM_EXIT_IO (x86) or KVM_EXIT_MMIO (all except s390). +Note KVM_EXIT_IO is significantly faster than KVM_EXIT_MMIO. + + /* KVM_EXIT_TPR_ACCESS */ + struct { + __u64 rip; + __u32 is_write; + __u32 pad; + } tpr_access; + +To be documented (KVM_TPR_ACCESS_REPORTING). + + /* KVM_EXIT_S390_SIEIC */ + struct { + __u8 icptcode; + __u64 mask; /* psw upper half */ + __u64 addr; /* psw lower half */ + __u16 ipa; + __u32 ipb; + } s390_sieic; + +s390 specific. + + /* KVM_EXIT_S390_RESET */ +#define KVM_S390_RESET_POR 1 +#define KVM_S390_RESET_CLEAR 2 +#define KVM_S390_RESET_SUBSYSTEM 4 +#define KVM_S390_RESET_CPU_INIT 8 +#define KVM_S390_RESET_IPL 16 + __u64 s390_reset_flags; + +s390 specific. + + /* KVM_EXIT_S390_UCONTROL */ + struct { + __u64 trans_exc_code; + __u32 pgm_code; + } s390_ucontrol; + +s390 specific. A page fault has occurred for a user controlled virtual +machine (KVM_VM_S390_UNCONTROL) on it's host page table that cannot be +resolved by the kernel. +The program code and the translation exception code that were placed +in the cpu's lowcore are presented here as defined by the z Architecture +Principles of Operation Book in the Chapter for Dynamic Address Translation +(DAT) + + /* KVM_EXIT_DCR */ + struct { + __u32 dcrn; + __u32 data; + __u8 is_write; + } dcr; + +Deprecated - was used for 440 KVM. + + /* KVM_EXIT_OSI */ + struct { + __u64 gprs[32]; + } osi; + +MOL uses a special hypercall interface it calls 'OSI'. To enable it, we catch +hypercalls and exit with this exit struct that contains all the guest gprs. + +If exit_reason is KVM_EXIT_OSI, then the vcpu has triggered such a hypercall. +Userspace can now handle the hypercall and when it's done modify the gprs as +necessary. Upon guest entry all guest GPRs will then be replaced by the values +in this struct. + + /* KVM_EXIT_PAPR_HCALL */ + struct { + __u64 nr; + __u64 ret; + __u64 args[9]; + } papr_hcall; + +This is used on 64-bit PowerPC when emulating a pSeries partition, +e.g. with the 'pseries' machine type in qemu. It occurs when the +guest does a hypercall using the 'sc 1' instruction. The 'nr' field +contains the hypercall number (from the guest R3), and 'args' contains +the arguments (from the guest R4 - R12). Userspace should put the +return code in 'ret' and any extra returned values in args[]. +The possible hypercalls are defined in the Power Architecture Platform +Requirements (PAPR) document available from www.power.org (free +developer registration required to access it). + + /* KVM_EXIT_S390_TSCH */ + struct { + __u16 subchannel_id; + __u16 subchannel_nr; + __u32 io_int_parm; + __u32 io_int_word; + __u32 ipb; + __u8 dequeued; + } s390_tsch; + +s390 specific. This exit occurs when KVM_CAP_S390_CSS_SUPPORT has been enabled +and TEST SUBCHANNEL was intercepted. If dequeued is set, a pending I/O +interrupt for the target subchannel has been dequeued and subchannel_id, +subchannel_nr, io_int_parm and io_int_word contain the parameters for that +interrupt. ipb is needed for instruction parameter decoding. + + /* KVM_EXIT_EPR */ + struct { + __u32 epr; + } epr; + +On FSL BookE PowerPC chips, the interrupt controller has a fast patch +interrupt acknowledge path to the core. When the core successfully +delivers an interrupt, it automatically populates the EPR register with +the interrupt vector number and acknowledges the interrupt inside +the interrupt controller. + +In case the interrupt controller lives in user space, we need to do +the interrupt acknowledge cycle through it to fetch the next to be +delivered interrupt vector using this exit. + +It gets triggered whenever both KVM_CAP_PPC_EPR are enabled and an +external interrupt has just been delivered into the guest. User space +should put the acknowledged interrupt vector into the 'epr' field. + + /* KVM_EXIT_SYSTEM_EVENT */ + struct { +#define KVM_SYSTEM_EVENT_SHUTDOWN 1 +#define KVM_SYSTEM_EVENT_RESET 2 +#define KVM_SYSTEM_EVENT_CRASH 3 + __u32 type; + __u64 flags; + } system_event; + +If exit_reason is KVM_EXIT_SYSTEM_EVENT then the vcpu has triggered +a system-level event using some architecture specific mechanism (hypercall +or some special instruction). In case of ARM/ARM64, this is triggered using +HVC instruction based PSCI call from the vcpu. The 'type' field describes +the system-level event type. The 'flags' field describes architecture +specific flags for the system-level event. + +Valid values for 'type' are: + KVM_SYSTEM_EVENT_SHUTDOWN -- the guest has requested a shutdown of the + VM. Userspace is not obliged to honour this, and if it does honour + this does not need to destroy the VM synchronously (ie it may call + KVM_RUN again before shutdown finally occurs). + KVM_SYSTEM_EVENT_RESET -- the guest has requested a reset of the VM. + As with SHUTDOWN, userspace can choose to ignore the request, or + to schedule the reset to occur in the future and may call KVM_RUN again. + KVM_SYSTEM_EVENT_CRASH -- the guest crash occurred and the guest + has requested a crash condition maintenance. Userspace can choose + to ignore the request, or to gather VM memory core dump and/or + reset/shutdown of the VM. + + /* KVM_EXIT_IOAPIC_EOI */ + struct { + __u8 vector; + } eoi; + +Indicates that the VCPU's in-kernel local APIC received an EOI for a +level-triggered IOAPIC interrupt. This exit only triggers when the +IOAPIC is implemented in userspace (i.e. KVM_CAP_SPLIT_IRQCHIP is enabled); +the userspace IOAPIC should process the EOI and retrigger the interrupt if +it is still asserted. Vector is the LAPIC interrupt vector for which the +EOI was received. + + struct kvm_hyperv_exit { +#define KVM_EXIT_HYPERV_SYNIC 1 +#define KVM_EXIT_HYPERV_HCALL 2 + __u32 type; + union { + struct { + __u32 msr; + __u64 control; + __u64 evt_page; + __u64 msg_page; + } synic; + struct { + __u64 input; + __u64 result; + __u64 params[2]; + } hcall; + } u; + }; + /* KVM_EXIT_HYPERV */ + struct kvm_hyperv_exit hyperv; +Indicates that the VCPU exits into userspace to process some tasks +related to Hyper-V emulation. +Valid values for 'type' are: + KVM_EXIT_HYPERV_SYNIC -- synchronously notify user-space about +Hyper-V SynIC state change. Notification is used to remap SynIC +event/message pages and to enable/disable SynIC messages/events processing +in userspace. + + /* Fix the size of the union. */ + char padding[256]; + }; + + /* + * shared registers between kvm and userspace. + * kvm_valid_regs specifies the register classes set by the host + * kvm_dirty_regs specified the register classes dirtied by userspace + * struct kvm_sync_regs is architecture specific, as well as the + * bits for kvm_valid_regs and kvm_dirty_regs + */ + __u64 kvm_valid_regs; + __u64 kvm_dirty_regs; + union { + struct kvm_sync_regs regs; + char padding[SYNC_REGS_SIZE_BYTES]; + } s; + +If KVM_CAP_SYNC_REGS is defined, these fields allow userspace to access +certain guest registers without having to call SET/GET_*REGS. Thus we can +avoid some system call overhead if userspace has to handle the exit. +Userspace can query the validity of the structure by checking +kvm_valid_regs for specific bits. These bits are architecture specific +and usually define the validity of a groups of registers. (e.g. one bit + for general purpose registers) + +Please note that the kernel is allowed to use the kvm_run structure as the +primary storage for certain register types. Therefore, the kernel may use the +values in kvm_run even if the corresponding bit in kvm_dirty_regs is not set. + +}; + + + +6. Capabilities that can be enabled on vCPUs +-------------------------------------------- + +There are certain capabilities that change the behavior of the virtual CPU or +the virtual machine when enabled. To enable them, please see section 4.37. +Below you can find a list of capabilities and what their effect on the vCPU or +the virtual machine is when enabling them. + +The following information is provided along with the description: + + Architectures: which instruction set architectures provide this ioctl. + x86 includes both i386 and x86_64. + + Target: whether this is a per-vcpu or per-vm capability. + + Parameters: what parameters are accepted by the capability. + + Returns: the return value. General error numbers (EBADF, ENOMEM, EINVAL) + are not detailed, but errors with specific meanings are. + + +6.1 KVM_CAP_PPC_OSI + +Architectures: ppc +Target: vcpu +Parameters: none +Returns: 0 on success; -1 on error + +This capability enables interception of OSI hypercalls that otherwise would +be treated as normal system calls to be injected into the guest. OSI hypercalls +were invented by Mac-on-Linux to have a standardized communication mechanism +between the guest and the host. + +When this capability is enabled, KVM_EXIT_OSI can occur. + + +6.2 KVM_CAP_PPC_PAPR + +Architectures: ppc +Target: vcpu +Parameters: none +Returns: 0 on success; -1 on error + +This capability enables interception of PAPR hypercalls. PAPR hypercalls are +done using the hypercall instruction "sc 1". + +It also sets the guest privilege level to "supervisor" mode. Usually the guest +runs in "hypervisor" privilege mode with a few missing features. + +In addition to the above, it changes the semantics of SDR1. In this mode, the +HTAB address part of SDR1 contains an HVA instead of a GPA, as PAPR keeps the +HTAB invisible to the guest. + +When this capability is enabled, KVM_EXIT_PAPR_HCALL can occur. + + +6.3 KVM_CAP_SW_TLB + +Architectures: ppc +Target: vcpu +Parameters: args[0] is the address of a struct kvm_config_tlb +Returns: 0 on success; -1 on error + +struct kvm_config_tlb { + __u64 params; + __u64 array; + __u32 mmu_type; + __u32 array_len; +}; + +Configures the virtual CPU's TLB array, establishing a shared memory area +between userspace and KVM. The "params" and "array" fields are userspace +addresses of mmu-type-specific data structures. The "array_len" field is an +safety mechanism, and should be set to the size in bytes of the memory that +userspace has reserved for the array. It must be at least the size dictated +by "mmu_type" and "params". + +While KVM_RUN is active, the shared region is under control of KVM. Its +contents are undefined, and any modification by userspace results in +boundedly undefined behavior. + +On return from KVM_RUN, the shared region will reflect the current state of +the guest's TLB. If userspace makes any changes, it must call KVM_DIRTY_TLB +to tell KVM which entries have been changed, prior to calling KVM_RUN again +on this vcpu. + +For mmu types KVM_MMU_FSL_BOOKE_NOHV and KVM_MMU_FSL_BOOKE_HV: + - The "params" field is of type "struct kvm_book3e_206_tlb_params". + - The "array" field points to an array of type "struct + kvm_book3e_206_tlb_entry". + - The array consists of all entries in the first TLB, followed by all + entries in the second TLB. + - Within a TLB, entries are ordered first by increasing set number. Within a + set, entries are ordered by way (increasing ESEL). + - The hash for determining set number in TLB0 is: (MAS2 >> 12) & (num_sets - 1) + where "num_sets" is the tlb_sizes[] value divided by the tlb_ways[] value. + - The tsize field of mas1 shall be set to 4K on TLB0, even though the + hardware ignores this value for TLB0. + +6.4 KVM_CAP_S390_CSS_SUPPORT + +Architectures: s390 +Target: vcpu +Parameters: none +Returns: 0 on success; -1 on error + +This capability enables support for handling of channel I/O instructions. + +TEST PENDING INTERRUPTION and the interrupt portion of TEST SUBCHANNEL are +handled in-kernel, while the other I/O instructions are passed to userspace. + +When this capability is enabled, KVM_EXIT_S390_TSCH will occur on TEST +SUBCHANNEL intercepts. + +Note that even though this capability is enabled per-vcpu, the complete +virtual machine is affected. + +6.5 KVM_CAP_PPC_EPR + +Architectures: ppc +Target: vcpu +Parameters: args[0] defines whether the proxy facility is active +Returns: 0 on success; -1 on error + +This capability enables or disables the delivery of interrupts through the +external proxy facility. + +When enabled (args[0] != 0), every time the guest gets an external interrupt +delivered, it automatically exits into user space with a KVM_EXIT_EPR exit +to receive the topmost interrupt vector. + +When disabled (args[0] == 0), behavior is as if this facility is unsupported. + +When this capability is enabled, KVM_EXIT_EPR can occur. + +6.6 KVM_CAP_IRQ_MPIC + +Architectures: ppc +Parameters: args[0] is the MPIC device fd + args[1] is the MPIC CPU number for this vcpu + +This capability connects the vcpu to an in-kernel MPIC device. + +6.7 KVM_CAP_IRQ_XICS + +Architectures: ppc +Target: vcpu +Parameters: args[0] is the XICS device fd + args[1] is the XICS CPU number (server ID) for this vcpu + +This capability connects the vcpu to an in-kernel XICS device. + +6.8 KVM_CAP_S390_IRQCHIP + +Architectures: s390 +Target: vm +Parameters: none + +This capability enables the in-kernel irqchip for s390. Please refer to +"4.24 KVM_CREATE_IRQCHIP" for details. + +6.9 KVM_CAP_MIPS_FPU + +Architectures: mips +Target: vcpu +Parameters: args[0] is reserved for future use (should be 0). + +This capability allows the use of the host Floating Point Unit by the guest. It +allows the Config1.FP bit to be set to enable the FPU in the guest. Once this is +done the KVM_REG_MIPS_FPR_* and KVM_REG_MIPS_FCR_* registers can be accessed +(depending on the current guest FPU register mode), and the Status.FR, +Config5.FRE bits are accessible via the KVM API and also from the guest, +depending on them being supported by the FPU. + +6.10 KVM_CAP_MIPS_MSA + +Architectures: mips +Target: vcpu +Parameters: args[0] is reserved for future use (should be 0). + +This capability allows the use of the MIPS SIMD Architecture (MSA) by the guest. +It allows the Config3.MSAP bit to be set to enable the use of MSA by the guest. +Once this is done the KVM_REG_MIPS_VEC_* and KVM_REG_MIPS_MSA_* registers can be +accessed, and the Config5.MSAEn bit is accessible via the KVM API and also from +the guest. + +6.74 KVM_CAP_SYNC_REGS +Architectures: s390, x86 +Target: s390: always enabled, x86: vcpu +Parameters: none +Returns: x86: KVM_CHECK_EXTENSION returns a bit-array indicating which register +sets are supported (bitfields defined in arch/x86/include/uapi/asm/kvm.h). + +As described above in the kvm_sync_regs struct info in section 5 (kvm_run): +KVM_CAP_SYNC_REGS "allow[s] userspace to access certain guest registers +without having to call SET/GET_*REGS". This reduces overhead by eliminating +repeated ioctl calls for setting and/or getting register values. This is +particularly important when userspace is making synchronous guest state +modifications, e.g. when emulating and/or intercepting instructions in +userspace. + +For s390 specifics, please refer to the source code. + +For x86: +- the register sets to be copied out to kvm_run are selectable + by userspace (rather that all sets being copied out for every exit). +- vcpu_events are available in addition to regs and sregs. + +For x86, the 'kvm_valid_regs' field of struct kvm_run is overloaded to +function as an input bit-array field set by userspace to indicate the +specific register sets to be copied out on the next exit. + +To indicate when userspace has modified values that should be copied into +the vCPU, the all architecture bitarray field, 'kvm_dirty_regs' must be set. +This is done using the same bitflags as for the 'kvm_valid_regs' field. +If the dirty bit is not set, then the register set values will not be copied +into the vCPU even if they've been modified. + +Unused bitfields in the bitarrays must be set to zero. + +struct kvm_sync_regs { + struct kvm_regs regs; + struct kvm_sregs sregs; + struct kvm_vcpu_events events; +}; + +6.75 KVM_CAP_PPC_IRQ_XIVE + +Architectures: ppc +Target: vcpu +Parameters: args[0] is the XIVE device fd + args[1] is the XIVE CPU number (server ID) for this vcpu + +This capability connects the vcpu to an in-kernel XIVE device. + +7. Capabilities that can be enabled on VMs +------------------------------------------ + +There are certain capabilities that change the behavior of the virtual +machine when enabled. To enable them, please see section 4.37. Below +you can find a list of capabilities and what their effect on the VM +is when enabling them. + +The following information is provided along with the description: + + Architectures: which instruction set architectures provide this ioctl. + x86 includes both i386 and x86_64. + + Parameters: what parameters are accepted by the capability. + + Returns: the return value. General error numbers (EBADF, ENOMEM, EINVAL) + are not detailed, but errors with specific meanings are. + + +7.1 KVM_CAP_PPC_ENABLE_HCALL + +Architectures: ppc +Parameters: args[0] is the sPAPR hcall number + args[1] is 0 to disable, 1 to enable in-kernel handling + +This capability controls whether individual sPAPR hypercalls (hcalls) +get handled by the kernel or not. Enabling or disabling in-kernel +handling of an hcall is effective across the VM. On creation, an +initial set of hcalls are enabled for in-kernel handling, which +consists of those hcalls for which in-kernel handlers were implemented +before this capability was implemented. If disabled, the kernel will +not to attempt to handle the hcall, but will always exit to userspace +to handle it. Note that it may not make sense to enable some and +disable others of a group of related hcalls, but KVM does not prevent +userspace from doing that. + +If the hcall number specified is not one that has an in-kernel +implementation, the KVM_ENABLE_CAP ioctl will fail with an EINVAL +error. + +7.2 KVM_CAP_S390_USER_SIGP + +Architectures: s390 +Parameters: none + +This capability controls which SIGP orders will be handled completely in user +space. With this capability enabled, all fast orders will be handled completely +in the kernel: +- SENSE +- SENSE RUNNING +- EXTERNAL CALL +- EMERGENCY SIGNAL +- CONDITIONAL EMERGENCY SIGNAL + +All other orders will be handled completely in user space. + +Only privileged operation exceptions will be checked for in the kernel (or even +in the hardware prior to interception). If this capability is not enabled, the +old way of handling SIGP orders is used (partially in kernel and user space). + +7.3 KVM_CAP_S390_VECTOR_REGISTERS + +Architectures: s390 +Parameters: none +Returns: 0 on success, negative value on error + +Allows use of the vector registers introduced with z13 processor, and +provides for the synchronization between host and user space. Will +return -EINVAL if the machine does not support vectors. + +7.4 KVM_CAP_S390_USER_STSI + +Architectures: s390 +Parameters: none + +This capability allows post-handlers for the STSI instruction. After +initial handling in the kernel, KVM exits to user space with +KVM_EXIT_S390_STSI to allow user space to insert further data. + +Before exiting to userspace, kvm handlers should fill in s390_stsi field of +vcpu->run: +struct { + __u64 addr; + __u8 ar; + __u8 reserved; + __u8 fc; + __u8 sel1; + __u16 sel2; +} s390_stsi; + +@addr - guest address of STSI SYSIB +@fc - function code +@sel1 - selector 1 +@sel2 - selector 2 +@ar - access register number + +KVM handlers should exit to userspace with rc = -EREMOTE. + +7.5 KVM_CAP_SPLIT_IRQCHIP + +Architectures: x86 +Parameters: args[0] - number of routes reserved for userspace IOAPICs +Returns: 0 on success, -1 on error + +Create a local apic for each processor in the kernel. This can be used +instead of KVM_CREATE_IRQCHIP if the userspace VMM wishes to emulate the +IOAPIC and PIC (and also the PIT, even though this has to be enabled +separately). + +This capability also enables in kernel routing of interrupt requests; +when KVM_CAP_SPLIT_IRQCHIP only routes of KVM_IRQ_ROUTING_MSI type are +used in the IRQ routing table. The first args[0] MSI routes are reserved +for the IOAPIC pins. Whenever the LAPIC receives an EOI for these routes, +a KVM_EXIT_IOAPIC_EOI vmexit will be reported to userspace. + +Fails if VCPU has already been created, or if the irqchip is already in the +kernel (i.e. KVM_CREATE_IRQCHIP has already been called). + +7.6 KVM_CAP_S390_RI + +Architectures: s390 +Parameters: none + +Allows use of runtime-instrumentation introduced with zEC12 processor. +Will return -EINVAL if the machine does not support runtime-instrumentation. +Will return -EBUSY if a VCPU has already been created. + +7.7 KVM_CAP_X2APIC_API + +Architectures: x86 +Parameters: args[0] - features that should be enabled +Returns: 0 on success, -EINVAL when args[0] contains invalid features + +Valid feature flags in args[0] are + +#define KVM_X2APIC_API_USE_32BIT_IDS (1ULL << 0) +#define KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK (1ULL << 1) + +Enabling KVM_X2APIC_API_USE_32BIT_IDS changes the behavior of +KVM_SET_GSI_ROUTING, KVM_SIGNAL_MSI, KVM_SET_LAPIC, and KVM_GET_LAPIC, +allowing the use of 32-bit APIC IDs. See KVM_CAP_X2APIC_API in their +respective sections. + +KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK must be enabled for x2APIC to work +in logical mode or with more than 255 VCPUs. Otherwise, KVM treats 0xff +as a broadcast even in x2APIC mode in order to support physical x2APIC +without interrupt remapping. This is undesirable in logical mode, +where 0xff represents CPUs 0-7 in cluster 0. + +7.8 KVM_CAP_S390_USER_INSTR0 + +Architectures: s390 +Parameters: none + +With this capability enabled, all illegal instructions 0x0000 (2 bytes) will +be intercepted and forwarded to user space. User space can use this +mechanism e.g. to realize 2-byte software breakpoints. The kernel will +not inject an operating exception for these instructions, user space has +to take care of that. + +This capability can be enabled dynamically even if VCPUs were already +created and are running. + +7.9 KVM_CAP_S390_GS + +Architectures: s390 +Parameters: none +Returns: 0 on success; -EINVAL if the machine does not support + guarded storage; -EBUSY if a VCPU has already been created. + +Allows use of guarded storage for the KVM guest. + +7.10 KVM_CAP_S390_AIS + +Architectures: s390 +Parameters: none + +Allow use of adapter-interruption suppression. +Returns: 0 on success; -EBUSY if a VCPU has already been created. + +7.11 KVM_CAP_PPC_SMT + +Architectures: ppc +Parameters: vsmt_mode, flags + +Enabling this capability on a VM provides userspace with a way to set +the desired virtual SMT mode (i.e. the number of virtual CPUs per +virtual core). The virtual SMT mode, vsmt_mode, must be a power of 2 +between 1 and 8. On POWER8, vsmt_mode must also be no greater than +the number of threads per subcore for the host. Currently flags must +be 0. A successful call to enable this capability will result in +vsmt_mode being returned when the KVM_CAP_PPC_SMT capability is +subsequently queried for the VM. This capability is only supported by +HV KVM, and can only be set before any VCPUs have been created. +The KVM_CAP_PPC_SMT_POSSIBLE capability indicates which virtual SMT +modes are available. + +7.12 KVM_CAP_PPC_FWNMI + +Architectures: ppc +Parameters: none + +With this capability a machine check exception in the guest address +space will cause KVM to exit the guest with NMI exit reason. This +enables QEMU to build error log and branch to guest kernel registered +machine check handling routine. Without this capability KVM will +branch to guests' 0x200 interrupt vector. + +7.13 KVM_CAP_X86_DISABLE_EXITS + +Architectures: x86 +Parameters: args[0] defines which exits are disabled +Returns: 0 on success, -EINVAL when args[0] contains invalid exits + +Valid bits in args[0] are + +#define KVM_X86_DISABLE_EXITS_MWAIT (1 << 0) +#define KVM_X86_DISABLE_EXITS_HLT (1 << 1) +#define KVM_X86_DISABLE_EXITS_PAUSE (1 << 2) +#define KVM_X86_DISABLE_EXITS_CSTATE (1 << 3) + +Enabling this capability on a VM provides userspace with a way to no +longer intercept some instructions for improved latency in some +workloads, and is suggested when vCPUs are associated to dedicated +physical CPUs. More bits can be added in the future; userspace can +just pass the KVM_CHECK_EXTENSION result to KVM_ENABLE_CAP to disable +all such vmexits. + +Do not enable KVM_FEATURE_PV_UNHALT if you disable HLT exits. + +7.14 KVM_CAP_S390_HPAGE_1M + +Architectures: s390 +Parameters: none +Returns: 0 on success, -EINVAL if hpage module parameter was not set + or cmma is enabled, or the VM has the KVM_VM_S390_UCONTROL + flag set + +With this capability the KVM support for memory backing with 1m pages +through hugetlbfs can be enabled for a VM. After the capability is +enabled, cmma can't be enabled anymore and pfmfi and the storage key +interpretation are disabled. If cmma has already been enabled or the +hpage module parameter is not set to 1, -EINVAL is returned. + +While it is generally possible to create a huge page backed VM without +this capability, the VM will not be able to run. + +7.15 KVM_CAP_MSR_PLATFORM_INFO + +Architectures: x86 +Parameters: args[0] whether feature should be enabled or not + +With this capability, a guest may read the MSR_PLATFORM_INFO MSR. Otherwise, +a #GP would be raised when the guest tries to access. Currently, this +capability does not enable write permissions of this MSR for the guest. + +7.16 KVM_CAP_PPC_NESTED_HV + +Architectures: ppc +Parameters: none +Returns: 0 on success, -EINVAL when the implementation doesn't support + nested-HV virtualization. + +HV-KVM on POWER9 and later systems allows for "nested-HV" +virtualization, which provides a way for a guest VM to run guests that +can run using the CPU's supervisor mode (privileged non-hypervisor +state). Enabling this capability on a VM depends on the CPU having +the necessary functionality and on the facility being enabled with a +kvm-hv module parameter. + +7.17 KVM_CAP_EXCEPTION_PAYLOAD + +Architectures: x86 +Parameters: args[0] whether feature should be enabled or not + +With this capability enabled, CR2 will not be modified prior to the +emulated VM-exit when L1 intercepts a #PF exception that occurs in +L2. Similarly, for kvm-intel only, DR6 will not be modified prior to +the emulated VM-exit when L1 intercepts a #DB exception that occurs in +L2. As a result, when KVM_GET_VCPU_EVENTS reports a pending #PF (or +#DB) exception for L2, exception.has_payload will be set and the +faulting address (or the new DR6 bits*) will be reported in the +exception_payload field. Similarly, when userspace injects a #PF (or +#DB) into L2 using KVM_SET_VCPU_EVENTS, it is expected to set +exception.has_payload and to put the faulting address (or the new DR6 +bits*) in the exception_payload field. + +This capability also enables exception.pending in struct +kvm_vcpu_events, which allows userspace to distinguish between pending +and injected exceptions. + + +* For the new DR6 bits, note that bit 16 is set iff the #DB exception + will clear DR6.RTM. + +7.18 KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 + +Architectures: x86, arm, arm64, mips +Parameters: args[0] whether feature should be enabled or not + +With this capability enabled, KVM_GET_DIRTY_LOG will not automatically +clear and write-protect all pages that are returned as dirty. +Rather, userspace will have to do this operation separately using +KVM_CLEAR_DIRTY_LOG. + +At the cost of a slightly more complicated operation, this provides better +scalability and responsiveness for two reasons. First, +KVM_CLEAR_DIRTY_LOG ioctl can operate on a 64-page granularity rather +than requiring to sync a full memslot; this ensures that KVM does not +take spinlocks for an extended period of time. Second, in some cases a +large amount of time can pass between a call to KVM_GET_DIRTY_LOG and +userspace actually using the data in the page. Pages can be modified +during this time, which is inefficint for both the guest and userspace: +the guest will incur a higher penalty due to write protection faults, +while userspace can see false reports of dirty pages. Manual reprotection +helps reducing this time, improving guest performance and reducing the +number of dirty log false positives. + +KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 was previously available under the name +KVM_CAP_MANUAL_DIRTY_LOG_PROTECT, but the implementation had bugs that make +it hard or impossible to use it correctly. The availability of +KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 signals that those bugs are fixed. +Userspace should not try to use KVM_CAP_MANUAL_DIRTY_LOG_PROTECT. + +8. Other capabilities. +---------------------- + +This section lists capabilities that give information about other +features of the KVM implementation. + +8.1 KVM_CAP_PPC_HWRNG + +Architectures: ppc + +This capability, if KVM_CHECK_EXTENSION indicates that it is +available, means that that the kernel has an implementation of the +H_RANDOM hypercall backed by a hardware random-number generator. +If present, the kernel H_RANDOM handler can be enabled for guest use +with the KVM_CAP_PPC_ENABLE_HCALL capability. + +8.2 KVM_CAP_HYPERV_SYNIC + +Architectures: x86 +This capability, if KVM_CHECK_EXTENSION indicates that it is +available, means that that the kernel has an implementation of the +Hyper-V Synthetic interrupt controller(SynIC). Hyper-V SynIC is +used to support Windows Hyper-V based guest paravirt drivers(VMBus). + +In order to use SynIC, it has to be activated by setting this +capability via KVM_ENABLE_CAP ioctl on the vcpu fd. Note that this +will disable the use of APIC hardware virtualization even if supported +by the CPU, as it's incompatible with SynIC auto-EOI behavior. + +8.3 KVM_CAP_PPC_RADIX_MMU + +Architectures: ppc + +This capability, if KVM_CHECK_EXTENSION indicates that it is +available, means that that the kernel can support guests using the +radix MMU defined in Power ISA V3.00 (as implemented in the POWER9 +processor). + +8.4 KVM_CAP_PPC_HASH_MMU_V3 + +Architectures: ppc + +This capability, if KVM_CHECK_EXTENSION indicates that it is +available, means that that the kernel can support guests using the +hashed page table MMU defined in Power ISA V3.00 (as implemented in +the POWER9 processor), including in-memory segment tables. + +8.5 KVM_CAP_MIPS_VZ + +Architectures: mips + +This capability, if KVM_CHECK_EXTENSION on the main kvm handle indicates that +it is available, means that full hardware assisted virtualization capabilities +of the hardware are available for use through KVM. An appropriate +KVM_VM_MIPS_* type must be passed to KVM_CREATE_VM to create a VM which +utilises it. + +If KVM_CHECK_EXTENSION on a kvm VM handle indicates that this capability is +available, it means that the VM is using full hardware assisted virtualization +capabilities of the hardware. This is useful to check after creating a VM with +KVM_VM_MIPS_DEFAULT. + +The value returned by KVM_CHECK_EXTENSION should be compared against known +values (see below). All other values are reserved. This is to allow for the +possibility of other hardware assisted virtualization implementations which +may be incompatible with the MIPS VZ ASE. + + 0: The trap & emulate implementation is in use to run guest code in user + mode. Guest virtual memory segments are rearranged to fit the guest in the + user mode address space. + + 1: The MIPS VZ ASE is in use, providing full hardware assisted + virtualization, including standard guest virtual memory segments. + +8.6 KVM_CAP_MIPS_TE + +Architectures: mips + +This capability, if KVM_CHECK_EXTENSION on the main kvm handle indicates that +it is available, means that the trap & emulate implementation is available to +run guest code in user mode, even if KVM_CAP_MIPS_VZ indicates that hardware +assisted virtualisation is also available. KVM_VM_MIPS_TE (0) must be passed +to KVM_CREATE_VM to create a VM which utilises it. + +If KVM_CHECK_EXTENSION on a kvm VM handle indicates that this capability is +available, it means that the VM is using trap & emulate. + +8.7 KVM_CAP_MIPS_64BIT + +Architectures: mips + +This capability indicates the supported architecture type of the guest, i.e. the +supported register and address width. + +The values returned when this capability is checked by KVM_CHECK_EXTENSION on a +kvm VM handle correspond roughly to the CP0_Config.AT register field, and should +be checked specifically against known values (see below). All other values are +reserved. + + 0: MIPS32 or microMIPS32. + Both registers and addresses are 32-bits wide. + It will only be possible to run 32-bit guest code. + + 1: MIPS64 or microMIPS64 with access only to 32-bit compatibility segments. + Registers are 64-bits wide, but addresses are 32-bits wide. + 64-bit guest code may run but cannot access MIPS64 memory segments. + It will also be possible to run 32-bit guest code. + + 2: MIPS64 or microMIPS64 with access to all address segments. + Both registers and addresses are 64-bits wide. + It will be possible to run 64-bit or 32-bit guest code. + +8.9 KVM_CAP_ARM_USER_IRQ + +Architectures: arm, arm64 +This capability, if KVM_CHECK_EXTENSION indicates that it is available, means +that if userspace creates a VM without an in-kernel interrupt controller, it +will be notified of changes to the output level of in-kernel emulated devices, +which can generate virtual interrupts, presented to the VM. +For such VMs, on every return to userspace, the kernel +updates the vcpu's run->s.regs.device_irq_level field to represent the actual +output level of the device. + +Whenever kvm detects a change in the device output level, kvm guarantees at +least one return to userspace before running the VM. This exit could either +be a KVM_EXIT_INTR or any other exit event, like KVM_EXIT_MMIO. This way, +userspace can always sample the device output level and re-compute the state of +the userspace interrupt controller. Userspace should always check the state +of run->s.regs.device_irq_level on every kvm exit. +The value in run->s.regs.device_irq_level can represent both level and edge +triggered interrupt signals, depending on the device. Edge triggered interrupt +signals will exit to userspace with the bit in run->s.regs.device_irq_level +set exactly once per edge signal. + +The field run->s.regs.device_irq_level is available independent of +run->kvm_valid_regs or run->kvm_dirty_regs bits. + +If KVM_CAP_ARM_USER_IRQ is supported, the KVM_CHECK_EXTENSION ioctl returns a +number larger than 0 indicating the version of this capability is implemented +and thereby which bits in in run->s.regs.device_irq_level can signal values. + +Currently the following bits are defined for the device_irq_level bitmap: + + KVM_CAP_ARM_USER_IRQ >= 1: + + KVM_ARM_DEV_EL1_VTIMER - EL1 virtual timer + KVM_ARM_DEV_EL1_PTIMER - EL1 physical timer + KVM_ARM_DEV_PMU - ARM PMU overflow interrupt signal + +Future versions of kvm may implement additional events. These will get +indicated by returning a higher number from KVM_CHECK_EXTENSION and will be +listed above. + +8.10 KVM_CAP_PPC_SMT_POSSIBLE + +Architectures: ppc + +Querying this capability returns a bitmap indicating the possible +virtual SMT modes that can be set using KVM_CAP_PPC_SMT. If bit N +(counting from the right) is set, then a virtual SMT mode of 2^N is +available. + +8.11 KVM_CAP_HYPERV_SYNIC2 + +Architectures: x86 + +This capability enables a newer version of Hyper-V Synthetic interrupt +controller (SynIC). The only difference with KVM_CAP_HYPERV_SYNIC is that KVM +doesn't clear SynIC message and event flags pages when they are enabled by +writing to the respective MSRs. + +8.12 KVM_CAP_HYPERV_VP_INDEX + +Architectures: x86 + +This capability indicates that userspace can load HV_X64_MSR_VP_INDEX msr. Its +value is used to denote the target vcpu for a SynIC interrupt. For +compatibilty, KVM initializes this msr to KVM's internal vcpu index. When this +capability is absent, userspace can still query this msr's value. + +8.13 KVM_CAP_S390_AIS_MIGRATION + +Architectures: s390 +Parameters: none + +This capability indicates if the flic device will be able to get/set the +AIS states for migration via the KVM_DEV_FLIC_AISM_ALL attribute and allows +to discover this without having to create a flic device. + +8.14 KVM_CAP_S390_PSW + +Architectures: s390 + +This capability indicates that the PSW is exposed via the kvm_run structure. + +8.15 KVM_CAP_S390_GMAP + +Architectures: s390 + +This capability indicates that the user space memory used as guest mapping can +be anywhere in the user memory address space, as long as the memory slots are +aligned and sized to a segment (1MB) boundary. + +8.16 KVM_CAP_S390_COW + +Architectures: s390 + +This capability indicates that the user space memory used as guest mapping can +use copy-on-write semantics as well as dirty pages tracking via read-only page +tables. + +8.17 KVM_CAP_S390_BPB + +Architectures: s390 + +This capability indicates that kvm will implement the interfaces to handle +reset, migration and nested KVM for branch prediction blocking. The stfle +facility 82 should not be provided to the guest without this capability. + +8.18 KVM_CAP_HYPERV_TLBFLUSH + +Architectures: x86 + +This capability indicates that KVM supports paravirtualized Hyper-V TLB Flush +hypercalls: +HvFlushVirtualAddressSpace, HvFlushVirtualAddressSpaceEx, +HvFlushVirtualAddressList, HvFlushVirtualAddressListEx. + +8.19 KVM_CAP_ARM_INJECT_SERROR_ESR + +Architectures: arm, arm64 + +This capability indicates that userspace can specify (via the +KVM_SET_VCPU_EVENTS ioctl) the syndrome value reported to the guest when it +takes a virtual SError interrupt exception. +If KVM advertises this capability, userspace can only specify the ISS field for +the ESR syndrome. Other parts of the ESR, such as the EC are generated by the +CPU when the exception is taken. If this virtual SError is taken to EL1 using +AArch64, this value will be reported in the ISS field of ESR_ELx. + +See KVM_CAP_VCPU_EVENTS for more details. +8.20 KVM_CAP_HYPERV_SEND_IPI + +Architectures: x86 + +This capability indicates that KVM supports paravirtualized Hyper-V IPI send +hypercalls: +HvCallSendSyntheticClusterIpi, HvCallSendSyntheticClusterIpiEx. diff --git a/Documentation/virt/kvm/arm/hyp-abi.txt b/Documentation/virt/kvm/arm/hyp-abi.txt new file mode 100644 index 000000000000..a20a0bee268d --- /dev/null +++ b/Documentation/virt/kvm/arm/hyp-abi.txt @@ -0,0 +1,53 @@ +* Internal ABI between the kernel and HYP + +This file documents the interaction between the Linux kernel and the +hypervisor layer when running Linux as a hypervisor (for example +KVM). It doesn't cover the interaction of the kernel with the +hypervisor when running as a guest (under Xen, KVM or any other +hypervisor), or any hypervisor-specific interaction when the kernel is +used as a host. + +On arm and arm64 (without VHE), the kernel doesn't run in hypervisor +mode, but still needs to interact with it, allowing a built-in +hypervisor to be either installed or torn down. + +In order to achieve this, the kernel must be booted at HYP (arm) or +EL2 (arm64), allowing it to install a set of stubs before dropping to +SVC/EL1. These stubs are accessible by using a 'hvc #0' instruction, +and only act on individual CPUs. + +Unless specified otherwise, any built-in hypervisor must implement +these functions (see arch/arm{,64}/include/asm/virt.h): + +* r0/x0 = HVC_SET_VECTORS + r1/x1 = vectors + + Set HVBAR/VBAR_EL2 to 'vectors' to enable a hypervisor. 'vectors' + must be a physical address, and respect the alignment requirements + of the architecture. Only implemented by the initial stubs, not by + Linux hypervisors. + +* r0/x0 = HVC_RESET_VECTORS + + Turn HYP/EL2 MMU off, and reset HVBAR/VBAR_EL2 to the initials + stubs' exception vector value. This effectively disables an existing + hypervisor. + +* r0/x0 = HVC_SOFT_RESTART + r1/x1 = restart address + x2 = x0's value when entering the next payload (arm64) + x3 = x1's value when entering the next payload (arm64) + x4 = x2's value when entering the next payload (arm64) + + Mask all exceptions, disable the MMU, move the arguments into place + (arm64 only), and jump to the restart address while at HYP/EL2. This + hypercall is not expected to return to its caller. + +Any other value of r0/x0 triggers a hypervisor-specific handling, +which is not documented here. + +The return value of a stub hypercall is held by r0/x0, and is 0 on +success, and HVC_STUB_ERR on error. A stub hypercall is allowed to +clobber any of the caller-saved registers (x0-x18 on arm64, r0-r3 and +ip on arm). It is thus recommended to use a function call to perform +the hypercall. diff --git a/Documentation/virt/kvm/arm/psci.txt b/Documentation/virt/kvm/arm/psci.txt new file mode 100644 index 000000000000..559586fc9d37 --- /dev/null +++ b/Documentation/virt/kvm/arm/psci.txt @@ -0,0 +1,61 @@ +KVM implements the PSCI (Power State Coordination Interface) +specification in order to provide services such as CPU on/off, reset +and power-off to the guest. + +The PSCI specification is regularly updated to provide new features, +and KVM implements these updates if they make sense from a virtualization +point of view. + +This means that a guest booted on two different versions of KVM can +observe two different "firmware" revisions. This could cause issues if +a given guest is tied to a particular PSCI revision (unlikely), or if +a migration causes a different PSCI version to be exposed out of the +blue to an unsuspecting guest. + +In order to remedy this situation, KVM exposes a set of "firmware +pseudo-registers" that can be manipulated using the GET/SET_ONE_REG +interface. These registers can be saved/restored by userspace, and set +to a convenient value if required. + +The following register is defined: + +* KVM_REG_ARM_PSCI_VERSION: + + - Only valid if the vcpu has the KVM_ARM_VCPU_PSCI_0_2 feature set + (and thus has already been initialized) + - Returns the current PSCI version on GET_ONE_REG (defaulting to the + highest PSCI version implemented by KVM and compatible with v0.2) + - Allows any PSCI version implemented by KVM and compatible with + v0.2 to be set with SET_ONE_REG + - Affects the whole VM (even if the register view is per-vcpu) + +* KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1: + Holds the state of the firmware support to mitigate CVE-2017-5715, as + offered by KVM to the guest via a HVC call. The workaround is described + under SMCCC_ARCH_WORKAROUND_1 in [1]. + Accepted values are: + KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_NOT_AVAIL: KVM does not offer + firmware support for the workaround. The mitigation status for the + guest is unknown. + KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_AVAIL: The workaround HVC call is + available to the guest and required for the mitigation. + KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_NOT_REQUIRED: The workaround HVC call + is available to the guest, but it is not needed on this VCPU. + +* KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2: + Holds the state of the firmware support to mitigate CVE-2018-3639, as + offered by KVM to the guest via a HVC call. The workaround is described + under SMCCC_ARCH_WORKAROUND_2 in [1]. + Accepted values are: + KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_AVAIL: A workaround is not + available. KVM does not offer firmware support for the workaround. + KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_UNKNOWN: The workaround state is + unknown. KVM does not offer firmware support for the workaround. + KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_AVAIL: The workaround is available, + and can be disabled by a vCPU. If + KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_ENABLED is set, it is active for + this vCPU. + KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_REQUIRED: The workaround is + always active on this vCPU or it is not needed. + +[1] https://developer.arm.com/-/media/developer/pdf/ARM_DEN_0070A_Firmware_interfaces_for_mitigating_CVE-2017-5715.pdf diff --git a/Documentation/virt/kvm/cpuid.rst b/Documentation/virt/kvm/cpuid.rst new file mode 100644 index 000000000000..01b081f6e7ea --- /dev/null +++ b/Documentation/virt/kvm/cpuid.rst @@ -0,0 +1,107 @@ +.. SPDX-License-Identifier: GPL-2.0 + +============== +KVM CPUID bits +============== + +:Author: Glauber Costa + +A guest running on a kvm host, can check some of its features using +cpuid. This is not always guaranteed to work, since userspace can +mask-out some, or even all KVM-related cpuid features before launching +a guest. + +KVM cpuid functions are: + +function: KVM_CPUID_SIGNATURE (0x40000000) + +returns:: + + eax = 0x40000001 + ebx = 0x4b4d564b + ecx = 0x564b4d56 + edx = 0x4d + +Note that this value in ebx, ecx and edx corresponds to the string "KVMKVMKVM". +The value in eax corresponds to the maximum cpuid function present in this leaf, +and will be updated if more functions are added in the future. +Note also that old hosts set eax value to 0x0. This should +be interpreted as if the value was 0x40000001. +This function queries the presence of KVM cpuid leafs. + +function: define KVM_CPUID_FEATURES (0x40000001) + +returns:: + + ebx, ecx + eax = an OR'ed group of (1 << flag) + +where ``flag`` is defined as below: + +================================= =========== ================================ +flag value meaning +================================= =========== ================================ +KVM_FEATURE_CLOCKSOURCE 0 kvmclock available at msrs + 0x11 and 0x12 + +KVM_FEATURE_NOP_IO_DELAY 1 not necessary to perform delays + on PIO operations + +KVM_FEATURE_MMU_OP 2 deprecated + +KVM_FEATURE_CLOCKSOURCE2 3 kvmclock available at msrs + + 0x4b564d00 and 0x4b564d01 +KVM_FEATURE_ASYNC_PF 4 async pf can be enabled by + writing to msr 0x4b564d02 + +KVM_FEATURE_STEAL_TIME 5 steal time can be enabled by + writing to msr 0x4b564d03 + +KVM_FEATURE_PV_EOI 6 paravirtualized end of interrupt + handler can be enabled by + writing to msr 0x4b564d04 + +KVM_FEATURE_PV_UNHAULT 7 guest checks this feature bit + before enabling paravirtualized + spinlock support + +KVM_FEATURE_PV_TLB_FLUSH 9 guest checks this feature bit + before enabling paravirtualized + tlb flush + +KVM_FEATURE_ASYNC_PF_VMEXIT 10 paravirtualized async PF VM EXIT + can be enabled by setting bit 2 + when writing to msr 0x4b564d02 + +KVM_FEATURE_PV_SEND_IPI 11 guest checks this feature bit + before enabling paravirtualized + sebd IPIs + +KVM_FEATURE_PV_POLL_CONTROL 12 host-side polling on HLT can + be disabled by writing + to msr 0x4b564d05. + +KVM_FEATURE_PV_SCHED_YIELD 13 guest checks this feature bit + before using paravirtualized + sched yield. + +KVM_FEATURE_CLOCSOURCE_STABLE_BIT 24 host will warn if no guest-side + per-cpu warps are expeced in + kvmclock +================================= =========== ================================ + +:: + + edx = an OR'ed group of (1 << flag) + +Where ``flag`` here is defined as below: + +================== ============ ================================= +flag value meaning +================== ============ ================================= +KVM_HINTS_REALTIME 0 guest checks this feature bit to + determine that vCPUs are never + preempted for an unlimited time + allowing optimizations +================== ============ ================================= diff --git a/Documentation/virt/kvm/devices/README b/Documentation/virt/kvm/devices/README new file mode 100644 index 000000000000..34a69834124a --- /dev/null +++ b/Documentation/virt/kvm/devices/README @@ -0,0 +1 @@ +This directory contains specific device bindings for KVM_CAP_DEVICE_CTRL. diff --git a/Documentation/virt/kvm/devices/arm-vgic-its.txt b/Documentation/virt/kvm/devices/arm-vgic-its.txt new file mode 100644 index 000000000000..eeaa95b893a8 --- /dev/null +++ b/Documentation/virt/kvm/devices/arm-vgic-its.txt @@ -0,0 +1,181 @@ +ARM Virtual Interrupt Translation Service (ITS) +=============================================== + +Device types supported: + KVM_DEV_TYPE_ARM_VGIC_ITS ARM Interrupt Translation Service Controller + +The ITS allows MSI(-X) interrupts to be injected into guests. This extension is +optional. Creating a virtual ITS controller also requires a host GICv3 (see +arm-vgic-v3.txt), but does not depend on having physical ITS controllers. + +There can be multiple ITS controllers per guest, each of them has to have +a separate, non-overlapping MMIO region. + + +Groups: + KVM_DEV_ARM_VGIC_GRP_ADDR + Attributes: + KVM_VGIC_ITS_ADDR_TYPE (rw, 64-bit) + Base address in the guest physical address space of the GICv3 ITS + control register frame. + This address needs to be 64K aligned and the region covers 128K. + Errors: + -E2BIG: Address outside of addressable IPA range + -EINVAL: Incorrectly aligned address + -EEXIST: Address already configured + -EFAULT: Invalid user pointer for attr->addr. + -ENODEV: Incorrect attribute or the ITS is not supported. + + + KVM_DEV_ARM_VGIC_GRP_CTRL + Attributes: + KVM_DEV_ARM_VGIC_CTRL_INIT + request the initialization of the ITS, no additional parameter in + kvm_device_attr.addr. + + KVM_DEV_ARM_ITS_CTRL_RESET + reset the ITS, no additional parameter in kvm_device_attr.addr. + See "ITS Reset State" section. + + KVM_DEV_ARM_ITS_SAVE_TABLES + save the ITS table data into guest RAM, at the location provisioned + by the guest in corresponding registers/table entries. + + The layout of the tables in guest memory defines an ABI. The entries + are laid out in little endian format as described in the last paragraph. + + KVM_DEV_ARM_ITS_RESTORE_TABLES + restore the ITS tables from guest RAM to ITS internal structures. + + The GICV3 must be restored before the ITS and all ITS registers but + the GITS_CTLR must be restored before restoring the ITS tables. + + The GITS_IIDR read-only register must also be restored before + calling KVM_DEV_ARM_ITS_RESTORE_TABLES as the IIDR revision field + encodes the ABI revision. + + The expected ordering when restoring the GICv3/ITS is described in section + "ITS Restore Sequence". + + Errors: + -ENXIO: ITS not properly configured as required prior to setting + this attribute + -ENOMEM: Memory shortage when allocating ITS internal data + -EINVAL: Inconsistent restored data + -EFAULT: Invalid guest ram access + -EBUSY: One or more VCPUS are running + -EACCES: The virtual ITS is backed by a physical GICv4 ITS, and the + state is not available + + KVM_DEV_ARM_VGIC_GRP_ITS_REGS + Attributes: + The attr field of kvm_device_attr encodes the offset of the + ITS register, relative to the ITS control frame base address + (ITS_base). + + kvm_device_attr.addr points to a __u64 value whatever the width + of the addressed register (32/64 bits). 64 bit registers can only + be accessed with full length. + + Writes to read-only registers are ignored by the kernel except for: + - GITS_CREADR. It must be restored otherwise commands in the queue + will be re-executed after restoring CWRITER. GITS_CREADR must be + restored before restoring the GITS_CTLR which is likely to enable the + ITS. Also it must be restored after GITS_CBASER since a write to + GITS_CBASER resets GITS_CREADR. + - GITS_IIDR. The Revision field encodes the table layout ABI revision. + In the future we might implement direct injection of virtual LPIs. + This will require an upgrade of the table layout and an evolution of + the ABI. GITS_IIDR must be restored before calling + KVM_DEV_ARM_ITS_RESTORE_TABLES. + + For other registers, getting or setting a register has the same + effect as reading/writing the register on real hardware. + Errors: + -ENXIO: Offset does not correspond to any supported register + -EFAULT: Invalid user pointer for attr->addr + -EINVAL: Offset is not 64-bit aligned + -EBUSY: one or more VCPUS are running + + ITS Restore Sequence: + ------------------------- + +The following ordering must be followed when restoring the GIC and the ITS: +a) restore all guest memory and create vcpus +b) restore all redistributors +c) provide the ITS base address + (KVM_DEV_ARM_VGIC_GRP_ADDR) +d) restore the ITS in the following order: + 1. Restore GITS_CBASER + 2. Restore all other GITS_ registers, except GITS_CTLR! + 3. Load the ITS table data (KVM_DEV_ARM_ITS_RESTORE_TABLES) + 4. Restore GITS_CTLR + +Then vcpus can be started. + + ITS Table ABI REV0: + ------------------- + + Revision 0 of the ABI only supports the features of a virtual GICv3, and does + not support a virtual GICv4 with support for direct injection of virtual + interrupts for nested hypervisors. + + The device table and ITT are indexed by the DeviceID and EventID, + respectively. The collection table is not indexed by CollectionID, and the + entries in the collection are listed in no particular order. + All entries are 8 bytes. + + Device Table Entry (DTE): + + bits: | 63| 62 ... 49 | 48 ... 5 | 4 ... 0 | + values: | V | next | ITT_addr | Size | + + where; + - V indicates whether the entry is valid. If not, other fields + are not meaningful. + - next: equals to 0 if this entry is the last one; otherwise it + corresponds to the DeviceID offset to the next DTE, capped by + 2^14 -1. + - ITT_addr matches bits [51:8] of the ITT address (256 Byte aligned). + - Size specifies the supported number of bits for the EventID, + minus one + + Collection Table Entry (CTE): + + bits: | 63| 62 .. 52 | 51 ... 16 | 15 ... 0 | + values: | V | RES0 | RDBase | ICID | + + where: + - V indicates whether the entry is valid. If not, other fields are + not meaningful. + - RES0: reserved field with Should-Be-Zero-or-Preserved behavior. + - RDBase is the PE number (GICR_TYPER.Processor_Number semantic), + - ICID is the collection ID + + Interrupt Translation Entry (ITE): + + bits: | 63 ... 48 | 47 ... 16 | 15 ... 0 | + values: | next | pINTID | ICID | + + where: + - next: equals to 0 if this entry is the last one; otherwise it corresponds + to the EventID offset to the next ITE capped by 2^16 -1. + - pINTID is the physical LPI ID; if zero, it means the entry is not valid + and other fields are not meaningful. + - ICID is the collection ID + + ITS Reset State: + ---------------- + +RESET returns the ITS to the same state that it was when first created and +initialized. When the RESET command returns, the following things are +guaranteed: + +- The ITS is not enabled and quiescent + GITS_CTLR.Enabled = 0 .Quiescent=1 +- There is no internally cached state +- No collection or device table are used + GITS_BASER.Valid = 0 +- GITS_CBASER = 0, GITS_CREADR = 0, GITS_CWRITER = 0 +- The ABI version is unchanged and remains the one set when the ITS + device was first created. diff --git a/Documentation/virt/kvm/devices/arm-vgic-v3.txt b/Documentation/virt/kvm/devices/arm-vgic-v3.txt new file mode 100644 index 000000000000..ff290b43c8e5 --- /dev/null +++ b/Documentation/virt/kvm/devices/arm-vgic-v3.txt @@ -0,0 +1,251 @@ +ARM Virtual Generic Interrupt Controller v3 and later (VGICv3) +============================================================== + + +Device types supported: + KVM_DEV_TYPE_ARM_VGIC_V3 ARM Generic Interrupt Controller v3.0 + +Only one VGIC instance may be instantiated through this API. The created VGIC +will act as the VM interrupt controller, requiring emulated user-space devices +to inject interrupts to the VGIC instead of directly to CPUs. It is not +possible to create both a GICv3 and GICv2 on the same VM. + +Creating a guest GICv3 device requires a host GICv3 as well. + + +Groups: + KVM_DEV_ARM_VGIC_GRP_ADDR + Attributes: + KVM_VGIC_V3_ADDR_TYPE_DIST (rw, 64-bit) + Base address in the guest physical address space of the GICv3 distributor + register mappings. Only valid for KVM_DEV_TYPE_ARM_VGIC_V3. + This address needs to be 64K aligned and the region covers 64 KByte. + + KVM_VGIC_V3_ADDR_TYPE_REDIST (rw, 64-bit) + Base address in the guest physical address space of the GICv3 + redistributor register mappings. There are two 64K pages for each + VCPU and all of the redistributor pages are contiguous. + Only valid for KVM_DEV_TYPE_ARM_VGIC_V3. + This address needs to be 64K aligned. + + KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION (rw, 64-bit) + The attribute data pointed to by kvm_device_attr.addr is a __u64 value: + bits: | 63 .... 52 | 51 .... 16 | 15 - 12 |11 - 0 + values: | count | base | flags | index + - index encodes the unique redistributor region index + - flags: reserved for future use, currently 0 + - base field encodes bits [51:16] of the guest physical base address + of the first redistributor in the region. + - count encodes the number of redistributors in the region. Must be + greater than 0. + There are two 64K pages for each redistributor in the region and + redistributors are laid out contiguously within the region. Regions + are filled with redistributors in the index order. The sum of all + region count fields must be greater than or equal to the number of + VCPUs. Redistributor regions must be registered in the incremental + index order, starting from index 0. + The characteristics of a specific redistributor region can be read + by presetting the index field in the attr data. + Only valid for KVM_DEV_TYPE_ARM_VGIC_V3. + + It is invalid to mix calls with KVM_VGIC_V3_ADDR_TYPE_REDIST and + KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION attributes. + + Errors: + -E2BIG: Address outside of addressable IPA range + -EINVAL: Incorrectly aligned address, bad redistributor region + count/index, mixed redistributor region attribute usage + -EEXIST: Address already configured + -ENOENT: Attempt to read the characteristics of a non existing + redistributor region + -ENXIO: The group or attribute is unknown/unsupported for this device + or hardware support is missing. + -EFAULT: Invalid user pointer for attr->addr. + + + KVM_DEV_ARM_VGIC_GRP_DIST_REGS + KVM_DEV_ARM_VGIC_GRP_REDIST_REGS + Attributes: + The attr field of kvm_device_attr encodes two values: + bits: | 63 .... 32 | 31 .... 0 | + values: | mpidr | offset | + + All distributor regs are (rw, 32-bit) and kvm_device_attr.addr points to a + __u32 value. 64-bit registers must be accessed by separately accessing the + lower and higher word. + + Writes to read-only registers are ignored by the kernel. + + KVM_DEV_ARM_VGIC_GRP_DIST_REGS accesses the main distributor registers. + KVM_DEV_ARM_VGIC_GRP_REDIST_REGS accesses the redistributor of the CPU + specified by the mpidr. + + The offset is relative to the "[Re]Distributor base address" as defined + in the GICv3/4 specs. Getting or setting such a register has the same + effect as reading or writing the register on real hardware, except for the + following registers: GICD_STATUSR, GICR_STATUSR, GICD_ISPENDR, + GICR_ISPENDR0, GICD_ICPENDR, and GICR_ICPENDR0. These registers behave + differently when accessed via this interface compared to their + architecturally defined behavior to allow software a full view of the + VGIC's internal state. + + The mpidr field is used to specify which + redistributor is accessed. The mpidr is ignored for the distributor. + + The mpidr encoding is based on the affinity information in the + architecture defined MPIDR, and the field is encoded as follows: + | 63 .... 56 | 55 .... 48 | 47 .... 40 | 39 .... 32 | + | Aff3 | Aff2 | Aff1 | Aff0 | + + Note that distributor fields are not banked, but return the same value + regardless of the mpidr used to access the register. + + GICD_IIDR.Revision is updated when the KVM implementation is changed in a + way directly observable by the guest or userspace. Userspace should read + GICD_IIDR from KVM and write back the read value to confirm its expected + behavior is aligned with the KVM implementation. Userspace should set + GICD_IIDR before setting any other registers to ensure the expected + behavior. + + + The GICD_STATUSR and GICR_STATUSR registers are architecturally defined such + that a write of a clear bit has no effect, whereas a write with a set bit + clears that value. To allow userspace to freely set the values of these two + registers, setting the attributes with the register offsets for these two + registers simply sets the non-reserved bits to the value written. + + + Accesses (reads and writes) to the GICD_ISPENDR register region and + GICR_ISPENDR0 registers get/set the value of the latched pending state for + the interrupts. + + This is identical to the value returned by a guest read from ISPENDR for an + edge triggered interrupt, but may differ for level triggered interrupts. + For edge triggered interrupts, once an interrupt becomes pending (whether + because of an edge detected on the input line or because of a guest write + to ISPENDR) this state is "latched", and only cleared when either the + interrupt is activated or when the guest writes to ICPENDR. A level + triggered interrupt may be pending either because the level input is held + high by a device, or because of a guest write to the ISPENDR register. Only + ISPENDR writes are latched; if the device lowers the line level then the + interrupt is no longer pending unless the guest also wrote to ISPENDR, and + conversely writes to ICPENDR or activations of the interrupt do not clear + the pending status if the line level is still being held high. (These + rules are documented in the GICv3 specification descriptions of the ICPENDR + and ISPENDR registers.) For a level triggered interrupt the value accessed + here is that of the latch which is set by ISPENDR and cleared by ICPENDR or + interrupt activation, whereas the value returned by a guest read from + ISPENDR is the logical OR of the latch value and the input line level. + + Raw access to the latch state is provided to userspace so that it can save + and restore the entire GIC internal state (which is defined by the + combination of the current input line level and the latch state, and cannot + be deduced from purely the line level and the value of the ISPENDR + registers). + + Accesses to GICD_ICPENDR register region and GICR_ICPENDR0 registers have + RAZ/WI semantics, meaning that reads always return 0 and writes are always + ignored. + + Errors: + -ENXIO: Getting or setting this register is not yet supported + -EBUSY: One or more VCPUs are running + + + KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS + Attributes: + The attr field of kvm_device_attr encodes two values: + bits: | 63 .... 32 | 31 .... 16 | 15 .... 0 | + values: | mpidr | RES | instr | + + The mpidr field encodes the CPU ID based on the affinity information in the + architecture defined MPIDR, and the field is encoded as follows: + | 63 .... 56 | 55 .... 48 | 47 .... 40 | 39 .... 32 | + | Aff3 | Aff2 | Aff1 | Aff0 | + + The instr field encodes the system register to access based on the fields + defined in the A64 instruction set encoding for system register access + (RES means the bits are reserved for future use and should be zero): + + | 15 ... 14 | 13 ... 11 | 10 ... 7 | 6 ... 3 | 2 ... 0 | + | Op 0 | Op1 | CRn | CRm | Op2 | + + All system regs accessed through this API are (rw, 64-bit) and + kvm_device_attr.addr points to a __u64 value. + + KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS accesses the CPU interface registers for the + CPU specified by the mpidr field. + + CPU interface registers access is not implemented for AArch32 mode. + Error -ENXIO is returned when accessed in AArch32 mode. + Errors: + -ENXIO: Getting or setting this register is not yet supported + -EBUSY: VCPU is running + -EINVAL: Invalid mpidr or register value supplied + + + KVM_DEV_ARM_VGIC_GRP_NR_IRQS + Attributes: + A value describing the number of interrupts (SGI, PPI and SPI) for + this GIC instance, ranging from 64 to 1024, in increments of 32. + + kvm_device_attr.addr points to a __u32 value. + + Errors: + -EINVAL: Value set is out of the expected range + -EBUSY: Value has already be set. + + + KVM_DEV_ARM_VGIC_GRP_CTRL + Attributes: + KVM_DEV_ARM_VGIC_CTRL_INIT + request the initialization of the VGIC, no additional parameter in + kvm_device_attr.addr. + KVM_DEV_ARM_VGIC_SAVE_PENDING_TABLES + save all LPI pending bits into guest RAM pending tables. + + The first kB of the pending table is not altered by this operation. + Errors: + -ENXIO: VGIC not properly configured as required prior to calling + this attribute + -ENODEV: no online VCPU + -ENOMEM: memory shortage when allocating vgic internal data + -EFAULT: Invalid guest ram access + -EBUSY: One or more VCPUS are running + + + KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO + Attributes: + The attr field of kvm_device_attr encodes the following values: + bits: | 63 .... 32 | 31 .... 10 | 9 .... 0 | + values: | mpidr | info | vINTID | + + The vINTID specifies which set of IRQs is reported on. + + The info field specifies which information userspace wants to get or set + using this interface. Currently we support the following info values: + + VGIC_LEVEL_INFO_LINE_LEVEL: + Get/Set the input level of the IRQ line for a set of 32 contiguously + numbered interrupts. + vINTID must be a multiple of 32. + + kvm_device_attr.addr points to a __u32 value which will contain a + bitmap where a set bit means the interrupt level is asserted. + + Bit[n] indicates the status for interrupt vINTID + n. + + SGIs and any interrupt with a higher ID than the number of interrupts + supported, will be RAZ/WI. LPIs are always edge-triggered and are + therefore not supported by this interface. + + PPIs are reported per VCPU as specified in the mpidr field, and SPIs are + reported with the same value regardless of the mpidr specified. + + The mpidr field encodes the CPU ID based on the affinity information in the + architecture defined MPIDR, and the field is encoded as follows: + | 63 .... 56 | 55 .... 48 | 47 .... 40 | 39 .... 32 | + | Aff3 | Aff2 | Aff1 | Aff0 | + Errors: + -EINVAL: vINTID is not multiple of 32 or + info field is not VGIC_LEVEL_INFO_LINE_LEVEL diff --git a/Documentation/virt/kvm/devices/arm-vgic.txt b/Documentation/virt/kvm/devices/arm-vgic.txt new file mode 100644 index 000000000000..97b6518148f8 --- /dev/null +++ b/Documentation/virt/kvm/devices/arm-vgic.txt @@ -0,0 +1,127 @@ +ARM Virtual Generic Interrupt Controller v2 (VGIC) +================================================== + +Device types supported: + KVM_DEV_TYPE_ARM_VGIC_V2 ARM Generic Interrupt Controller v2.0 + +Only one VGIC instance may be instantiated through either this API or the +legacy KVM_CREATE_IRQCHIP API. The created VGIC will act as the VM interrupt +controller, requiring emulated user-space devices to inject interrupts to the +VGIC instead of directly to CPUs. + +GICv3 implementations with hardware compatibility support allow creating a +guest GICv2 through this interface. For information on creating a guest GICv3 +device and guest ITS devices, see arm-vgic-v3.txt. It is not possible to +create both a GICv3 and GICv2 device on the same VM. + + +Groups: + KVM_DEV_ARM_VGIC_GRP_ADDR + Attributes: + KVM_VGIC_V2_ADDR_TYPE_DIST (rw, 64-bit) + Base address in the guest physical address space of the GIC distributor + register mappings. Only valid for KVM_DEV_TYPE_ARM_VGIC_V2. + This address needs to be 4K aligned and the region covers 4 KByte. + + KVM_VGIC_V2_ADDR_TYPE_CPU (rw, 64-bit) + Base address in the guest physical address space of the GIC virtual cpu + interface register mappings. Only valid for KVM_DEV_TYPE_ARM_VGIC_V2. + This address needs to be 4K aligned and the region covers 4 KByte. + Errors: + -E2BIG: Address outside of addressable IPA range + -EINVAL: Incorrectly aligned address + -EEXIST: Address already configured + -ENXIO: The group or attribute is unknown/unsupported for this device + or hardware support is missing. + -EFAULT: Invalid user pointer for attr->addr. + + KVM_DEV_ARM_VGIC_GRP_DIST_REGS + Attributes: + The attr field of kvm_device_attr encodes two values: + bits: | 63 .... 40 | 39 .. 32 | 31 .... 0 | + values: | reserved | vcpu_index | offset | + + All distributor regs are (rw, 32-bit) + + The offset is relative to the "Distributor base address" as defined in the + GICv2 specs. Getting or setting such a register has the same effect as + reading or writing the register on the actual hardware from the cpu whose + index is specified with the vcpu_index field. Note that most distributor + fields are not banked, but return the same value regardless of the + vcpu_index used to access the register. + + GICD_IIDR.Revision is updated when the KVM implementation of an emulated + GICv2 is changed in a way directly observable by the guest or userspace. + Userspace should read GICD_IIDR from KVM and write back the read value to + confirm its expected behavior is aligned with the KVM implementation. + Userspace should set GICD_IIDR before setting any other registers (both + KVM_DEV_ARM_VGIC_GRP_DIST_REGS and KVM_DEV_ARM_VGIC_GRP_CPU_REGS) to ensure + the expected behavior. Unless GICD_IIDR has been set from userspace, writes + to the interrupt group registers (GICD_IGROUPR) are ignored. + Errors: + -ENXIO: Getting or setting this register is not yet supported + -EBUSY: One or more VCPUs are running + -EINVAL: Invalid vcpu_index supplied + + KVM_DEV_ARM_VGIC_GRP_CPU_REGS + Attributes: + The attr field of kvm_device_attr encodes two values: + bits: | 63 .... 40 | 39 .. 32 | 31 .... 0 | + values: | reserved | vcpu_index | offset | + + All CPU interface regs are (rw, 32-bit) + + The offset specifies the offset from the "CPU interface base address" as + defined in the GICv2 specs. Getting or setting such a register has the + same effect as reading or writing the register on the actual hardware. + + The Active Priorities Registers APRn are implementation defined, so we set a + fixed format for our implementation that fits with the model of a "GICv2 + implementation without the security extensions" which we present to the + guest. This interface always exposes four register APR[0-3] describing the + maximum possible 128 preemption levels. The semantics of the register + indicate if any interrupts in a given preemption level are in the active + state by setting the corresponding bit. + + Thus, preemption level X has one or more active interrupts if and only if: + + APRn[X mod 32] == 0b1, where n = X / 32 + + Bits for undefined preemption levels are RAZ/WI. + + Note that this differs from a CPU's view of the APRs on hardware in which + a GIC without the security extensions expose group 0 and group 1 active + priorities in separate register groups, whereas we show a combined view + similar to GICv2's GICH_APR. + + For historical reasons and to provide ABI compatibility with userspace we + export the GICC_PMR register in the format of the GICH_VMCR.VMPriMask + field in the lower 5 bits of a word, meaning that userspace must always + use the lower 5 bits to communicate with the KVM device and must shift the + value left by 3 places to obtain the actual priority mask level. + + Errors: + -ENXIO: Getting or setting this register is not yet supported + -EBUSY: One or more VCPUs are running + -EINVAL: Invalid vcpu_index supplied + + KVM_DEV_ARM_VGIC_GRP_NR_IRQS + Attributes: + A value describing the number of interrupts (SGI, PPI and SPI) for + this GIC instance, ranging from 64 to 1024, in increments of 32. + + Errors: + -EINVAL: Value set is out of the expected range + -EBUSY: Value has already be set, or GIC has already been initialized + with default values. + + KVM_DEV_ARM_VGIC_GRP_CTRL + Attributes: + KVM_DEV_ARM_VGIC_CTRL_INIT + request the initialization of the VGIC or ITS, no additional parameter + in kvm_device_attr.addr. + Errors: + -ENXIO: VGIC not properly configured as required prior to calling + this attribute + -ENODEV: no online VCPU + -ENOMEM: memory shortage when allocating vgic internal data diff --git a/Documentation/virt/kvm/devices/mpic.txt b/Documentation/virt/kvm/devices/mpic.txt new file mode 100644 index 000000000000..8257397adc3c --- /dev/null +++ b/Documentation/virt/kvm/devices/mpic.txt @@ -0,0 +1,53 @@ +MPIC interrupt controller +========================= + +Device types supported: + KVM_DEV_TYPE_FSL_MPIC_20 Freescale MPIC v2.0 + KVM_DEV_TYPE_FSL_MPIC_42 Freescale MPIC v4.2 + +Only one MPIC instance, of any type, may be instantiated. The created +MPIC will act as the system interrupt controller, connecting to each +vcpu's interrupt inputs. + +Groups: + KVM_DEV_MPIC_GRP_MISC + Attributes: + KVM_DEV_MPIC_BASE_ADDR (rw, 64-bit) + Base address of the 256 KiB MPIC register space. Must be + naturally aligned. A value of zero disables the mapping. + Reset value is zero. + + KVM_DEV_MPIC_GRP_REGISTER (rw, 32-bit) + Access an MPIC register, as if the access were made from the guest. + "attr" is the byte offset into the MPIC register space. Accesses + must be 4-byte aligned. + + MSIs may be signaled by using this attribute group to write + to the relevant MSIIR. + + KVM_DEV_MPIC_GRP_IRQ_ACTIVE (rw, 32-bit) + IRQ input line for each standard openpic source. 0 is inactive and 1 + is active, regardless of interrupt sense. + + For edge-triggered interrupts: Writing 1 is considered an activating + edge, and writing 0 is ignored. Reading returns 1 if a previously + signaled edge has not been acknowledged, and 0 otherwise. + + "attr" is the IRQ number. IRQ numbers for standard sources are the + byte offset of the relevant IVPR from EIVPR0, divided by 32. + +IRQ Routing: + + The MPIC emulation supports IRQ routing. Only a single MPIC device can + be instantiated. Once that device has been created, it's available as + irqchip id 0. + + This irqchip 0 has 256 interrupt pins, which expose the interrupts in + the main array of interrupt sources (a.k.a. "SRC" interrupts). + + The numbering is the same as the MPIC device tree binding -- based on + the register offset from the beginning of the sources array, without + regard to any subdivisions in chip documentation such as "internal" + or "external" interrupts. + + Access to non-SRC interrupts is not implemented through IRQ routing mechanisms. diff --git a/Documentation/virt/kvm/devices/s390_flic.txt b/Documentation/virt/kvm/devices/s390_flic.txt new file mode 100644 index 000000000000..a4e20a090174 --- /dev/null +++ b/Documentation/virt/kvm/devices/s390_flic.txt @@ -0,0 +1,163 @@ +FLIC (floating interrupt controller) +==================================== + +FLIC handles floating (non per-cpu) interrupts, i.e. I/O, service and some +machine check interruptions. All interrupts are stored in a per-vm list of +pending interrupts. FLIC performs operations on this list. + +Only one FLIC instance may be instantiated. + +FLIC provides support to +- add interrupts (KVM_DEV_FLIC_ENQUEUE) +- inspect currently pending interrupts (KVM_FLIC_GET_ALL_IRQS) +- purge all pending floating interrupts (KVM_DEV_FLIC_CLEAR_IRQS) +- purge one pending floating I/O interrupt (KVM_DEV_FLIC_CLEAR_IO_IRQ) +- enable/disable for the guest transparent async page faults +- register and modify adapter interrupt sources (KVM_DEV_FLIC_ADAPTER_*) +- modify AIS (adapter-interruption-suppression) mode state (KVM_DEV_FLIC_AISM) +- inject adapter interrupts on a specified adapter (KVM_DEV_FLIC_AIRQ_INJECT) +- get/set all AIS mode states (KVM_DEV_FLIC_AISM_ALL) + +Groups: + KVM_DEV_FLIC_ENQUEUE + Passes a buffer and length into the kernel which are then injected into + the list of pending interrupts. + attr->addr contains the pointer to the buffer and attr->attr contains + the length of the buffer. + The format of the data structure kvm_s390_irq as it is copied from userspace + is defined in usr/include/linux/kvm.h. + + KVM_DEV_FLIC_GET_ALL_IRQS + Copies all floating interrupts into a buffer provided by userspace. + When the buffer is too small it returns -ENOMEM, which is the indication + for userspace to try again with a bigger buffer. + -ENOBUFS is returned when the allocation of a kernelspace buffer has + failed. + -EFAULT is returned when copying data to userspace failed. + All interrupts remain pending, i.e. are not deleted from the list of + currently pending interrupts. + attr->addr contains the userspace address of the buffer into which all + interrupt data will be copied. + attr->attr contains the size of the buffer in bytes. + + KVM_DEV_FLIC_CLEAR_IRQS + Simply deletes all elements from the list of currently pending floating + interrupts. No interrupts are injected into the guest. + + KVM_DEV_FLIC_CLEAR_IO_IRQ + Deletes one (if any) I/O interrupt for a subchannel identified by the + subsystem identification word passed via the buffer specified by + attr->addr (address) and attr->attr (length). + + KVM_DEV_FLIC_APF_ENABLE + Enables async page faults for the guest. So in case of a major page fault + the host is allowed to handle this async and continues the guest. + + KVM_DEV_FLIC_APF_DISABLE_WAIT + Disables async page faults for the guest and waits until already pending + async page faults are done. This is necessary to trigger a completion interrupt + for every init interrupt before migrating the interrupt list. + + KVM_DEV_FLIC_ADAPTER_REGISTER + Register an I/O adapter interrupt source. Takes a kvm_s390_io_adapter + describing the adapter to register: + +struct kvm_s390_io_adapter { + __u32 id; + __u8 isc; + __u8 maskable; + __u8 swap; + __u8 flags; +}; + + id contains the unique id for the adapter, isc the I/O interruption subclass + to use, maskable whether this adapter may be masked (interrupts turned off), + swap whether the indicators need to be byte swapped, and flags contains + further characteristics of the adapter. + Currently defined values for 'flags' are: + - KVM_S390_ADAPTER_SUPPRESSIBLE: adapter is subject to AIS + (adapter-interrupt-suppression) facility. This flag only has an effect if + the AIS capability is enabled. + Unknown flag values are ignored. + + + KVM_DEV_FLIC_ADAPTER_MODIFY + Modifies attributes of an existing I/O adapter interrupt source. Takes + a kvm_s390_io_adapter_req specifying the adapter and the operation: + +struct kvm_s390_io_adapter_req { + __u32 id; + __u8 type; + __u8 mask; + __u16 pad0; + __u64 addr; +}; + + id specifies the adapter and type the operation. The supported operations + are: + + KVM_S390_IO_ADAPTER_MASK + mask or unmask the adapter, as specified in mask + + KVM_S390_IO_ADAPTER_MAP + perform a gmap translation for the guest address provided in addr, + pin a userspace page for the translated address and add it to the + list of mappings + Note: A new mapping will be created unconditionally; therefore, + the calling code should avoid making duplicate mappings. + + KVM_S390_IO_ADAPTER_UNMAP + release a userspace page for the translated address specified in addr + from the list of mappings + + KVM_DEV_FLIC_AISM + modify the adapter-interruption-suppression mode for a given isc if the + AIS capability is enabled. Takes a kvm_s390_ais_req describing: + +struct kvm_s390_ais_req { + __u8 isc; + __u16 mode; +}; + + isc contains the target I/O interruption subclass, mode the target + adapter-interruption-suppression mode. The following modes are + currently supported: + - KVM_S390_AIS_MODE_ALL: ALL-Interruptions Mode, i.e. airq injection + is always allowed; + - KVM_S390_AIS_MODE_SINGLE: SINGLE-Interruption Mode, i.e. airq + injection is only allowed once and the following adapter interrupts + will be suppressed until the mode is set again to ALL-Interruptions + or SINGLE-Interruption mode. + + KVM_DEV_FLIC_AIRQ_INJECT + Inject adapter interrupts on a specified adapter. + attr->attr contains the unique id for the adapter, which allows for + adapter-specific checks and actions. + For adapters subject to AIS, handle the airq injection suppression for + an isc according to the adapter-interruption-suppression mode on condition + that the AIS capability is enabled. + + KVM_DEV_FLIC_AISM_ALL + Gets or sets the adapter-interruption-suppression mode for all ISCs. Takes + a kvm_s390_ais_all describing: + +struct kvm_s390_ais_all { + __u8 simm; /* Single-Interruption-Mode mask */ + __u8 nimm; /* No-Interruption-Mode mask * +}; + + simm contains Single-Interruption-Mode mask for all ISCs, nimm contains + No-Interruption-Mode mask for all ISCs. Each bit in simm and nimm corresponds + to an ISC (MSB0 bit 0 to ISC 0 and so on). The combination of simm bit and + nimm bit presents AIS mode for a ISC. + + KVM_DEV_FLIC_AISM_ALL is indicated by KVM_CAP_S390_AIS_MIGRATION. + +Note: The KVM_SET_DEVICE_ATTR/KVM_GET_DEVICE_ATTR device ioctls executed on +FLIC with an unknown group or attribute gives the error code EINVAL (instead of +ENXIO, as specified in the API documentation). It is not possible to conclude +that a FLIC operation is unavailable based on the error code resulting from a +usage attempt. + +Note: The KVM_DEV_FLIC_CLEAR_IO_IRQ ioctl will return EINVAL in case a zero +schid is specified. diff --git a/Documentation/virt/kvm/devices/vcpu.txt b/Documentation/virt/kvm/devices/vcpu.txt new file mode 100644 index 000000000000..2b5dab16c4f2 --- /dev/null +++ b/Documentation/virt/kvm/devices/vcpu.txt @@ -0,0 +1,62 @@ +Generic vcpu interface +==================================== + +The virtual cpu "device" also accepts the ioctls KVM_SET_DEVICE_ATTR, +KVM_GET_DEVICE_ATTR, and KVM_HAS_DEVICE_ATTR. The interface uses the same struct +kvm_device_attr as other devices, but targets VCPU-wide settings and controls. + +The groups and attributes per virtual cpu, if any, are architecture specific. + +1. GROUP: KVM_ARM_VCPU_PMU_V3_CTRL +Architectures: ARM64 + +1.1. ATTRIBUTE: KVM_ARM_VCPU_PMU_V3_IRQ +Parameters: in kvm_device_attr.addr the address for PMU overflow interrupt is a + pointer to an int +Returns: -EBUSY: The PMU overflow interrupt is already set + -ENXIO: The overflow interrupt not set when attempting to get it + -ENODEV: PMUv3 not supported + -EINVAL: Invalid PMU overflow interrupt number supplied or + trying to set the IRQ number without using an in-kernel + irqchip. + +A value describing the PMUv3 (Performance Monitor Unit v3) overflow interrupt +number for this vcpu. This interrupt could be a PPI or SPI, but the interrupt +type must be same for each vcpu. As a PPI, the interrupt number is the same for +all vcpus, while as an SPI it must be a separate number per vcpu. + +1.2 ATTRIBUTE: KVM_ARM_VCPU_PMU_V3_INIT +Parameters: no additional parameter in kvm_device_attr.addr +Returns: -ENODEV: PMUv3 not supported or GIC not initialized + -ENXIO: PMUv3 not properly configured or in-kernel irqchip not + configured as required prior to calling this attribute + -EBUSY: PMUv3 already initialized + +Request the initialization of the PMUv3. If using the PMUv3 with an in-kernel +virtual GIC implementation, this must be done after initializing the in-kernel +irqchip. + + +2. GROUP: KVM_ARM_VCPU_TIMER_CTRL +Architectures: ARM,ARM64 + +2.1. ATTRIBUTE: KVM_ARM_VCPU_TIMER_IRQ_VTIMER +2.2. ATTRIBUTE: KVM_ARM_VCPU_TIMER_IRQ_PTIMER +Parameters: in kvm_device_attr.addr the address for the timer interrupt is a + pointer to an int +Returns: -EINVAL: Invalid timer interrupt number + -EBUSY: One or more VCPUs has already run + +A value describing the architected timer interrupt number when connected to an +in-kernel virtual GIC. These must be a PPI (16 <= intid < 32). Setting the +attribute overrides the default values (see below). + +KVM_ARM_VCPU_TIMER_IRQ_VTIMER: The EL1 virtual timer intid (default: 27) +KVM_ARM_VCPU_TIMER_IRQ_PTIMER: The EL1 physical timer intid (default: 30) + +Setting the same PPI for different timers will prevent the VCPUs from running. +Setting the interrupt number on a VCPU configures all VCPUs created at that +time to use the number provided for a given timer, overwriting any previously +configured values on other VCPUs. Userspace should configure the interrupt +numbers on at least one VCPU after creating all VCPUs and before running any +VCPUs. diff --git a/Documentation/virt/kvm/devices/vfio.txt b/Documentation/virt/kvm/devices/vfio.txt new file mode 100644 index 000000000000..528c77c8022c --- /dev/null +++ b/Documentation/virt/kvm/devices/vfio.txt @@ -0,0 +1,36 @@ +VFIO virtual device +=================== + +Device types supported: + KVM_DEV_TYPE_VFIO + +Only one VFIO instance may be created per VM. The created device +tracks VFIO groups in use by the VM and features of those groups +important to the correctness and acceleration of the VM. As groups +are enabled and disabled for use by the VM, KVM should be updated +about their presence. When registered with KVM, a reference to the +VFIO-group is held by KVM. + +Groups: + KVM_DEV_VFIO_GROUP + +KVM_DEV_VFIO_GROUP attributes: + KVM_DEV_VFIO_GROUP_ADD: Add a VFIO group to VFIO-KVM device tracking + kvm_device_attr.addr points to an int32_t file descriptor + for the VFIO group. + KVM_DEV_VFIO_GROUP_DEL: Remove a VFIO group from VFIO-KVM device tracking + kvm_device_attr.addr points to an int32_t file descriptor + for the VFIO group. + KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE: attaches a guest visible TCE table + allocated by sPAPR KVM. + kvm_device_attr.addr points to a struct: + + struct kvm_vfio_spapr_tce { + __s32 groupfd; + __s32 tablefd; + }; + + where + @groupfd is a file descriptor for a VFIO group; + @tablefd is a file descriptor for a TCE table allocated via + KVM_CREATE_SPAPR_TCE. diff --git a/Documentation/virt/kvm/devices/vm.txt b/Documentation/virt/kvm/devices/vm.txt new file mode 100644 index 000000000000..4ffb82b02468 --- /dev/null +++ b/Documentation/virt/kvm/devices/vm.txt @@ -0,0 +1,270 @@ +Generic vm interface +==================================== + +The virtual machine "device" also accepts the ioctls KVM_SET_DEVICE_ATTR, +KVM_GET_DEVICE_ATTR, and KVM_HAS_DEVICE_ATTR. The interface uses the same +struct kvm_device_attr as other devices, but targets VM-wide settings +and controls. + +The groups and attributes per virtual machine, if any, are architecture +specific. + +1. GROUP: KVM_S390_VM_MEM_CTRL +Architectures: s390 + +1.1. ATTRIBUTE: KVM_S390_VM_MEM_ENABLE_CMMA +Parameters: none +Returns: -EBUSY if a vcpu is already defined, otherwise 0 + +Enables Collaborative Memory Management Assist (CMMA) for the virtual machine. + +1.2. ATTRIBUTE: KVM_S390_VM_MEM_CLR_CMMA +Parameters: none +Returns: -EINVAL if CMMA was not enabled + 0 otherwise + +Clear the CMMA status for all guest pages, so any pages the guest marked +as unused are again used any may not be reclaimed by the host. + +1.3. ATTRIBUTE KVM_S390_VM_MEM_LIMIT_SIZE +Parameters: in attr->addr the address for the new limit of guest memory +Returns: -EFAULT if the given address is not accessible + -EINVAL if the virtual machine is of type UCONTROL + -E2BIG if the given guest memory is to big for that machine + -EBUSY if a vcpu is already defined + -ENOMEM if not enough memory is available for a new shadow guest mapping + 0 otherwise + +Allows userspace to query the actual limit and set a new limit for +the maximum guest memory size. The limit will be rounded up to +2048 MB, 4096 GB, 8192 TB respectively, as this limit is governed by +the number of page table levels. In the case that there is no limit we will set +the limit to KVM_S390_NO_MEM_LIMIT (U64_MAX). + +2. GROUP: KVM_S390_VM_CPU_MODEL +Architectures: s390 + +2.1. ATTRIBUTE: KVM_S390_VM_CPU_MACHINE (r/o) + +Allows user space to retrieve machine and kvm specific cpu related information: + +struct kvm_s390_vm_cpu_machine { + __u64 cpuid; # CPUID of host + __u32 ibc; # IBC level range offered by host + __u8 pad[4]; + __u64 fac_mask[256]; # set of cpu facilities enabled by KVM + __u64 fac_list[256]; # set of cpu facilities offered by host +} + +Parameters: address of buffer to store the machine related cpu data + of type struct kvm_s390_vm_cpu_machine* +Returns: -EFAULT if the given address is not accessible from kernel space + -ENOMEM if not enough memory is available to process the ioctl + 0 in case of success + +2.2. ATTRIBUTE: KVM_S390_VM_CPU_PROCESSOR (r/w) + +Allows user space to retrieve or request to change cpu related information for a vcpu: + +struct kvm_s390_vm_cpu_processor { + __u64 cpuid; # CPUID currently (to be) used by this vcpu + __u16 ibc; # IBC level currently (to be) used by this vcpu + __u8 pad[6]; + __u64 fac_list[256]; # set of cpu facilities currently (to be) used + # by this vcpu +} + +KVM does not enforce or limit the cpu model data in any form. Take the information +retrieved by means of KVM_S390_VM_CPU_MACHINE as hint for reasonable configuration +setups. Instruction interceptions triggered by additionally set facility bits that +are not handled by KVM need to by imlemented in the VM driver code. + +Parameters: address of buffer to store/set the processor related cpu + data of type struct kvm_s390_vm_cpu_processor*. +Returns: -EBUSY in case 1 or more vcpus are already activated (only in write case) + -EFAULT if the given address is not accessible from kernel space + -ENOMEM if not enough memory is available to process the ioctl + 0 in case of success + +2.3. ATTRIBUTE: KVM_S390_VM_CPU_MACHINE_FEAT (r/o) + +Allows user space to retrieve available cpu features. A feature is available if +provided by the hardware and supported by kvm. In theory, cpu features could +even be completely emulated by kvm. + +struct kvm_s390_vm_cpu_feat { + __u64 feat[16]; # Bitmap (1 = feature available), MSB 0 bit numbering +}; + +Parameters: address of a buffer to load the feature list from. +Returns: -EFAULT if the given address is not accessible from kernel space. + 0 in case of success. + +2.4. ATTRIBUTE: KVM_S390_VM_CPU_PROCESSOR_FEAT (r/w) + +Allows user space to retrieve or change enabled cpu features for all VCPUs of a +VM. Features that are not available cannot be enabled. + +See 2.3. for a description of the parameter struct. + +Parameters: address of a buffer to store/load the feature list from. +Returns: -EFAULT if the given address is not accessible from kernel space. + -EINVAL if a cpu feature that is not available is to be enabled. + -EBUSY if at least one VCPU has already been defined. + 0 in case of success. + +2.5. ATTRIBUTE: KVM_S390_VM_CPU_MACHINE_SUBFUNC (r/o) + +Allows user space to retrieve available cpu subfunctions without any filtering +done by a set IBC. These subfunctions are indicated to the guest VCPU via +query or "test bit" subfunctions and used e.g. by cpacf functions, plo and ptff. + +A subfunction block is only valid if KVM_S390_VM_CPU_MACHINE contains the +STFL(E) bit introducing the affected instruction. If the affected instruction +indicates subfunctions via a "query subfunction", the response block is +contained in the returned struct. If the affected instruction +indicates subfunctions via a "test bit" mechanism, the subfunction codes are +contained in the returned struct in MSB 0 bit numbering. + +struct kvm_s390_vm_cpu_subfunc { + u8 plo[32]; # always valid (ESA/390 feature) + u8 ptff[16]; # valid with TOD-clock steering + u8 kmac[16]; # valid with Message-Security-Assist + u8 kmc[16]; # valid with Message-Security-Assist + u8 km[16]; # valid with Message-Security-Assist + u8 kimd[16]; # valid with Message-Security-Assist + u8 klmd[16]; # valid with Message-Security-Assist + u8 pckmo[16]; # valid with Message-Security-Assist-Extension 3 + u8 kmctr[16]; # valid with Message-Security-Assist-Extension 4 + u8 kmf[16]; # valid with Message-Security-Assist-Extension 4 + u8 kmo[16]; # valid with Message-Security-Assist-Extension 4 + u8 pcc[16]; # valid with Message-Security-Assist-Extension 4 + u8 ppno[16]; # valid with Message-Security-Assist-Extension 5 + u8 kma[16]; # valid with Message-Security-Assist-Extension 8 + u8 kdsa[16]; # valid with Message-Security-Assist-Extension 9 + u8 reserved[1792]; # reserved for future instructions +}; + +Parameters: address of a buffer to load the subfunction blocks from. +Returns: -EFAULT if the given address is not accessible from kernel space. + 0 in case of success. + +2.6. ATTRIBUTE: KVM_S390_VM_CPU_PROCESSOR_SUBFUNC (r/w) + +Allows user space to retrieve or change cpu subfunctions to be indicated for +all VCPUs of a VM. This attribute will only be available if kernel and +hardware support are in place. + +The kernel uses the configured subfunction blocks for indication to +the guest. A subfunction block will only be used if the associated STFL(E) bit +has not been disabled by user space (so the instruction to be queried is +actually available for the guest). + +As long as no data has been written, a read will fail. The IBC will be used +to determine available subfunctions in this case, this will guarantee backward +compatibility. + +See 2.5. for a description of the parameter struct. + +Parameters: address of a buffer to store/load the subfunction blocks from. +Returns: -EFAULT if the given address is not accessible from kernel space. + -EINVAL when reading, if there was no write yet. + -EBUSY if at least one VCPU has already been defined. + 0 in case of success. + +3. GROUP: KVM_S390_VM_TOD +Architectures: s390 + +3.1. ATTRIBUTE: KVM_S390_VM_TOD_HIGH + +Allows user space to set/get the TOD clock extension (u8) (superseded by +KVM_S390_VM_TOD_EXT). + +Parameters: address of a buffer in user space to store the data (u8) to +Returns: -EFAULT if the given address is not accessible from kernel space + -EINVAL if setting the TOD clock extension to != 0 is not supported + +3.2. ATTRIBUTE: KVM_S390_VM_TOD_LOW + +Allows user space to set/get bits 0-63 of the TOD clock register as defined in +the POP (u64). + +Parameters: address of a buffer in user space to store the data (u64) to +Returns: -EFAULT if the given address is not accessible from kernel space + +3.3. ATTRIBUTE: KVM_S390_VM_TOD_EXT +Allows user space to set/get bits 0-63 of the TOD clock register as defined in +the POP (u64). If the guest CPU model supports the TOD clock extension (u8), it +also allows user space to get/set it. If the guest CPU model does not support +it, it is stored as 0 and not allowed to be set to a value != 0. + +Parameters: address of a buffer in user space to store the data + (kvm_s390_vm_tod_clock) to +Returns: -EFAULT if the given address is not accessible from kernel space + -EINVAL if setting the TOD clock extension to != 0 is not supported + +4. GROUP: KVM_S390_VM_CRYPTO +Architectures: s390 + +4.1. ATTRIBUTE: KVM_S390_VM_CRYPTO_ENABLE_AES_KW (w/o) + +Allows user space to enable aes key wrapping, including generating a new +wrapping key. + +Parameters: none +Returns: 0 + +4.2. ATTRIBUTE: KVM_S390_VM_CRYPTO_ENABLE_DEA_KW (w/o) + +Allows user space to enable dea key wrapping, including generating a new +wrapping key. + +Parameters: none +Returns: 0 + +4.3. ATTRIBUTE: KVM_S390_VM_CRYPTO_DISABLE_AES_KW (w/o) + +Allows user space to disable aes key wrapping, clearing the wrapping key. + +Parameters: none +Returns: 0 + +4.4. ATTRIBUTE: KVM_S390_VM_CRYPTO_DISABLE_DEA_KW (w/o) + +Allows user space to disable dea key wrapping, clearing the wrapping key. + +Parameters: none +Returns: 0 + +5. GROUP: KVM_S390_VM_MIGRATION +Architectures: s390 + +5.1. ATTRIBUTE: KVM_S390_VM_MIGRATION_STOP (w/o) + +Allows userspace to stop migration mode, needed for PGSTE migration. +Setting this attribute when migration mode is not active will have no +effects. + +Parameters: none +Returns: 0 + +5.2. ATTRIBUTE: KVM_S390_VM_MIGRATION_START (w/o) + +Allows userspace to start migration mode, needed for PGSTE migration. +Setting this attribute when migration mode is already active will have +no effects. + +Parameters: none +Returns: -ENOMEM if there is not enough free memory to start migration mode + -EINVAL if the state of the VM is invalid (e.g. no memory defined) + 0 in case of success. + +5.3. ATTRIBUTE: KVM_S390_VM_MIGRATION_STATUS (r/o) + +Allows userspace to query the status of migration mode. + +Parameters: address of a buffer in user space to store the data (u64) to; + the data itself is either 0 if migration mode is disabled or 1 + if it is enabled +Returns: -EFAULT if the given address is not accessible from kernel space + 0 in case of success. diff --git a/Documentation/virt/kvm/devices/xics.txt b/Documentation/virt/kvm/devices/xics.txt new file mode 100644 index 000000000000..42864935ac5d --- /dev/null +++ b/Documentation/virt/kvm/devices/xics.txt @@ -0,0 +1,66 @@ +XICS interrupt controller + +Device type supported: KVM_DEV_TYPE_XICS + +Groups: + KVM_DEV_XICS_SOURCES + Attributes: One per interrupt source, indexed by the source number. + +This device emulates the XICS (eXternal Interrupt Controller +Specification) defined in PAPR. The XICS has a set of interrupt +sources, each identified by a 20-bit source number, and a set of +Interrupt Control Presentation (ICP) entities, also called "servers", +each associated with a virtual CPU. + +The ICP entities are created by enabling the KVM_CAP_IRQ_ARCH +capability for each vcpu, specifying KVM_CAP_IRQ_XICS in args[0] and +the interrupt server number (i.e. the vcpu number from the XICS's +point of view) in args[1] of the kvm_enable_cap struct. Each ICP has +64 bits of state which can be read and written using the +KVM_GET_ONE_REG and KVM_SET_ONE_REG ioctls on the vcpu. The 64 bit +state word has the following bitfields, starting at the +least-significant end of the word: + +* Unused, 16 bits + +* Pending interrupt priority, 8 bits + Zero is the highest priority, 255 means no interrupt is pending. + +* Pending IPI (inter-processor interrupt) priority, 8 bits + Zero is the highest priority, 255 means no IPI is pending. + +* Pending interrupt source number, 24 bits + Zero means no interrupt pending, 2 means an IPI is pending + +* Current processor priority, 8 bits + Zero is the highest priority, meaning no interrupts can be + delivered, and 255 is the lowest priority. + +Each source has 64 bits of state that can be read and written using +the KVM_GET_DEVICE_ATTR and KVM_SET_DEVICE_ATTR ioctls, specifying the +KVM_DEV_XICS_SOURCES attribute group, with the attribute number being +the interrupt source number. The 64 bit state word has the following +bitfields, starting from the least-significant end of the word: + +* Destination (server number), 32 bits + This specifies where the interrupt should be sent, and is the + interrupt server number specified for the destination vcpu. + +* Priority, 8 bits + This is the priority specified for this interrupt source, where 0 is + the highest priority and 255 is the lowest. An interrupt with a + priority of 255 will never be delivered. + +* Level sensitive flag, 1 bit + This bit is 1 for a level-sensitive interrupt source, or 0 for + edge-sensitive (or MSI). + +* Masked flag, 1 bit + This bit is set to 1 if the interrupt is masked (cannot be delivered + regardless of its priority), for example by the ibm,int-off RTAS + call, or 0 if it is not masked. + +* Pending flag, 1 bit + This bit is 1 if the source has a pending interrupt, otherwise 0. + +Only one XICS instance may be created per VM. diff --git a/Documentation/virt/kvm/devices/xive.txt b/Documentation/virt/kvm/devices/xive.txt new file mode 100644 index 000000000000..9a24a4525253 --- /dev/null +++ b/Documentation/virt/kvm/devices/xive.txt @@ -0,0 +1,197 @@ +POWER9 eXternal Interrupt Virtualization Engine (XIVE Gen1) +========================================================== + +Device types supported: + KVM_DEV_TYPE_XIVE POWER9 XIVE Interrupt Controller generation 1 + +This device acts as a VM interrupt controller. It provides the KVM +interface to configure the interrupt sources of a VM in the underlying +POWER9 XIVE interrupt controller. + +Only one XIVE instance may be instantiated. A guest XIVE device +requires a POWER9 host and the guest OS should have support for the +XIVE native exploitation interrupt mode. If not, it should run using +the legacy interrupt mode, referred as XICS (POWER7/8). + +* Device Mappings + + The KVM device exposes different MMIO ranges of the XIVE HW which + are required for interrupt management. These are exposed to the + guest in VMAs populated with a custom VM fault handler. + + 1. Thread Interrupt Management Area (TIMA) + + Each thread has an associated Thread Interrupt Management context + composed of a set of registers. These registers let the thread + handle priority management and interrupt acknowledgment. The most + important are : + + - Interrupt Pending Buffer (IPB) + - Current Processor Priority (CPPR) + - Notification Source Register (NSR) + + They are exposed to software in four different pages each proposing + a view with a different privilege. The first page is for the + physical thread context and the second for the hypervisor. Only the + third (operating system) and the fourth (user level) are exposed the + guest. + + 2. Event State Buffer (ESB) + + Each source is associated with an Event State Buffer (ESB) with + either a pair of even/odd pair of pages which provides commands to + manage the source: to trigger, to EOI, to turn off the source for + instance. + + 3. Device pass-through + + When a device is passed-through into the guest, the source + interrupts are from a different HW controller (PHB4) and the ESB + pages exposed to the guest should accommadate this change. + + The passthru_irq helpers, kvmppc_xive_set_mapped() and + kvmppc_xive_clr_mapped() are called when the device HW irqs are + mapped into or unmapped from the guest IRQ number space. The KVM + device extends these helpers to clear the ESB pages of the guest IRQ + number being mapped and then lets the VM fault handler repopulate. + The handler will insert the ESB page corresponding to the HW + interrupt of the device being passed-through or the initial IPI ESB + page if the device has being removed. + + The ESB remapping is fully transparent to the guest and the OS + device driver. All handling is done within VFIO and the above + helpers in KVM-PPC. + +* Groups: + + 1. KVM_DEV_XIVE_GRP_CTRL + Provides global controls on the device + Attributes: + 1.1 KVM_DEV_XIVE_RESET (write only) + Resets the interrupt controller configuration for sources and event + queues. To be used by kexec and kdump. + Errors: none + + 1.2 KVM_DEV_XIVE_EQ_SYNC (write only) + Sync all the sources and queues and mark the EQ pages dirty. This + to make sure that a consistent memory state is captured when + migrating the VM. + Errors: none + + 2. KVM_DEV_XIVE_GRP_SOURCE (write only) + Initializes a new source in the XIVE device and mask it. + Attributes: + Interrupt source number (64-bit) + The kvm_device_attr.addr points to a __u64 value: + bits: | 63 .... 2 | 1 | 0 + values: | unused | level | type + - type: 0:MSI 1:LSI + - level: assertion level in case of an LSI. + Errors: + -E2BIG: Interrupt source number is out of range + -ENOMEM: Could not create a new source block + -EFAULT: Invalid user pointer for attr->addr. + -ENXIO: Could not allocate underlying HW interrupt + + 3. KVM_DEV_XIVE_GRP_SOURCE_CONFIG (write only) + Configures source targeting + Attributes: + Interrupt source number (64-bit) + The kvm_device_attr.addr points to a __u64 value: + bits: | 63 .... 33 | 32 | 31 .. 3 | 2 .. 0 + values: | eisn | mask | server | priority + - priority: 0-7 interrupt priority level + - server: CPU number chosen to handle the interrupt + - mask: mask flag (unused) + - eisn: Effective Interrupt Source Number + Errors: + -ENOENT: Unknown source number + -EINVAL: Not initialized source number + -EINVAL: Invalid priority + -EINVAL: Invalid CPU number. + -EFAULT: Invalid user pointer for attr->addr. + -ENXIO: CPU event queues not configured or configuration of the + underlying HW interrupt failed + -EBUSY: No CPU available to serve interrupt + + 4. KVM_DEV_XIVE_GRP_EQ_CONFIG (read-write) + Configures an event queue of a CPU + Attributes: + EQ descriptor identifier (64-bit) + The EQ descriptor identifier is a tuple (server, priority) : + bits: | 63 .... 32 | 31 .. 3 | 2 .. 0 + values: | unused | server | priority + The kvm_device_attr.addr points to : + struct kvm_ppc_xive_eq { + __u32 flags; + __u32 qshift; + __u64 qaddr; + __u32 qtoggle; + __u32 qindex; + __u8 pad[40]; + }; + - flags: queue flags + KVM_XIVE_EQ_ALWAYS_NOTIFY (required) + forces notification without using the coalescing mechanism + provided by the XIVE END ESBs. + - qshift: queue size (power of 2) + - qaddr: real address of queue + - qtoggle: current queue toggle bit + - qindex: current queue index + - pad: reserved for future use + Errors: + -ENOENT: Invalid CPU number + -EINVAL: Invalid priority + -EINVAL: Invalid flags + -EINVAL: Invalid queue size + -EINVAL: Invalid queue address + -EFAULT: Invalid user pointer for attr->addr. + -EIO: Configuration of the underlying HW failed + + 5. KVM_DEV_XIVE_GRP_SOURCE_SYNC (write only) + Synchronize the source to flush event notifications + Attributes: + Interrupt source number (64-bit) + Errors: + -ENOENT: Unknown source number + -EINVAL: Not initialized source number + +* VCPU state + + The XIVE IC maintains VP interrupt state in an internal structure + called the NVT. When a VP is not dispatched on a HW processor + thread, this structure can be updated by HW if the VP is the target + of an event notification. + + It is important for migration to capture the cached IPB from the NVT + as it synthesizes the priorities of the pending interrupts. We + capture a bit more to report debug information. + + KVM_REG_PPC_VP_STATE (2 * 64bits) + bits: | 63 .... 32 | 31 .... 0 | + values: | TIMA word0 | TIMA word1 | + bits: | 127 .......... 64 | + values: | unused | + +* Migration: + + Saving the state of a VM using the XIVE native exploitation mode + should follow a specific sequence. When the VM is stopped : + + 1. Mask all sources (PQ=01) to stop the flow of events. + + 2. Sync the XIVE device with the KVM control KVM_DEV_XIVE_EQ_SYNC to + flush any in-flight event notification and to stabilize the EQs. At + this stage, the EQ pages are marked dirty to make sure they are + transferred in the migration sequence. + + 3. Capture the state of the source targeting, the EQs configuration + and the state of thread interrupt context registers. + + Restore is similar : + + 1. Restore the EQ configuration. As targeting depends on it. + 2. Restore targeting + 3. Restore the thread interrupt contexts + 4. Restore the source states + 5. Let the vCPU run diff --git a/Documentation/virt/kvm/halt-polling.txt b/Documentation/virt/kvm/halt-polling.txt new file mode 100644 index 000000000000..4f791b128dd2 --- /dev/null +++ b/Documentation/virt/kvm/halt-polling.txt @@ -0,0 +1,136 @@ +The KVM halt polling system +=========================== + +The KVM halt polling system provides a feature within KVM whereby the latency +of a guest can, under some circumstances, be reduced by polling in the host +for some time period after the guest has elected to no longer run by cedeing. +That is, when a guest vcpu has ceded, or in the case of powerpc when all of the +vcpus of a single vcore have ceded, the host kernel polls for wakeup conditions +before giving up the cpu to the scheduler in order to let something else run. + +Polling provides a latency advantage in cases where the guest can be run again +very quickly by at least saving us a trip through the scheduler, normally on +the order of a few micro-seconds, although performance benefits are workload +dependant. In the event that no wakeup source arrives during the polling +interval or some other task on the runqueue is runnable the scheduler is +invoked. Thus halt polling is especially useful on workloads with very short +wakeup periods where the time spent halt polling is minimised and the time +savings of not invoking the scheduler are distinguishable. + +The generic halt polling code is implemented in: + + virt/kvm/kvm_main.c: kvm_vcpu_block() + +The powerpc kvm-hv specific case is implemented in: + + arch/powerpc/kvm/book3s_hv.c: kvmppc_vcore_blocked() + +Halt Polling Interval +===================== + +The maximum time for which to poll before invoking the scheduler, referred to +as the halt polling interval, is increased and decreased based on the perceived +effectiveness of the polling in an attempt to limit pointless polling. +This value is stored in either the vcpu struct: + + kvm_vcpu->halt_poll_ns + +or in the case of powerpc kvm-hv, in the vcore struct: + + kvmppc_vcore->halt_poll_ns + +Thus this is a per vcpu (or vcore) value. + +During polling if a wakeup source is received within the halt polling interval, +the interval is left unchanged. In the event that a wakeup source isn't +received during the polling interval (and thus schedule is invoked) there are +two options, either the polling interval and total block time[0] were less than +the global max polling interval (see module params below), or the total block +time was greater than the global max polling interval. + +In the event that both the polling interval and total block time were less than +the global max polling interval then the polling interval can be increased in +the hope that next time during the longer polling interval the wake up source +will be received while the host is polling and the latency benefits will be +received. The polling interval is grown in the function grow_halt_poll_ns() and +is multiplied by the module parameters halt_poll_ns_grow and +halt_poll_ns_grow_start. + +In the event that the total block time was greater than the global max polling +interval then the host will never poll for long enough (limited by the global +max) to wakeup during the polling interval so it may as well be shrunk in order +to avoid pointless polling. The polling interval is shrunk in the function +shrink_halt_poll_ns() and is divided by the module parameter +halt_poll_ns_shrink, or set to 0 iff halt_poll_ns_shrink == 0. + +It is worth noting that this adjustment process attempts to hone in on some +steady state polling interval but will only really do a good job for wakeups +which come at an approximately constant rate, otherwise there will be constant +adjustment of the polling interval. + +[0] total block time: the time between when the halt polling function is + invoked and a wakeup source received (irrespective of + whether the scheduler is invoked within that function). + +Module Parameters +================= + +The kvm module has 3 tuneable module parameters to adjust the global max +polling interval as well as the rate at which the polling interval is grown and +shrunk. These variables are defined in include/linux/kvm_host.h and as module +parameters in virt/kvm/kvm_main.c, or arch/powerpc/kvm/book3s_hv.c in the +powerpc kvm-hv case. + +Module Parameter | Description | Default Value +-------------------------------------------------------------------------------- +halt_poll_ns | The global max polling | KVM_HALT_POLL_NS_DEFAULT + | interval which defines | + | the ceiling value of the | + | polling interval for | (per arch value) + | each vcpu. | +-------------------------------------------------------------------------------- +halt_poll_ns_grow | The value by which the | 2 + | halt polling interval is | + | multiplied in the | + | grow_halt_poll_ns() | + | function. | +-------------------------------------------------------------------------------- +halt_poll_ns_grow_start | The initial value to grow | 10000 + | to from zero in the | + | grow_halt_poll_ns() | + | function. | +-------------------------------------------------------------------------------- +halt_poll_ns_shrink | The value by which the | 0 + | halt polling interval is | + | divided in the | + | shrink_halt_poll_ns() | + | function. | +-------------------------------------------------------------------------------- + +These module parameters can be set from the debugfs files in: + + /sys/module/kvm/parameters/ + +Note: that these module parameters are system wide values and are not able to + be tuned on a per vm basis. + +Further Notes +============= + +- Care should be taken when setting the halt_poll_ns module parameter as a +large value has the potential to drive the cpu usage to 100% on a machine which +would be almost entirely idle otherwise. This is because even if a guest has +wakeups during which very little work is done and which are quite far apart, if +the period is shorter than the global max polling interval (halt_poll_ns) then +the host will always poll for the entire block time and thus cpu utilisation +will go to 100%. + +- Halt polling essentially presents a trade off between power usage and latency +and the module parameters should be used to tune the affinity for this. Idle +cpu time is essentially converted to host kernel time with the aim of decreasing +latency when entering the guest. + +- Halt polling will only be conducted by the host when no other tasks are +runnable on that cpu, otherwise the polling will cease immediately and +schedule will be invoked to allow that other task to run. Thus this doesn't +allow a guest to denial of service the cpu. diff --git a/Documentation/virt/kvm/hypercalls.txt b/Documentation/virt/kvm/hypercalls.txt new file mode 100644 index 000000000000..5f6d291bd004 --- /dev/null +++ b/Documentation/virt/kvm/hypercalls.txt @@ -0,0 +1,154 @@ +Linux KVM Hypercall: +=================== +X86: + KVM Hypercalls have a three-byte sequence of either the vmcall or the vmmcall + instruction. The hypervisor can replace it with instructions that are + guaranteed to be supported. + + Up to four arguments may be passed in rbx, rcx, rdx, and rsi respectively. + The hypercall number should be placed in rax and the return value will be + placed in rax. No other registers will be clobbered unless explicitly stated + by the particular hypercall. + +S390: + R2-R7 are used for parameters 1-6. In addition, R1 is used for hypercall + number. The return value is written to R2. + + S390 uses diagnose instruction as hypercall (0x500) along with hypercall + number in R1. + + For further information on the S390 diagnose call as supported by KVM, + refer to Documentation/virt/kvm/s390-diag.txt. + + PowerPC: + It uses R3-R10 and hypercall number in R11. R4-R11 are used as output registers. + Return value is placed in R3. + + KVM hypercalls uses 4 byte opcode, that are patched with 'hypercall-instructions' + property inside the device tree's /hypervisor node. + For more information refer to Documentation/virt/kvm/ppc-pv.txt + +MIPS: + KVM hypercalls use the HYPCALL instruction with code 0 and the hypercall + number in $2 (v0). Up to four arguments may be placed in $4-$7 (a0-a3) and + the return value is placed in $2 (v0). + +KVM Hypercalls Documentation +=========================== +The template for each hypercall is: +1. Hypercall name. +2. Architecture(s) +3. Status (deprecated, obsolete, active) +4. Purpose + +1. KVM_HC_VAPIC_POLL_IRQ +------------------------ +Architecture: x86 +Status: active +Purpose: Trigger guest exit so that the host can check for pending +interrupts on reentry. + +2. KVM_HC_MMU_OP +------------------------ +Architecture: x86 +Status: deprecated. +Purpose: Support MMU operations such as writing to PTE, +flushing TLB, release PT. + +3. KVM_HC_FEATURES +------------------------ +Architecture: PPC +Status: active +Purpose: Expose hypercall availability to the guest. On x86 platforms, cpuid +used to enumerate which hypercalls are available. On PPC, either device tree +based lookup ( which is also what EPAPR dictates) OR KVM specific enumeration +mechanism (which is this hypercall) can be used. + +4. KVM_HC_PPC_MAP_MAGIC_PAGE +------------------------ +Architecture: PPC +Status: active +Purpose: To enable communication between the hypervisor and guest there is a +shared page that contains parts of supervisor visible register state. +The guest can map this shared page to access its supervisor register through +memory using this hypercall. + +5. KVM_HC_KICK_CPU +------------------------ +Architecture: x86 +Status: active +Purpose: Hypercall used to wakeup a vcpu from HLT state +Usage example : A vcpu of a paravirtualized guest that is busywaiting in guest +kernel mode for an event to occur (ex: a spinlock to become available) can +execute HLT instruction once it has busy-waited for more than a threshold +time-interval. Execution of HLT instruction would cause the hypervisor to put +the vcpu to sleep until occurrence of an appropriate event. Another vcpu of the +same guest can wakeup the sleeping vcpu by issuing KVM_HC_KICK_CPU hypercall, +specifying APIC ID (a1) of the vcpu to be woken up. An additional argument (a0) +is used in the hypercall for future use. + + +6. KVM_HC_CLOCK_PAIRING +------------------------ +Architecture: x86 +Status: active +Purpose: Hypercall used to synchronize host and guest clocks. +Usage: + +a0: guest physical address where host copies +"struct kvm_clock_offset" structure. + +a1: clock_type, ATM only KVM_CLOCK_PAIRING_WALLCLOCK (0) +is supported (corresponding to the host's CLOCK_REALTIME clock). + + struct kvm_clock_pairing { + __s64 sec; + __s64 nsec; + __u64 tsc; + __u32 flags; + __u32 pad[9]; + }; + + Where: + * sec: seconds from clock_type clock. + * nsec: nanoseconds from clock_type clock. + * tsc: guest TSC value used to calculate sec/nsec pair + * flags: flags, unused (0) at the moment. + +The hypercall lets a guest compute a precise timestamp across +host and guest. The guest can use the returned TSC value to +compute the CLOCK_REALTIME for its clock, at the same instant. + +Returns KVM_EOPNOTSUPP if the host does not use TSC clocksource, +or if clock type is different than KVM_CLOCK_PAIRING_WALLCLOCK. + +6. KVM_HC_SEND_IPI +------------------------ +Architecture: x86 +Status: active +Purpose: Send IPIs to multiple vCPUs. + +a0: lower part of the bitmap of destination APIC IDs +a1: higher part of the bitmap of destination APIC IDs +a2: the lowest APIC ID in bitmap +a3: APIC ICR + +The hypercall lets a guest send multicast IPIs, with at most 128 +128 destinations per hypercall in 64-bit mode and 64 vCPUs per +hypercall in 32-bit mode. The destinations are represented by a +bitmap contained in the first two arguments (a0 and a1). Bit 0 of +a0 corresponds to the APIC ID in the third argument (a2), bit 1 +corresponds to the APIC ID a2+1, and so on. + +Returns the number of CPUs to which the IPIs were delivered successfully. + +7. KVM_HC_SCHED_YIELD +------------------------ +Architecture: x86 +Status: active +Purpose: Hypercall used to yield if the IPI target vCPU is preempted + +a0: destination APIC ID + +Usage example: When sending a call-function IPI-many to vCPUs, yield if +any of the IPI target vCPUs was preempted. diff --git a/Documentation/virt/kvm/index.rst b/Documentation/virt/kvm/index.rst new file mode 100644 index 000000000000..0b206a06f5be --- /dev/null +++ b/Documentation/virt/kvm/index.rst @@ -0,0 +1,11 @@ +.. SPDX-License-Identifier: GPL-2.0 + +=== +KVM +=== + +.. toctree:: + :maxdepth: 2 + + amd-memory-encryption + cpuid diff --git a/Documentation/virt/kvm/locking.txt b/Documentation/virt/kvm/locking.txt new file mode 100644 index 000000000000..635cd6eaf714 --- /dev/null +++ b/Documentation/virt/kvm/locking.txt @@ -0,0 +1,215 @@ +KVM Lock Overview +================= + +1. Acquisition Orders +--------------------- + +The acquisition orders for mutexes are as follows: + +- kvm->lock is taken outside vcpu->mutex + +- kvm->lock is taken outside kvm->slots_lock and kvm->irq_lock + +- kvm->slots_lock is taken outside kvm->irq_lock, though acquiring + them together is quite rare. + +On x86, vcpu->mutex is taken outside kvm->arch.hyperv.hv_lock. + +Everything else is a leaf: no other lock is taken inside the critical +sections. + +2: Exception +------------ + +Fast page fault: + +Fast page fault is the fast path which fixes the guest page fault out of +the mmu-lock on x86. Currently, the page fault can be fast in one of the +following two cases: + +1. Access Tracking: The SPTE is not present, but it is marked for access +tracking i.e. the SPTE_SPECIAL_MASK is set. That means we need to +restore the saved R/X bits. This is described in more detail later below. + +2. Write-Protection: The SPTE is present and the fault is +caused by write-protect. That means we just need to change the W bit of the +spte. + +What we use to avoid all the race is the SPTE_HOST_WRITEABLE bit and +SPTE_MMU_WRITEABLE bit on the spte: +- SPTE_HOST_WRITEABLE means the gfn is writable on host. +- SPTE_MMU_WRITEABLE means the gfn is writable on mmu. The bit is set when + the gfn is writable on guest mmu and it is not write-protected by shadow + page write-protection. + +On fast page fault path, we will use cmpxchg to atomically set the spte W +bit if spte.SPTE_HOST_WRITEABLE = 1 and spte.SPTE_WRITE_PROTECT = 1, or +restore the saved R/X bits if VMX_EPT_TRACK_ACCESS mask is set, or both. This +is safe because whenever changing these bits can be detected by cmpxchg. + +But we need carefully check these cases: +1): The mapping from gfn to pfn +The mapping from gfn to pfn may be changed since we can only ensure the pfn +is not changed during cmpxchg. This is a ABA problem, for example, below case +will happen: + +At the beginning: +gpte = gfn1 +gfn1 is mapped to pfn1 on host +spte is the shadow page table entry corresponding with gpte and +spte = pfn1 + + VCPU 0 VCPU0 +on fast page fault path: + + old_spte = *spte; + pfn1 is swapped out: + spte = 0; + + pfn1 is re-alloced for gfn2. + + gpte is changed to point to + gfn2 by the guest: + spte = pfn1; + + if (cmpxchg(spte, old_spte, old_spte+W) + mark_page_dirty(vcpu->kvm, gfn1) + OOPS!!! + +We dirty-log for gfn1, that means gfn2 is lost in dirty-bitmap. + +For direct sp, we can easily avoid it since the spte of direct sp is fixed +to gfn. For indirect sp, before we do cmpxchg, we call gfn_to_pfn_atomic() +to pin gfn to pfn, because after gfn_to_pfn_atomic(): +- We have held the refcount of pfn that means the pfn can not be freed and + be reused for another gfn. +- The pfn is writable that means it can not be shared between different gfns + by KSM. + +Then, we can ensure the dirty bitmaps is correctly set for a gfn. + +Currently, to simplify the whole things, we disable fast page fault for +indirect shadow page. + +2): Dirty bit tracking +In the origin code, the spte can be fast updated (non-atomically) if the +spte is read-only and the Accessed bit has already been set since the +Accessed bit and Dirty bit can not be lost. + +But it is not true after fast page fault since the spte can be marked +writable between reading spte and updating spte. Like below case: + +At the beginning: +spte.W = 0 +spte.Accessed = 1 + + VCPU 0 VCPU0 +In mmu_spte_clear_track_bits(): + + old_spte = *spte; + + /* 'if' condition is satisfied. */ + if (old_spte.Accessed == 1 && + old_spte.W == 0) + spte = 0ull; + on fast page fault path: + spte.W = 1 + memory write on the spte: + spte.Dirty = 1 + + + else + old_spte = xchg(spte, 0ull) + + + if (old_spte.Accessed == 1) + kvm_set_pfn_accessed(spte.pfn); + if (old_spte.Dirty == 1) + kvm_set_pfn_dirty(spte.pfn); + OOPS!!! + +The Dirty bit is lost in this case. + +In order to avoid this kind of issue, we always treat the spte as "volatile" +if it can be updated out of mmu-lock, see spte_has_volatile_bits(), it means, +the spte is always atomically updated in this case. + +3): flush tlbs due to spte updated +If the spte is updated from writable to readonly, we should flush all TLBs, +otherwise rmap_write_protect will find a read-only spte, even though the +writable spte might be cached on a CPU's TLB. + +As mentioned before, the spte can be updated to writable out of mmu-lock on +fast page fault path, in order to easily audit the path, we see if TLBs need +be flushed caused by this reason in mmu_spte_update() since this is a common +function to update spte (present -> present). + +Since the spte is "volatile" if it can be updated out of mmu-lock, we always +atomically update the spte, the race caused by fast page fault can be avoided, +See the comments in spte_has_volatile_bits() and mmu_spte_update(). + +Lockless Access Tracking: + +This is used for Intel CPUs that are using EPT but do not support the EPT A/D +bits. In this case, when the KVM MMU notifier is called to track accesses to a +page (via kvm_mmu_notifier_clear_flush_young), it marks the PTE as not-present +by clearing the RWX bits in the PTE and storing the original R & X bits in +some unused/ignored bits. In addition, the SPTE_SPECIAL_MASK is also set on the +PTE (using the ignored bit 62). When the VM tries to access the page later on, +a fault is generated and the fast page fault mechanism described above is used +to atomically restore the PTE to a Present state. The W bit is not saved when +the PTE is marked for access tracking and during restoration to the Present +state, the W bit is set depending on whether or not it was a write access. If +it wasn't, then the W bit will remain clear until a write access happens, at +which time it will be set using the Dirty tracking mechanism described above. + +3. Reference +------------ + +Name: kvm_lock +Type: mutex +Arch: any +Protects: - vm_list + +Name: kvm_count_lock +Type: raw_spinlock_t +Arch: any +Protects: - hardware virtualization enable/disable +Comment: 'raw' because hardware enabling/disabling must be atomic /wrt + migration. + +Name: kvm_arch::tsc_write_lock +Type: raw_spinlock +Arch: x86 +Protects: - kvm_arch::{last_tsc_write,last_tsc_nsec,last_tsc_offset} + - tsc offset in vmcb +Comment: 'raw' because updating the tsc offsets must not be preempted. + +Name: kvm->mmu_lock +Type: spinlock_t +Arch: any +Protects: -shadow page/shadow tlb entry +Comment: it is a spinlock since it is used in mmu notifier. + +Name: kvm->srcu +Type: srcu lock +Arch: any +Protects: - kvm->memslots + - kvm->buses +Comment: The srcu read lock must be held while accessing memslots (e.g. + when using gfn_to_* functions) and while accessing in-kernel + MMIO/PIO address->device structure mapping (kvm->buses). + The srcu index can be stored in kvm_vcpu->srcu_idx per vcpu + if it is needed by multiple functions. + +Name: blocked_vcpu_on_cpu_lock +Type: spinlock_t +Arch: x86 +Protects: blocked_vcpu_on_cpu +Comment: This is a per-CPU lock and it is used for VT-d posted-interrupts. + When VT-d posted-interrupts is supported and the VM has assigned + devices, we put the blocked vCPU on the list blocked_vcpu_on_cpu + protected by blocked_vcpu_on_cpu_lock, when VT-d hardware issues + wakeup notification event since external interrupts from the + assigned devices happens, we will find the vCPU on the list to + wakeup. diff --git a/Documentation/virt/kvm/mmu.txt b/Documentation/virt/kvm/mmu.txt new file mode 100644 index 000000000000..1b9880dfba0a --- /dev/null +++ b/Documentation/virt/kvm/mmu.txt @@ -0,0 +1,449 @@ +The x86 kvm shadow mmu +====================== + +The mmu (in arch/x86/kvm, files mmu.[ch] and paging_tmpl.h) is responsible +for presenting a standard x86 mmu to the guest, while translating guest +physical addresses to host physical addresses. + +The mmu code attempts to satisfy the following requirements: + +- correctness: the guest should not be able to determine that it is running + on an emulated mmu except for timing (we attempt to comply + with the specification, not emulate the characteristics of + a particular implementation such as tlb size) +- security: the guest must not be able to touch host memory not assigned + to it +- performance: minimize the performance penalty imposed by the mmu +- scaling: need to scale to large memory and large vcpu guests +- hardware: support the full range of x86 virtualization hardware +- integration: Linux memory management code must be in control of guest memory + so that swapping, page migration, page merging, transparent + hugepages, and similar features work without change +- dirty tracking: report writes to guest memory to enable live migration + and framebuffer-based displays +- footprint: keep the amount of pinned kernel memory low (most memory + should be shrinkable) +- reliability: avoid multipage or GFP_ATOMIC allocations + +Acronyms +======== + +pfn host page frame number +hpa host physical address +hva host virtual address +gfn guest frame number +gpa guest physical address +gva guest virtual address +ngpa nested guest physical address +ngva nested guest virtual address +pte page table entry (used also to refer generically to paging structure + entries) +gpte guest pte (referring to gfns) +spte shadow pte (referring to pfns) +tdp two dimensional paging (vendor neutral term for NPT and EPT) + +Virtual and real hardware supported +=================================== + +The mmu supports first-generation mmu hardware, which allows an atomic switch +of the current paging mode and cr3 during guest entry, as well as +two-dimensional paging (AMD's NPT and Intel's EPT). The emulated hardware +it exposes is the traditional 2/3/4 level x86 mmu, with support for global +pages, pae, pse, pse36, cr0.wp, and 1GB pages. Emulated hardware also +able to expose NPT capable hardware on NPT capable hosts. + +Translation +=========== + +The primary job of the mmu is to program the processor's mmu to translate +addresses for the guest. Different translations are required at different +times: + +- when guest paging is disabled, we translate guest physical addresses to + host physical addresses (gpa->hpa) +- when guest paging is enabled, we translate guest virtual addresses, to + guest physical addresses, to host physical addresses (gva->gpa->hpa) +- when the guest launches a guest of its own, we translate nested guest + virtual addresses, to nested guest physical addresses, to guest physical + addresses, to host physical addresses (ngva->ngpa->gpa->hpa) + +The primary challenge is to encode between 1 and 3 translations into hardware +that support only 1 (traditional) and 2 (tdp) translations. When the +number of required translations matches the hardware, the mmu operates in +direct mode; otherwise it operates in shadow mode (see below). + +Memory +====== + +Guest memory (gpa) is part of the user address space of the process that is +using kvm. Userspace defines the translation between guest addresses and user +addresses (gpa->hva); note that two gpas may alias to the same hva, but not +vice versa. + +These hvas may be backed using any method available to the host: anonymous +memory, file backed memory, and device memory. Memory might be paged by the +host at any time. + +Events +====== + +The mmu is driven by events, some from the guest, some from the host. + +Guest generated events: +- writes to control registers (especially cr3) +- invlpg/invlpga instruction execution +- access to missing or protected translations + +Host generated events: +- changes in the gpa->hpa translation (either through gpa->hva changes or + through hva->hpa changes) +- memory pressure (the shrinker) + +Shadow pages +============ + +The principal data structure is the shadow page, 'struct kvm_mmu_page'. A +shadow page contains 512 sptes, which can be either leaf or nonleaf sptes. A +shadow page may contain a mix of leaf and nonleaf sptes. + +A nonleaf spte allows the hardware mmu to reach the leaf pages and +is not related to a translation directly. It points to other shadow pages. + +A leaf spte corresponds to either one or two translations encoded into +one paging structure entry. These are always the lowest level of the +translation stack, with optional higher level translations left to NPT/EPT. +Leaf ptes point at guest pages. + +The following table shows translations encoded by leaf ptes, with higher-level +translations in parentheses: + + Non-nested guests: + nonpaging: gpa->hpa + paging: gva->gpa->hpa + paging, tdp: (gva->)gpa->hpa + Nested guests: + non-tdp: ngva->gpa->hpa (*) + tdp: (ngva->)ngpa->gpa->hpa + +(*) the guest hypervisor will encode the ngva->gpa translation into its page + tables if npt is not present + +Shadow pages contain the following information: + role.level: + The level in the shadow paging hierarchy that this shadow page belongs to. + 1=4k sptes, 2=2M sptes, 3=1G sptes, etc. + role.direct: + If set, leaf sptes reachable from this page are for a linear range. + Examples include real mode translation, large guest pages backed by small + host pages, and gpa->hpa translations when NPT or EPT is active. + The linear range starts at (gfn << PAGE_SHIFT) and its size is determined + by role.level (2MB for first level, 1GB for second level, 0.5TB for third + level, 256TB for fourth level) + If clear, this page corresponds to a guest page table denoted by the gfn + field. + role.quadrant: + When role.gpte_is_8_bytes=0, the guest uses 32-bit gptes while the host uses 64-bit + sptes. That means a guest page table contains more ptes than the host, + so multiple shadow pages are needed to shadow one guest page. + For first-level shadow pages, role.quadrant can be 0 or 1 and denotes the + first or second 512-gpte block in the guest page table. For second-level + page tables, each 32-bit gpte is converted to two 64-bit sptes + (since each first-level guest page is shadowed by two first-level + shadow pages) so role.quadrant takes values in the range 0..3. Each + quadrant maps 1GB virtual address space. + role.access: + Inherited guest access permissions in the form uwx. Note execute + permission is positive, not negative. + role.invalid: + The page is invalid and should not be used. It is a root page that is + currently pinned (by a cpu hardware register pointing to it); once it is + unpinned it will be destroyed. + role.gpte_is_8_bytes: + Reflects the size of the guest PTE for which the page is valid, i.e. '1' + if 64-bit gptes are in use, '0' if 32-bit gptes are in use. + role.nxe: + Contains the value of efer.nxe for which the page is valid. + role.cr0_wp: + Contains the value of cr0.wp for which the page is valid. + role.smep_andnot_wp: + Contains the value of cr4.smep && !cr0.wp for which the page is valid + (pages for which this is true are different from other pages; see the + treatment of cr0.wp=0 below). + role.smap_andnot_wp: + Contains the value of cr4.smap && !cr0.wp for which the page is valid + (pages for which this is true are different from other pages; see the + treatment of cr0.wp=0 below). + role.ept_sp: + This is a virtual flag to denote a shadowed nested EPT page. ept_sp + is true if "cr0_wp && smap_andnot_wp", an otherwise invalid combination. + role.smm: + Is 1 if the page is valid in system management mode. This field + determines which of the kvm_memslots array was used to build this + shadow page; it is also used to go back from a struct kvm_mmu_page + to a memslot, through the kvm_memslots_for_spte_role macro and + __gfn_to_memslot. + role.ad_disabled: + Is 1 if the MMU instance cannot use A/D bits. EPT did not have A/D + bits before Haswell; shadow EPT page tables also cannot use A/D bits + if the L1 hypervisor does not enable them. + gfn: + Either the guest page table containing the translations shadowed by this + page, or the base page frame for linear translations. See role.direct. + spt: + A pageful of 64-bit sptes containing the translations for this page. + Accessed by both kvm and hardware. + The page pointed to by spt will have its page->private pointing back + at the shadow page structure. + sptes in spt point either at guest pages, or at lower-level shadow pages. + Specifically, if sp1 and sp2 are shadow pages, then sp1->spt[n] may point + at __pa(sp2->spt). sp2 will point back at sp1 through parent_pte. + The spt array forms a DAG structure with the shadow page as a node, and + guest pages as leaves. + gfns: + An array of 512 guest frame numbers, one for each present pte. Used to + perform a reverse map from a pte to a gfn. When role.direct is set, any + element of this array can be calculated from the gfn field when used, in + this case, the array of gfns is not allocated. See role.direct and gfn. + root_count: + A counter keeping track of how many hardware registers (guest cr3 or + pdptrs) are now pointing at the page. While this counter is nonzero, the + page cannot be destroyed. See role.invalid. + parent_ptes: + The reverse mapping for the pte/ptes pointing at this page's spt. If + parent_ptes bit 0 is zero, only one spte points at this page and + parent_ptes points at this single spte, otherwise, there exists multiple + sptes pointing at this page and (parent_ptes & ~0x1) points at a data + structure with a list of parent sptes. + unsync: + If true, then the translations in this page may not match the guest's + translation. This is equivalent to the state of the tlb when a pte is + changed but before the tlb entry is flushed. Accordingly, unsync ptes + are synchronized when the guest executes invlpg or flushes its tlb by + other means. Valid for leaf pages. + unsync_children: + How many sptes in the page point at pages that are unsync (or have + unsynchronized children). + unsync_child_bitmap: + A bitmap indicating which sptes in spt point (directly or indirectly) at + pages that may be unsynchronized. Used to quickly locate all unsychronized + pages reachable from a given page. + clear_spte_count: + Only present on 32-bit hosts, where a 64-bit spte cannot be written + atomically. The reader uses this while running out of the MMU lock + to detect in-progress updates and retry them until the writer has + finished the write. + write_flooding_count: + A guest may write to a page table many times, causing a lot of + emulations if the page needs to be write-protected (see "Synchronized + and unsynchronized pages" below). Leaf pages can be unsynchronized + so that they do not trigger frequent emulation, but this is not + possible for non-leafs. This field counts the number of emulations + since the last time the page table was actually used; if emulation + is triggered too frequently on this page, KVM will unmap the page + to avoid emulation in the future. + +Reverse map +=========== + +The mmu maintains a reverse mapping whereby all ptes mapping a page can be +reached given its gfn. This is used, for example, when swapping out a page. + +Synchronized and unsynchronized pages +===================================== + +The guest uses two events to synchronize its tlb and page tables: tlb flushes +and page invalidations (invlpg). + +A tlb flush means that we need to synchronize all sptes reachable from the +guest's cr3. This is expensive, so we keep all guest page tables write +protected, and synchronize sptes to gptes when a gpte is written. + +A special case is when a guest page table is reachable from the current +guest cr3. In this case, the guest is obliged to issue an invlpg instruction +before using the translation. We take advantage of that by removing write +protection from the guest page, and allowing the guest to modify it freely. +We synchronize modified gptes when the guest invokes invlpg. This reduces +the amount of emulation we have to do when the guest modifies multiple gptes, +or when the a guest page is no longer used as a page table and is used for +random guest data. + +As a side effect we have to resynchronize all reachable unsynchronized shadow +pages on a tlb flush. + + +Reaction to events +================== + +- guest page fault (or npt page fault, or ept violation) + +This is the most complicated event. The cause of a page fault can be: + + - a true guest fault (the guest translation won't allow the access) (*) + - access to a missing translation + - access to a protected translation + - when logging dirty pages, memory is write protected + - synchronized shadow pages are write protected (*) + - access to untranslatable memory (mmio) + + (*) not applicable in direct mode + +Handling a page fault is performed as follows: + + - if the RSV bit of the error code is set, the page fault is caused by guest + accessing MMIO and cached MMIO information is available. + - walk shadow page table + - check for valid generation number in the spte (see "Fast invalidation of + MMIO sptes" below) + - cache the information to vcpu->arch.mmio_gva, vcpu->arch.access and + vcpu->arch.mmio_gfn, and call the emulator + - If both P bit and R/W bit of error code are set, this could possibly + be handled as a "fast page fault" (fixed without taking the MMU lock). See + the description in Documentation/virt/kvm/locking.txt. + - if needed, walk the guest page tables to determine the guest translation + (gva->gpa or ngpa->gpa) + - if permissions are insufficient, reflect the fault back to the guest + - determine the host page + - if this is an mmio request, there is no host page; cache the info to + vcpu->arch.mmio_gva, vcpu->arch.access and vcpu->arch.mmio_gfn + - walk the shadow page table to find the spte for the translation, + instantiating missing intermediate page tables as necessary + - If this is an mmio request, cache the mmio info to the spte and set some + reserved bit on the spte (see callers of kvm_mmu_set_mmio_spte_mask) + - try to unsynchronize the page + - if successful, we can let the guest continue and modify the gpte + - emulate the instruction + - if failed, unshadow the page and let the guest continue + - update any translations that were modified by the instruction + +invlpg handling: + + - walk the shadow page hierarchy and drop affected translations + - try to reinstantiate the indicated translation in the hope that the + guest will use it in the near future + +Guest control register updates: + +- mov to cr3 + - look up new shadow roots + - synchronize newly reachable shadow pages + +- mov to cr0/cr4/efer + - set up mmu context for new paging mode + - look up new shadow roots + - synchronize newly reachable shadow pages + +Host translation updates: + + - mmu notifier called with updated hva + - look up affected sptes through reverse map + - drop (or update) translations + +Emulating cr0.wp +================ + +If tdp is not enabled, the host must keep cr0.wp=1 so page write protection +works for the guest kernel, not guest guest userspace. When the guest +cr0.wp=1, this does not present a problem. However when the guest cr0.wp=0, +we cannot map the permissions for gpte.u=1, gpte.w=0 to any spte (the +semantics require allowing any guest kernel access plus user read access). + +We handle this by mapping the permissions to two possible sptes, depending +on fault type: + +- kernel write fault: spte.u=0, spte.w=1 (allows full kernel access, + disallows user access) +- read fault: spte.u=1, spte.w=0 (allows full read access, disallows kernel + write access) + +(user write faults generate a #PF) + +In the first case there are two additional complications: +- if CR4.SMEP is enabled: since we've turned the page into a kernel page, + the kernel may now execute it. We handle this by also setting spte.nx. + If we get a user fetch or read fault, we'll change spte.u=1 and + spte.nx=gpte.nx back. For this to work, KVM forces EFER.NX to 1 when + shadow paging is in use. +- if CR4.SMAP is disabled: since the page has been changed to a kernel + page, it can not be reused when CR4.SMAP is enabled. We set + CR4.SMAP && !CR0.WP into shadow page's role to avoid this case. Note, + here we do not care the case that CR4.SMAP is enabled since KVM will + directly inject #PF to guest due to failed permission check. + +To prevent an spte that was converted into a kernel page with cr0.wp=0 +from being written by the kernel after cr0.wp has changed to 1, we make +the value of cr0.wp part of the page role. This means that an spte created +with one value of cr0.wp cannot be used when cr0.wp has a different value - +it will simply be missed by the shadow page lookup code. A similar issue +exists when an spte created with cr0.wp=0 and cr4.smep=0 is used after +changing cr4.smep to 1. To avoid this, the value of !cr0.wp && cr4.smep +is also made a part of the page role. + +Large pages +=========== + +The mmu supports all combinations of large and small guest and host pages. +Supported page sizes include 4k, 2M, 4M, and 1G. 4M pages are treated as +two separate 2M pages, on both guest and host, since the mmu always uses PAE +paging. + +To instantiate a large spte, four constraints must be satisfied: + +- the spte must point to a large host page +- the guest pte must be a large pte of at least equivalent size (if tdp is + enabled, there is no guest pte and this condition is satisfied) +- if the spte will be writeable, the large page frame may not overlap any + write-protected pages +- the guest page must be wholly contained by a single memory slot + +To check the last two conditions, the mmu maintains a ->disallow_lpage set of +arrays for each memory slot and large page size. Every write protected page +causes its disallow_lpage to be incremented, thus preventing instantiation of +a large spte. The frames at the end of an unaligned memory slot have +artificially inflated ->disallow_lpages so they can never be instantiated. + +Fast invalidation of MMIO sptes +=============================== + +As mentioned in "Reaction to events" above, kvm will cache MMIO +information in leaf sptes. When a new memslot is added or an existing +memslot is changed, this information may become stale and needs to be +invalidated. This also needs to hold the MMU lock while walking all +shadow pages, and is made more scalable with a similar technique. + +MMIO sptes have a few spare bits, which are used to store a +generation number. The global generation number is stored in +kvm_memslots(kvm)->generation, and increased whenever guest memory info +changes. + +When KVM finds an MMIO spte, it checks the generation number of the spte. +If the generation number of the spte does not equal the global generation +number, it will ignore the cached MMIO information and handle the page +fault through the slow path. + +Since only 19 bits are used to store generation-number on mmio spte, all +pages are zapped when there is an overflow. + +Unfortunately, a single memory access might access kvm_memslots(kvm) multiple +times, the last one happening when the generation number is retrieved and +stored into the MMIO spte. Thus, the MMIO spte might be created based on +out-of-date information, but with an up-to-date generation number. + +To avoid this, the generation number is incremented again after synchronize_srcu +returns; thus, bit 63 of kvm_memslots(kvm)->generation set to 1 only during a +memslot update, while some SRCU readers might be using the old copy. We do not +want to use an MMIO sptes created with an odd generation number, and we can do +this without losing a bit in the MMIO spte. The "update in-progress" bit of the +generation is not stored in MMIO spte, and is so is implicitly zero when the +generation is extracted out of the spte. If KVM is unlucky and creates an MMIO +spte while an update is in-progress, the next access to the spte will always be +a cache miss. For example, a subsequent access during the update window will +miss due to the in-progress flag diverging, while an access after the update +window closes will have a higher generation number (as compared to the spte). + + +Further reading +=============== + +- NPT presentation from KVM Forum 2008 + http://www.linux-kvm.org/images/c/c8/KvmForum2008%24kdf2008_21.pdf + diff --git a/Documentation/virt/kvm/msr.txt b/Documentation/virt/kvm/msr.txt new file mode 100644 index 000000000000..df1f4338b3ca --- /dev/null +++ b/Documentation/virt/kvm/msr.txt @@ -0,0 +1,284 @@ +KVM-specific MSRs. +Glauber Costa , Red Hat Inc, 2010 +===================================================== + +KVM makes use of some custom MSRs to service some requests. + +Custom MSRs have a range reserved for them, that goes from +0x4b564d00 to 0x4b564dff. There are MSRs outside this area, +but they are deprecated and their use is discouraged. + +Custom MSR list +-------- + +The current supported Custom MSR list is: + +MSR_KVM_WALL_CLOCK_NEW: 0x4b564d00 + + data: 4-byte alignment physical address of a memory area which must be + in guest RAM. This memory is expected to hold a copy of the following + structure: + + struct pvclock_wall_clock { + u32 version; + u32 sec; + u32 nsec; + } __attribute__((__packed__)); + + whose data will be filled in by the hypervisor. The hypervisor is only + guaranteed to update this data at the moment of MSR write. + Users that want to reliably query this information more than once have + to write more than once to this MSR. Fields have the following meanings: + + version: guest has to check version before and after grabbing + time information and check that they are both equal and even. + An odd version indicates an in-progress update. + + sec: number of seconds for wallclock at time of boot. + + nsec: number of nanoseconds for wallclock at time of boot. + + In order to get the current wallclock time, the system_time from + MSR_KVM_SYSTEM_TIME_NEW needs to be added. + + Note that although MSRs are per-CPU entities, the effect of this + particular MSR is global. + + Availability of this MSR must be checked via bit 3 in 0x4000001 cpuid + leaf prior to usage. + +MSR_KVM_SYSTEM_TIME_NEW: 0x4b564d01 + + data: 4-byte aligned physical address of a memory area which must be in + guest RAM, plus an enable bit in bit 0. This memory is expected to hold + a copy of the following structure: + + struct pvclock_vcpu_time_info { + u32 version; + u32 pad0; + u64 tsc_timestamp; + u64 system_time; + u32 tsc_to_system_mul; + s8 tsc_shift; + u8 flags; + u8 pad[2]; + } __attribute__((__packed__)); /* 32 bytes */ + + whose data will be filled in by the hypervisor periodically. Only one + write, or registration, is needed for each VCPU. The interval between + updates of this structure is arbitrary and implementation-dependent. + The hypervisor may update this structure at any time it sees fit until + anything with bit0 == 0 is written to it. + + Fields have the following meanings: + + version: guest has to check version before and after grabbing + time information and check that they are both equal and even. + An odd version indicates an in-progress update. + + tsc_timestamp: the tsc value at the current VCPU at the time + of the update of this structure. Guests can subtract this value + from current tsc to derive a notion of elapsed time since the + structure update. + + system_time: a host notion of monotonic time, including sleep + time at the time this structure was last updated. Unit is + nanoseconds. + + tsc_to_system_mul: multiplier to be used when converting + tsc-related quantity to nanoseconds + + tsc_shift: shift to be used when converting tsc-related + quantity to nanoseconds. This shift will ensure that + multiplication with tsc_to_system_mul does not overflow. + A positive value denotes a left shift, a negative value + a right shift. + + The conversion from tsc to nanoseconds involves an additional + right shift by 32 bits. With this information, guests can + derive per-CPU time by doing: + + time = (current_tsc - tsc_timestamp) + if (tsc_shift >= 0) + time <<= tsc_shift; + else + time >>= -tsc_shift; + time = (time * tsc_to_system_mul) >> 32 + time = time + system_time + + flags: bits in this field indicate extended capabilities + coordinated between the guest and the hypervisor. Availability + of specific flags has to be checked in 0x40000001 cpuid leaf. + Current flags are: + + flag bit | cpuid bit | meaning + ------------------------------------------------------------- + | | time measures taken across + 0 | 24 | multiple cpus are guaranteed to + | | be monotonic + ------------------------------------------------------------- + | | guest vcpu has been paused by + 1 | N/A | the host + | | See 4.70 in api.txt + ------------------------------------------------------------- + + Availability of this MSR must be checked via bit 3 in 0x4000001 cpuid + leaf prior to usage. + + +MSR_KVM_WALL_CLOCK: 0x11 + + data and functioning: same as MSR_KVM_WALL_CLOCK_NEW. Use that instead. + + This MSR falls outside the reserved KVM range and may be removed in the + future. Its usage is deprecated. + + Availability of this MSR must be checked via bit 0 in 0x4000001 cpuid + leaf prior to usage. + +MSR_KVM_SYSTEM_TIME: 0x12 + + data and functioning: same as MSR_KVM_SYSTEM_TIME_NEW. Use that instead. + + This MSR falls outside the reserved KVM range and may be removed in the + future. Its usage is deprecated. + + Availability of this MSR must be checked via bit 0 in 0x4000001 cpuid + leaf prior to usage. + + The suggested algorithm for detecting kvmclock presence is then: + + if (!kvm_para_available()) /* refer to cpuid.txt */ + return NON_PRESENT; + + flags = cpuid_eax(0x40000001); + if (flags & 3) { + msr_kvm_system_time = MSR_KVM_SYSTEM_TIME_NEW; + msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK_NEW; + return PRESENT; + } else if (flags & 0) { + msr_kvm_system_time = MSR_KVM_SYSTEM_TIME; + msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK; + return PRESENT; + } else + return NON_PRESENT; + +MSR_KVM_ASYNC_PF_EN: 0x4b564d02 + data: Bits 63-6 hold 64-byte aligned physical address of a + 64 byte memory area which must be in guest RAM and must be + zeroed. Bits 5-3 are reserved and should be zero. Bit 0 is 1 + when asynchronous page faults are enabled on the vcpu 0 when + disabled. Bit 1 is 1 if asynchronous page faults can be injected + when vcpu is in cpl == 0. Bit 2 is 1 if asynchronous page faults + are delivered to L1 as #PF vmexits. Bit 2 can be set only if + KVM_FEATURE_ASYNC_PF_VMEXIT is present in CPUID. + + First 4 byte of 64 byte memory location will be written to by + the hypervisor at the time of asynchronous page fault (APF) + injection to indicate type of asynchronous page fault. Value + of 1 means that the page referred to by the page fault is not + present. Value 2 means that the page is now available. Disabling + interrupt inhibits APFs. Guest must not enable interrupt + before the reason is read, or it may be overwritten by another + APF. Since APF uses the same exception vector as regular page + fault guest must reset the reason to 0 before it does + something that can generate normal page fault. If during page + fault APF reason is 0 it means that this is regular page + fault. + + During delivery of type 1 APF cr2 contains a token that will + be used to notify a guest when missing page becomes + available. When page becomes available type 2 APF is sent with + cr2 set to the token associated with the page. There is special + kind of token 0xffffffff which tells vcpu that it should wake + up all processes waiting for APFs and no individual type 2 APFs + will be sent. + + If APF is disabled while there are outstanding APFs, they will + not be delivered. + + Currently type 2 APF will be always delivered on the same vcpu as + type 1 was, but guest should not rely on that. + +MSR_KVM_STEAL_TIME: 0x4b564d03 + + data: 64-byte alignment physical address of a memory area which must be + in guest RAM, plus an enable bit in bit 0. This memory is expected to + hold a copy of the following structure: + + struct kvm_steal_time { + __u64 steal; + __u32 version; + __u32 flags; + __u8 preempted; + __u8 u8_pad[3]; + __u32 pad[11]; + } + + whose data will be filled in by the hypervisor periodically. Only one + write, or registration, is needed for each VCPU. The interval between + updates of this structure is arbitrary and implementation-dependent. + The hypervisor may update this structure at any time it sees fit until + anything with bit0 == 0 is written to it. Guest is required to make sure + this structure is initialized to zero. + + Fields have the following meanings: + + version: a sequence counter. In other words, guest has to check + this field before and after grabbing time information and make + sure they are both equal and even. An odd version indicates an + in-progress update. + + flags: At this point, always zero. May be used to indicate + changes in this structure in the future. + + steal: the amount of time in which this vCPU did not run, in + nanoseconds. Time during which the vcpu is idle, will not be + reported as steal time. + + preempted: indicate the vCPU who owns this struct is running or + not. Non-zero values mean the vCPU has been preempted. Zero + means the vCPU is not preempted. NOTE, it is always zero if the + the hypervisor doesn't support this field. + +MSR_KVM_EOI_EN: 0x4b564d04 + data: Bit 0 is 1 when PV end of interrupt is enabled on the vcpu; 0 + when disabled. Bit 1 is reserved and must be zero. When PV end of + interrupt is enabled (bit 0 set), bits 63-2 hold a 4-byte aligned + physical address of a 4 byte memory area which must be in guest RAM and + must be zeroed. + + The first, least significant bit of 4 byte memory location will be + written to by the hypervisor, typically at the time of interrupt + injection. Value of 1 means that guest can skip writing EOI to the apic + (using MSR or MMIO write); instead, it is sufficient to signal + EOI by clearing the bit in guest memory - this location will + later be polled by the hypervisor. + Value of 0 means that the EOI write is required. + + It is always safe for the guest to ignore the optimization and perform + the APIC EOI write anyway. + + Hypervisor is guaranteed to only modify this least + significant bit while in the current VCPU context, this means that + guest does not need to use either lock prefix or memory ordering + primitives to synchronise with the hypervisor. + + However, hypervisor can set and clear this memory bit at any time: + therefore to make sure hypervisor does not interrupt the + guest and clear the least significant bit in the memory area + in the window between guest testing it to detect + whether it can skip EOI apic write and between guest + clearing it to signal EOI to the hypervisor, + guest must both read the least significant bit in the memory area and + clear it using a single CPU instruction, such as test and clear, or + compare and exchange. + +MSR_KVM_POLL_CONTROL: 0x4b564d05 + Control host-side polling. + + data: Bit 0 enables (1) or disables (0) host-side HLT polling logic. + + KVM guests can request the host not to poll on HLT, for example if + they are performing polling themselves. + diff --git a/Documentation/virt/kvm/nested-vmx.txt b/Documentation/virt/kvm/nested-vmx.txt new file mode 100644 index 000000000000..97eb1353e962 --- /dev/null +++ b/Documentation/virt/kvm/nested-vmx.txt @@ -0,0 +1,240 @@ +Nested VMX +========== + +Overview +--------- + +On Intel processors, KVM uses Intel's VMX (Virtual-Machine eXtensions) +to easily and efficiently run guest operating systems. Normally, these guests +*cannot* themselves be hypervisors running their own guests, because in VMX, +guests cannot use VMX instructions. + +The "Nested VMX" feature adds this missing capability - of running guest +hypervisors (which use VMX) with their own nested guests. It does so by +allowing a guest to use VMX instructions, and correctly and efficiently +emulating them using the single level of VMX available in the hardware. + +We describe in much greater detail the theory behind the nested VMX feature, +its implementation and its performance characteristics, in the OSDI 2010 paper +"The Turtles Project: Design and Implementation of Nested Virtualization", +available at: + + http://www.usenix.org/events/osdi10/tech/full_papers/Ben-Yehuda.pdf + + +Terminology +----------- + +Single-level virtualization has two levels - the host (KVM) and the guests. +In nested virtualization, we have three levels: The host (KVM), which we call +L0, the guest hypervisor, which we call L1, and its nested guest, which we +call L2. + + +Running nested VMX +------------------ + +The nested VMX feature is disabled by default. It can be enabled by giving +the "nested=1" option to the kvm-intel module. + +No modifications are required to user space (qemu). However, qemu's default +emulated CPU type (qemu64) does not list the "VMX" CPU feature, so it must be +explicitly enabled, by giving qemu one of the following options: + + -cpu host (emulated CPU has all features of the real CPU) + + -cpu qemu64,+vmx (add just the vmx feature to a named CPU type) + + +ABIs +---- + +Nested VMX aims to present a standard and (eventually) fully-functional VMX +implementation for the a guest hypervisor to use. As such, the official +specification of the ABI that it provides is Intel's VMX specification, +namely volume 3B of their "Intel 64 and IA-32 Architectures Software +Developer's Manual". Not all of VMX's features are currently fully supported, +but the goal is to eventually support them all, starting with the VMX features +which are used in practice by popular hypervisors (KVM and others). + +As a VMX implementation, nested VMX presents a VMCS structure to L1. +As mandated by the spec, other than the two fields revision_id and abort, +this structure is *opaque* to its user, who is not supposed to know or care +about its internal structure. Rather, the structure is accessed through the +VMREAD and VMWRITE instructions. +Still, for debugging purposes, KVM developers might be interested to know the +internals of this structure; This is struct vmcs12 from arch/x86/kvm/vmx.c. + +The name "vmcs12" refers to the VMCS that L1 builds for L2. In the code we +also have "vmcs01", the VMCS that L0 built for L1, and "vmcs02" is the VMCS +which L0 builds to actually run L2 - how this is done is explained in the +aforementioned paper. + +For convenience, we repeat the content of struct vmcs12 here. If the internals +of this structure changes, this can break live migration across KVM versions. +VMCS12_REVISION (from vmx.c) should be changed if struct vmcs12 or its inner +struct shadow_vmcs is ever changed. + + typedef u64 natural_width; + struct __packed vmcs12 { + /* According to the Intel spec, a VMCS region must start with + * these two user-visible fields */ + u32 revision_id; + u32 abort; + + u32 launch_state; /* set to 0 by VMCLEAR, to 1 by VMLAUNCH */ + u32 padding[7]; /* room for future expansion */ + + u64 io_bitmap_a; + u64 io_bitmap_b; + u64 msr_bitmap; + u64 vm_exit_msr_store_addr; + u64 vm_exit_msr_load_addr; + u64 vm_entry_msr_load_addr; + u64 tsc_offset; + u64 virtual_apic_page_addr; + u64 apic_access_addr; + u64 ept_pointer; + u64 guest_physical_address; + u64 vmcs_link_pointer; + u64 guest_ia32_debugctl; + u64 guest_ia32_pat; + u64 guest_ia32_efer; + u64 guest_pdptr0; + u64 guest_pdptr1; + u64 guest_pdptr2; + u64 guest_pdptr3; + u64 host_ia32_pat; + u64 host_ia32_efer; + u64 padding64[8]; /* room for future expansion */ + natural_width cr0_guest_host_mask; + natural_width cr4_guest_host_mask; + natural_width cr0_read_shadow; + natural_width cr4_read_shadow; + natural_width cr3_target_value0; + natural_width cr3_target_value1; + natural_width cr3_target_value2; + natural_width cr3_target_value3; + natural_width exit_qualification; + natural_width guest_linear_address; + natural_width guest_cr0; + natural_width guest_cr3; + natural_width guest_cr4; + natural_width guest_es_base; + natural_width guest_cs_base; + natural_width guest_ss_base; + natural_width guest_ds_base; + natural_width guest_fs_base; + natural_width guest_gs_base; + natural_width guest_ldtr_base; + natural_width guest_tr_base; + natural_width guest_gdtr_base; + natural_width guest_idtr_base; + natural_width guest_dr7; + natural_width guest_rsp; + natural_width guest_rip; + natural_width guest_rflags; + natural_width guest_pending_dbg_exceptions; + natural_width guest_sysenter_esp; + natural_width guest_sysenter_eip; + natural_width host_cr0; + natural_width host_cr3; + natural_width host_cr4; + natural_width host_fs_base; + natural_width host_gs_base; + natural_width host_tr_base; + natural_width host_gdtr_base; + natural_width host_idtr_base; + natural_width host_ia32_sysenter_esp; + natural_width host_ia32_sysenter_eip; + natural_width host_rsp; + natural_width host_rip; + natural_width paddingl[8]; /* room for future expansion */ + u32 pin_based_vm_exec_control; + u32 cpu_based_vm_exec_control; + u32 exception_bitmap; + u32 page_fault_error_code_mask; + u32 page_fault_error_code_match; + u32 cr3_target_count; + u32 vm_exit_controls; + u32 vm_exit_msr_store_count; + u32 vm_exit_msr_load_count; + u32 vm_entry_controls; + u32 vm_entry_msr_load_count; + u32 vm_entry_intr_info_field; + u32 vm_entry_exception_error_code; + u32 vm_entry_instruction_len; + u32 tpr_threshold; + u32 secondary_vm_exec_control; + u32 vm_instruction_error; + u32 vm_exit_reason; + u32 vm_exit_intr_info; + u32 vm_exit_intr_error_code; + u32 idt_vectoring_info_field; + u32 idt_vectoring_error_code; + u32 vm_exit_instruction_len; + u32 vmx_instruction_info; + u32 guest_es_limit; + u32 guest_cs_limit; + u32 guest_ss_limit; + u32 guest_ds_limit; + u32 guest_fs_limit; + u32 guest_gs_limit; + u32 guest_ldtr_limit; + u32 guest_tr_limit; + u32 guest_gdtr_limit; + u32 guest_idtr_limit; + u32 guest_es_ar_bytes; + u32 guest_cs_ar_bytes; + u32 guest_ss_ar_bytes; + u32 guest_ds_ar_bytes; + u32 guest_fs_ar_bytes; + u32 guest_gs_ar_bytes; + u32 guest_ldtr_ar_bytes; + u32 guest_tr_ar_bytes; + u32 guest_interruptibility_info; + u32 guest_activity_state; + u32 guest_sysenter_cs; + u32 host_ia32_sysenter_cs; + u32 padding32[8]; /* room for future expansion */ + u16 virtual_processor_id; + u16 guest_es_selector; + u16 guest_cs_selector; + u16 guest_ss_selector; + u16 guest_ds_selector; + u16 guest_fs_selector; + u16 guest_gs_selector; + u16 guest_ldtr_selector; + u16 guest_tr_selector; + u16 host_es_selector; + u16 host_cs_selector; + u16 host_ss_selector; + u16 host_ds_selector; + u16 host_fs_selector; + u16 host_gs_selector; + u16 host_tr_selector; + }; + + +Authors +------- + +These patches were written by: + Abel Gordon, abelg il.ibm.com + Nadav Har'El, nyh il.ibm.com + Orit Wasserman, oritw il.ibm.com + Ben-Ami Yassor, benami il.ibm.com + Muli Ben-Yehuda, muli il.ibm.com + +With contributions by: + Anthony Liguori, aliguori us.ibm.com + Mike Day, mdday us.ibm.com + Michael Factor, factor il.ibm.com + Zvi Dubitzky, dubi il.ibm.com + +And valuable reviews by: + Avi Kivity, avi redhat.com + Gleb Natapov, gleb redhat.com + Marcelo Tosatti, mtosatti redhat.com + Kevin Tian, kevin.tian intel.com + and others. diff --git a/Documentation/virt/kvm/ppc-pv.txt b/Documentation/virt/kvm/ppc-pv.txt new file mode 100644 index 000000000000..e26115ce4258 --- /dev/null +++ b/Documentation/virt/kvm/ppc-pv.txt @@ -0,0 +1,212 @@ +The PPC KVM paravirtual interface +================================= + +The basic execution principle by which KVM on PowerPC works is to run all kernel +space code in PR=1 which is user space. This way we trap all privileged +instructions and can emulate them accordingly. + +Unfortunately that is also the downfall. There are quite some privileged +instructions that needlessly return us to the hypervisor even though they +could be handled differently. + +This is what the PPC PV interface helps with. It takes privileged instructions +and transforms them into unprivileged ones with some help from the hypervisor. +This cuts down virtualization costs by about 50% on some of my benchmarks. + +The code for that interface can be found in arch/powerpc/kernel/kvm* + +Querying for existence +====================== + +To find out if we're running on KVM or not, we leverage the device tree. When +Linux is running on KVM, a node /hypervisor exists. That node contains a +compatible property with the value "linux,kvm". + +Once you determined you're running under a PV capable KVM, you can now use +hypercalls as described below. + +KVM hypercalls +============== + +Inside the device tree's /hypervisor node there's a property called +'hypercall-instructions'. This property contains at most 4 opcodes that make +up the hypercall. To call a hypercall, just call these instructions. + +The parameters are as follows: + + Register IN OUT + + r0 - volatile + r3 1st parameter Return code + r4 2nd parameter 1st output value + r5 3rd parameter 2nd output value + r6 4th parameter 3rd output value + r7 5th parameter 4th output value + r8 6th parameter 5th output value + r9 7th parameter 6th output value + r10 8th parameter 7th output value + r11 hypercall number 8th output value + r12 - volatile + +Hypercall definitions are shared in generic code, so the same hypercall numbers +apply for x86 and powerpc alike with the exception that each KVM hypercall +also needs to be ORed with the KVM vendor code which is (42 << 16). + +Return codes can be as follows: + + Code Meaning + + 0 Success + 12 Hypercall not implemented + <0 Error + +The magic page +============== + +To enable communication between the hypervisor and guest there is a new shared +page that contains parts of supervisor visible register state. The guest can +map this shared page using the KVM hypercall KVM_HC_PPC_MAP_MAGIC_PAGE. + +With this hypercall issued the guest always gets the magic page mapped at the +desired location. The first parameter indicates the effective address when the +MMU is enabled. The second parameter indicates the address in real mode, if +applicable to the target. For now, we always map the page to -4096. This way we +can access it using absolute load and store functions. The following +instruction reads the first field of the magic page: + + ld rX, -4096(0) + +The interface is designed to be extensible should there be need later to add +additional registers to the magic page. If you add fields to the magic page, +also define a new hypercall feature to indicate that the host can give you more +registers. Only if the host supports the additional features, make use of them. + +The magic page layout is described by struct kvm_vcpu_arch_shared +in arch/powerpc/include/asm/kvm_para.h. + +Magic page features +=================== + +When mapping the magic page using the KVM hypercall KVM_HC_PPC_MAP_MAGIC_PAGE, +a second return value is passed to the guest. This second return value contains +a bitmap of available features inside the magic page. + +The following enhancements to the magic page are currently available: + + KVM_MAGIC_FEAT_SR Maps SR registers r/w in the magic page + KVM_MAGIC_FEAT_MAS0_TO_SPRG7 Maps MASn, ESR, PIR and high SPRGs + +For enhanced features in the magic page, please check for the existence of the +feature before using them! + +Magic page flags +================ + +In addition to features that indicate whether a host is capable of a particular +feature we also have a channel for a guest to tell the guest whether it's capable +of something. This is what we call "flags". + +Flags are passed to the host in the low 12 bits of the Effective Address. + +The following flags are currently available for a guest to expose: + + MAGIC_PAGE_FLAG_NOT_MAPPED_NX Guest handles NX bits correctly wrt magic page + +MSR bits +======== + +The MSR contains bits that require hypervisor intervention and bits that do +not require direct hypervisor intervention because they only get interpreted +when entering the guest or don't have any impact on the hypervisor's behavior. + +The following bits are safe to be set inside the guest: + + MSR_EE + MSR_RI + +If any other bit changes in the MSR, please still use mtmsr(d). + +Patched instructions +==================== + +The "ld" and "std" instructions are transformed to "lwz" and "stw" instructions +respectively on 32 bit systems with an added offset of 4 to accommodate for big +endianness. + +The following is a list of mapping the Linux kernel performs when running as +guest. Implementing any of those mappings is optional, as the instruction traps +also act on the shared page. So calling privileged instructions still works as +before. + +From To +==== == + +mfmsr rX ld rX, magic_page->msr +mfsprg rX, 0 ld rX, magic_page->sprg0 +mfsprg rX, 1 ld rX, magic_page->sprg1 +mfsprg rX, 2 ld rX, magic_page->sprg2 +mfsprg rX, 3 ld rX, magic_page->sprg3 +mfsrr0 rX ld rX, magic_page->srr0 +mfsrr1 rX ld rX, magic_page->srr1 +mfdar rX ld rX, magic_page->dar +mfdsisr rX lwz rX, magic_page->dsisr + +mtmsr rX std rX, magic_page->msr +mtsprg 0, rX std rX, magic_page->sprg0 +mtsprg 1, rX std rX, magic_page->sprg1 +mtsprg 2, rX std rX, magic_page->sprg2 +mtsprg 3, rX std rX, magic_page->sprg3 +mtsrr0 rX std rX, magic_page->srr0 +mtsrr1 rX std rX, magic_page->srr1 +mtdar rX std rX, magic_page->dar +mtdsisr rX stw rX, magic_page->dsisr + +tlbsync nop + +mtmsrd rX, 0 b +mtmsr rX b + +mtmsrd rX, 1 b + +[Book3S only] +mtsrin rX, rY b + +[BookE only] +wrteei [0|1] b + + +Some instructions require more logic to determine what's going on than a load +or store instruction can deliver. To enable patching of those, we keep some +RAM around where we can live translate instructions to. What happens is the +following: + + 1) copy emulation code to memory + 2) patch that code to fit the emulated instruction + 3) patch that code to return to the original pc + 4 + 4) patch the original instruction to branch to the new code + +That way we can inject an arbitrary amount of code as replacement for a single +instruction. This allows us to check for pending interrupts when setting EE=1 +for example. + +Hypercall ABIs in KVM on PowerPC +================================= +1) KVM hypercalls (ePAPR) + +These are ePAPR compliant hypercall implementation (mentioned above). Even +generic hypercalls are implemented here, like the ePAPR idle hcall. These are +available on all targets. + +2) PAPR hypercalls + +PAPR hypercalls are needed to run server PowerPC PAPR guests (-M pseries in QEMU). +These are the same hypercalls that pHyp, the POWER hypervisor implements. Some of +them are handled in the kernel, some are handled in user space. This is only +available on book3s_64. + +3) OSI hypercalls + +Mac-on-Linux is another user of KVM on PowerPC, which has its own hypercall (long +before KVM). This is supported to maintain compatibility. All these hypercalls get +forwarded to user space. This is only useful on book3s_32, but can be used with +book3s_64 as well. diff --git a/Documentation/virt/kvm/review-checklist.txt b/Documentation/virt/kvm/review-checklist.txt new file mode 100644 index 000000000000..499af499e296 --- /dev/null +++ b/Documentation/virt/kvm/review-checklist.txt @@ -0,0 +1,38 @@ +Review checklist for kvm patches +================================ + +1. The patch must follow Documentation/process/coding-style.rst and + Documentation/process/submitting-patches.rst. + +2. Patches should be against kvm.git master branch. + +3. If the patch introduces or modifies a new userspace API: + - the API must be documented in Documentation/virt/kvm/api.txt + - the API must be discoverable using KVM_CHECK_EXTENSION + +4. New state must include support for save/restore. + +5. New features must default to off (userspace should explicitly request them). + Performance improvements can and should default to on. + +6. New cpu features should be exposed via KVM_GET_SUPPORTED_CPUID2 + +7. Emulator changes should be accompanied by unit tests for qemu-kvm.git + kvm/test directory. + +8. Changes should be vendor neutral when possible. Changes to common code + are better than duplicating changes to vendor code. + +9. Similarly, prefer changes to arch independent code than to arch dependent + code. + +10. User/kernel interfaces and guest/host interfaces must be 64-bit clean + (all variables and sizes naturally aligned on 64-bit; use specific types + only - u64 rather than ulong). + +11. New guest visible features must either be documented in a hardware manual + or be accompanied by documentation. + +12. Features must be robust against reset and kexec - for example, shared + host/guest memory must be unshared to prevent the host from writing to + guest memory that the guest has not reserved for this purpose. diff --git a/Documentation/virt/kvm/s390-diag.txt b/Documentation/virt/kvm/s390-diag.txt new file mode 100644 index 000000000000..7c52e5f8b210 --- /dev/null +++ b/Documentation/virt/kvm/s390-diag.txt @@ -0,0 +1,83 @@ +The s390 DIAGNOSE call on KVM +============================= + +KVM on s390 supports the DIAGNOSE call for making hypercalls, both for +native hypercalls and for selected hypercalls found on other s390 +hypervisors. + +Note that bits are numbered as by the usual s390 convention (most significant +bit on the left). + + +General remarks +--------------- + +DIAGNOSE calls by the guest cause a mandatory intercept. This implies +all supported DIAGNOSE calls need to be handled by either KVM or its +userspace. + +All DIAGNOSE calls supported by KVM use the RS-a format: + +-------------------------------------- +| '83' | R1 | R3 | B2 | D2 | +-------------------------------------- +0 8 12 16 20 31 + +The second-operand address (obtained by the base/displacement calculation) +is not used to address data. Instead, bits 48-63 of this address specify +the function code, and bits 0-47 are ignored. + +The supported DIAGNOSE function codes vary by the userspace used. For +DIAGNOSE function codes not specific to KVM, please refer to the +documentation for the s390 hypervisors defining them. + + +DIAGNOSE function code 'X'500' - KVM virtio functions +----------------------------------------------------- + +If the function code specifies 0x500, various virtio-related functions +are performed. + +General register 1 contains the virtio subfunction code. Supported +virtio subfunctions depend on KVM's userspace. Generally, userspace +provides either s390-virtio (subcodes 0-2) or virtio-ccw (subcode 3). + +Upon completion of the DIAGNOSE instruction, general register 2 contains +the function's return code, which is either a return code or a subcode +specific value. + +Subcode 0 - s390-virtio notification and early console printk + Handled by userspace. + +Subcode 1 - s390-virtio reset + Handled by userspace. + +Subcode 2 - s390-virtio set status + Handled by userspace. + +Subcode 3 - virtio-ccw notification + Handled by either userspace or KVM (ioeventfd case). + + General register 2 contains a subchannel-identification word denoting + the subchannel of the virtio-ccw proxy device to be notified. + + General register 3 contains the number of the virtqueue to be notified. + + General register 4 contains a 64bit identifier for KVM usage (the + kvm_io_bus cookie). If general register 4 does not contain a valid + identifier, it is ignored. + + After completion of the DIAGNOSE call, general register 2 may contain + a 64bit identifier (in the kvm_io_bus cookie case), or a negative + error value, if an internal error occurred. + + See also the virtio standard for a discussion of this hypercall. + + +DIAGNOSE function code 'X'501 - KVM breakpoint +---------------------------------------------- + +If the function code specifies 0x501, breakpoint functions may be performed. +This function code is handled by userspace. + +This diagnose function code has no subfunctions and uses no parameters. diff --git a/Documentation/virt/kvm/timekeeping.txt b/Documentation/virt/kvm/timekeeping.txt new file mode 100644 index 000000000000..76808a17ad84 --- /dev/null +++ b/Documentation/virt/kvm/timekeeping.txt @@ -0,0 +1,612 @@ + + Timekeeping Virtualization for X86-Based Architectures + + Zachary Amsden + Copyright (c) 2010, Red Hat. All rights reserved. + +1) Overview +2) Timing Devices +3) TSC Hardware +4) Virtualization Problems + +========================================================================= + +1) Overview + +One of the most complicated parts of the X86 platform, and specifically, +the virtualization of this platform is the plethora of timing devices available +and the complexity of emulating those devices. In addition, virtualization of +time introduces a new set of challenges because it introduces a multiplexed +division of time beyond the control of the guest CPU. + +First, we will describe the various timekeeping hardware available, then +present some of the problems which arise and solutions available, giving +specific recommendations for certain classes of KVM guests. + +The purpose of this document is to collect data and information relevant to +timekeeping which may be difficult to find elsewhere, specifically, +information relevant to KVM and hardware-based virtualization. + +========================================================================= + +2) Timing Devices + +First we discuss the basic hardware devices available. TSC and the related +KVM clock are special enough to warrant a full exposition and are described in +the following section. + +2.1) i8254 - PIT + +One of the first timer devices available is the programmable interrupt timer, +or PIT. The PIT has a fixed frequency 1.193182 MHz base clock and three +channels which can be programmed to deliver periodic or one-shot interrupts. +These three channels can be configured in different modes and have individual +counters. Channel 1 and 2 were not available for general use in the original +IBM PC, and historically were connected to control RAM refresh and the PC +speaker. Now the PIT is typically integrated as part of an emulated chipset +and a separate physical PIT is not used. + +The PIT uses I/O ports 0x40 - 0x43. Access to the 16-bit counters is done +using single or multiple byte access to the I/O ports. There are 6 modes +available, but not all modes are available to all timers, as only timer 2 +has a connected gate input, required for modes 1 and 5. The gate line is +controlled by port 61h, bit 0, as illustrated in the following diagram. + + -------------- ---------------- +| | | | +| 1.1932 MHz |---------->| CLOCK OUT | ---------> IRQ 0 +| Clock | | | | + -------------- | +->| GATE TIMER 0 | + | ---------------- + | + | ---------------- + | | | + |------>| CLOCK OUT | ---------> 66.3 KHZ DRAM + | | | (aka /dev/null) + | +->| GATE TIMER 1 | + | ---------------- + | + | ---------------- + | | | + |------>| CLOCK OUT | ---------> Port 61h, bit 5 + | | | +Port 61h, bit 0 ---------->| GATE TIMER 2 | \_.---- ____ + ---------------- _| )--|LPF|---Speaker + / *---- \___/ +Port 61h, bit 1 -----------------------------------/ + +The timer modes are now described. + +Mode 0: Single Timeout. This is a one-shot software timeout that counts down + when the gate is high (always true for timers 0 and 1). When the count + reaches zero, the output goes high. + +Mode 1: Triggered One-shot. The output is initially set high. When the gate + line is set high, a countdown is initiated (which does not stop if the gate is + lowered), during which the output is set low. When the count reaches zero, + the output goes high. + +Mode 2: Rate Generator. The output is initially set high. When the countdown + reaches 1, the output goes low for one count and then returns high. The value + is reloaded and the countdown automatically resumes. If the gate line goes + low, the count is halted. If the output is low when the gate is lowered, the + output automatically goes high (this only affects timer 2). + +Mode 3: Square Wave. This generates a high / low square wave. The count + determines the length of the pulse, which alternates between high and low + when zero is reached. The count only proceeds when gate is high and is + automatically reloaded on reaching zero. The count is decremented twice at + each clock to generate a full high / low cycle at the full periodic rate. + If the count is even, the clock remains high for N/2 counts and low for N/2 + counts; if the clock is odd, the clock is high for (N+1)/2 counts and low + for (N-1)/2 counts. Only even values are latched by the counter, so odd + values are not observed when reading. This is the intended mode for timer 2, + which generates sine-like tones by low-pass filtering the square wave output. + +Mode 4: Software Strobe. After programming this mode and loading the counter, + the output remains high until the counter reaches zero. Then the output + goes low for 1 clock cycle and returns high. The counter is not reloaded. + Counting only occurs when gate is high. + +Mode 5: Hardware Strobe. After programming and loading the counter, the + output remains high. When the gate is raised, a countdown is initiated + (which does not stop if the gate is lowered). When the counter reaches zero, + the output goes low for 1 clock cycle and then returns high. The counter is + not reloaded. + +In addition to normal binary counting, the PIT supports BCD counting. The +command port, 0x43 is used to set the counter and mode for each of the three +timers. + +PIT commands, issued to port 0x43, using the following bit encoding: + +Bit 7-4: Command (See table below) +Bit 3-1: Mode (000 = Mode 0, 101 = Mode 5, 11X = undefined) +Bit 0 : Binary (0) / BCD (1) + +Command table: + +0000 - Latch Timer 0 count for port 0x40 + sample and hold the count to be read in port 0x40; + additional commands ignored until counter is read; + mode bits ignored. + +0001 - Set Timer 0 LSB mode for port 0x40 + set timer to read LSB only and force MSB to zero; + mode bits set timer mode + +0010 - Set Timer 0 MSB mode for port 0x40 + set timer to read MSB only and force LSB to zero; + mode bits set timer mode + +0011 - Set Timer 0 16-bit mode for port 0x40 + set timer to read / write LSB first, then MSB; + mode bits set timer mode + +0100 - Latch Timer 1 count for port 0x41 - as described above +0101 - Set Timer 1 LSB mode for port 0x41 - as described above +0110 - Set Timer 1 MSB mode for port 0x41 - as described above +0111 - Set Timer 1 16-bit mode for port 0x41 - as described above + +1000 - Latch Timer 2 count for port 0x42 - as described above +1001 - Set Timer 2 LSB mode for port 0x42 - as described above +1010 - Set Timer 2 MSB mode for port 0x42 - as described above +1011 - Set Timer 2 16-bit mode for port 0x42 as described above + +1101 - General counter latch + Latch combination of counters into corresponding ports + Bit 3 = Counter 2 + Bit 2 = Counter 1 + Bit 1 = Counter 0 + Bit 0 = Unused + +1110 - Latch timer status + Latch combination of counter mode into corresponding ports + Bit 3 = Counter 2 + Bit 2 = Counter 1 + Bit 1 = Counter 0 + + The output of ports 0x40-0x42 following this command will be: + + Bit 7 = Output pin + Bit 6 = Count loaded (0 if timer has expired) + Bit 5-4 = Read / Write mode + 01 = MSB only + 10 = LSB only + 11 = LSB / MSB (16-bit) + Bit 3-1 = Mode + Bit 0 = Binary (0) / BCD mode (1) + +2.2) RTC + +The second device which was available in the original PC was the MC146818 real +time clock. The original device is now obsolete, and usually emulated by the +system chipset, sometimes by an HPET and some frankenstein IRQ routing. + +The RTC is accessed through CMOS variables, which uses an index register to +control which bytes are read. Since there is only one index register, read +of the CMOS and read of the RTC require lock protection (in addition, it is +dangerous to allow userspace utilities such as hwclock to have direct RTC +access, as they could corrupt kernel reads and writes of CMOS memory). + +The RTC generates an interrupt which is usually routed to IRQ 8. The interrupt +can function as a periodic timer, an additional once a day alarm, and can issue +interrupts after an update of the CMOS registers by the MC146818 is complete. +The type of interrupt is signalled in the RTC status registers. + +The RTC will update the current time fields by battery power even while the +system is off. The current time fields should not be read while an update is +in progress, as indicated in the status register. + +The clock uses a 32.768kHz crystal, so bits 6-4 of register A should be +programmed to a 32kHz divider if the RTC is to count seconds. + +This is the RAM map originally used for the RTC/CMOS: + +Location Size Description +------------------------------------------ +00h byte Current second (BCD) +01h byte Seconds alarm (BCD) +02h byte Current minute (BCD) +03h byte Minutes alarm (BCD) +04h byte Current hour (BCD) +05h byte Hours alarm (BCD) +06h byte Current day of week (BCD) +07h byte Current day of month (BCD) +08h byte Current month (BCD) +09h byte Current year (BCD) +0Ah byte Register A + bit 7 = Update in progress + bit 6-4 = Divider for clock + 000 = 4.194 MHz + 001 = 1.049 MHz + 010 = 32 kHz + 10X = test modes + 110 = reset / disable + 111 = reset / disable + bit 3-0 = Rate selection for periodic interrupt + 000 = periodic timer disabled + 001 = 3.90625 uS + 010 = 7.8125 uS + 011 = .122070 mS + 100 = .244141 mS + ... + 1101 = 125 mS + 1110 = 250 mS + 1111 = 500 mS +0Bh byte Register B + bit 7 = Run (0) / Halt (1) + bit 6 = Periodic interrupt enable + bit 5 = Alarm interrupt enable + bit 4 = Update-ended interrupt enable + bit 3 = Square wave interrupt enable + bit 2 = BCD calendar (0) / Binary (1) + bit 1 = 12-hour mode (0) / 24-hour mode (1) + bit 0 = 0 (DST off) / 1 (DST enabled) +OCh byte Register C (read only) + bit 7 = interrupt request flag (IRQF) + bit 6 = periodic interrupt flag (PF) + bit 5 = alarm interrupt flag (AF) + bit 4 = update interrupt flag (UF) + bit 3-0 = reserved +ODh byte Register D (read only) + bit 7 = RTC has power + bit 6-0 = reserved +32h byte Current century BCD (*) + (*) location vendor specific and now determined from ACPI global tables + +2.3) APIC + +On Pentium and later processors, an on-board timer is available to each CPU +as part of the Advanced Programmable Interrupt Controller. The APIC is +accessed through memory-mapped registers and provides interrupt service to each +CPU, used for IPIs and local timer interrupts. + +Although in theory the APIC is a safe and stable source for local interrupts, +in practice, many bugs and glitches have occurred due to the special nature of +the APIC CPU-local memory-mapped hardware. Beware that CPU errata may affect +the use of the APIC and that workarounds may be required. In addition, some of +these workarounds pose unique constraints for virtualization - requiring either +extra overhead incurred from extra reads of memory-mapped I/O or additional +functionality that may be more computationally expensive to implement. + +Since the APIC is documented quite well in the Intel and AMD manuals, we will +avoid repetition of the detail here. It should be pointed out that the APIC +timer is programmed through the LVT (local vector timer) register, is capable +of one-shot or periodic operation, and is based on the bus clock divided down +by the programmable divider register. + +2.4) HPET + +HPET is quite complex, and was originally intended to replace the PIT / RTC +support of the X86 PC. It remains to be seen whether that will be the case, as +the de facto standard of PC hardware is to emulate these older devices. Some +systems designated as legacy free may support only the HPET as a hardware timer +device. + +The HPET spec is rather loose and vague, requiring at least 3 hardware timers, +but allowing implementation freedom to support many more. It also imposes no +fixed rate on the timer frequency, but does impose some extremal values on +frequency, error and slew. + +In general, the HPET is recommended as a high precision (compared to PIT /RTC) +time source which is independent of local variation (as there is only one HPET +in any given system). The HPET is also memory-mapped, and its presence is +indicated through ACPI tables by the BIOS. + +Detailed specification of the HPET is beyond the current scope of this +document, as it is also very well documented elsewhere. + +2.5) Offboard Timers + +Several cards, both proprietary (watchdog boards) and commonplace (e1000) have +timing chips built into the cards which may have registers which are accessible +to kernel or user drivers. To the author's knowledge, using these to generate +a clocksource for a Linux or other kernel has not yet been attempted and is in +general frowned upon as not playing by the agreed rules of the game. Such a +timer device would require additional support to be virtualized properly and is +not considered important at this time as no known operating system does this. + +========================================================================= + +3) TSC Hardware + +The TSC or time stamp counter is relatively simple in theory; it counts +instruction cycles issued by the processor, which can be used as a measure of +time. In practice, due to a number of problems, it is the most complicated +timekeeping device to use. + +The TSC is represented internally as a 64-bit MSR which can be read with the +RDMSR, RDTSC, or RDTSCP (when available) instructions. In the past, hardware +limitations made it possible to write the TSC, but generally on old hardware it +was only possible to write the low 32-bits of the 64-bit counter, and the upper +32-bits of the counter were cleared. Now, however, on Intel processors family +0Fh, for models 3, 4 and 6, and family 06h, models e and f, this restriction +has been lifted and all 64-bits are writable. On AMD systems, the ability to +write the TSC MSR is not an architectural guarantee. + +The TSC is accessible from CPL-0 and conditionally, for CPL > 0 software by +means of the CR4.TSD bit, which when enabled, disables CPL > 0 TSC access. + +Some vendors have implemented an additional instruction, RDTSCP, which returns +atomically not just the TSC, but an indicator which corresponds to the +processor number. This can be used to index into an array of TSC variables to +determine offset information in SMP systems where TSCs are not synchronized. +The presence of this instruction must be determined by consulting CPUID feature +bits. + +Both VMX and SVM provide extension fields in the virtualization hardware which +allows the guest visible TSC to be offset by a constant. Newer implementations +promise to allow the TSC to additionally be scaled, but this hardware is not +yet widely available. + +3.1) TSC synchronization + +The TSC is a CPU-local clock in most implementations. This means, on SMP +platforms, the TSCs of different CPUs may start at different times depending +on when the CPUs are powered on. Generally, CPUs on the same die will share +the same clock, however, this is not always the case. + +The BIOS may attempt to resynchronize the TSCs during the poweron process and +the operating system or other system software may attempt to do this as well. +Several hardware limitations make the problem worse - if it is not possible to +write the full 64-bits of the TSC, it may be impossible to match the TSC in +newly arriving CPUs to that of the rest of the system, resulting in +unsynchronized TSCs. This may be done by BIOS or system software, but in +practice, getting a perfectly synchronized TSC will not be possible unless all +values are read from the same clock, which generally only is possible on single +socket systems or those with special hardware support. + +3.2) TSC and CPU hotplug + +As touched on already, CPUs which arrive later than the boot time of the system +may not have a TSC value that is synchronized with the rest of the system. +Either system software, BIOS, or SMM code may actually try to establish the TSC +to a value matching the rest of the system, but a perfect match is usually not +a guarantee. This can have the effect of bringing a system from a state where +TSC is synchronized back to a state where TSC synchronization flaws, however +small, may be exposed to the OS and any virtualization environment. + +3.3) TSC and multi-socket / NUMA + +Multi-socket systems, especially large multi-socket systems are likely to have +individual clocksources rather than a single, universally distributed clock. +Since these clocks are driven by different crystals, they will not have +perfectly matched frequency, and temperature and electrical variations will +cause the CPU clocks, and thus the TSCs to drift over time. Depending on the +exact clock and bus design, the drift may or may not be fixed in absolute +error, and may accumulate over time. + +In addition, very large systems may deliberately slew the clocks of individual +cores. This technique, known as spread-spectrum clocking, reduces EMI at the +clock frequency and harmonics of it, which may be required to pass FCC +standards for telecommunications and computer equipment. + +It is recommended not to trust the TSCs to remain synchronized on NUMA or +multiple socket systems for these reasons. + +3.4) TSC and C-states + +C-states, or idling states of the processor, especially C1E and deeper sleep +states may be problematic for TSC as well. The TSC may stop advancing in such +a state, resulting in a TSC which is behind that of other CPUs when execution +is resumed. Such CPUs must be detected and flagged by the operating system +based on CPU and chipset identifications. + +The TSC in such a case may be corrected by catching it up to a known external +clocksource. + +3.5) TSC frequency change / P-states + +To make things slightly more interesting, some CPUs may change frequency. They +may or may not run the TSC at the same rate, and because the frequency change +may be staggered or slewed, at some points in time, the TSC rate may not be +known other than falling within a range of values. In this case, the TSC will +not be a stable time source, and must be calibrated against a known, stable, +external clock to be a usable source of time. + +Whether the TSC runs at a constant rate or scales with the P-state is model +dependent and must be determined by inspecting CPUID, chipset or vendor +specific MSR fields. + +In addition, some vendors have known bugs where the P-state is actually +compensated for properly during normal operation, but when the processor is +inactive, the P-state may be raised temporarily to service cache misses from +other processors. In such cases, the TSC on halted CPUs could advance faster +than that of non-halted processors. AMD Turion processors are known to have +this problem. + +3.6) TSC and STPCLK / T-states + +External signals given to the processor may also have the effect of stopping +the TSC. This is typically done for thermal emergency power control to prevent +an overheating condition, and typically, there is no way to detect that this +condition has happened. + +3.7) TSC virtualization - VMX + +VMX provides conditional trapping of RDTSC, RDMSR, WRMSR and RDTSCP +instructions, which is enough for full virtualization of TSC in any manner. In +addition, VMX allows passing through the host TSC plus an additional TSC_OFFSET +field specified in the VMCS. Special instructions must be used to read and +write the VMCS field. + +3.8) TSC virtualization - SVM + +SVM provides conditional trapping of RDTSC, RDMSR, WRMSR and RDTSCP +instructions, which is enough for full virtualization of TSC in any manner. In +addition, SVM allows passing through the host TSC plus an additional offset +field specified in the SVM control block. + +3.9) TSC feature bits in Linux + +In summary, there is no way to guarantee the TSC remains in perfect +synchronization unless it is explicitly guaranteed by the architecture. Even +if so, the TSCs in multi-sockets or NUMA systems may still run independently +despite being locally consistent. + +The following feature bits are used by Linux to signal various TSC attributes, +but they can only be taken to be meaningful for UP or single node systems. + +X86_FEATURE_TSC : The TSC is available in hardware +X86_FEATURE_RDTSCP : The RDTSCP instruction is available +X86_FEATURE_CONSTANT_TSC : The TSC rate is unchanged with P-states +X86_FEATURE_NONSTOP_TSC : The TSC does not stop in C-states +X86_FEATURE_TSC_RELIABLE : TSC sync checks are skipped (VMware) + +4) Virtualization Problems + +Timekeeping is especially problematic for virtualization because a number of +challenges arise. The most obvious problem is that time is now shared between +the host and, potentially, a number of virtual machines. Thus the virtual +operating system does not run with 100% usage of the CPU, despite the fact that +it may very well make that assumption. It may expect it to remain true to very +exacting bounds when interrupt sources are disabled, but in reality only its +virtual interrupt sources are disabled, and the machine may still be preempted +at any time. This causes problems as the passage of real time, the injection +of machine interrupts and the associated clock sources are no longer completely +synchronized with real time. + +This same problem can occur on native hardware to a degree, as SMM mode may +steal cycles from the naturally on X86 systems when SMM mode is used by the +BIOS, but not in such an extreme fashion. However, the fact that SMM mode may +cause similar problems to virtualization makes it a good justification for +solving many of these problems on bare metal. + +4.1) Interrupt clocking + +One of the most immediate problems that occurs with legacy operating systems +is that the system timekeeping routines are often designed to keep track of +time by counting periodic interrupts. These interrupts may come from the PIT +or the RTC, but the problem is the same: the host virtualization engine may not +be able to deliver the proper number of interrupts per second, and so guest +time may fall behind. This is especially problematic if a high interrupt rate +is selected, such as 1000 HZ, which is unfortunately the default for many Linux +guests. + +There are three approaches to solving this problem; first, it may be possible +to simply ignore it. Guests which have a separate time source for tracking +'wall clock' or 'real time' may not need any adjustment of their interrupts to +maintain proper time. If this is not sufficient, it may be necessary to inject +additional interrupts into the guest in order to increase the effective +interrupt rate. This approach leads to complications in extreme conditions, +where host load or guest lag is too much to compensate for, and thus another +solution to the problem has risen: the guest may need to become aware of lost +ticks and compensate for them internally. Although promising in theory, the +implementation of this policy in Linux has been extremely error prone, and a +number of buggy variants of lost tick compensation are distributed across +commonly used Linux systems. + +Windows uses periodic RTC clocking as a means of keeping time internally, and +thus requires interrupt slewing to keep proper time. It does use a low enough +rate (ed: is it 18.2 Hz?) however that it has not yet been a problem in +practice. + +4.2) TSC sampling and serialization + +As the highest precision time source available, the cycle counter of the CPU +has aroused much interest from developers. As explained above, this timer has +many problems unique to its nature as a local, potentially unstable and +potentially unsynchronized source. One issue which is not unique to the TSC, +but is highlighted because of its very precise nature is sampling delay. By +definition, the counter, once read is already old. However, it is also +possible for the counter to be read ahead of the actual use of the result. +This is a consequence of the superscalar execution of the instruction stream, +which may execute instructions out of order. Such execution is called +non-serialized. Forcing serialized execution is necessary for precise +measurement with the TSC, and requires a serializing instruction, such as CPUID +or an MSR read. + +Since CPUID may actually be virtualized by a trap and emulate mechanism, this +serialization can pose a performance issue for hardware virtualization. An +accurate time stamp counter reading may therefore not always be available, and +it may be necessary for an implementation to guard against "backwards" reads of +the TSC as seen from other CPUs, even in an otherwise perfectly synchronized +system. + +4.3) Timespec aliasing + +Additionally, this lack of serialization from the TSC poses another challenge +when using results of the TSC when measured against another time source. As +the TSC is much higher precision, many possible values of the TSC may be read +while another clock is still expressing the same value. + +That is, you may read (T,T+10) while external clock C maintains the same value. +Due to non-serialized reads, you may actually end up with a range which +fluctuates - from (T-1.. T+10). Thus, any time calculated from a TSC, but +calibrated against an external value may have a range of valid values. +Re-calibrating this computation may actually cause time, as computed after the +calibration, to go backwards, compared with time computed before the +calibration. + +This problem is particularly pronounced with an internal time source in Linux, +the kernel time, which is expressed in the theoretically high resolution +timespec - but which advances in much larger granularity intervals, sometimes +at the rate of jiffies, and possibly in catchup modes, at a much larger step. + +This aliasing requires care in the computation and recalibration of kvmclock +and any other values derived from TSC computation (such as TSC virtualization +itself). + +4.4) Migration + +Migration of a virtual machine raises problems for timekeeping in two ways. +First, the migration itself may take time, during which interrupts cannot be +delivered, and after which, the guest time may need to be caught up. NTP may +be able to help to some degree here, as the clock correction required is +typically small enough to fall in the NTP-correctable window. + +An additional concern is that timers based off the TSC (or HPET, if the raw bus +clock is exposed) may now be running at different rates, requiring compensation +in some way in the hypervisor by virtualizing these timers. In addition, +migrating to a faster machine may preclude the use of a passthrough TSC, as a +faster clock cannot be made visible to a guest without the potential of time +advancing faster than usual. A slower clock is less of a problem, as it can +always be caught up to the original rate. KVM clock avoids these problems by +simply storing multipliers and offsets against the TSC for the guest to convert +back into nanosecond resolution values. + +4.5) Scheduling + +Since scheduling may be based on precise timing and firing of interrupts, the +scheduling algorithms of an operating system may be adversely affected by +virtualization. In theory, the effect is random and should be universally +distributed, but in contrived as well as real scenarios (guest device access, +causes of virtualization exits, possible context switch), this may not always +be the case. The effect of this has not been well studied. + +In an attempt to work around this, several implementations have provided a +paravirtualized scheduler clock, which reveals the true amount of CPU time for +which a virtual machine has been running. + +4.6) Watchdogs + +Watchdog timers, such as the lock detector in Linux may fire accidentally when +running under hardware virtualization due to timer interrupts being delayed or +misinterpretation of the passage of real time. Usually, these warnings are +spurious and can be ignored, but in some circumstances it may be necessary to +disable such detection. + +4.7) Delays and precision timing + +Precise timing and delays may not be possible in a virtualized system. This +can happen if the system is controlling physical hardware, or issues delays to +compensate for slower I/O to and from devices. The first issue is not solvable +in general for a virtualized system; hardware control software can't be +adequately virtualized without a full real-time operating system, which would +require an RT aware virtualization platform. + +The second issue may cause performance problems, but this is unlikely to be a +significant issue. In many cases these delays may be eliminated through +configuration or paravirtualization. + +4.8) Covert channels and leaks + +In addition to the above problems, time information will inevitably leak to the +guest about the host in anything but a perfect implementation of virtualized +time. This may allow the guest to infer the presence of a hypervisor (as in a +red-pill type detection), and it may allow information to leak between guests +by using CPU utilization itself as a signalling channel. Preventing such +problems would require completely isolated virtual time which may not track +real time any longer. This may be useful in certain security or QA contexts, +but in general isn't recommended for real-world deployment scenarios. diff --git a/Documentation/virt/kvm/vcpu-requests.rst b/Documentation/virt/kvm/vcpu-requests.rst new file mode 100644 index 000000000000..5feb3706a7ae --- /dev/null +++ b/Documentation/virt/kvm/vcpu-requests.rst @@ -0,0 +1,307 @@ +================= +KVM VCPU Requests +================= + +Overview +======== + +KVM supports an internal API enabling threads to request a VCPU thread to +perform some activity. For example, a thread may request a VCPU to flush +its TLB with a VCPU request. The API consists of the following functions:: + + /* Check if any requests are pending for VCPU @vcpu. */ + bool kvm_request_pending(struct kvm_vcpu *vcpu); + + /* Check if VCPU @vcpu has request @req pending. */ + bool kvm_test_request(int req, struct kvm_vcpu *vcpu); + + /* Clear request @req for VCPU @vcpu. */ + void kvm_clear_request(int req, struct kvm_vcpu *vcpu); + + /* + * Check if VCPU @vcpu has request @req pending. When the request is + * pending it will be cleared and a memory barrier, which pairs with + * another in kvm_make_request(), will be issued. + */ + bool kvm_check_request(int req, struct kvm_vcpu *vcpu); + + /* + * Make request @req of VCPU @vcpu. Issues a memory barrier, which pairs + * with another in kvm_check_request(), prior to setting the request. + */ + void kvm_make_request(int req, struct kvm_vcpu *vcpu); + + /* Make request @req of all VCPUs of the VM with struct kvm @kvm. */ + bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req); + +Typically a requester wants the VCPU to perform the activity as soon +as possible after making the request. This means most requests +(kvm_make_request() calls) are followed by a call to kvm_vcpu_kick(), +and kvm_make_all_cpus_request() has the kicking of all VCPUs built +into it. + +VCPU Kicks +---------- + +The goal of a VCPU kick is to bring a VCPU thread out of guest mode in +order to perform some KVM maintenance. To do so, an IPI is sent, forcing +a guest mode exit. However, a VCPU thread may not be in guest mode at the +time of the kick. Therefore, depending on the mode and state of the VCPU +thread, there are two other actions a kick may take. All three actions +are listed below: + +1) Send an IPI. This forces a guest mode exit. +2) Waking a sleeping VCPU. Sleeping VCPUs are VCPU threads outside guest + mode that wait on waitqueues. Waking them removes the threads from + the waitqueues, allowing the threads to run again. This behavior + may be suppressed, see KVM_REQUEST_NO_WAKEUP below. +3) Nothing. When the VCPU is not in guest mode and the VCPU thread is not + sleeping, then there is nothing to do. + +VCPU Mode +--------- + +VCPUs have a mode state, ``vcpu->mode``, that is used to track whether the +guest is running in guest mode or not, as well as some specific +outside guest mode states. The architecture may use ``vcpu->mode`` to +ensure VCPU requests are seen by VCPUs (see "Ensuring Requests Are Seen"), +as well as to avoid sending unnecessary IPIs (see "IPI Reduction"), and +even to ensure IPI acknowledgements are waited upon (see "Waiting for +Acknowledgements"). The following modes are defined: + +OUTSIDE_GUEST_MODE + + The VCPU thread is outside guest mode. + +IN_GUEST_MODE + + The VCPU thread is in guest mode. + +EXITING_GUEST_MODE + + The VCPU thread is transitioning from IN_GUEST_MODE to + OUTSIDE_GUEST_MODE. + +READING_SHADOW_PAGE_TABLES + + The VCPU thread is outside guest mode, but it wants the sender of + certain VCPU requests, namely KVM_REQ_TLB_FLUSH, to wait until the VCPU + thread is done reading the page tables. + +VCPU Request Internals +====================== + +VCPU requests are simply bit indices of the ``vcpu->requests`` bitmap. +This means general bitops, like those documented in [atomic-ops]_ could +also be used, e.g. :: + + clear_bit(KVM_REQ_UNHALT & KVM_REQUEST_MASK, &vcpu->requests); + +However, VCPU request users should refrain from doing so, as it would +break the abstraction. The first 8 bits are reserved for architecture +independent requests, all additional bits are available for architecture +dependent requests. + +Architecture Independent Requests +--------------------------------- + +KVM_REQ_TLB_FLUSH + + KVM's common MMU notifier may need to flush all of a guest's TLB + entries, calling kvm_flush_remote_tlbs() to do so. Architectures that + choose to use the common kvm_flush_remote_tlbs() implementation will + need to handle this VCPU request. + +KVM_REQ_MMU_RELOAD + + When shadow page tables are used and memory slots are removed it's + necessary to inform each VCPU to completely refresh the tables. This + request is used for that. + +KVM_REQ_PENDING_TIMER + + This request may be made from a timer handler run on the host on behalf + of a VCPU. It informs the VCPU thread to inject a timer interrupt. + +KVM_REQ_UNHALT + + This request may be made from the KVM common function kvm_vcpu_block(), + which is used to emulate an instruction that causes a CPU to halt until + one of an architectural specific set of events and/or interrupts is + received (determined by checking kvm_arch_vcpu_runnable()). When that + event or interrupt arrives kvm_vcpu_block() makes the request. This is + in contrast to when kvm_vcpu_block() returns due to any other reason, + such as a pending signal, which does not indicate the VCPU's halt + emulation should stop, and therefore does not make the request. + +KVM_REQUEST_MASK +---------------- + +VCPU requests should be masked by KVM_REQUEST_MASK before using them with +bitops. This is because only the lower 8 bits are used to represent the +request's number. The upper bits are used as flags. Currently only two +flags are defined. + +VCPU Request Flags +------------------ + +KVM_REQUEST_NO_WAKEUP + + This flag is applied to requests that only need immediate attention + from VCPUs running in guest mode. That is, sleeping VCPUs do not need + to be awaken for these requests. Sleeping VCPUs will handle the + requests when they are awaken later for some other reason. + +KVM_REQUEST_WAIT + + When requests with this flag are made with kvm_make_all_cpus_request(), + then the caller will wait for each VCPU to acknowledge its IPI before + proceeding. This flag only applies to VCPUs that would receive IPIs. + If, for example, the VCPU is sleeping, so no IPI is necessary, then + the requesting thread does not wait. This means that this flag may be + safely combined with KVM_REQUEST_NO_WAKEUP. See "Waiting for + Acknowledgements" for more information about requests with + KVM_REQUEST_WAIT. + +VCPU Requests with Associated State +=================================== + +Requesters that want the receiving VCPU to handle new state need to ensure +the newly written state is observable to the receiving VCPU thread's CPU +by the time it observes the request. This means a write memory barrier +must be inserted after writing the new state and before setting the VCPU +request bit. Additionally, on the receiving VCPU thread's side, a +corresponding read barrier must be inserted after reading the request bit +and before proceeding to read the new state associated with it. See +scenario 3, Message and Flag, of [lwn-mb]_ and the kernel documentation +[memory-barriers]_. + +The pair of functions, kvm_check_request() and kvm_make_request(), provide +the memory barriers, allowing this requirement to be handled internally by +the API. + +Ensuring Requests Are Seen +========================== + +When making requests to VCPUs, we want to avoid the receiving VCPU +executing in guest mode for an arbitrary long time without handling the +request. We can be sure this won't happen as long as we ensure the VCPU +thread checks kvm_request_pending() before entering guest mode and that a +kick will send an IPI to force an exit from guest mode when necessary. +Extra care must be taken to cover the period after the VCPU thread's last +kvm_request_pending() check and before it has entered guest mode, as kick +IPIs will only trigger guest mode exits for VCPU threads that are in guest +mode or at least have already disabled interrupts in order to prepare to +enter guest mode. This means that an optimized implementation (see "IPI +Reduction") must be certain when it's safe to not send the IPI. One +solution, which all architectures except s390 apply, is to: + +- set ``vcpu->mode`` to IN_GUEST_MODE between disabling the interrupts and + the last kvm_request_pending() check; +- enable interrupts atomically when entering the guest. + +This solution also requires memory barriers to be placed carefully in both +the requesting thread and the receiving VCPU. With the memory barriers we +can exclude the possibility of a VCPU thread observing +!kvm_request_pending() on its last check and then not receiving an IPI for +the next request made of it, even if the request is made immediately after +the check. This is done by way of the Dekker memory barrier pattern +(scenario 10 of [lwn-mb]_). As the Dekker pattern requires two variables, +this solution pairs ``vcpu->mode`` with ``vcpu->requests``. Substituting +them into the pattern gives:: + + CPU1 CPU2 + ================= ================= + local_irq_disable(); + WRITE_ONCE(vcpu->mode, IN_GUEST_MODE); kvm_make_request(REQ, vcpu); + smp_mb(); smp_mb(); + if (kvm_request_pending(vcpu)) { if (READ_ONCE(vcpu->mode) == + IN_GUEST_MODE) { + ...abort guest entry... ...send IPI... + } } + +As stated above, the IPI is only useful for VCPU threads in guest mode or +that have already disabled interrupts. This is why this specific case of +the Dekker pattern has been extended to disable interrupts before setting +``vcpu->mode`` to IN_GUEST_MODE. WRITE_ONCE() and READ_ONCE() are used to +pedantically implement the memory barrier pattern, guaranteeing the +compiler doesn't interfere with ``vcpu->mode``'s carefully planned +accesses. + +IPI Reduction +------------- + +As only one IPI is needed to get a VCPU to check for any/all requests, +then they may be coalesced. This is easily done by having the first IPI +sending kick also change the VCPU mode to something !IN_GUEST_MODE. The +transitional state, EXITING_GUEST_MODE, is used for this purpose. + +Waiting for Acknowledgements +---------------------------- + +Some requests, those with the KVM_REQUEST_WAIT flag set, require IPIs to +be sent, and the acknowledgements to be waited upon, even when the target +VCPU threads are in modes other than IN_GUEST_MODE. For example, one case +is when a target VCPU thread is in READING_SHADOW_PAGE_TABLES mode, which +is set after disabling interrupts. To support these cases, the +KVM_REQUEST_WAIT flag changes the condition for sending an IPI from +checking that the VCPU is IN_GUEST_MODE to checking that it is not +OUTSIDE_GUEST_MODE. + +Request-less VCPU Kicks +----------------------- + +As the determination of whether or not to send an IPI depends on the +two-variable Dekker memory barrier pattern, then it's clear that +request-less VCPU kicks are almost never correct. Without the assurance +that a non-IPI generating kick will still result in an action by the +receiving VCPU, as the final kvm_request_pending() check does for +request-accompanying kicks, then the kick may not do anything useful at +all. If, for instance, a request-less kick was made to a VCPU that was +just about to set its mode to IN_GUEST_MODE, meaning no IPI is sent, then +the VCPU thread may continue its entry without actually having done +whatever it was the kick was meant to initiate. + +One exception is x86's posted interrupt mechanism. In this case, however, +even the request-less VCPU kick is coupled with the same +local_irq_disable() + smp_mb() pattern described above; the ON bit +(Outstanding Notification) in the posted interrupt descriptor takes the +role of ``vcpu->requests``. When sending a posted interrupt, PIR.ON is +set before reading ``vcpu->mode``; dually, in the VCPU thread, +vmx_sync_pir_to_irr() reads PIR after setting ``vcpu->mode`` to +IN_GUEST_MODE. + +Additional Considerations +========================= + +Sleeping VCPUs +-------------- + +VCPU threads may need to consider requests before and/or after calling +functions that may put them to sleep, e.g. kvm_vcpu_block(). Whether they +do or not, and, if they do, which requests need consideration, is +architecture dependent. kvm_vcpu_block() calls kvm_arch_vcpu_runnable() +to check if it should awaken. One reason to do so is to provide +architectures a function where requests may be checked if necessary. + +Clearing Requests +----------------- + +Generally it only makes sense for the receiving VCPU thread to clear a +request. However, in some circumstances, such as when the requesting +thread and the receiving VCPU thread are executed serially, such as when +they are the same thread, or when they are using some form of concurrency +control to temporarily execute synchronously, then it's possible to know +that the request may be cleared immediately, rather than waiting for the +receiving VCPU thread to handle the request in VCPU RUN. The only current +examples of this are kvm_vcpu_block() calls made by VCPUs to block +themselves. A possible side-effect of that call is to make the +KVM_REQ_UNHALT request, which may then be cleared immediately when the +VCPU returns from the call. + +References +========== + +.. [atomic-ops] Documentation/core-api/atomic_ops.rst +.. [memory-barriers] Documentation/memory-barriers.txt +.. [lwn-mb] https://lwn.net/Articles/573436/ diff --git a/Documentation/virt/paravirt_ops.rst b/Documentation/virt/paravirt_ops.rst new file mode 100644 index 000000000000..6b789d27cead --- /dev/null +++ b/Documentation/virt/paravirt_ops.rst @@ -0,0 +1,35 @@ +.. SPDX-License-Identifier: GPL-2.0 + +============ +Paravirt_ops +============ + +Linux provides support for different hypervisor virtualization technologies. +Historically different binary kernels would be required in order to support +different hypervisors, this restriction was removed with pv_ops. +Linux pv_ops is a virtualization API which enables support for different +hypervisors. It allows each hypervisor to override critical operations and +allows a single kernel binary to run on all supported execution environments +including native machine -- without any hypervisors. + +pv_ops provides a set of function pointers which represent operations +corresponding to low level critical instructions and high level +functionalities in various areas. pv-ops allows for optimizations at run +time by enabling binary patching of the low-ops critical operations +at boot time. + +pv_ops operations are classified into three categories: + +- simple indirect call + These operations correspond to high level functionality where it is + known that the overhead of indirect call isn't very important. + +- indirect call which allows optimization with binary patch + Usually these operations correspond to low level critical instructions. They + are called frequently and are performance critical. The overhead is + very important. + +- a set of macros for hand written assembly code + Hand written assembly codes (.S files) also need paravirtualization + because they include sensitive instructions or some of code paths in + them are very performance critical. diff --git a/Documentation/virt/uml/UserModeLinux-HOWTO.txt b/Documentation/virt/uml/UserModeLinux-HOWTO.txt new file mode 100644 index 000000000000..87b80f589e1c --- /dev/null +++ b/Documentation/virt/uml/UserModeLinux-HOWTO.txt @@ -0,0 +1,4589 @@ + User Mode Linux HOWTO + User Mode Linux Core Team + Mon Nov 18 14:16:16 EST 2002 + + This document describes the use and abuse of Jeff Dike's User Mode + Linux: a port of the Linux kernel as a normal Intel Linux process. + ______________________________________________________________________ + + Table of Contents + + 1. Introduction + + 1.1 How is User Mode Linux Different? + 1.2 Why Would I Want User Mode Linux? + + 2. Compiling the kernel and modules + + 2.1 Compiling the kernel + 2.2 Compiling and installing kernel modules + 2.3 Compiling and installing uml_utilities + + 3. Running UML and logging in + + 3.1 Running UML + 3.2 Logging in + 3.3 Examples + + 4. UML on 2G/2G hosts + + 4.1 Introduction + 4.2 The problem + 4.3 The solution + + 5. Setting up serial lines and consoles + + 5.1 Specifying the device + 5.2 Specifying the channel + 5.3 Examples + + 6. Setting up the network + + 6.1 General setup + 6.2 Userspace daemons + 6.3 Specifying ethernet addresses + 6.4 UML interface setup + 6.5 Multicast + 6.6 TUN/TAP with the uml_net helper + 6.7 TUN/TAP with a preconfigured tap device + 6.8 Ethertap + 6.9 The switch daemon + 6.10 Slip + 6.11 Slirp + 6.12 pcap + 6.13 Setting up the host yourself + + 7. Sharing Filesystems between Virtual Machines + + 7.1 A warning + 7.2 Using layered block devices + 7.3 Note! + 7.4 Another warning + 7.5 uml_moo : Merging a COW file with its backing file + + 8. Creating filesystems + + 8.1 Create the filesystem file + 8.2 Assign the file to a UML device + 8.3 Creating and mounting the filesystem + + 9. Host file access + + 9.1 Using hostfs + 9.2 hostfs as the root filesystem + 9.3 Building hostfs + + 10. The Management Console + 10.1 version + 10.2 halt and reboot + 10.3 config + 10.4 remove + 10.5 sysrq + 10.6 help + 10.7 cad + 10.8 stop + 10.9 go + + 11. Kernel debugging + + 11.1 Starting the kernel under gdb + 11.2 Examining sleeping processes + 11.3 Running ddd on UML + 11.4 Debugging modules + 11.5 Attaching gdb to the kernel + 11.6 Using alternate debuggers + + 12. Kernel debugging examples + + 12.1 The case of the hung fsck + 12.2 Episode 2: The case of the hung fsck + + 13. What to do when UML doesn't work + + 13.1 Strange compilation errors when you build from source + 13.2 (obsolete) + 13.3 A variety of panics and hangs with /tmp on a reiserfs filesystem + 13.4 The compile fails with errors about conflicting types for 'open', 'dup', and 'waitpid' + 13.5 UML doesn't work when /tmp is an NFS filesystem + 13.6 UML hangs on boot when compiled with gprof support + 13.7 syslogd dies with a SIGTERM on startup + 13.8 TUN/TAP networking doesn't work on a 2.4 host + 13.9 You can network to the host but not to other machines on the net + 13.10 I have no root and I want to scream + 13.11 UML build conflict between ptrace.h and ucontext.h + 13.12 The UML BogoMips is exactly half the host's BogoMips + 13.13 When you run UML, it immediately segfaults + 13.14 xterms appear, then immediately disappear + 13.15 Any other panic, hang, or strange behavior + + 14. Diagnosing Problems + + 14.1 Case 1 : Normal kernel panics + 14.2 Case 2 : Tracing thread panics + 14.3 Case 3 : Tracing thread panics caused by other threads + 14.4 Case 4 : Hangs + + 15. Thanks + + 15.1 Code and Documentation + 15.2 Flushing out bugs + 15.3 Buglets and clean-ups + 15.4 Case Studies + 15.5 Other contributions + + + ______________________________________________________________________ + + 1. Introduction + + Welcome to User Mode Linux. It's going to be fun. + + + + 1.1. How is User Mode Linux Different? + + Normally, the Linux Kernel talks straight to your hardware (video + card, keyboard, hard drives, etc), and any programs which run ask the + kernel to operate the hardware, like so: + + + + +-----------+-----------+----+ + | Process 1 | Process 2 | ...| + +-----------+-----------+----+ + | Linux Kernel | + +----------------------------+ + | Hardware | + +----------------------------+ + + + + + The User Mode Linux Kernel is different; instead of talking to the + hardware, it talks to a `real' Linux kernel (called the `host kernel' + from now on), like any other program. Programs can then run inside + User-Mode Linux as if they were running under a normal kernel, like + so: + + + + +----------------+ + | Process 2 | ...| + +-----------+----------------+ + | Process 1 | User-Mode Linux| + +----------------------------+ + | Linux Kernel | + +----------------------------+ + | Hardware | + +----------------------------+ + + + + + + 1.2. Why Would I Want User Mode Linux? + + + 1. If User Mode Linux crashes, your host kernel is still fine. + + 2. You can run a usermode kernel as a non-root user. + + 3. You can debug the User Mode Linux like any normal process. + + 4. You can run gprof (profiling) and gcov (coverage testing). + + 5. You can play with your kernel without breaking things. + + 6. You can use it as a sandbox for testing new apps. + + 7. You can try new development kernels safely. + + 8. You can run different distributions simultaneously. + + 9. It's extremely fun. + + + + + + 2. Compiling the kernel and modules + + + + + 2.1. Compiling the kernel + + + Compiling the user mode kernel is just like compiling any other + kernel. Let's go through the steps, using 2.4.0-prerelease (current + as of this writing) as an example: + + + 1. Download the latest UML patch from + + the download page + . + + + 3. Make a directory and unpack the kernel into it. + + + + host% + mkdir ~/uml + + + + + + + host% + cd ~/uml + + + + + + + host% + tar -xzvf linux-2.4.0-prerelease.tar.bz2 + + + + + + + 4. Apply the patch using + + + + host% + cd ~/uml/linux + + + + host% + bzcat uml-patch-2.4.0-prerelease.bz2 | patch -p1 + + + + + + + 5. Run your favorite config; `make xconfig ARCH=um' is the most + convenient. `make config ARCH=um' and 'make menuconfig ARCH=um' + will work as well. The defaults will give you a useful kernel. If + you want to change something, go ahead, it probably won't hurt + anything. + + + Note: If the host is configured with a 2G/2G address space split + rather than the usual 3G/1G split, then the packaged UML binaries + will not run. They will immediately segfault. See ``UML on 2G/2G + hosts'' for the scoop on running UML on your system. + + + + 6. Finish with `make linux ARCH=um': the result is a file called + `linux' in the top directory of your source tree. + + Make sure that you don't build this kernel in /usr/src/linux. On some + distributions, /usr/include/asm is a link into this pool. The user- + mode build changes the other end of that link, and things that include + stop compiling. + + The sources are also available from cvs at the project's cvs page, + which has directions on getting the sources. You can also browse the + CVS pool from there. + + If you get the CVS sources, you will have to check them out into an + empty directory. You will then have to copy each file into the + corresponding directory in the appropriate kernel pool. + + If you don't have the latest kernel pool, you can get the + corresponding user-mode sources with + + + host% cvs co -r v_2_3_x linux + + + + + where 'x' is the version in your pool. Note that you will not get the + bug fixes and enhancements that have gone into subsequent releases. + + + 2.2. Compiling and installing kernel modules + + UML modules are built in the same way as the native kernel (with the + exception of the 'ARCH=um' that you always need for UML): + + + host% make modules ARCH=um + + + + + Any modules that you want to load into this kernel need to be built in + the user-mode pool. Modules from the native kernel won't work. + + You can install them by using ftp or something to copy them into the + virtual machine and dropping them into /lib/modules/`uname -r`. + + You can also get the kernel build process to install them as follows: + + 1. with the kernel not booted, mount the root filesystem in the top + level of the kernel pool: + + + host% mount root_fs mnt -o loop + + + + + + + 2. run + + + host% + make modules_install INSTALL_MOD_PATH=`pwd`/mnt ARCH=um + + + + + + + 3. unmount the filesystem + + + host% umount mnt + + + + + + + 4. boot the kernel on it + + + When the system is booted, you can use insmod as usual to get the + modules into the kernel. A number of things have been loaded into UML + as modules, especially filesystems and network protocols and filters, + so most symbols which need to be exported probably already are. + However, if you do find symbols that need exporting, let us + know, and + they'll be "taken care of". + + + + 2.3. Compiling and installing uml_utilities + + Many features of the UML kernel require a user-space helper program, + so a uml_utilities package is distributed separately from the kernel + patch which provides these helpers. Included within this is: + + o port-helper - Used by consoles which connect to xterms or ports + + o tunctl - Configuration tool to create and delete tap devices + + o uml_net - Setuid binary for automatic tap device configuration + + o uml_switch - User-space virtual switch required for daemon + transport + + The uml_utilities tree is compiled with: + + + host# + make && make install + + + + + Note that UML kernel patches may require a specific version of the + uml_utilities distribution. If you don't keep up with the mailing + lists, ensure that you have the latest release of uml_utilities if you + are experiencing problems with your UML kernel, particularly when + dealing with consoles or command-line switches to the helper programs + + + + + + + + + 3. Running UML and logging in + + + + 3.1. Running UML + + It runs on 2.2.15 or later, and all 2.4 kernels. + + + Booting UML is straightforward. Simply run 'linux': it will try to + mount the file `root_fs' in the current directory. You do not need to + run it as root. If your root filesystem is not named `root_fs', then + you need to put a `ubd0=root_fs_whatever' switch on the linux command + line. + + + You will need a filesystem to boot UML from. There are a number + available for download from here . There are also several tools + which can be + used to generate UML-compatible filesystem images from media. + The kernel will boot up and present you with a login prompt. + + + Note: If the host is configured with a 2G/2G address space split + rather than the usual 3G/1G split, then the packaged UML binaries will + not run. They will immediately segfault. See ``UML on 2G/2G hosts'' + for the scoop on running UML on your system. + + + + 3.2. Logging in + + + + The prepackaged filesystems have a root account with password 'root' + and a user account with password 'user'. The login banner will + generally tell you how to log in. So, you log in and you will find + yourself inside a little virtual machine. Our filesystems have a + variety of commands and utilities installed (and it is fairly easy to + add more), so you will have a lot of tools with which to poke around + the system. + + There are a couple of other ways to log in: + + o On a virtual console + + + + Each virtual console that is configured (i.e. the device exists in + /dev and /etc/inittab runs a getty on it) will come up in its own + xterm. If you get tired of the xterms, read ``Setting up serial + lines and consoles'' to see how to attach the consoles to + something else, like host ptys. + + + + o Over the serial line + + + In the boot output, find a line that looks like: + + + + serial line 0 assigned pty /dev/ptyp1 + + + + + Attach your favorite terminal program to the corresponding tty. I.e. + for minicom, the command would be + + + host% minicom -o -p /dev/ttyp1 + + + + + + + o Over the net + + + If the network is running, then you can telnet to the virtual + machine and log in to it. See ``Setting up the network'' to learn + about setting up a virtual network. + + When you're done using it, run halt, and the kernel will bring itself + down and the process will exit. + + + 3.3. Examples + + Here are some examples of UML in action: + + o A login session + + o A virtual network + + + + + + + + 4. UML on 2G/2G hosts + + + + + 4.1. Introduction + + + Most Linux machines are configured so that the kernel occupies the + upper 1G (0xc0000000 - 0xffffffff) of the 4G address space and + processes use the lower 3G (0x00000000 - 0xbfffffff). However, some + machine are configured with a 2G/2G split, with the kernel occupying + the upper 2G (0x80000000 - 0xffffffff) and processes using the lower + 2G (0x00000000 - 0x7fffffff). + + + + + 4.2. The problem + + + The prebuilt UML binaries on this site will not run on 2G/2G hosts + because UML occupies the upper .5G of the 3G process address space + (0xa0000000 - 0xbfffffff). Obviously, on 2G/2G hosts, this is right + in the middle of the kernel address space, so UML won't even load - it + will immediately segfault. + + + + + 4.3. The solution + + + The fix for this is to rebuild UML from source after enabling + CONFIG_HOST_2G_2G (under 'General Setup'). This will cause UML to + load itself in the top .5G of that smaller process address space, + where it will run fine. See ``Compiling the kernel and modules'' if + you need help building UML from source. + + + + + + + + + + + 5. Setting up serial lines and consoles + + + It is possible to attach UML serial lines and consoles to many types + of host I/O channels by specifying them on the command line. + + + You can attach them to host ptys, ttys, file descriptors, and ports. + This allows you to do things like + + o have a UML console appear on an unused host console, + + o hook two virtual machines together by having one attach to a pty + and having the other attach to the corresponding tty + + o make a virtual machine accessible from the net by attaching a + console to a port on the host. + + + The general format of the command line option is device=channel. + + + + 5.1. Specifying the device + + Devices are specified with "con" or "ssl" (console or serial line, + respectively), optionally with a device number if you are talking + about a specific device. + + + Using just "con" or "ssl" describes all of the consoles or serial + lines. If you want to talk about console #3 or serial line #10, they + would be "con3" and "ssl10", respectively. + + + A specific device name will override a less general "con=" or "ssl=". + So, for example, you can assign a pty to each of the serial lines + except for the first two like this: + + + ssl=pty ssl0=tty:/dev/tty0 ssl1=tty:/dev/tty1 + + + + + The specificity of the device name is all that matters; order on the + command line is irrelevant. + + + + 5.2. Specifying the channel + + There are a number of different types of channels to attach a UML + device to, each with a different way of specifying exactly what to + attach to. + + o pseudo-terminals - device=pty pts terminals - device=pts + + + This will cause UML to allocate a free host pseudo-terminal for the + device. The terminal that it got will be announced in the boot + log. You access it by attaching a terminal program to the + corresponding tty: + + o screen /dev/pts/n + + o screen /dev/ttyxx + + o minicom -o -p /dev/ttyxx - minicom seems not able to handle pts + devices + + o kermit - start it up, 'open' the device, then 'connect' + + + + + + o terminals - device=tty:tty device file + + + This will make UML attach the device to the specified tty (i.e + + + con1=tty:/dev/tty3 + + + + + will attach UML's console 1 to the host's /dev/tty3). If the tty that + you specify is the slave end of a tty/pty pair, something else must + have already opened the corresponding pty in order for this to work. + + + + + + o xterms - device=xterm + + + UML will run an xterm and the device will be attached to it. + + + + + + o Port - device=port:port number + + + This will attach the UML devices to the specified host port. + Attaching console 1 to the host's port 9000 would be done like + this: + + + con1=port:9000 + + + + + Attaching all the serial lines to that port would be done similarly: + + + ssl=port:9000 + + + + + You access these devices by telnetting to that port. Each active tel- + net session gets a different device. If there are more telnets to a + port than UML devices attached to it, then the extra telnet sessions + will block until an existing telnet detaches, or until another device + becomes active (i.e. by being activated in /etc/inittab). + + This channel has the advantage that you can both attach multiple UML + devices to it and know how to access them without reading the UML boot + log. It is also unique in allowing access to a UML from remote + machines without requiring that the UML be networked. This could be + useful in allowing public access to UMLs because they would be + accessible from the net, but wouldn't need any kind of network + filtering or access control because they would have no network access. + + + If you attach the main console to a portal, then the UML boot will + appear to hang. In reality, it's waiting for a telnet to connect, at + which point the boot will proceed. + + + + + + o already-existing file descriptors - device=file descriptor + + + If you set up a file descriptor on the UML command line, you can + attach a UML device to it. This is most commonly used to put the + main console back on stdin and stdout after assigning all the other + consoles to something else: + + + con0=fd:0,fd:1 con=pts + + + + + + + + + o Nothing - device=null + + + This allows the device to be opened, in contrast to 'none', but + reads will block, and writes will succeed and the data will be + thrown out. + + + + + + o None - device=none + + + This causes the device to disappear. + + + + You can also specify different input and output channels for a device + by putting a comma between them: + + + ssl3=tty:/dev/tty2,xterm + + + + + will cause serial line 3 to accept input on the host's /dev/tty2 and + display output on an xterm. That's a silly example - the most common + use of this syntax is to reattach the main console to stdin and stdout + as shown above. + + + If you decide to move the main console away from stdin/stdout, the + initial boot output will appear in the terminal that you're running + UML in. However, once the console driver has been officially + initialized, then the boot output will start appearing wherever you + specified that console 0 should be. That device will receive all + subsequent output. + + + + 5.3. Examples + + There are a number of interesting things you can do with this + capability. + + + First, this is how you get rid of those bleeding console xterms by + attaching them to host ptys: + + + con=pty con0=fd:0,fd:1 + + + + + This will make a UML console take over an unused host virtual console, + so that when you switch to it, you will see the UML login prompt + rather than the host login prompt: + + + con1=tty:/dev/tty6 + + + + + You can attach two virtual machines together with what amounts to a + serial line as follows: + + Run one UML with a serial line attached to a pty - + + + ssl1=pty + + + + + Look at the boot log to see what pty it got (this example will assume + that it got /dev/ptyp1). + + Boot the other UML with a serial line attached to the corresponding + tty - + + + ssl1=tty:/dev/ttyp1 + + + + + Log in, make sure that it has no getty on that serial line, attach a + terminal program like minicom to it, and you should see the login + prompt of the other virtual machine. + + + 6. Setting up the network + + + + This page describes how to set up the various transports and to + provide a UML instance with network access to the host, other machines + on the local net, and the rest of the net. + + + As of 2.4.5, UML networking has been completely redone to make it much + easier to set up, fix bugs, and add new features. + + + There is a new helper, uml_net, which does the host setup that + requires root privileges. + + + There are currently five transport types available for a UML virtual + machine to exchange packets with other hosts: + + o ethertap + + o TUN/TAP + + o Multicast + + o a switch daemon + + o slip + + o slirp + + o pcap + + The TUN/TAP, ethertap, slip, and slirp transports allow a UML + instance to exchange packets with the host. They may be directed + to the host or the host may just act as a router to provide access + to other physical or virtual machines. + + + The pcap transport is a synthetic read-only interface, using the + libpcap binary to collect packets from interfaces on the host and + filter them. This is useful for building preconfigured traffic + monitors or sniffers. + + + The daemon and multicast transports provide a completely virtual + network to other virtual machines. This network is completely + disconnected from the physical network unless one of the virtual + machines on it is acting as a gateway. + + + With so many host transports, which one should you use? Here's when + you should use each one: + + o ethertap - if you want access to the host networking and it is + running 2.2 + + o TUN/TAP - if you want access to the host networking and it is + running 2.4. Also, the TUN/TAP transport is able to use a + preconfigured device, allowing it to avoid using the setuid uml_net + helper, which is a security advantage. + + o Multicast - if you want a purely virtual network and you don't want + to set up anything but the UML + + o a switch daemon - if you want a purely virtual network and you + don't mind running the daemon in order to get somewhat better + performance + + o slip - there is no particular reason to run the slip backend unless + ethertap and TUN/TAP are just not available for some reason + + o slirp - if you don't have root access on the host to setup + networking, or if you don't want to allocate an IP to your UML + + o pcap - not much use for actual network connectivity, but great for + monitoring traffic on the host + + Ethertap is available on 2.4 and works fine. TUN/TAP is preferred + to it because it has better performance and ethertap is officially + considered obsolete in 2.4. Also, the root helper only needs to + run occasionally for TUN/TAP, rather than handling every packet, as + it does with ethertap. This is a slight security advantage since + it provides fewer opportunities for a nasty UML user to somehow + exploit the helper's root privileges. + + + 6.1. General setup + + First, you must have the virtual network enabled in your UML. If are + running a prebuilt kernel from this site, everything is already + enabled. If you build the kernel yourself, under the "Network device + support" menu, enable "Network device support", and then the three + transports. + + + The next step is to provide a network device to the virtual machine. + This is done by describing it on the kernel command line. + + The general format is + + + eth = , + + + + + For example, a virtual ethernet device may be attached to a host + ethertap device as follows: + + + eth0=ethertap,tap0,fe:fd:0:0:0:1,192.168.0.254 + + + + + This sets up eth0 inside the virtual machine to attach itself to the + host /dev/tap0, assigns it an ethernet address, and assigns the host + tap0 interface an IP address. + + + + Note that the IP address you assign to the host end of the tap device + must be different than the IP you assign to the eth device inside UML. + If you are short on IPs and don't want to consume two per UML, then + you can reuse the host's eth IP address for the host ends of the tap + devices. Internally, the UMLs must still get unique IPs for their eth + devices. You can also give the UMLs non-routable IPs (192.168.x.x or + 10.x.x.x) and have the host masquerade them. This will let outgoing + connections work, but incoming connections won't without more work, + such as port forwarding from the host. + Also note that when you configure the host side of an interface, it is + only acting as a gateway. It will respond to pings sent to it + locally, but is not useful to do that since it's a host interface. + You are not talking to the UML when you ping that interface and get a + response. + + + You can also add devices to a UML and remove them at runtime. See the + ``The Management Console'' page for details. + + + The sections below describe this in more detail. + + + Once you've decided how you're going to set up the devices, you boot + UML, log in, configure the UML side of the devices, and set up routes + to the outside world. At that point, you will be able to talk to any + other machines, physical or virtual, on the net. + + + If ifconfig inside UML fails and the network refuses to come up, run + tell you what went wrong. + + + + 6.2. Userspace daemons + + You will likely need the setuid helper, or the switch daemon, or both. + They are both installed with the RPM and deb, so if you've installed + either, you can skip the rest of this section. + + + If not, then you need to check them out of CVS, build them, and + install them. The helper is uml_net, in CVS /tools/uml_net, and the + daemon is uml_switch, in CVS /tools/uml_router. They are both built + with a plain 'make'. Both need to be installed in a directory that's + in your path - /usr/bin is recommend. On top of that, uml_net needs + to be setuid root. + + + + 6.3. Specifying ethernet addresses + + Below, you will see that the TUN/TAP, ethertap, and daemon interfaces + allow you to specify hardware addresses for the virtual ethernet + devices. This is generally not necessary. If you don't have a + specific reason to do it, you probably shouldn't. If one is not + specified on the command line, the driver will assign one based on the + device IP address. It will provide the address fe:fd:nn:nn:nn:nn + where nn.nn.nn.nn is the device IP address. This is nearly always + sufficient to guarantee a unique hardware address for the device. A + couple of exceptions are: + + o Another set of virtual ethernet devices are on the same network and + they are assigned hardware addresses using a different scheme which + may conflict with the UML IP address-based scheme + + o You aren't going to use the device for IP networking, so you don't + assign the device an IP address + + If you let the driver provide the hardware address, you should make + sure that the device IP address is known before the interface is + brought up. So, inside UML, this will guarantee that: + + + + UML# + ifconfig eth0 192.168.0.250 up + + + + + If you decide to assign the hardware address yourself, make sure that + the first byte of the address is even. Addresses with an odd first + byte are broadcast addresses, which you don't want assigned to a + device. + + + + 6.4. UML interface setup + + Once the network devices have been described on the command line, you + should boot UML and log in. + + + The first thing to do is bring the interface up: + + + UML# ifconfig ethn ip-address up + + + + + You should be able to ping the host at this point. + + + To reach the rest of the world, you should set a default route to the + host: + + + UML# route add default gw host ip + + + + + Again, with host ip of 192.168.0.4: + + + UML# route add default gw 192.168.0.4 + + + + + This page used to recommend setting a network route to your local net. + This is wrong, because it will cause UML to try to figure out hardware + addresses of the local machines by arping on the interface to the + host. Since that interface is basically a single strand of ethernet + with two nodes on it (UML and the host) and arp requests don't cross + networks, they will fail to elicit any responses. So, what you want + is for UML to just blindly throw all packets at the host and let it + figure out what to do with them, which is what leaving out the network + route and adding the default route does. + + + Note: If you can't communicate with other hosts on your physical + ethernet, it's probably because of a network route that's + automatically set up. If you run 'route -n' and see a route that + looks like this: + + + + + Destination Gateway Genmask Flags Metric Ref Use Iface + 192.168.0.0 0.0.0.0 255.255.255.0 U 0 0 0 eth0 + + + + + with a mask that's not 255.255.255.255, then replace it with a route + to your host: + + + UML# + route del -net 192.168.0.0 dev eth0 netmask 255.255.255.0 + + + + + + + UML# + route add -host 192.168.0.4 dev eth0 + + + + + This, plus the default route to the host, will allow UML to exchange + packets with any machine on your ethernet. + + + + 6.5. Multicast + + The simplest way to set up a virtual network between multiple UMLs is + to use the mcast transport. This was written by Harald Welte and is + present in UML version 2.4.5-5um and later. Your system must have + multicast enabled in the kernel and there must be a multicast-capable + network device on the host. Normally, this is eth0, but if there is + no ethernet card on the host, then you will likely get strange error + messages when you bring the device up inside UML. + + + To use it, run two UMLs with + + + eth0=mcast + + + + + on their command lines. Log in, configure the ethernet device in each + machine with different IP addresses: + + + UML1# ifconfig eth0 192.168.0.254 + + + + + + + UML2# ifconfig eth0 192.168.0.253 + + + + + and they should be able to talk to each other. + + The full set of command line options for this transport are + + + + ethn=mcast,ethernet address,multicast + address,multicast port,ttl + + + + + Harald's original README is here and explains these in detail, as well as + some other issues. + + There is also a related point-to-point only "ucast" transport. + This is useful when your network does not support multicast, and + all network connections are simple point to point links. + + The full set of command line options for this transport are + + + ethn=ucast,ethernet address,remote address,listen port,remote port + + + + + 6.6. TUN/TAP with the uml_net helper + + TUN/TAP is the preferred mechanism on 2.4 to exchange packets with the + host. The TUN/TAP backend has been in UML since 2.4.9-3um. + + + The easiest way to get up and running is to let the setuid uml_net + helper do the host setup for you. This involves insmod-ing the tun.o + module if necessary, configuring the device, and setting up IP + forwarding, routing, and proxy arp. If you are new to UML networking, + do this first. If you're concerned about the security implications of + the setuid helper, use it to get up and running, then read the next + section to see how to have UML use a preconfigured tap device, which + avoids the use of uml_net. + + + If you specify an IP address for the host side of the device, the + uml_net helper will do all necessary setup on the host - the only + requirement is that TUN/TAP be available, either built in to the host + kernel or as the tun.o module. + + The format of the command line switch to attach a device to a TUN/TAP + device is + + + eth =tuntap,,, + + + + + For example, this argument will attach the UML's eth0 to the next + available tap device and assign an ethernet address to it based on its + IP address + + + eth0=tuntap,,,192.168.0.254 + + + + + + + Note that the IP address that must be used for the eth device inside + UML is fixed by the routing and proxy arp that is set up on the + TUN/TAP device on the host. You can use a different one, but it won't + work because reply packets won't reach the UML. This is a feature. + It prevents a nasty UML user from doing things like setting the UML IP + to the same as the network's nameserver or mail server. + + + There are a couple potential problems with running the TUN/TAP + transport on a 2.4 host kernel + + o TUN/TAP seems not to work on 2.4.3 and earlier. Upgrade the host + kernel or use the ethertap transport. + + o With an upgraded kernel, TUN/TAP may fail with + + + File descriptor in bad state + + + + + This is due to a header mismatch between the upgraded kernel and the + kernel that was originally installed on the machine. The fix is to + make sure that /usr/src/linux points to the headers for the running + kernel. + + These were pointed out by Tim Robinson in + name="this uml- + user post"> . + + + + 6.7. TUN/TAP with a preconfigured tap device + + If you prefer not to have UML use uml_net (which is somewhat + insecure), with UML 2.4.17-11, you can set up a TUN/TAP device + beforehand. The setup needs to be done as root, but once that's done, + there is no need for root assistance. Setting up the device is done + as follows: + + o Create the device with tunctl (available from the UML utilities + tarball) + + + + + host# tunctl -u uid + + + + + where uid is the user id or username that UML will be run as. This + will tell you what device was created. + + o Configure the device IP (change IP addresses and device name to + suit) + + + + + host# ifconfig tap0 192.168.0.254 up + + + + + + o Set up routing and arping if desired - this is my recipe, there are + other ways of doing the same thing + + + host# + bash -c 'echo 1 > /proc/sys/net/ipv4/ip_forward' + + host# + route add -host 192.168.0.253 dev tap0 + + + + + + + host# + bash -c 'echo 1 > /proc/sys/net/ipv4/conf/tap0/proxy_arp' + + + + + + + host# + arp -Ds 192.168.0.253 eth0 pub + + + + + Note that this must be done every time the host boots - this configu- + ration is not stored across host reboots. So, it's probably a good + idea to stick it in an rc file. An even better idea would be a little + utility which reads the information from a config file and sets up + devices at boot time. + + o Rather than using up two IPs and ARPing for one of them, you can + also provide direct access to your LAN by the UML by using a + bridge. + + + host# + brctl addbr br0 + + + + + + + host# + ifconfig eth0 0.0.0.0 promisc up + + + + + + + host# + ifconfig tap0 0.0.0.0 promisc up + + + + + + + host# + ifconfig br0 192.168.0.1 netmask 255.255.255.0 up + + + + + + + + host# + brctl stp br0 off + + + + + + + host# + brctl setfd br0 1 + + + + + + + host# + brctl sethello br0 1 + + + + + + + host# + brctl addif br0 eth0 + + + + + + + host# + brctl addif br0 tap0 + + + + + Note that 'br0' should be setup using ifconfig with the existing IP + address of eth0, as eth0 no longer has its own IP. + + o + + + Also, the /dev/net/tun device must be writable by the user running + UML in order for the UML to use the device that's been configured + for it. The simplest thing to do is + + + host# chmod 666 /dev/net/tun + + + + + Making it world-writable looks bad, but it seems not to be + exploitable as a security hole. However, it does allow anyone to cre- + ate useless tap devices (useless because they can't configure them), + which is a DOS attack. A somewhat more secure alternative would to be + to create a group containing all the users who have preconfigured tap + devices and chgrp /dev/net/tun to that group with mode 664 or 660. + + + o Once the device is set up, run UML with 'eth0=tuntap,device name' + (i.e. 'eth0=tuntap,tap0') on the command line (or do it with the + mconsole config command). + + o Bring the eth device up in UML and you're in business. + + If you don't want that tap device any more, you can make it non- + persistent with + + + host# tunctl -d tap device + + + + + Finally, tunctl has a -b (for brief mode) switch which causes it to + output only the name of the tap device it created. This makes it + suitable for capture by a script: + + + host# TAP=`tunctl -u 1000 -b` + + + + + + + 6.8. Ethertap + + Ethertap is the general mechanism on 2.2 for userspace processes to + exchange packets with the kernel. + + + + To use this transport, you need to describe the virtual network device + on the UML command line. The general format for this is + + + eth =ethertap, , , + + + + + So, the previous example + + + eth0=ethertap,tap0,fe:fd:0:0:0:1,192.168.0.254 + + + + + attaches the UML eth0 device to the host /dev/tap0, assigns it the + ethernet address fe:fd:0:0:0:1, and assigns the IP address + 192.168.0.254 to the tap device. + + + + The tap device is mandatory, but the others are optional. If the + ethernet address is omitted, one will be assigned to it. + + + The presence of the tap IP address will cause the helper to run and do + whatever host setup is needed to allow the virtual machine to + communicate with the outside world. If you're not sure you know what + you're doing, this is the way to go. + + + If it is absent, then you must configure the tap device and whatever + arping and routing you will need on the host. However, even in this + case, the uml_net helper still needs to be in your path and it must be + setuid root if you're not running UML as root. This is because the + tap device doesn't support SIGIO, which UML needs in order to use + something as a source of input. So, the helper is used as a + convenient asynchronous IO thread. + + If you're using the uml_net helper, you can ignore the following host + setup - uml_net will do it for you. You just need to make sure you + have ethertap available, either built in to the host kernel or + available as a module. + + + If you want to set things up yourself, you need to make sure that the + appropriate /dev entry exists. If it doesn't, become root and create + it as follows: + + + mknod /dev/tap c 36 + 16 + + + + + For example, this is how to create /dev/tap0: + + + mknod /dev/tap0 c 36 0 + 16 + + + + + You also need to make sure that the host kernel has ethertap support. + If ethertap is enabled as a module, you apparently need to insmod + ethertap once for each ethertap device you want to enable. So, + + + host# + insmod ethertap + + + + + will give you the tap0 interface. To get the tap1 interface, you need + to run + + + host# + insmod ethertap unit=1 -o ethertap1 + + + + + + + + 6.9. The switch daemon + + Note: This is the daemon formerly known as uml_router, but which was + renamed so the network weenies of the world would stop growling at me. + + + The switch daemon, uml_switch, provides a mechanism for creating a + totally virtual network. By default, it provides no connection to the + host network (but see -tap, below). + + + The first thing you need to do is run the daemon. Running it with no + arguments will make it listen on a default pair of unix domain + sockets. + + + If you want it to listen on a different pair of sockets, use + + + -unix control socket data socket + + + + + + If you want it to act as a hub rather than a switch, use + + + -hub + + + + + + If you want the switch to be connected to host networking (allowing + the umls to get access to the outside world through the host), use + + + -tap tap0 + + + + + + Note that the tap device must be preconfigured (see "TUN/TAP with a + preconfigured tap device", above). If you're using a different tap + device than tap0, specify that instead of tap0. + + + uml_switch can be backgrounded as follows + + + host% + uml_switch [ options ] < /dev/null > /dev/null + + + + + The reason it doesn't background by default is that it listens to + stdin for EOF. When it sees that, it exits. + + + The general format of the kernel command line switch is + + + + ethn=daemon,ethernet address,socket + type,control socket,data socket + + + + + You can leave off everything except the 'daemon'. You only need to + specify the ethernet address if the one that will be assigned to it + isn't acceptable for some reason. The rest of the arguments describe + how to communicate with the daemon. You should only specify them if + you told the daemon to use different sockets than the default. So, if + you ran the daemon with no arguments, running the UML on the same + machine with + eth0=daemon + + + + + will cause the eth0 driver to attach itself to the daemon correctly. + + + + 6.10. Slip + + Slip is another, less general, mechanism for a process to communicate + with the host networking. In contrast to the ethertap interface, + which exchanges ethernet frames with the host and can be used to + transport any higher-level protocol, it can only be used to transport + IP. + + + The general format of the command line switch is + + + + ethn=slip,slip IP + + + + + The slip IP argument is the IP address that will be assigned to the + host end of the slip device. If it is specified, the helper will run + and will set up the host so that the virtual machine can reach it and + the rest of the network. + + + There are some oddities with this interface that you should be aware + of. You should only specify one slip device on a given virtual + machine, and its name inside UML will be 'umn', not 'eth0' or whatever + you specified on the command line. These problems will be fixed at + some point. + + + + 6.11. Slirp + + slirp uses an external program, usually /usr/bin/slirp, to provide IP + only networking connectivity through the host. This is similar to IP + masquerading with a firewall, although the translation is performed in + user-space, rather than by the kernel. As slirp does not set up any + interfaces on the host, or changes routing, slirp does not require + root access or setuid binaries on the host. + + + The general format of the command line switch for slirp is: + + + + ethn=slirp,ethernet address,slirp path + + + + + The ethernet address is optional, as UML will set up the interface + with an ethernet address based upon the initial IP address of the + interface. The slirp path is generally /usr/bin/slirp, although it + will depend on distribution. + + + The slirp program can have a number of options passed to the command + line and we can't add them to the UML command line, as they will be + parsed incorrectly. Instead, a wrapper shell script can be written or + the options inserted into the /.slirprc file. More information on + all of the slirp options can be found in its man pages. + + + The eth0 interface on UML should be set up with the IP 10.2.0.15, + although you can use anything as long as it is not used by a network + you will be connecting to. The default route on UML should be set to + use + + + UML# + route add default dev eth0 + + + + + slirp provides a number of useful IP addresses which can be used by + UML, such as 10.0.2.3 which is an alias for the DNS server specified + in /etc/resolv.conf on the host or the IP given in the 'dns' option + for slirp. + + + Even with a baudrate setting higher than 115200, the slirp connection + is limited to 115200. If you need it to go faster, the slirp binary + needs to be compiled with FULL_BOLT defined in config.h. + + + + 6.12. pcap + + The pcap transport is attached to a UML ethernet device on the command + line or with uml_mconsole with the following syntax: + + + + ethn=pcap,host interface,filter + expression,option1,option2 + + + + + The expression and options are optional. + + + The interface is whatever network device on the host you want to + sniff. The expression is a pcap filter expression, which is also what + tcpdump uses, so if you know how to specify tcpdump filters, you will + use the same expressions here. The options are up to two of + 'promisc', control whether pcap puts the host interface into + promiscuous mode. 'optimize' and 'nooptimize' control whether the pcap + expression optimizer is used. + + + Example: + + + + eth0=pcap,eth0,tcp + + eth1=pcap,eth0,!tcp + + + + will cause the UML eth0 to emit all tcp packets on the host eth0 and + the UML eth1 to emit all non-tcp packets on the host eth0. + + + + 6.13. Setting up the host yourself + + If you don't specify an address for the host side of the ethertap or + slip device, UML won't do any setup on the host. So this is what is + needed to get things working (the examples use a host-side IP of + 192.168.0.251 and a UML-side IP of 192.168.0.250 - adjust to suit your + own network): + + o The device needs to be configured with its IP address. Tap devices + are also configured with an mtu of 1484. Slip devices are + configured with a point-to-point address pointing at the UML ip + address. + + + host# ifconfig tap0 arp mtu 1484 192.168.0.251 up + + + + + + + host# + ifconfig sl0 192.168.0.251 pointopoint 192.168.0.250 up + + + + + + o If a tap device is being set up, a route is set to the UML IP. + + + UML# route add -host 192.168.0.250 gw 192.168.0.251 + + + + + + o To allow other hosts on your network to see the virtual machine, + proxy arp is set up for it. + + + host# arp -Ds 192.168.0.250 eth0 pub + + + + + + o Finally, the host is set up to route packets. + + + host# echo 1 > /proc/sys/net/ipv4/ip_forward + + + + + + + + + + + 7. Sharing Filesystems between Virtual Machines + + + + + 7.1. A warning + + Don't attempt to share filesystems simply by booting two UMLs from the + same file. That's the same thing as booting two physical machines + from a shared disk. It will result in filesystem corruption. + + + + 7.2. Using layered block devices + + The way to share a filesystem between two virtual machines is to use + the copy-on-write (COW) layering capability of the ubd block driver. + As of 2.4.6-2um, the driver supports layering a read-write private + device over a read-only shared device. A machine's writes are stored + in the private device, while reads come from either device - the + private one if the requested block is valid in it, the shared one if + not. Using this scheme, the majority of data which is unchanged is + shared between an arbitrary number of virtual machines, each of which + has a much smaller file containing the changes that it has made. With + a large number of UMLs booting from a large root filesystem, this + leads to a huge disk space saving. It will also help performance, + since the host will be able to cache the shared data using a much + smaller amount of memory, so UML disk requests will be served from the + host's memory rather than its disks. + + + + + To add a copy-on-write layer to an existing block device file, simply + add the name of the COW file to the appropriate ubd switch: + + + ubd0=root_fs_cow,root_fs_debian_22 + + + + + where 'root_fs_cow' is the private COW file and 'root_fs_debian_22' is + the existing shared filesystem. The COW file need not exist. If it + doesn't, the driver will create and initialize it. Once the COW file + has been initialized, it can be used on its own on the command line: + + + ubd0=root_fs_cow + + + + + The name of the backing file is stored in the COW file header, so it + would be redundant to continue specifying it on the command line. + + + + 7.3. Note! + + When checking the size of the COW file in order to see the gobs of + space that you're saving, make sure you use 'ls -ls' to see the actual + disk consumption rather than the length of the file. The COW file is + sparse, so the length will be very different from the disk usage. + Here is a 'ls -l' of a COW file and backing file from one boot and + shutdown: + host% ls -l cow.debian debian2.2 + -rw-r--r-- 1 jdike jdike 492504064 Aug 6 21:16 cow.debian + -rwxrw-rw- 1 jdike jdike 537919488 Aug 6 20:42 debian2.2 + + + + + Doesn't look like much saved space, does it? Well, here's 'ls -ls': + + + host% ls -ls cow.debian debian2.2 + 880 -rw-r--r-- 1 jdike jdike 492504064 Aug 6 21:16 cow.debian + 525832 -rwxrw-rw- 1 jdike jdike 537919488 Aug 6 20:42 debian2.2 + + + + + Now, you can see that the COW file has less than a meg of disk, rather + than 492 meg. + + + + 7.4. Another warning + + Once a filesystem is being used as a readonly backing file for a COW + file, do not boot directly from it or modify it in any way. Doing so + will invalidate any COW files that are using it. The mtime and size + of the backing file are stored in the COW file header at its creation, + and they must continue to match. If they don't, the driver will + refuse to use the COW file. + + + + + If you attempt to evade this restriction by changing either the + backing file or the COW header by hand, you will get a corrupted + filesystem. + + + + + Among other things, this means that upgrading the distribution in a + backing file and expecting that all of the COW files using it will see + the upgrade will not work. + + + + + 7.5. uml_moo : Merging a COW file with its backing file + + Depending on how you use UML and COW devices, it may be advisable to + merge the changes in the COW file into the backing file every once in + a while. + + + + + The utility that does this is uml_moo. Its usage is + + + host% uml_moo COW file new backing file + + + + + There's no need to specify the backing file since that information is + already in the COW file header. If you're paranoid, boot the new + merged file, and if you're happy with it, move it over the old backing + file. + + + + + uml_moo creates a new backing file by default as a safety measure. It + also has a destructive merge option which will merge the COW file + directly into its current backing file. This is really only usable + when the backing file only has one COW file associated with it. If + there are multiple COWs associated with a backing file, a -d merge of + one of them will invalidate all of the others. However, it is + convenient if you're short of disk space, and it should also be + noticeably faster than a non-destructive merge. + + + + + uml_moo is installed with the UML deb and RPM. If you didn't install + UML from one of those packages, you can also get it from the UML + utilities tar file in tools/moo. + + + + + + + + + 8. Creating filesystems + + + You may want to create and mount new UML filesystems, either because + your root filesystem isn't large enough or because you want to use a + filesystem other than ext2. + + + This was written on the occasion of reiserfs being included in the + 2.4.1 kernel pool, and therefore the 2.4.1 UML, so the examples will + talk about reiserfs. This information is generic, and the examples + should be easy to translate to the filesystem of your choice. + + + 8.1. Create the filesystem file + + dd is your friend. All you need to do is tell dd to create an empty + file of the appropriate size. I usually make it sparse to save time + and to avoid allocating disk space until it's actually used. For + example, the following command will create a sparse 100 meg file full + of zeroes. + + + host% + dd if=/dev/zero of=new_filesystem seek=100 count=1 bs=1M + + + + + + + 8.2. Assign the file to a UML device + + Add an argument like the following to the UML command line: + + ubd4=new_filesystem + + + + + making sure that you use an unassigned ubd device number. + + + + 8.3. Creating and mounting the filesystem + + Make sure that the filesystem is available, either by being built into + the kernel, or available as a module, then boot up UML and log in. If + the root filesystem doesn't have the filesystem utilities (mkfs, fsck, + etc), then get them into UML by way of the net or hostfs. + + + Make the new filesystem on the device assigned to the new file: + + + host# mkreiserfs /dev/ubd/4 + + + <----------- MKREISERFSv2 -----------> + + ReiserFS version 3.6.25 + Block size 4096 bytes + Block count 25856 + Used blocks 8212 + Journal - 8192 blocks (18-8209), journal header is in block 8210 + Bitmaps: 17 + Root block 8211 + Hash function "r5" + ATTENTION: ALL DATA WILL BE LOST ON '/dev/ubd/4'! (y/n)y + journal size 8192 (from 18) + Initializing journal - 0%....20%....40%....60%....80%....100% + Syncing..done. + + + + + Now, mount it: + + + UML# + mount /dev/ubd/4 /mnt + + + + + and you're in business. + + + + + + + + + + 9. Host file access + + + If you want to access files on the host machine from inside UML, you + can treat it as a separate machine and either nfs mount directories + from the host or copy files into the virtual machine with scp or rcp. + However, since UML is running on the host, it can access those + files just like any other process and make them available inside the + virtual machine without needing to use the network. + + + This is now possible with the hostfs virtual filesystem. With it, you + can mount a host directory into the UML filesystem and access the + files contained in it just as you would on the host. + + + 9.1. Using hostfs + + To begin with, make sure that hostfs is available inside the virtual + machine with + + + UML# cat /proc/filesystems + + + + . hostfs should be listed. If it's not, either rebuild the kernel + with hostfs configured into it or make sure that hostfs is built as a + module and available inside the virtual machine, and insmod it. + + + Now all you need to do is run mount: + + + UML# mount none /mnt/host -t hostfs + + + + + will mount the host's / on the virtual machine's /mnt/host. + + + If you don't want to mount the host root directory, then you can + specify a subdirectory to mount with the -o switch to mount: + + + UML# mount none /mnt/home -t hostfs -o /home + + + + + will mount the hosts's /home on the virtual machine's /mnt/home. + + + + 9.2. hostfs as the root filesystem + + It's possible to boot from a directory hierarchy on the host using + hostfs rather than using the standard filesystem in a file. + + To start, you need that hierarchy. The easiest way is to loop mount + an existing root_fs file: + + + host# mount root_fs uml_root_dir -o loop + + + + + You need to change the filesystem type of / in etc/fstab to be + 'hostfs', so that line looks like this: + + /dev/ubd/0 / hostfs defaults 1 1 + + + + + Then you need to chown to yourself all the files in that directory + that are owned by root. This worked for me: + + + host# find . -uid 0 -exec chown jdike {} \; + + + + + Next, make sure that your UML kernel has hostfs compiled in, not as a + module. Then run UML with the boot device pointing at that directory: + + + ubd0=/path/to/uml/root/directory + + + + + UML should then boot as it does normally. + + + 9.3. Building hostfs + + If you need to build hostfs because it's not in your kernel, you have + two choices: + + + + o Compiling hostfs into the kernel: + + + Reconfigure the kernel and set the 'Host filesystem' option under + + + o Compiling hostfs as a module: + + + Reconfigure the kernel and set the 'Host filesystem' option under + be in arch/um/fs/hostfs/hostfs.o. Install that in + /lib/modules/`uname -r`/fs in the virtual machine, boot it up, and + + + UML# insmod hostfs + + + + + + + + + + + + + 10. The Management Console + + + + The UML management console is a low-level interface to the kernel, + somewhat like the i386 SysRq interface. Since there is a full-blown + operating system under UML, there is much greater flexibility possible + than with the SysRq mechanism. + + + There are a number of things you can do with the mconsole interface: + + o get the kernel version + + o add and remove devices + + o halt or reboot the machine + + o Send SysRq commands + + o Pause and resume the UML + + + You need the mconsole client (uml_mconsole) which is present in CVS + (/tools/mconsole) in 2.4.5-9um and later, and will be in the RPM in + 2.4.6. + + + You also need CONFIG_MCONSOLE (under 'General Setup') enabled in UML. + When you boot UML, you'll see a line like: + + + mconsole initialized on /home/jdike/.uml/umlNJ32yL/mconsole + + + + + If you specify a unique machine id one the UML command line, i.e. + + + umid=debian + + + + + you'll see this + + + mconsole initialized on /home/jdike/.uml/debian/mconsole + + + + + That file is the socket that uml_mconsole will use to communicate with + UML. Run it with either the umid or the full path as its argument: + + + host% uml_mconsole debian + + + + + or + + + host% uml_mconsole /home/jdike/.uml/debian/mconsole + + + + + You'll get a prompt, at which you can run one of these commands: + + o version + + o halt + + o reboot + + o config + + o remove + + o sysrq + + o help + + o cad + + o stop + + o go + + + 10.1. version + + This takes no arguments. It prints the UML version. + + + (mconsole) version + OK Linux usermode 2.4.5-9um #1 Wed Jun 20 22:47:08 EDT 2001 i686 + + + + + There are a couple actual uses for this. It's a simple no-op which + can be used to check that a UML is running. It's also a way of + sending an interrupt to the UML. This is sometimes useful on SMP + hosts, where there's a bug which causes signals to UML to be lost, + often causing it to appear to hang. Sending such a UML the mconsole + version command is a good way to 'wake it up' before networking has + been enabled, as it does not do anything to the function of the UML. + + + + 10.2. halt and reboot + + These take no arguments. They shut the machine down immediately, with + no syncing of disks and no clean shutdown of userspace. So, they are + pretty close to crashing the machine. + + + (mconsole) halt + OK + + + + + + + 10.3. config + + "config" adds a new device to the virtual machine. Currently the ubd + and network drivers support this. It takes one argument, which is the + device to add, with the same syntax as the kernel command line. + + + + + (mconsole) + config ubd3=/home/jdike/incoming/roots/root_fs_debian22 + + OK + (mconsole) config eth1=mcast + OK + + + + + + + 10.4. remove + + "remove" deletes a device from the system. Its argument is just the + name of the device to be removed. The device must be idle in whatever + sense the driver considers necessary. In the case of the ubd driver, + the removed block device must not be mounted, swapped on, or otherwise + open, and in the case of the network driver, the device must be down. + + + (mconsole) remove ubd3 + OK + (mconsole) remove eth1 + OK + + + + + + + 10.5. sysrq + + This takes one argument, which is a single letter. It calls the + generic kernel's SysRq driver, which does whatever is called for by + that argument. See the SysRq documentation in + Documentation/admin-guide/sysrq.rst in your favorite kernel tree to + see what letters are valid and what they do. + + + + 10.6. help + + "help" returns a string listing the valid commands and what each one + does. + + + + 10.7. cad + + This invokes the Ctl-Alt-Del action on init. What exactly this ends + up doing is up to /etc/inittab. Normally, it reboots the machine. + With UML, this is usually not desired, so if a halt would be better, + then find the section of inittab that looks like this + + + # What to do when CTRL-ALT-DEL is pressed. + ca:12345:ctrlaltdel:/sbin/shutdown -t1 -a -r now + + + + + and change the command to halt. + + + + 10.8. stop + + This puts the UML in a loop reading mconsole requests until a 'go' + mconsole command is received. This is very useful for making backups + of UML filesystems, as the UML can be stopped, then synced via 'sysrq + s', so that everything is written to the filesystem. You can then copy + the filesystem and then send the UML 'go' via mconsole. + + + Note that a UML running with more than one CPU will have problems + after you send the 'stop' command, as only one CPU will be held in a + mconsole loop and all others will continue as normal. This is a bug, + and will be fixed. + + + + 10.9. go + + This resumes a UML after being paused by a 'stop' command. Note that + when the UML has resumed, TCP connections may have timed out and if + the UML is paused for a long period of time, crond might go a little + crazy, running all the jobs it didn't do earlier. + + + + + + + + + 11. Kernel debugging + + + Note: The interface that makes debugging, as described here, possible + is present in 2.4.0-test6 kernels and later. + + + Since the user-mode kernel runs as a normal Linux process, it is + possible to debug it with gdb almost like any other process. It is + slightly different because the kernel's threads are already being + ptraced for system call interception, so gdb can't ptrace them. + However, a mechanism has been added to work around that problem. + + + In order to debug the kernel, you need build it from source. See + ``Compiling the kernel and modules'' for information on doing that. + Make sure that you enable CONFIG_DEBUGSYM and CONFIG_PT_PROXY during + the config. These will compile the kernel with -g, and enable the + ptrace proxy so that gdb works with UML, respectively. + + + + + 11.1. Starting the kernel under gdb + + You can have the kernel running under the control of gdb from the + beginning by putting 'debug' on the command line. You will get an + xterm with gdb running inside it. The kernel will send some commands + to gdb which will leave it stopped at the beginning of start_kernel. + At this point, you can get things going with 'next', 'step', or + 'cont'. + + + There is a transcript of a debugging session here , with breakpoints being set in the scheduler and in an + interrupt handler. + 11.2. Examining sleeping processes + + Not every bug is evident in the currently running process. Sometimes, + processes hang in the kernel when they shouldn't because they've + deadlocked on a semaphore or something similar. In this case, when + you ^C gdb and get a backtrace, you will see the idle thread, which + isn't very relevant. + + + What you want is the stack of whatever process is sleeping when it + shouldn't be. You need to figure out which process that is, which is + generally fairly easy. Then you need to get its host process id, + which you can do either by looking at ps on the host or at + task.thread.extern_pid in gdb. + + + Now what you do is this: + + o detach from the current thread + + + (UML gdb) det + + + + + + o attach to the thread you are interested in + + + (UML gdb) att + + + + + + o look at its stack and anything else of interest + + + (UML gdb) bt + + + + + Note that you can't do anything at this point that requires that a + process execute, e.g. calling a function + + o when you're done looking at that process, reattach to the current + thread and continue it + + + (UML gdb) + att 1 + + + + + + + (UML gdb) + c + + + + + Here, specifying any pid which is not the process id of a UML thread + will cause gdb to reattach to the current thread. I commonly use 1, + but any other invalid pid would work. + + + + 11.3. Running ddd on UML + + ddd works on UML, but requires a special kludge. The process goes + like this: + + o Start ddd + + + host% ddd linux + + + + + + o With ps, get the pid of the gdb that ddd started. You can ask the + gdb to tell you, but for some reason that confuses things and + causes a hang. + + o run UML with 'debug=parent gdb-pid=' added to the command line + - it will just sit there after you hit return + + o type 'att 1' to the ddd gdb and you will see something like + + + 0xa013dc51 in __kill () + + + (gdb) + + + + + + o At this point, type 'c', UML will boot up, and you can use ddd just + as you do on any other process. + + + + 11.4. Debugging modules + + gdb has support for debugging code which is dynamically loaded into + the process. This support is what is needed to debug kernel modules + under UML. + + + Using that support is somewhat complicated. You have to tell gdb what + object file you just loaded into UML and where in memory it is. Then, + it can read the symbol table, and figure out where all the symbols are + from the load address that you provided. It gets more interesting + when you load the module again (i.e. after an rmmod). You have to + tell gdb to forget about all its symbols, including the main UML ones + for some reason, then load then all back in again. + + + There's an easy way and a hard way to do this. The easy way is to use + the umlgdb expect script written by Chandan Kudige. It basically + automates the process for you. + + + First, you must tell it where your modules are. There is a list in + the script that looks like this: + set MODULE_PATHS { + "fat" "/usr/src/uml/linux-2.4.18/fs/fat/fat.o" + "isofs" "/usr/src/uml/linux-2.4.18/fs/isofs/isofs.o" + "minix" "/usr/src/uml/linux-2.4.18/fs/minix/minix.o" + } + + + + + You change that to list the names and paths of the modules that you + are going to debug. Then you run it from the toplevel directory of + your UML pool and it basically tells you what to do: + + + + + ******** GDB pid is 21903 ******** + Start UML as: ./linux debug gdb-pid=21903 + + + + GNU gdb 5.0rh-5 Red Hat Linux 7.1 + Copyright 2001 Free Software Foundation, Inc. + GDB is free software, covered by the GNU General Public License, and you are + welcome to change it and/or distribute copies of it under certain conditions. + Type "show copying" to see the conditions. + There is absolutely no warranty for GDB. Type "show warranty" for details. + This GDB was configured as "i386-redhat-linux"... + (gdb) b sys_init_module + Breakpoint 1 at 0xa0011923: file module.c, line 349. + (gdb) att 1 + + + + + After you run UML and it sits there doing nothing, you hit return at + the 'att 1' and continue it: + + + Attaching to program: /home/jdike/linux/2.4/um/./linux, process 1 + 0xa00f4221 in __kill () + (UML gdb) c + Continuing. + + + + + At this point, you debug normally. When you insmod something, the + expect magic will kick in and you'll see something like: + + + + + + + + + + + + + + + + + + *** Module hostfs loaded *** + Breakpoint 1, sys_init_module (name_user=0x805abb0 "hostfs", + mod_user=0x8070e00) at module.c:349 + 349 char *name, *n_name, *name_tmp = NULL; + (UML gdb) finish + Run till exit from #0 sys_init_module (name_user=0x805abb0 "hostfs", + mod_user=0x8070e00) at module.c:349 + 0xa00e2e23 in execute_syscall (r=0xa8140284) at syscall_kern.c:411 + 411 else res = EXECUTE_SYSCALL(syscall, regs); + Value returned is $1 = 0 + (UML gdb) + p/x (int)module_list + module_list->size_of_struct + + $2 = 0xa9021054 + (UML gdb) symbol-file ./linux + Load new symbol table from "./linux"? (y or n) y + Reading symbols from ./linux... + done. + (UML gdb) + add-symbol-file /home/jdike/linux/2.4/um/arch/um/fs/hostfs/hostfs.o 0xa9021054 + + add symbol table from file "/home/jdike/linux/2.4/um/arch/um/fs/hostfs/hostfs.o" at + .text_addr = 0xa9021054 + (y or n) y + + Reading symbols from /home/jdike/linux/2.4/um/arch/um/fs/hostfs/hostfs.o... + done. + (UML gdb) p *module_list + $1 = {size_of_struct = 84, next = 0xa0178720, name = 0xa9022de0 "hostfs", + size = 9016, uc = {usecount = {counter = 0}, pad = 0}, flags = 1, + nsyms = 57, ndeps = 0, syms = 0xa9023170, deps = 0x0, refs = 0x0, + init = 0xa90221f0 , cleanup = 0xa902222c , + ex_table_start = 0x0, ex_table_end = 0x0, persist_start = 0x0, + persist_end = 0x0, can_unload = 0, runsize = 0, kallsyms_start = 0x0, + kallsyms_end = 0x0, + archdata_start = 0x1b855
, + archdata_end = 0xe5890000
, + kernel_data = 0xf689c35d
} + >> Finished loading symbols for hostfs ... + + + + + That's the easy way. It's highly recommended. The hard way is + described below in case you're interested in what's going on. + + + Boot the kernel under the debugger and load the module with insmod or + modprobe. With gdb, do: + + + (UML gdb) p module_list + + + + + This is a list of modules that have been loaded into the kernel, with + the most recently loaded module first. Normally, the module you want + is at module_list. If it's not, walk down the next links, looking at + the name fields until find the module you want to debug. Take the + address of that structure, and add module.size_of_struct (which in + 2.4.10 kernels is 96 (0x60)) to it. Gdb can make this hard addition + for you :-): + + + + (UML gdb) + printf "%#x\n", (int)module_list module_list->size_of_struct + + + + + The offset from the module start occasionally changes (before 2.4.0, + it was module.size_of_struct + 4), so it's a good idea to check the + init and cleanup addresses once in a while, as describe below. Now + do: + + + (UML gdb) + add-symbol-file /path/to/module/on/host that_address + + + + + Tell gdb you really want to do it, and you're in business. + + + If there's any doubt that you got the offset right, like breakpoints + appear not to work, or they're appearing in the wrong place, you can + check it by looking at the module structure. The init and cleanup + fields should look like: + + + init = 0x588066b0 , cleanup = 0x588066c0 + + + + + with no offsets on the symbol names. If the names are right, but they + are offset, then the offset tells you how much you need to add to the + address you gave to add-symbol-file. + + + When you want to load in a new version of the module, you need to get + gdb to forget about the old one. The only way I've found to do that + is to tell gdb to forget about all symbols that it knows about: + + + (UML gdb) symbol-file + + + + + Then reload the symbols from the kernel binary: + + + (UML gdb) symbol-file /path/to/kernel + + + + + and repeat the process above. You'll also need to re-enable break- + points. They were disabled when you dumped all the symbols because + gdb couldn't figure out where they should go. + + + + 11.5. Attaching gdb to the kernel + + If you don't have the kernel running under gdb, you can attach gdb to + it later by sending the tracing thread a SIGUSR1. The first line of + the console output identifies its pid: + tracing thread pid = 20093 + + + + + When you send it the signal: + + + host% kill -USR1 20093 + + + + + you will get an xterm with gdb running in it. + + + If you have the mconsole compiled into UML, then the mconsole client + can be used to start gdb: + + + (mconsole) (mconsole) config gdb=xterm + + + + + will fire up an xterm with gdb running in it. + + + + 11.6. Using alternate debuggers + + UML has support for attaching to an already running debugger rather + than starting gdb itself. This is present in CVS as of 17 Apr 2001. + I sent it to Alan for inclusion in the ac tree, and it will be in my + 2.4.4 release. + + + This is useful when gdb is a subprocess of some UI, such as emacs or + ddd. It can also be used to run debuggers other than gdb on UML. + Below is an example of using strace as an alternate debugger. + + + To do this, you need to get the pid of the debugger and pass it in + with the + + + If you are using gdb under some UI, then tell it to 'att 1', and + you'll find yourself attached to UML. + + + If you are using something other than gdb as your debugger, then + you'll need to get it to do the equivalent of 'att 1' if it doesn't do + it automatically. + + + An example of an alternate debugger is strace. You can strace the + actual kernel as follows: + + o Run the following in a shell + + + host% + sh -c 'echo pid=$$; echo -n hit return; read x; exec strace -p 1 -o strace.out' + + + + o Run UML with 'debug' and 'gdb-pid=' with the pid printed out + by the previous command + + o Hit return in the shell, and UML will start running, and strace + output will start accumulating in the output file. + + Note that this is different from running + + + host% strace ./linux + + + + + That will strace only the main UML thread, the tracing thread, which + doesn't do any of the actual kernel work. It just oversees the vir- + tual machine. In contrast, using strace as described above will show + you the low-level activity of the virtual machine. + + + + + + 12. Kernel debugging examples + + 12.1. The case of the hung fsck + + When booting up the kernel, fsck failed, and dropped me into a shell + to fix things up. I ran fsck -y, which hung: + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Setting hostname uml [ OK ] + Checking root filesystem + /dev/fhd0 was not cleanly unmounted, check forced. + Error reading block 86894 (Attempt to read block from filesystem resulted in short read) while reading indirect blocks of inode 19780. + + /dev/fhd0: UNEXPECTED INCONSISTENCY; RUN fsck MANUALLY. + (i.e., without -a or -p options) + [ FAILED ] + + *** An error occurred during the file system check. + *** Dropping you to a shell; the system will reboot + *** when you leave the shell. + Give root password for maintenance + (or type Control-D for normal startup): + + [root@uml /root]# fsck -y /dev/fhd0 + fsck -y /dev/fhd0 + Parallelizing fsck version 1.14 (9-Jan-1999) + e2fsck 1.14, 9-Jan-1999 for EXT2 FS 0.5b, 95/08/09 + /dev/fhd0 contains a file system with errors, check forced. + Pass 1: Checking inodes, blocks, and sizes + Error reading block 86894 (Attempt to read block from filesystem resulted in short read) while reading indirect blocks of inode 19780. Ignore error? yes + + Inode 19780, i_blocks is 1548, should be 540. Fix? yes + + Pass 2: Checking directory structure + Error reading block 49405 (Attempt to read block from filesystem resulted in short read). Ignore error? yes + + Directory inode 11858, block 0, offset 0: directory corrupted + Salvage? yes + + Missing '.' in directory inode 11858. + Fix? yes + + Missing '..' in directory inode 11858. + Fix? yes + + + + + + The standard drill in this sort of situation is to fire up gdb on the + signal thread, which, in this case, was pid 1935. In another window, + I run gdb and attach pid 1935. + + + + + ~/linux/2.3.26/um 1016: gdb linux + GNU gdb 4.17.0.11 with Linux support + Copyright 1998 Free Software Foundation, Inc. + GDB is free software, covered by the GNU General Public License, and you are + welcome to change it and/or distribute copies of it under certain conditions. + Type "show copying" to see the conditions. + There is absolutely no warranty for GDB. Type "show warranty" for details. + This GDB was configured as "i386-redhat-linux"... + + (gdb) att 1935 + Attaching to program `/home/dike/linux/2.3.26/um/linux', Pid 1935 + 0x100756d9 in __wait4 () + + + + + + + Let's see what's currently running: + + + + (gdb) p current_task.pid + $1 = 0 + + + + + + It's the idle thread, which means that fsck went to sleep for some + reason and never woke up. + + + Let's guess that the last process in the process list is fsck: + + + + (gdb) p current_task.prev_task.comm + $13 = "fsck.ext2\000\000\000\000\000\000" + + + + + + It is, so let's see what it thinks it's up to: + + + + (gdb) p current_task.prev_task.thread + $14 = {extern_pid = 1980, tracing = 0, want_tracing = 0, forking = 0, + kernel_stack_page = 0, signal_stack = 1342627840, syscall = {id = 4, args = { + 3, 134973440, 1024, 0, 1024}, have_result = 0, result = 50590720}, + request = {op = 2, u = {exec = {ip = 1350467584, sp = 2952789424}, fork = { + regs = {1350467584, 2952789424, 0 }, sigstack = 0, + pid = 0}, switch_to = 0x507e8000, thread = {proc = 0x507e8000, + arg = 0xaffffdb0, flags = 0, new_pid = 0}, input_request = { + op = 1350467584, fd = -1342177872, proc = 0, pid = 0}}}} + + + + + + The interesting things here are the fact that its .thread.syscall.id + is __NR_write (see the big switch in arch/um/kernel/syscall_kern.c or + the defines in include/asm-um/arch/unistd.h), and that it never + returned. Also, its .request.op is OP_SWITCH (see + arch/um/include/user_util.h). These mean that it went into a write, + and, for some reason, called schedule(). + + + The fact that it never returned from write means that its stack should + be fairly interesting. Its pid is 1980 (.thread.extern_pid). That + process is being ptraced by the signal thread, so it must be detached + before gdb can attach it: + + + + + + + + + + + (gdb) call detach(1980) + + Program received signal SIGSEGV, Segmentation fault. + + The program being debugged stopped while in a function called from GDB. + When the function (detach) is done executing, GDB will silently + stop (instead of continuing to evaluate the expression containing + the function call). + (gdb) call detach(1980) + $15 = 0 + + + + + + The first detach segfaults for some reason, and the second one + succeeds. + + + Now I detach from the signal thread, attach to the fsck thread, and + look at its stack: + + + (gdb) det + Detaching from program: /home/dike/linux/2.3.26/um/linux Pid 1935 + (gdb) att 1980 + Attaching to program `/home/dike/linux/2.3.26/um/linux', Pid 1980 + 0x10070451 in __kill () + (gdb) bt + #0 0x10070451 in __kill () + #1 0x10068ccd in usr1_pid (pid=1980) at process.c:30 + #2 0x1006a03f in _switch_to (prev=0x50072000, next=0x507e8000) + at process_kern.c:156 + #3 0x1006a052 in switch_to (prev=0x50072000, next=0x507e8000, last=0x50072000) + at process_kern.c:161 + #4 0x10001d12 in schedule () at core.c:777 + #5 0x1006a744 in __down (sem=0x507d241c) at semaphore.c:71 + #6 0x1006aa10 in __down_failed () at semaphore.c:157 + #7 0x1006c5d8 in segv_handler (sc=0x5006e940) at trap_user.c:174 + #8 0x1006c5ec in kern_segv_handler (sig=11) at trap_user.c:182 + #9 + #10 0x10155404 in errno () + #11 0x1006c0aa in segv (address=1342179328, is_write=2) at trap_kern.c:50 + #12 0x1006c5d8 in segv_handler (sc=0x5006eaf8) at trap_user.c:174 + #13 0x1006c5ec in kern_segv_handler (sig=11) at trap_user.c:182 + #14 + #15 0xc0fd in ?? () + #16 0x10016647 in sys_write (fd=3, + buf=0x80b8800
, count=1024) + at read_write.c:159 + #17 0x1006d5b3 in execute_syscall (syscall=4, args=0x5006ef08) + at syscall_kern.c:254 + #18 0x1006af87 in really_do_syscall (sig=12) at syscall_user.c:35 + #19 + #20 0x400dc8b0 in ?? () + + + + + + The interesting things here are : + + o There are two segfaults on this stack (frames 9 and 14) + + o The first faulting address (frame 11) is 0x50000800 + + (gdb) p (void *)1342179328 + $16 = (void *) 0x50000800 + + + + + + The initial faulting address is interesting because it is on the idle + thread's stack. I had been seeing the idle thread segfault for no + apparent reason, and the cause looked like stack corruption. In hopes + of catching the culprit in the act, I had turned off all protections + to that stack while the idle thread wasn't running. This apparently + tripped that trap. + + + However, the more immediate problem is that second segfault and I'm + going to concentrate on that. First, I want to see where the fault + happened, so I have to go look at the sigcontent struct in frame 8: + + + + (gdb) up + #1 0x10068ccd in usr1_pid (pid=1980) at process.c:30 + 30 kill(pid, SIGUSR1); + (gdb) + #2 0x1006a03f in _switch_to (prev=0x50072000, next=0x507e8000) + at process_kern.c:156 + 156 usr1_pid(getpid()); + (gdb) + #3 0x1006a052 in switch_to (prev=0x50072000, next=0x507e8000, last=0x50072000) + at process_kern.c:161 + 161 _switch_to(prev, next); + (gdb) + #4 0x10001d12 in schedule () at core.c:777 + 777 switch_to(prev, next, prev); + (gdb) + #5 0x1006a744 in __down (sem=0x507d241c) at semaphore.c:71 + 71 schedule(); + (gdb) + #6 0x1006aa10 in __down_failed () at semaphore.c:157 + 157 } + (gdb) + #7 0x1006c5d8 in segv_handler (sc=0x5006e940) at trap_user.c:174 + 174 segv(sc->cr2, sc->err & 2); + (gdb) + #8 0x1006c5ec in kern_segv_handler (sig=11) at trap_user.c:182 + 182 segv_handler(sc); + (gdb) p *sc + Cannot access memory at address 0x0. + + + + + That's not very useful, so I'll try a more manual method: + + + (gdb) p *((struct sigcontext *) (&sig + 1)) + $19 = {gs = 0, __gsh = 0, fs = 0, __fsh = 0, es = 43, __esh = 0, ds = 43, + __dsh = 0, edi = 1342179328, esi = 1350378548, ebp = 1342630440, + esp = 1342630420, ebx = 1348150624, edx = 1280, ecx = 0, eax = 0, + trapno = 14, err = 4, eip = 268480945, cs = 35, __csh = 0, eflags = 66118, + esp_at_signal = 1342630420, ss = 43, __ssh = 0, fpstate = 0x0, oldmask = 0, + cr2 = 1280} + + + + The ip is in handle_mm_fault: + + + (gdb) p (void *)268480945 + $20 = (void *) 0x1000b1b1 + (gdb) i sym $20 + handle_mm_fault + 57 in section .text + + + + + + Specifically, it's in pte_alloc: + + + (gdb) i line *$20 + Line 124 of "/home/dike/linux/2.3.26/um/include/asm/pgalloc.h" + starts at address 0x1000b1b1 + and ends at 0x1000b1b7 . + + + + + + To find where in handle_mm_fault this is, I'll jump forward in the + code until I see an address in that procedure: + + + + (gdb) i line *0x1000b1c0 + Line 126 of "/home/dike/linux/2.3.26/um/include/asm/pgalloc.h" + starts at address 0x1000b1b7 + and ends at 0x1000b1c3 . + (gdb) i line *0x1000b1d0 + Line 131 of "/home/dike/linux/2.3.26/um/include/asm/pgalloc.h" + starts at address 0x1000b1d0 + and ends at 0x1000b1da . + (gdb) i line *0x1000b1e0 + Line 61 of "/home/dike/linux/2.3.26/um/include/asm/pgalloc.h" + starts at address 0x1000b1da + and ends at 0x1000b1e1 . + (gdb) i line *0x1000b1f0 + Line 134 of "/home/dike/linux/2.3.26/um/include/asm/pgalloc.h" + starts at address 0x1000b1f0 + and ends at 0x1000b200 . + (gdb) i line *0x1000b200 + Line 135 of "/home/dike/linux/2.3.26/um/include/asm/pgalloc.h" + starts at address 0x1000b200 + and ends at 0x1000b208 . + (gdb) i line *0x1000b210 + Line 139 of "/home/dike/linux/2.3.26/um/include/asm/pgalloc.h" + starts at address 0x1000b210 + and ends at 0x1000b219 . + (gdb) i line *0x1000b220 + Line 1168 of "memory.c" starts at address 0x1000b21e + and ends at 0x1000b222 . + + + + + + Something is apparently wrong with the page tables or vma_structs, so + lets go back to frame 11 and have a look at them: + + + + #11 0x1006c0aa in segv (address=1342179328, is_write=2) at trap_kern.c:50 + 50 handle_mm_fault(current, vma, address, is_write); + (gdb) call pgd_offset_proc(vma->vm_mm, address) + $22 = (pgd_t *) 0x80a548c + + + + + + That's pretty bogus. Page tables aren't supposed to be in process + text or data areas. Let's see what's in the vma: + + + (gdb) p *vma + $23 = {vm_mm = 0x507d2434, vm_start = 0, vm_end = 134512640, + vm_next = 0x80a4f8c, vm_page_prot = {pgprot = 0}, vm_flags = 31200, + vm_avl_height = 2058, vm_avl_left = 0x80a8c94, vm_avl_right = 0x80d1000, + vm_next_share = 0xaffffdb0, vm_pprev_share = 0xaffffe63, + vm_ops = 0xaffffe7a, vm_pgoff = 2952789626, vm_file = 0xafffffec, + vm_private_data = 0x62} + (gdb) p *vma.vm_mm + $24 = {mmap = 0x507d2434, mmap_avl = 0x0, mmap_cache = 0x8048000, + pgd = 0x80a4f8c, mm_users = {counter = 0}, mm_count = {counter = 134904288}, + map_count = 134909076, mmap_sem = {count = {counter = 135073792}, + sleepers = -1342177872, wait = {lock = , + task_list = {next = 0xaffffe63, prev = 0xaffffe7a}, + __magic = -1342177670, __creator = -1342177300}, __magic = 98}, + page_table_lock = {}, context = 138, start_code = 0, end_code = 0, + start_data = 0, end_data = 0, start_brk = 0, brk = 0, start_stack = 0, + arg_start = 0, arg_end = 0, env_start = 0, env_end = 0, rss = 1350381536, + total_vm = 0, locked_vm = 0, def_flags = 0, cpu_vm_mask = 0, swap_cnt = 0, + swap_address = 0, segments = 0x0} + + + + + + This also pretty bogus. With all of the 0x80xxxxx and 0xaffffxxx + addresses, this is looking like a stack was plonked down on top of + these structures. Maybe it's a stack overflow from the next page: + + + + (gdb) p vma + $25 = (struct vm_area_struct *) 0x507d2434 + + + + + + That's towards the lower quarter of the page, so that would have to + have been pretty heavy stack overflow: + + + + + + + + + + + + + + + (gdb) x/100x $25 + 0x507d2434: 0x507d2434 0x00000000 0x08048000 0x080a4f8c + 0x507d2444: 0x00000000 0x080a79e0 0x080a8c94 0x080d1000 + 0x507d2454: 0xaffffdb0 0xaffffe63 0xaffffe7a 0xaffffe7a + 0x507d2464: 0xafffffec 0x00000062 0x0000008a 0x00000000 + 0x507d2474: 0x00000000 0x00000000 0x00000000 0x00000000 + 0x507d2484: 0x00000000 0x00000000 0x00000000 0x00000000 + 0x507d2494: 0x00000000 0x00000000 0x507d2fe0 0x00000000 + 0x507d24a4: 0x00000000 0x00000000 0x00000000 0x00000000 + 0x507d24b4: 0x00000000 0x00000000 0x00000000 0x00000000 + 0x507d24c4: 0x00000000 0x00000000 0x00000000 0x00000000 + 0x507d24d4: 0x00000000 0x00000000 0x00000000 0x00000000 + 0x507d24e4: 0x00000000 0x00000000 0x00000000 0x00000000 + 0x507d24f4: 0x00000000 0x00000000 0x00000000 0x00000000 + 0x507d2504: 0x00000000 0x00000000 0x00000000 0x00000000 + 0x507d2514: 0x00000000 0x00000000 0x00000000 0x00000000 + 0x507d2524: 0x00000000 0x00000000 0x00000000 0x00000000 + 0x507d2534: 0x00000000 0x00000000 0x507d25dc 0x00000000 + 0x507d2544: 0x00000000 0x00000000 0x00000000 0x00000000 + 0x507d2554: 0x00000000 0x00000000 0x00000000 0x00000000 + 0x507d2564: 0x00000000 0x00000000 0x00000000 0x00000000 + 0x507d2574: 0x00000000 0x00000000 0x00000000 0x00000000 + 0x507d2584: 0x00000000 0x00000000 0x00000000 0x00000000 + 0x507d2594: 0x00000000 0x00000000 0x00000000 0x00000000 + 0x507d25a4: 0x00000000 0x00000000 0x00000000 0x00000000 + 0x507d25b4: 0x00000000 0x00000000 0x00000000 0x00000000 + + + + + + It's not stack overflow. The only "stack-like" piece of this data is + the vma_struct itself. + + + At this point, I don't see any avenues to pursue, so I just have to + admit that I have no idea what's going on. What I will do, though, is + stick a trap on the segfault handler which will stop if it sees any + writes to the idle thread's stack. That was the thing that happened + first, and it may be that if I can catch it immediately, what's going + on will be somewhat clearer. + + + 12.2. Episode 2: The case of the hung fsck + + After setting a trap in the SEGV handler for accesses to the signal + thread's stack, I reran the kernel. + + + fsck hung again, this time by hitting the trap: + + + + + + + + + + + + + + + + + Setting hostname uml [ OK ] + Checking root filesystem + /dev/fhd0 contains a file system with errors, check forced. + Error reading block 86894 (Attempt to read block from filesystem resulted in short read) while reading indirect blocks of inode 19780. + + /dev/fhd0: UNEXPECTED INCONSISTENCY; RUN fsck MANUALLY. + (i.e., without -a or -p options) + [ FAILED ] + + *** An error occurred during the file system check. + *** Dropping you to a shell; the system will reboot + *** when you leave the shell. + Give root password for maintenance + (or type Control-D for normal startup): + + [root@uml /root]# fsck -y /dev/fhd0 + fsck -y /dev/fhd0 + Parallelizing fsck version 1.14 (9-Jan-1999) + e2fsck 1.14, 9-Jan-1999 for EXT2 FS 0.5b, 95/08/09 + /dev/fhd0 contains a file system with errors, check forced. + Pass 1: Checking inodes, blocks, and sizes + Error reading block 86894 (Attempt to read block from filesystem resulted in short read) while reading indirect blocks of inode 19780. Ignore error? yes + + Pass 2: Checking directory structure + Error reading block 49405 (Attempt to read block from filesystem resulted in short read). Ignore error? yes + + Directory inode 11858, block 0, offset 0: directory corrupted + Salvage? yes + + Missing '.' in directory inode 11858. + Fix? yes + + Missing '..' in directory inode 11858. + Fix? yes + + Untested (4127) [100fe44c]: trap_kern.c line 31 + + + + + + I need to get the signal thread to detach from pid 4127 so that I can + attach to it with gdb. This is done by sending it a SIGUSR1, which is + caught by the signal thread, which detaches the process: + + + kill -USR1 4127 + + + + + + Now I can run gdb on it: + + + + + + + + + + + + + + ~/linux/2.3.26/um 1034: gdb linux + GNU gdb 4.17.0.11 with Linux support + Copyright 1998 Free Software Foundation, Inc. + GDB is free software, covered by the GNU General Public License, and you are + welcome to change it and/or distribute copies of it under certain conditions. + Type "show copying" to see the conditions. + There is absolutely no warranty for GDB. Type "show warranty" for details. + This GDB was configured as "i386-redhat-linux"... + (gdb) att 4127 + Attaching to program `/home/dike/linux/2.3.26/um/linux', Pid 4127 + 0x10075891 in __libc_nanosleep () + + + + + + The backtrace shows that it was in a write and that the fault address + (address in frame 3) is 0x50000800, which is right in the middle of + the signal thread's stack page: + + + (gdb) bt + #0 0x10075891 in __libc_nanosleep () + #1 0x1007584d in __sleep (seconds=1000000) + at ../sysdeps/unix/sysv/linux/sleep.c:78 + #2 0x1006ce9a in stop () at user_util.c:191 + #3 0x1006bf88 in segv (address=1342179328, is_write=2) at trap_kern.c:31 + #4 0x1006c628 in segv_handler (sc=0x5006eaf8) at trap_user.c:174 + #5 0x1006c63c in kern_segv_handler (sig=11) at trap_user.c:182 + #6 + #7 0xc0fd in ?? () + #8 0x10016647 in sys_write (fd=3, buf=0x80b8800 "R.", count=1024) + at read_write.c:159 + #9 0x1006d603 in execute_syscall (syscall=4, args=0x5006ef08) + at syscall_kern.c:254 + #10 0x1006af87 in really_do_syscall (sig=12) at syscall_user.c:35 + #11 + #12 0x400dc8b0 in ?? () + #13 + #14 0x400dc8b0 in ?? () + #15 0x80545fd in ?? () + #16 0x804daae in ?? () + #17 0x8054334 in ?? () + #18 0x804d23e in ?? () + #19 0x8049632 in ?? () + #20 0x80491d2 in ?? () + #21 0x80596b5 in ?? () + (gdb) p (void *)1342179328 + $3 = (void *) 0x50000800 + + + + + + Going up the stack to the segv_handler frame and looking at where in + the code the access happened shows that it happened near line 110 of + block_dev.c: + + + + + + + + + + (gdb) up + #1 0x1007584d in __sleep (seconds=1000000) + at ../sysdeps/unix/sysv/linux/sleep.c:78 + ../sysdeps/unix/sysv/linux/sleep.c:78: No such file or directory. + (gdb) + #2 0x1006ce9a in stop () at user_util.c:191 + 191 while(1) sleep(1000000); + (gdb) + #3 0x1006bf88 in segv (address=1342179328, is_write=2) at trap_kern.c:31 + 31 KERN_UNTESTED(); + (gdb) + #4 0x1006c628 in segv_handler (sc=0x5006eaf8) at trap_user.c:174 + 174 segv(sc->cr2, sc->err & 2); + (gdb) p *sc + $1 = {gs = 0, __gsh = 0, fs = 0, __fsh = 0, es = 43, __esh = 0, ds = 43, + __dsh = 0, edi = 1342179328, esi = 134973440, ebp = 1342631484, + esp = 1342630864, ebx = 256, edx = 0, ecx = 256, eax = 1024, trapno = 14, + err = 6, eip = 268550834, cs = 35, __csh = 0, eflags = 66070, + esp_at_signal = 1342630864, ss = 43, __ssh = 0, fpstate = 0x0, oldmask = 0, + cr2 = 1342179328} + (gdb) p (void *)268550834 + $2 = (void *) 0x1001c2b2 + (gdb) i sym $2 + block_write + 1090 in section .text + (gdb) i line *$2 + Line 209 of "/home/dike/linux/2.3.26/um/include/asm/arch/string.h" + starts at address 0x1001c2a1 + and ends at 0x1001c2bf . + (gdb) i line *0x1001c2c0 + Line 110 of "block_dev.c" starts at address 0x1001c2bf + and ends at 0x1001c2e3 . + + + + + + Looking at the source shows that the fault happened during a call to + copy_from_user to copy the data into the kernel: + + + 107 count -= chars; + 108 copy_from_user(p,buf,chars); + 109 p += chars; + 110 buf += chars; + + + + + + p is the pointer which must contain 0x50000800, since buf contains + 0x80b8800 (frame 8 above). It is defined as: + + + p = offset + bh->b_data; + + + + + + I need to figure out what bh is, and it just so happens that bh is + passed as an argument to mark_buffer_uptodate and mark_buffer_dirty a + few lines later, so I do a little disassembly: + + + + + (gdb) disas 0x1001c2bf 0x1001c2e0 + Dump of assembler code from 0x1001c2bf to 0x1001c2d0: + 0x1001c2bf : addl %eax,0xc(%ebp) + 0x1001c2c2 : movl 0xfffffdd4(%ebp),%edx + 0x1001c2c8 : btsl $0x0,0x18(%edx) + 0x1001c2cd : btsl $0x1,0x18(%edx) + 0x1001c2d2 : sbbl %ecx,%ecx + 0x1001c2d4 : testl %ecx,%ecx + 0x1001c2d6 : jne 0x1001c2e3 + 0x1001c2d8 : pushl $0x0 + 0x1001c2da : pushl %edx + 0x1001c2db : call 0x1001819c <__mark_buffer_dirty> + End of assembler dump. + + + + + + At that point, bh is in %edx (address 0x1001c2da), which is calculated + at 0x1001c2c2 as %ebp + 0xfffffdd4, so I figure exactly what that is, + taking %ebp from the sigcontext_struct above: + + + (gdb) p (void *)1342631484 + $5 = (void *) 0x5006ee3c + (gdb) p 0x5006ee3c+0xfffffdd4 + $6 = 1342630928 + (gdb) p (void *)$6 + $7 = (void *) 0x5006ec10 + (gdb) p *((void **)$7) + $8 = (void *) 0x50100200 + + + + + + Now, I look at the structure to see what's in it, and particularly, + what its b_data field contains: + + + (gdb) p *((struct buffer_head *)0x50100200) + $13 = {b_next = 0x50289380, b_blocknr = 49405, b_size = 1024, b_list = 0, + b_dev = 15872, b_count = {counter = 1}, b_rdev = 15872, b_state = 24, + b_flushtime = 0, b_next_free = 0x501001a0, b_prev_free = 0x50100260, + b_this_page = 0x501001a0, b_reqnext = 0x0, b_pprev = 0x507fcf58, + b_data = 0x50000800 "", b_page = 0x50004000, + b_end_io = 0x10017f60 , b_dev_id = 0x0, + b_rsector = 98810, b_wait = {lock = , + task_list = {next = 0x50100248, prev = 0x50100248}, __magic = 1343226448, + __creator = 0}, b_kiobuf = 0x0} + + + + + + The b_data field is indeed 0x50000800, so the question becomes how + that happened. The rest of the structure looks fine, so this probably + is not a case of data corruption. It happened on purpose somehow. + + + The b_page field is a pointer to the page_struct representing the + 0x50000000 page. Looking at it shows the kernel's idea of the state + of that page: + + + + (gdb) p *$13.b_page + $17 = {list = {next = 0x50004a5c, prev = 0x100c5174}, mapping = 0x0, + index = 0, next_hash = 0x0, count = {counter = 1}, flags = 132, lru = { + next = 0x50008460, prev = 0x50019350}, wait = { + lock = , task_list = {next = 0x50004024, + prev = 0x50004024}, __magic = 1342193708, __creator = 0}, + pprev_hash = 0x0, buffers = 0x501002c0, virtual = 1342177280, + zone = 0x100c5160} + + + + + + Some sanity-checking: the virtual field shows the "virtual" address of + this page, which in this kernel is the same as its "physical" address, + and the page_struct itself should be mem_map[0], since it represents + the first page of memory: + + + + (gdb) p (void *)1342177280 + $18 = (void *) 0x50000000 + (gdb) p mem_map + $19 = (mem_map_t *) 0x50004000 + + + + + + These check out fine. + + + Now to check out the page_struct itself. In particular, the flags + field shows whether the page is considered free or not: + + + (gdb) p (void *)132 + $21 = (void *) 0x84 + + + + + + The "reserved" bit is the high bit, which is definitely not set, so + the kernel considers the signal stack page to be free and available to + be used. + + + At this point, I jump to conclusions and start looking at my early + boot code, because that's where that page is supposed to be reserved. + + + In my setup_arch procedure, I have the following code which looks just + fine: + + + + bootmap_size = init_bootmem(start_pfn, end_pfn - start_pfn); + free_bootmem(__pa(low_physmem) + bootmap_size, high_physmem - low_physmem); + + + + + + Two stack pages have already been allocated, and low_physmem points to + the third page, which is the beginning of free memory. + The init_bootmem call declares the entire memory to the boot memory + manager, which marks it all reserved. The free_bootmem call frees up + all of it, except for the first two pages. This looks correct to me. + + + So, I decide to see init_bootmem run and make sure that it is marking + those first two pages as reserved. I never get that far. + + + Stepping into init_bootmem, and looking at bootmem_map before looking + at what it contains shows the following: + + + + (gdb) p bootmem_map + $3 = (void *) 0x50000000 + + + + + + Aha! The light dawns. That first page is doing double duty as a + stack and as the boot memory map. The last thing that the boot memory + manager does is to free the pages used by its memory map, so this page + is getting freed even its marked as reserved. + + + The fix was to initialize the boot memory manager before allocating + those two stack pages, and then allocate them through the boot memory + manager. After doing this, and fixing a couple of subsequent buglets, + the stack corruption problem disappeared. + + + + + + 13. What to do when UML doesn't work + + + + + 13.1. Strange compilation errors when you build from source + + As of test11, it is necessary to have "ARCH=um" in the environment or + on the make command line for all steps in building UML, including + clean, distclean, or mrproper, config, menuconfig, or xconfig, dep, + and linux. If you forget for any of them, the i386 build seems to + contaminate the UML build. If this happens, start from scratch with + + + host% + make mrproper ARCH=um + + + + + and repeat the build process with ARCH=um on all the steps. + + + See ``Compiling the kernel and modules'' for more details. + + + Another cause of strange compilation errors is building UML in + /usr/src/linux. If you do this, the first thing you need to do is + clean up the mess you made. The /usr/src/linux/asm link will now + point to /usr/src/linux/asm-um. Make it point back to + /usr/src/linux/asm-i386. Then, move your UML pool someplace else and + build it there. Also see below, where a more specific set of symptoms + is described. + + + + 13.3. A variety of panics and hangs with /tmp on a reiserfs filesys- + tem + + I saw this on reiserfs 3.5.21 and it seems to be fixed in 3.5.27. + Panics preceded by + + + Detaching pid nnnn + + + + are diagnostic of this problem. This is a reiserfs bug which causes a + thread to occasionally read stale data from a mmapped page shared with + another thread. The fix is to upgrade the filesystem or to have /tmp + be an ext2 filesystem. + + + + 13.4. The compile fails with errors about conflicting types for + 'open', 'dup', and 'waitpid' + + This happens when you build in /usr/src/linux. The UML build makes + the include/asm link point to include/asm-um. /usr/include/asm points + to /usr/src/linux/include/asm, so when that link gets moved, files + which need to include the asm-i386 versions of headers get the + incompatible asm-um versions. The fix is to move the include/asm link + back to include/asm-i386 and to do UML builds someplace else. + + + + 13.5. UML doesn't work when /tmp is an NFS filesystem + + This seems to be a similar situation with the ReiserFS problem above. + Some versions of NFS seems not to handle mmap correctly, which UML + depends on. The workaround is have /tmp be a non-NFS directory. + + + 13.6. UML hangs on boot when compiled with gprof support + + If you build UML with gprof support and, early in the boot, it does + this + + + kernel BUG at page_alloc.c:100! + + + + + you have a buggy gcc. You can work around the problem by removing + UM_FASTCALL from CFLAGS in arch/um/Makefile-i386. This will open up + another bug, but that one is fairly hard to reproduce. + + + + 13.7. syslogd dies with a SIGTERM on startup + + The exact boot error depends on the distribution that you're booting, + but Debian produces this: + + + /etc/rc2.d/S10sysklogd: line 49: 93 Terminated + start-stop-daemon --start --quiet --exec /sbin/syslogd -- $SYSLOGD + + + + + This is a syslogd bug. There's a race between a parent process + installing a signal handler and its child sending the signal. See + this uml-devel post for the details. + + + + 13.8. TUN/TAP networking doesn't work on a 2.4 host + + There are a couple of problems which were + name="pointed + out"> by Tim Robinson + + o It doesn't work on hosts running 2.4.7 (or thereabouts) or earlier. + The fix is to upgrade to something more recent and then read the + next item. + + o If you see + + + File descriptor in bad state + + + + when you bring up the device inside UML, you have a header mismatch + between the original kernel and the upgraded one. Make /usr/src/linux + point at the new headers. This will only be a problem if you build + uml_net yourself. + + + + 13.9. You can network to the host but not to other machines on the + net + + If you can connect to the host, and the host can connect to UML, but + you cannot connect to any other machines, then you may need to enable + IP Masquerading on the host. Usually this is only experienced when + using private IP addresses (192.168.x.x or 10.x.x.x) for host/UML + networking, rather than the public address space that your host is + connected to. UML does not enable IP Masquerading, so you will need + to create a static rule to enable it: + + + host% + iptables -t nat -A POSTROUTING -o eth0 -j MASQUERADE + + + + + Replace eth0 with the interface that you use to talk to the rest of + the world. + + + Documentation on IP Masquerading, and SNAT, can be found at + www.netfilter.org . + + + If you can reach the local net, but not the outside Internet, then + that is usually a routing problem. The UML needs a default route: + + + UML# + route add default gw gateway IP + + + + + The gateway IP can be any machine on the local net that knows how to + reach the outside world. Usually, this is the host or the local net- + work's gateway. + + + Occasionally, we hear from someone who can reach some machines, but + not others on the same net, or who can reach some ports on other + machines, but not others. These are usually caused by strange + firewalling somewhere between the UML and the other box. You track + this down by running tcpdump on every interface the packets travel + over and see where they disappear. When you find a machine that takes + the packets in, but does not send them onward, that's the culprit. + + + + 13.10. I have no root and I want to scream + + Thanks to Birgit Wahlich for telling me about this strange one. It + turns out that there's a limit of six environment variables on the + kernel command line. When that limit is reached or exceeded, argument + processing stops, which means that the 'root=' argument that UML + usually adds is not seen. So, the filesystem has no idea what the + root device is, so it panics. + + + The fix is to put less stuff on the command line. Glomming all your + setup variables into one is probably the best way to go. + + + + 13.11. UML build conflict between ptrace.h and ucontext.h + + On some older systems, /usr/include/asm/ptrace.h and + /usr/include/sys/ucontext.h define the same names. So, when they're + included together, the defines from one completely mess up the parsing + of the other, producing errors like: + /usr/include/sys/ucontext.h:47: parse error before + `10' + + + + + plus a pile of warnings. + + + This is a libc botch, which has since been fixed, and I don't see any + way around it besides upgrading. + + + + 13.12. The UML BogoMips is exactly half the host's BogoMips + + On i386 kernels, there are two ways of running the loop that is used + to calculate the BogoMips rating, using the TSC if it's there or using + a one-instruction loop. The TSC produces twice the BogoMips as the + loop. UML uses the loop, since it has nothing resembling a TSC, and + will get almost exactly the same BogoMips as a host using the loop. + However, on a host with a TSC, its BogoMips will be double the loop + BogoMips, and therefore double the UML BogoMips. + + + + 13.13. When you run UML, it immediately segfaults + + If the host is configured with the 2G/2G address space split, that's + why. See ``UML on 2G/2G hosts'' for the details on getting UML to + run on your host. + + + + 13.14. xterms appear, then immediately disappear + + If you're running an up to date kernel with an old release of + uml_utilities, the port-helper program will not work properly, so + xterms will exit straight after they appear. The solution is to + upgrade to the latest release of uml_utilities. Usually this problem + occurs when you have installed a packaged release of UML then compiled + your own development kernel without upgrading the uml_utilities from + the source distribution. + + + + 13.15. Any other panic, hang, or strange behavior + + If you're seeing truly strange behavior, such as hangs or panics that + happen in random places, or you try running the debugger to see what's + happening and it acts strangely, then it could be a problem in the + host kernel. If you're not running a stock Linus or -ac kernel, then + try that. An early version of the preemption patch and a 2.4.10 SuSE + kernel have caused very strange problems in UML. + + + Otherwise, let me know about it. Send a message to one of the UML + mailing lists - either the developer list - user-mode-linux-devel at + lists dot sourceforge dot net (subscription info) or the user list - + user-mode-linux-user at lists dot sourceforge do net (subscription + info), whichever you prefer. Don't assume that everyone knows about + it and that a fix is imminent. + + + If you want to be super-helpful, read ``Diagnosing Problems'' and + follow the instructions contained therein. + 14. Diagnosing Problems + + + If you get UML to crash, hang, or otherwise misbehave, you should + report this on one of the project mailing lists, either the developer + list - user-mode-linux-devel at lists dot sourceforge dot net + (subscription info) or the user list - user-mode-linux-user at lists + dot sourceforge dot net (subscription info). When you do, it is + likely that I will want more information. So, it would be helpful to + read the stuff below, do whatever is applicable in your case, and + report the results to the list. + + + For any diagnosis, you're going to need to build a debugging kernel. + The binaries from this site aren't debuggable. If you haven't done + this before, read about ``Compiling the kernel and modules'' and + ``Kernel debugging'' UML first. + + + 14.1. Case 1 : Normal kernel panics + + The most common case is for a normal thread to panic. To debug this, + you will need to run it under the debugger (add 'debug' to the command + line). An xterm will start up with gdb running inside it. Continue + it when it stops in start_kernel and make it crash. Now ^C gdb and + + + If the panic was a "Kernel mode fault", then there will be a segv + frame on the stack and I'm going to want some more information. The + stack might look something like this: + + + (UML gdb) backtrace + #0 0x1009bf76 in __sigprocmask (how=1, set=0x5f347940, oset=0x0) + at ../sysdeps/unix/sysv/linux/sigprocmask.c:49 + #1 0x10091411 in change_sig (signal=10, on=1) at process.c:218 + #2 0x10094785 in timer_handler (sig=26) at time_kern.c:32 + #3 0x1009bf38 in __restore () + at ../sysdeps/unix/sysv/linux/i386/sigaction.c:125 + #4 0x1009534c in segv (address=8, ip=268849158, is_write=2, is_user=0) + at trap_kern.c:66 + #5 0x10095c04 in segv_handler (sig=11) at trap_user.c:285 + #6 0x1009bf38 in __restore () + + + + + I'm going to want to see the symbol and line information for the value + of ip in the segv frame. In this case, you would do the following: + + + (UML gdb) i sym 268849158 + + + + + and + + + (UML gdb) i line *268849158 + + + + + The reason for this is the __restore frame right above the segv_han- + dler frame is hiding the frame that actually segfaulted. So, I have + to get that information from the faulting ip. + + + 14.2. Case 2 : Tracing thread panics + + The less common and more painful case is when the tracing thread + panics. In this case, the kernel debugger will be useless because it + needs a healthy tracing thread in order to work. The first thing to + do is get a backtrace from the tracing thread. This is done by + figuring out what its pid is, firing up gdb, and attaching it to that + pid. You can figure out the tracing thread pid by looking at the + first line of the console output, which will look like this: + + + tracing thread pid = 15851 + + + + + or by running ps on the host and finding the line that looks like + this: + + + jdike 15851 4.5 0.4 132568 1104 pts/0 S 21:34 0:05 ./linux [(tracing thread)] + + + + + If the panic was 'segfault in signals', then follow the instructions + above for collecting information about the location of the seg fault. + + + If the tracing thread flaked out all by itself, then send that + backtrace in and wait for our crack debugging team to fix the problem. + + + 14.3. Case 3 : Tracing thread panics caused by other threads + + However, there are cases where the misbehavior of another thread + caused the problem. The most common panic of this type is: + + + wait_for_stop failed to wait for to stop with + + + + + In this case, you'll need to get a backtrace from the process men- + tioned in the panic, which is complicated by the fact that the kernel + debugger is defunct and without some fancy footwork, another gdb can't + attach to it. So, this is how the fancy footwork goes: + + In a shell: + + + host% kill -STOP pid + + + + + Run gdb on the tracing thread as described in case 2 and do: + + + (host gdb) call detach(pid) + + + If you get a segfault, do it again. It always works the second time. + + Detach from the tracing thread and attach to that other thread: + + + (host gdb) detach + + + + + + + (host gdb) attach pid + + + + + If gdb hangs when attaching to that process, go back to a shell and + do: + + + host% + kill -CONT pid + + + + + And then get the backtrace: + + + (host gdb) backtrace + + + + + + 14.4. Case 4 : Hangs + + Hangs seem to be fairly rare, but they sometimes happen. When a hang + happens, we need a backtrace from the offending process. Run the + kernel debugger as described in case 1 and get a backtrace. If the + current process is not the idle thread, then send in the backtrace. + You can tell that it's the idle thread if the stack looks like this: + + + #0 0x100b1401 in __libc_nanosleep () + #1 0x100a2885 in idle_sleep (secs=10) at time.c:122 + #2 0x100a546f in do_idle () at process_kern.c:445 + #3 0x100a5508 in cpu_idle () at process_kern.c:471 + #4 0x100ec18f in start_kernel () at init/main.c:592 + #5 0x100a3e10 in start_kernel_proc (unused=0x0) at um_arch.c:71 + #6 0x100a383f in signal_tramp (arg=0x100a3dd8) at trap_user.c:50 + + + + + If this is the case, then some other process is at fault, and went to + sleep when it shouldn't have. Run ps on the host and figure out which + process should not have gone to sleep and stayed asleep. Then attach + to it with gdb and get a backtrace as described in case 3. + + + + + + + 15. Thanks + + + A number of people have helped this project in various ways, and this + page gives recognition where recognition is due. + + + If you're listed here and you would prefer a real link on your name, + or no link at all, instead of the despammed email address pseudo-link, + let me know. + + + If you're not listed here and you think maybe you should be, please + let me know that as well. I try to get everyone, but sometimes my + bookkeeping lapses and I forget about contributions. + + + 15.1. Code and Documentation + + Rusty Russell - + + o wrote the HOWTO + + o prodded me into making this project official and putting it on + SourceForge + + o came up with the way cool UML logo + + o redid the config process + + + Peter Moulder - Fixed my config and build + processes, and added some useful code to the block driver + + + Bill Stearns - + + o HOWTO updates + + o lots of bug reports + + o lots of testing + + o dedicated a box (uml.ists.dartmouth.edu) to support UML development + + o wrote the mkrootfs script, which allows bootable filesystems of + RPM-based distributions to be cranked out + + o cranked out a large number of filesystems with said script + + + Jim Leu - Wrote the virtual ethernet driver + and associated usermode tools + + Lars Brinkhoff - Contributed the ptrace + proxy from his own project to allow easier + kernel debugging + + + Andrea Arcangeli - Redid some of the early boot + code so that it would work on machines with Large File Support + + + Chris Emerson - Did + the first UML port to Linux/ppc + + + Harald Welte - Wrote the multicast + transport for the network driver + + + Jorgen Cederlof - Added special file support to hostfs + + + Greg Lonnon - Changed the ubd driver + to allow it to layer a COW file on a shared read-only filesystem and + wrote the iomem emulation support + + + Henrik Nordstrom - Provided a variety + of patches, fixes, and clues + + + Lennert Buytenhek - Contributed various patches, a rewrite of the + network driver, the first implementation of the mconsole driver, and + did the bulk of the work needed to get SMP working again. + + + Yon Uriarte - Fixed the TUN/TAP network backend while I slept. + + + Adam Heath - Made a bunch of nice cleanups to the initialization code, + plus various other small patches. + + + Matt Zimmerman - Matt volunteered to be the UML Debian maintainer and + is doing a real nice job of it. He also noticed and fixed a number of + actually and potentially exploitable security holes in uml_net. Plus + the occasional patch. I like patches. + + + James McMechan - James seems to have taken over maintenance of the ubd + driver and is doing a nice job of it. + + + Chandan Kudige - wrote the umlgdb script which automates the reloading + of module symbols. + + + Steve Schmidtke - wrote the UML slirp transport and hostaudio drivers, + enabling UML processes to access audio devices on the host. He also + submitted patches for the slip transport and lots of other things. + + + David Coulson - + + o Set up the usermodelinux.org site, + which is a great way of keeping the UML user community on top of + UML goings-on. + + o Site documentation and updates + + o Nifty little UML management daemon UMLd + + + o Lots of testing and bug reports + + + + + 15.2. Flushing out bugs + + + + o Yuri Pudgorodsky + + o Gerald Britton + + o Ian Wehrman + + o Gord Lamb + + o Eugene Koontz + + o John H. Hartman + + o Anders Karlsson + + o Daniel Phillips + + o John Fremlin + + o Rainer Burgstaller + + o James Stevenson + + o Matt Clay + + o Cliff Jefferies + + o Geoff Hoff + + o Lennert Buytenhek + + o Al Viro + + o Frank Klingenhoefer + + o Livio Baldini Soares + + o Jon Burgess + + o Petru Paler + + o Paul + + o Chris Reahard + + o Sverker Nilsson + + o Gong Su + + o johan verrept + + o Bjorn Eriksson + + o Lorenzo Allegrucci + + o Muli Ben-Yehuda + + o David Mansfield + + o Howard Goff + + o Mike Anderson + + o John Byrne + + o Sapan J. Batia + + o Iris Huang + + o Jan Hudec + + o Voluspa + + + + + 15.3. Buglets and clean-ups + + + + o Dave Zarzycki + + o Adam Lazur + + o Boria Feigin + + o Brian J. Murrell + + o JS + + o Roman Zippel + + o Wil Cooley + + o Ayelet Shemesh + + o Will Dyson + + o Sverker Nilsson + + o dvorak + + o v.naga srinivas + + o Shlomi Fish + + o Roger Binns + + o johan verrept + + o MrChuoi + + o Peter Cleve + + o Vincent Guffens + + o Nathan Scott + + o Patrick Caulfield + + o jbearce + + o Catalin Marinas + + o Shane Spencer + + o Zou Min + + + o Ryan Boder + + o Lorenzo Colitti + + o Gwendal Grignou + + o Andre' Breiler + + o Tsutomu Yasuda + + + + 15.4. Case Studies + + + o Jon Wright + + o William McEwan + + o Michael Richardson + + + + 15.5. Other contributions + + + Bill Carr made the Red Hat mkrootfs script + work with RH 6.2. + + Michael Jennings sent in some material which + is now gracing the top of the index page of this site. + + SGI (and more specifically Ralf Baechle ) gave me an account on oss.sgi.com + . The bandwidth there made it possible to + produce most of the filesystems available on the project download + page. + + Laurent Bonnaud took the old grotty + Debian filesystem that I've been distributing and updated it to 2.2. + It is now available by itself here. + + Rik van Riel gave me some ftp space on ftp.nl.linux.org so I can make + releases even when Sourceforge is broken. + + Rodrigo de Castro looked at my broken pte code and told me what was + wrong with it, letting me fix a long-standing (several weeks) and + serious set of bugs. + + Chris Reahard built a specialized root filesystem for running a DNS + server jailed inside UML. It's available from the download + page in the Jail + Filesystems section. + + + + + + + + + + + + diff --git a/Documentation/virtual/index.rst b/Documentation/virtual/index.rst deleted file mode 100644 index 062ffb527043..000000000000 --- a/Documentation/virtual/index.rst +++ /dev/null @@ -1,18 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -============================ -Linux Virtualization Support -============================ - -.. toctree:: - :maxdepth: 2 - - kvm/index - paravirt_ops - -.. only:: html and subproject - - Indices - ======= - - * :ref:`genindex` diff --git a/Documentation/virtual/kvm/amd-memory-encryption.rst b/Documentation/virtual/kvm/amd-memory-encryption.rst deleted file mode 100644 index d18c97b4e140..000000000000 --- a/Documentation/virtual/kvm/amd-memory-encryption.rst +++ /dev/null @@ -1,250 +0,0 @@ -====================================== -Secure Encrypted Virtualization (SEV) -====================================== - -Overview -======== - -Secure Encrypted Virtualization (SEV) is a feature found on AMD processors. - -SEV is an extension to the AMD-V architecture which supports running -virtual machines (VMs) under the control of a hypervisor. When enabled, -the memory contents of a VM will be transparently encrypted with a key -unique to that VM. - -The hypervisor can determine the SEV support through the CPUID -instruction. The CPUID function 0x8000001f reports information related -to SEV:: - - 0x8000001f[eax]: - Bit[1] indicates support for SEV - ... - [ecx]: - Bits[31:0] Number of encrypted guests supported simultaneously - -If support for SEV is present, MSR 0xc001_0010 (MSR_K8_SYSCFG) and MSR 0xc001_0015 -(MSR_K7_HWCR) can be used to determine if it can be enabled:: - - 0xc001_0010: - Bit[23] 1 = memory encryption can be enabled - 0 = memory encryption can not be enabled - - 0xc001_0015: - Bit[0] 1 = memory encryption can be enabled - 0 = memory encryption can not be enabled - -When SEV support is available, it can be enabled in a specific VM by -setting the SEV bit before executing VMRUN.:: - - VMCB[0x90]: - Bit[1] 1 = SEV is enabled - 0 = SEV is disabled - -SEV hardware uses ASIDs to associate a memory encryption key with a VM. -Hence, the ASID for the SEV-enabled guests must be from 1 to a maximum value -defined in the CPUID 0x8000001f[ecx] field. - -SEV Key Management -================== - -The SEV guest key management is handled by a separate processor called the AMD -Secure Processor (AMD-SP). Firmware running inside the AMD-SP provides a secure -key management interface to perform common hypervisor activities such as -encrypting bootstrap code, snapshot, migrating and debugging the guest. For more -information, see the SEV Key Management spec [api-spec]_ - -KVM implements the following commands to support common lifecycle events of SEV -guests, such as launching, running, snapshotting, migrating and decommissioning. - -1. KVM_SEV_INIT ---------------- - -The KVM_SEV_INIT command is used by the hypervisor to initialize the SEV platform -context. In a typical workflow, this command should be the first command issued. - -Returns: 0 on success, -negative on error - -2. KVM_SEV_LAUNCH_START ------------------------ - -The KVM_SEV_LAUNCH_START command is used for creating the memory encryption -context. To create the encryption context, user must provide a guest policy, -the owner's public Diffie-Hellman (PDH) key and session information. - -Parameters: struct kvm_sev_launch_start (in/out) - -Returns: 0 on success, -negative on error - -:: - - struct kvm_sev_launch_start { - __u32 handle; /* if zero then firmware creates a new handle */ - __u32 policy; /* guest's policy */ - - __u64 dh_uaddr; /* userspace address pointing to the guest owner's PDH key */ - __u32 dh_len; - - __u64 session_addr; /* userspace address which points to the guest session information */ - __u32 session_len; - }; - -On success, the 'handle' field contains a new handle and on error, a negative value. - -For more details, see SEV spec Section 6.2. - -3. KVM_SEV_LAUNCH_UPDATE_DATA ------------------------------ - -The KVM_SEV_LAUNCH_UPDATE_DATA is used for encrypting a memory region. It also -calculates a measurement of the memory contents. The measurement is a signature -of the memory contents that can be sent to the guest owner as an attestation -that the memory was encrypted correctly by the firmware. - -Parameters (in): struct kvm_sev_launch_update_data - -Returns: 0 on success, -negative on error - -:: - - struct kvm_sev_launch_update { - __u64 uaddr; /* userspace address to be encrypted (must be 16-byte aligned) */ - __u32 len; /* length of the data to be encrypted (must be 16-byte aligned) */ - }; - -For more details, see SEV spec Section 6.3. - -4. KVM_SEV_LAUNCH_MEASURE -------------------------- - -The KVM_SEV_LAUNCH_MEASURE command is used to retrieve the measurement of the -data encrypted by the KVM_SEV_LAUNCH_UPDATE_DATA command. The guest owner may -wait to provide the guest with confidential information until it can verify the -measurement. Since the guest owner knows the initial contents of the guest at -boot, the measurement can be verified by comparing it to what the guest owner -expects. - -Parameters (in): struct kvm_sev_launch_measure - -Returns: 0 on success, -negative on error - -:: - - struct kvm_sev_launch_measure { - __u64 uaddr; /* where to copy the measurement */ - __u32 len; /* length of measurement blob */ - }; - -For more details on the measurement verification flow, see SEV spec Section 6.4. - -5. KVM_SEV_LAUNCH_FINISH ------------------------- - -After completion of the launch flow, the KVM_SEV_LAUNCH_FINISH command can be -issued to make the guest ready for the execution. - -Returns: 0 on success, -negative on error - -6. KVM_SEV_GUEST_STATUS ------------------------ - -The KVM_SEV_GUEST_STATUS command is used to retrieve status information about a -SEV-enabled guest. - -Parameters (out): struct kvm_sev_guest_status - -Returns: 0 on success, -negative on error - -:: - - struct kvm_sev_guest_status { - __u32 handle; /* guest handle */ - __u32 policy; /* guest policy */ - __u8 state; /* guest state (see enum below) */ - }; - -SEV guest state: - -:: - - enum { - SEV_STATE_INVALID = 0; - SEV_STATE_LAUNCHING, /* guest is currently being launched */ - SEV_STATE_SECRET, /* guest is being launched and ready to accept the ciphertext data */ - SEV_STATE_RUNNING, /* guest is fully launched and running */ - SEV_STATE_RECEIVING, /* guest is being migrated in from another SEV machine */ - SEV_STATE_SENDING /* guest is getting migrated out to another SEV machine */ - }; - -7. KVM_SEV_DBG_DECRYPT ----------------------- - -The KVM_SEV_DEBUG_DECRYPT command can be used by the hypervisor to request the -firmware to decrypt the data at the given memory region. - -Parameters (in): struct kvm_sev_dbg - -Returns: 0 on success, -negative on error - -:: - - struct kvm_sev_dbg { - __u64 src_uaddr; /* userspace address of data to decrypt */ - __u64 dst_uaddr; /* userspace address of destination */ - __u32 len; /* length of memory region to decrypt */ - }; - -The command returns an error if the guest policy does not allow debugging. - -8. KVM_SEV_DBG_ENCRYPT ----------------------- - -The KVM_SEV_DEBUG_ENCRYPT command can be used by the hypervisor to request the -firmware to encrypt the data at the given memory region. - -Parameters (in): struct kvm_sev_dbg - -Returns: 0 on success, -negative on error - -:: - - struct kvm_sev_dbg { - __u64 src_uaddr; /* userspace address of data to encrypt */ - __u64 dst_uaddr; /* userspace address of destination */ - __u32 len; /* length of memory region to encrypt */ - }; - -The command returns an error if the guest policy does not allow debugging. - -9. KVM_SEV_LAUNCH_SECRET ------------------------- - -The KVM_SEV_LAUNCH_SECRET command can be used by the hypervisor to inject secret -data after the measurement has been validated by the guest owner. - -Parameters (in): struct kvm_sev_launch_secret - -Returns: 0 on success, -negative on error - -:: - - struct kvm_sev_launch_secret { - __u64 hdr_uaddr; /* userspace address containing the packet header */ - __u32 hdr_len; - - __u64 guest_uaddr; /* the guest memory region where the secret should be injected */ - __u32 guest_len; - - __u64 trans_uaddr; /* the hypervisor memory region which contains the secret */ - __u32 trans_len; - }; - -References -========== - - -See [white-paper]_, [api-spec]_, [amd-apm]_ and [kvm-forum]_ for more info. - -.. [white-paper] http://amd-dev.wpengine.netdna-cdn.com/wordpress/media/2013/12/AMD_Memory_Encryption_Whitepaper_v7-Public.pdf -.. [api-spec] http://support.amd.com/TechDocs/55766_SEV-KM_API_Specification.pdf -.. [amd-apm] http://support.amd.com/TechDocs/24593.pdf (section 15.34) -.. [kvm-forum] http://www.linux-kvm.org/images/7/74/02x08A-Thomas_Lendacky-AMDs_Virtualizatoin_Memory_Encryption_Technology.pdf diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt deleted file mode 100644 index e54a3f51ddc5..000000000000 --- a/Documentation/virtual/kvm/api.txt +++ /dev/null @@ -1,5296 +0,0 @@ -The Definitive KVM (Kernel-based Virtual Machine) API Documentation -=================================================================== - -1. General description ----------------------- - -The kvm API is a set of ioctls that are issued to control various aspects -of a virtual machine. The ioctls belong to three classes: - - - System ioctls: These query and set global attributes which affect the - whole kvm subsystem. In addition a system ioctl is used to create - virtual machines. - - - VM ioctls: These query and set attributes that affect an entire virtual - machine, for example memory layout. In addition a VM ioctl is used to - create virtual cpus (vcpus) and devices. - - VM ioctls must be issued from the same process (address space) that was - used to create the VM. - - - vcpu ioctls: These query and set attributes that control the operation - of a single virtual cpu. - - vcpu ioctls should be issued from the same thread that was used to create - the vcpu, except for asynchronous vcpu ioctl that are marked as such in - the documentation. Otherwise, the first ioctl after switching threads - could see a performance impact. - - - device ioctls: These query and set attributes that control the operation - of a single device. - - device ioctls must be issued from the same process (address space) that - was used to create the VM. - -2. File descriptors -------------------- - -The kvm API is centered around file descriptors. An initial -open("/dev/kvm") obtains a handle to the kvm subsystem; this handle -can be used to issue system ioctls. A KVM_CREATE_VM ioctl on this -handle will create a VM file descriptor which can be used to issue VM -ioctls. A KVM_CREATE_VCPU or KVM_CREATE_DEVICE ioctl on a VM fd will -create a virtual cpu or device and return a file descriptor pointing to -the new resource. Finally, ioctls on a vcpu or device fd can be used -to control the vcpu or device. For vcpus, this includes the important -task of actually running guest code. - -In general file descriptors can be migrated among processes by means -of fork() and the SCM_RIGHTS facility of unix domain socket. These -kinds of tricks are explicitly not supported by kvm. While they will -not cause harm to the host, their actual behavior is not guaranteed by -the API. See "General description" for details on the ioctl usage -model that is supported by KVM. - -It is important to note that althought VM ioctls may only be issued from -the process that created the VM, a VM's lifecycle is associated with its -file descriptor, not its creator (process). In other words, the VM and -its resources, *including the associated address space*, are not freed -until the last reference to the VM's file descriptor has been released. -For example, if fork() is issued after ioctl(KVM_CREATE_VM), the VM will -not be freed until both the parent (original) process and its child have -put their references to the VM's file descriptor. - -Because a VM's resources are not freed until the last reference to its -file descriptor is released, creating additional references to a VM via -via fork(), dup(), etc... without careful consideration is strongly -discouraged and may have unwanted side effects, e.g. memory allocated -by and on behalf of the VM's process may not be freed/unaccounted when -the VM is shut down. - - -3. Extensions -------------- - -As of Linux 2.6.22, the KVM ABI has been stabilized: no backward -incompatible change are allowed. However, there is an extension -facility that allows backward-compatible extensions to the API to be -queried and used. - -The extension mechanism is not based on the Linux version number. -Instead, kvm defines extension identifiers and a facility to query -whether a particular extension identifier is available. If it is, a -set of ioctls is available for application use. - - -4. API description ------------------- - -This section describes ioctls that can be used to control kvm guests. -For each ioctl, the following information is provided along with a -description: - - Capability: which KVM extension provides this ioctl. Can be 'basic', - which means that is will be provided by any kernel that supports - API version 12 (see section 4.1), a KVM_CAP_xyz constant, which - means availability needs to be checked with KVM_CHECK_EXTENSION - (see section 4.4), or 'none' which means that while not all kernels - support this ioctl, there's no capability bit to check its - availability: for kernels that don't support the ioctl, - the ioctl returns -ENOTTY. - - Architectures: which instruction set architectures provide this ioctl. - x86 includes both i386 and x86_64. - - Type: system, vm, or vcpu. - - Parameters: what parameters are accepted by the ioctl. - - Returns: the return value. General error numbers (EBADF, ENOMEM, EINVAL) - are not detailed, but errors with specific meanings are. - - -4.1 KVM_GET_API_VERSION - -Capability: basic -Architectures: all -Type: system ioctl -Parameters: none -Returns: the constant KVM_API_VERSION (=12) - -This identifies the API version as the stable kvm API. It is not -expected that this number will change. However, Linux 2.6.20 and -2.6.21 report earlier versions; these are not documented and not -supported. Applications should refuse to run if KVM_GET_API_VERSION -returns a value other than 12. If this check passes, all ioctls -described as 'basic' will be available. - - -4.2 KVM_CREATE_VM - -Capability: basic -Architectures: all -Type: system ioctl -Parameters: machine type identifier (KVM_VM_*) -Returns: a VM fd that can be used to control the new virtual machine. - -The new VM has no virtual cpus and no memory. -You probably want to use 0 as machine type. - -In order to create user controlled virtual machines on S390, check -KVM_CAP_S390_UCONTROL and use the flag KVM_VM_S390_UCONTROL as -privileged user (CAP_SYS_ADMIN). - -To use hardware assisted virtualization on MIPS (VZ ASE) rather than -the default trap & emulate implementation (which changes the virtual -memory layout to fit in user mode), check KVM_CAP_MIPS_VZ and use the -flag KVM_VM_MIPS_VZ. - - -On arm64, the physical address size for a VM (IPA Size limit) is limited -to 40bits by default. The limit can be configured if the host supports the -extension KVM_CAP_ARM_VM_IPA_SIZE. When supported, use -KVM_VM_TYPE_ARM_IPA_SIZE(IPA_Bits) to set the size in the machine type -identifier, where IPA_Bits is the maximum width of any physical -address used by the VM. The IPA_Bits is encoded in bits[7-0] of the -machine type identifier. - -e.g, to configure a guest to use 48bit physical address size : - - vm_fd = ioctl(dev_fd, KVM_CREATE_VM, KVM_VM_TYPE_ARM_IPA_SIZE(48)); - -The requested size (IPA_Bits) must be : - 0 - Implies default size, 40bits (for backward compatibility) - - or - - N - Implies N bits, where N is a positive integer such that, - 32 <= N <= Host_IPA_Limit - -Host_IPA_Limit is the maximum possible value for IPA_Bits on the host and -is dependent on the CPU capability and the kernel configuration. The limit can -be retrieved using KVM_CAP_ARM_VM_IPA_SIZE of the KVM_CHECK_EXTENSION -ioctl() at run-time. - -Please note that configuring the IPA size does not affect the capability -exposed by the guest CPUs in ID_AA64MMFR0_EL1[PARange]. It only affects -size of the address translated by the stage2 level (guest physical to -host physical address translations). - - -4.3 KVM_GET_MSR_INDEX_LIST, KVM_GET_MSR_FEATURE_INDEX_LIST - -Capability: basic, KVM_CAP_GET_MSR_FEATURES for KVM_GET_MSR_FEATURE_INDEX_LIST -Architectures: x86 -Type: system ioctl -Parameters: struct kvm_msr_list (in/out) -Returns: 0 on success; -1 on error -Errors: - EFAULT: the msr index list cannot be read from or written to - E2BIG: the msr index list is to be to fit in the array specified by - the user. - -struct kvm_msr_list { - __u32 nmsrs; /* number of msrs in entries */ - __u32 indices[0]; -}; - -The user fills in the size of the indices array in nmsrs, and in return -kvm adjusts nmsrs to reflect the actual number of msrs and fills in the -indices array with their numbers. - -KVM_GET_MSR_INDEX_LIST returns the guest msrs that are supported. The list -varies by kvm version and host processor, but does not change otherwise. - -Note: if kvm indicates supports MCE (KVM_CAP_MCE), then the MCE bank MSRs are -not returned in the MSR list, as different vcpus can have a different number -of banks, as set via the KVM_X86_SETUP_MCE ioctl. - -KVM_GET_MSR_FEATURE_INDEX_LIST returns the list of MSRs that can be passed -to the KVM_GET_MSRS system ioctl. This lets userspace probe host capabilities -and processor features that are exposed via MSRs (e.g., VMX capabilities). -This list also varies by kvm version and host processor, but does not change -otherwise. - - -4.4 KVM_CHECK_EXTENSION - -Capability: basic, KVM_CAP_CHECK_EXTENSION_VM for vm ioctl -Architectures: all -Type: system ioctl, vm ioctl -Parameters: extension identifier (KVM_CAP_*) -Returns: 0 if unsupported; 1 (or some other positive integer) if supported - -The API allows the application to query about extensions to the core -kvm API. Userspace passes an extension identifier (an integer) and -receives an integer that describes the extension availability. -Generally 0 means no and 1 means yes, but some extensions may report -additional information in the integer return value. - -Based on their initialization different VMs may have different capabilities. -It is thus encouraged to use the vm ioctl to query for capabilities (available -with KVM_CAP_CHECK_EXTENSION_VM on the vm fd) - -4.5 KVM_GET_VCPU_MMAP_SIZE - -Capability: basic -Architectures: all -Type: system ioctl -Parameters: none -Returns: size of vcpu mmap area, in bytes - -The KVM_RUN ioctl (cf.) communicates with userspace via a shared -memory region. This ioctl returns the size of that region. See the -KVM_RUN documentation for details. - - -4.6 KVM_SET_MEMORY_REGION - -Capability: basic -Architectures: all -Type: vm ioctl -Parameters: struct kvm_memory_region (in) -Returns: 0 on success, -1 on error - -This ioctl is obsolete and has been removed. - - -4.7 KVM_CREATE_VCPU - -Capability: basic -Architectures: all -Type: vm ioctl -Parameters: vcpu id (apic id on x86) -Returns: vcpu fd on success, -1 on error - -This API adds a vcpu to a virtual machine. No more than max_vcpus may be added. -The vcpu id is an integer in the range [0, max_vcpu_id). - -The recommended max_vcpus value can be retrieved using the KVM_CAP_NR_VCPUS of -the KVM_CHECK_EXTENSION ioctl() at run-time. -The maximum possible value for max_vcpus can be retrieved using the -KVM_CAP_MAX_VCPUS of the KVM_CHECK_EXTENSION ioctl() at run-time. - -If the KVM_CAP_NR_VCPUS does not exist, you should assume that max_vcpus is 4 -cpus max. -If the KVM_CAP_MAX_VCPUS does not exist, you should assume that max_vcpus is -same as the value returned from KVM_CAP_NR_VCPUS. - -The maximum possible value for max_vcpu_id can be retrieved using the -KVM_CAP_MAX_VCPU_ID of the KVM_CHECK_EXTENSION ioctl() at run-time. - -If the KVM_CAP_MAX_VCPU_ID does not exist, you should assume that max_vcpu_id -is the same as the value returned from KVM_CAP_MAX_VCPUS. - -On powerpc using book3s_hv mode, the vcpus are mapped onto virtual -threads in one or more virtual CPU cores. (This is because the -hardware requires all the hardware threads in a CPU core to be in the -same partition.) The KVM_CAP_PPC_SMT capability indicates the number -of vcpus per virtual core (vcore). The vcore id is obtained by -dividing the vcpu id by the number of vcpus per vcore. The vcpus in a -given vcore will always be in the same physical core as each other -(though that might be a different physical core from time to time). -Userspace can control the threading (SMT) mode of the guest by its -allocation of vcpu ids. For example, if userspace wants -single-threaded guest vcpus, it should make all vcpu ids be a multiple -of the number of vcpus per vcore. - -For virtual cpus that have been created with S390 user controlled virtual -machines, the resulting vcpu fd can be memory mapped at page offset -KVM_S390_SIE_PAGE_OFFSET in order to obtain a memory map of the virtual -cpu's hardware control block. - - -4.8 KVM_GET_DIRTY_LOG (vm ioctl) - -Capability: basic -Architectures: all -Type: vm ioctl -Parameters: struct kvm_dirty_log (in/out) -Returns: 0 on success, -1 on error - -/* for KVM_GET_DIRTY_LOG */ -struct kvm_dirty_log { - __u32 slot; - __u32 padding; - union { - void __user *dirty_bitmap; /* one bit per page */ - __u64 padding; - }; -}; - -Given a memory slot, return a bitmap containing any pages dirtied -since the last call to this ioctl. Bit 0 is the first page in the -memory slot. Ensure the entire structure is cleared to avoid padding -issues. - -If KVM_CAP_MULTI_ADDRESS_SPACE is available, bits 16-31 specifies -the address space for which you want to return the dirty bitmap. -They must be less than the value that KVM_CHECK_EXTENSION returns for -the KVM_CAP_MULTI_ADDRESS_SPACE capability. - -The bits in the dirty bitmap are cleared before the ioctl returns, unless -KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 is enabled. For more information, -see the description of the capability. - -4.9 KVM_SET_MEMORY_ALIAS - -Capability: basic -Architectures: x86 -Type: vm ioctl -Parameters: struct kvm_memory_alias (in) -Returns: 0 (success), -1 (error) - -This ioctl is obsolete and has been removed. - - -4.10 KVM_RUN - -Capability: basic -Architectures: all -Type: vcpu ioctl -Parameters: none -Returns: 0 on success, -1 on error -Errors: - EINTR: an unmasked signal is pending - -This ioctl is used to run a guest virtual cpu. While there are no -explicit parameters, there is an implicit parameter block that can be -obtained by mmap()ing the vcpu fd at offset 0, with the size given by -KVM_GET_VCPU_MMAP_SIZE. The parameter block is formatted as a 'struct -kvm_run' (see below). - - -4.11 KVM_GET_REGS - -Capability: basic -Architectures: all except ARM, arm64 -Type: vcpu ioctl -Parameters: struct kvm_regs (out) -Returns: 0 on success, -1 on error - -Reads the general purpose registers from the vcpu. - -/* x86 */ -struct kvm_regs { - /* out (KVM_GET_REGS) / in (KVM_SET_REGS) */ - __u64 rax, rbx, rcx, rdx; - __u64 rsi, rdi, rsp, rbp; - __u64 r8, r9, r10, r11; - __u64 r12, r13, r14, r15; - __u64 rip, rflags; -}; - -/* mips */ -struct kvm_regs { - /* out (KVM_GET_REGS) / in (KVM_SET_REGS) */ - __u64 gpr[32]; - __u64 hi; - __u64 lo; - __u64 pc; -}; - - -4.12 KVM_SET_REGS - -Capability: basic -Architectures: all except ARM, arm64 -Type: vcpu ioctl -Parameters: struct kvm_regs (in) -Returns: 0 on success, -1 on error - -Writes the general purpose registers into the vcpu. - -See KVM_GET_REGS for the data structure. - - -4.13 KVM_GET_SREGS - -Capability: basic -Architectures: x86, ppc -Type: vcpu ioctl -Parameters: struct kvm_sregs (out) -Returns: 0 on success, -1 on error - -Reads special registers from the vcpu. - -/* x86 */ -struct kvm_sregs { - struct kvm_segment cs, ds, es, fs, gs, ss; - struct kvm_segment tr, ldt; - struct kvm_dtable gdt, idt; - __u64 cr0, cr2, cr3, cr4, cr8; - __u64 efer; - __u64 apic_base; - __u64 interrupt_bitmap[(KVM_NR_INTERRUPTS + 63) / 64]; -}; - -/* ppc -- see arch/powerpc/include/uapi/asm/kvm.h */ - -interrupt_bitmap is a bitmap of pending external interrupts. At most -one bit may be set. This interrupt has been acknowledged by the APIC -but not yet injected into the cpu core. - - -4.14 KVM_SET_SREGS - -Capability: basic -Architectures: x86, ppc -Type: vcpu ioctl -Parameters: struct kvm_sregs (in) -Returns: 0 on success, -1 on error - -Writes special registers into the vcpu. See KVM_GET_SREGS for the -data structures. - - -4.15 KVM_TRANSLATE - -Capability: basic -Architectures: x86 -Type: vcpu ioctl -Parameters: struct kvm_translation (in/out) -Returns: 0 on success, -1 on error - -Translates a virtual address according to the vcpu's current address -translation mode. - -struct kvm_translation { - /* in */ - __u64 linear_address; - - /* out */ - __u64 physical_address; - __u8 valid; - __u8 writeable; - __u8 usermode; - __u8 pad[5]; -}; - - -4.16 KVM_INTERRUPT - -Capability: basic -Architectures: x86, ppc, mips -Type: vcpu ioctl -Parameters: struct kvm_interrupt (in) -Returns: 0 on success, negative on failure. - -Queues a hardware interrupt vector to be injected. - -/* for KVM_INTERRUPT */ -struct kvm_interrupt { - /* in */ - __u32 irq; -}; - -X86: - -Returns: 0 on success, - -EEXIST if an interrupt is already enqueued - -EINVAL the the irq number is invalid - -ENXIO if the PIC is in the kernel - -EFAULT if the pointer is invalid - -Note 'irq' is an interrupt vector, not an interrupt pin or line. This -ioctl is useful if the in-kernel PIC is not used. - -PPC: - -Queues an external interrupt to be injected. This ioctl is overleaded -with 3 different irq values: - -a) KVM_INTERRUPT_SET - - This injects an edge type external interrupt into the guest once it's ready - to receive interrupts. When injected, the interrupt is done. - -b) KVM_INTERRUPT_UNSET - - This unsets any pending interrupt. - - Only available with KVM_CAP_PPC_UNSET_IRQ. - -c) KVM_INTERRUPT_SET_LEVEL - - This injects a level type external interrupt into the guest context. The - interrupt stays pending until a specific ioctl with KVM_INTERRUPT_UNSET - is triggered. - - Only available with KVM_CAP_PPC_IRQ_LEVEL. - -Note that any value for 'irq' other than the ones stated above is invalid -and incurs unexpected behavior. - -This is an asynchronous vcpu ioctl and can be invoked from any thread. - -MIPS: - -Queues an external interrupt to be injected into the virtual CPU. A negative -interrupt number dequeues the interrupt. - -This is an asynchronous vcpu ioctl and can be invoked from any thread. - - -4.17 KVM_DEBUG_GUEST - -Capability: basic -Architectures: none -Type: vcpu ioctl -Parameters: none) -Returns: -1 on error - -Support for this has been removed. Use KVM_SET_GUEST_DEBUG instead. - - -4.18 KVM_GET_MSRS - -Capability: basic (vcpu), KVM_CAP_GET_MSR_FEATURES (system) -Architectures: x86 -Type: system ioctl, vcpu ioctl -Parameters: struct kvm_msrs (in/out) -Returns: number of msrs successfully returned; - -1 on error - -When used as a system ioctl: -Reads the values of MSR-based features that are available for the VM. This -is similar to KVM_GET_SUPPORTED_CPUID, but it returns MSR indices and values. -The list of msr-based features can be obtained using KVM_GET_MSR_FEATURE_INDEX_LIST -in a system ioctl. - -When used as a vcpu ioctl: -Reads model-specific registers from the vcpu. Supported msr indices can -be obtained using KVM_GET_MSR_INDEX_LIST in a system ioctl. - -struct kvm_msrs { - __u32 nmsrs; /* number of msrs in entries */ - __u32 pad; - - struct kvm_msr_entry entries[0]; -}; - -struct kvm_msr_entry { - __u32 index; - __u32 reserved; - __u64 data; -}; - -Application code should set the 'nmsrs' member (which indicates the -size of the entries array) and the 'index' member of each array entry. -kvm will fill in the 'data' member. - - -4.19 KVM_SET_MSRS - -Capability: basic -Architectures: x86 -Type: vcpu ioctl -Parameters: struct kvm_msrs (in) -Returns: 0 on success, -1 on error - -Writes model-specific registers to the vcpu. See KVM_GET_MSRS for the -data structures. - -Application code should set the 'nmsrs' member (which indicates the -size of the entries array), and the 'index' and 'data' members of each -array entry. - - -4.20 KVM_SET_CPUID - -Capability: basic -Architectures: x86 -Type: vcpu ioctl -Parameters: struct kvm_cpuid (in) -Returns: 0 on success, -1 on error - -Defines the vcpu responses to the cpuid instruction. Applications -should use the KVM_SET_CPUID2 ioctl if available. - - -struct kvm_cpuid_entry { - __u32 function; - __u32 eax; - __u32 ebx; - __u32 ecx; - __u32 edx; - __u32 padding; -}; - -/* for KVM_SET_CPUID */ -struct kvm_cpuid { - __u32 nent; - __u32 padding; - struct kvm_cpuid_entry entries[0]; -}; - - -4.21 KVM_SET_SIGNAL_MASK - -Capability: basic -Architectures: all -Type: vcpu ioctl -Parameters: struct kvm_signal_mask (in) -Returns: 0 on success, -1 on error - -Defines which signals are blocked during execution of KVM_RUN. This -signal mask temporarily overrides the threads signal mask. Any -unblocked signal received (except SIGKILL and SIGSTOP, which retain -their traditional behaviour) will cause KVM_RUN to return with -EINTR. - -Note the signal will only be delivered if not blocked by the original -signal mask. - -/* for KVM_SET_SIGNAL_MASK */ -struct kvm_signal_mask { - __u32 len; - __u8 sigset[0]; -}; - - -4.22 KVM_GET_FPU - -Capability: basic -Architectures: x86 -Type: vcpu ioctl -Parameters: struct kvm_fpu (out) -Returns: 0 on success, -1 on error - -Reads the floating point state from the vcpu. - -/* for KVM_GET_FPU and KVM_SET_FPU */ -struct kvm_fpu { - __u8 fpr[8][16]; - __u16 fcw; - __u16 fsw; - __u8 ftwx; /* in fxsave format */ - __u8 pad1; - __u16 last_opcode; - __u64 last_ip; - __u64 last_dp; - __u8 xmm[16][16]; - __u32 mxcsr; - __u32 pad2; -}; - - -4.23 KVM_SET_FPU - -Capability: basic -Architectures: x86 -Type: vcpu ioctl -Parameters: struct kvm_fpu (in) -Returns: 0 on success, -1 on error - -Writes the floating point state to the vcpu. - -/* for KVM_GET_FPU and KVM_SET_FPU */ -struct kvm_fpu { - __u8 fpr[8][16]; - __u16 fcw; - __u16 fsw; - __u8 ftwx; /* in fxsave format */ - __u8 pad1; - __u16 last_opcode; - __u64 last_ip; - __u64 last_dp; - __u8 xmm[16][16]; - __u32 mxcsr; - __u32 pad2; -}; - - -4.24 KVM_CREATE_IRQCHIP - -Capability: KVM_CAP_IRQCHIP, KVM_CAP_S390_IRQCHIP (s390) -Architectures: x86, ARM, arm64, s390 -Type: vm ioctl -Parameters: none -Returns: 0 on success, -1 on error - -Creates an interrupt controller model in the kernel. -On x86, creates a virtual ioapic, a virtual PIC (two PICs, nested), and sets up -future vcpus to have a local APIC. IRQ routing for GSIs 0-15 is set to both -PIC and IOAPIC; GSI 16-23 only go to the IOAPIC. -On ARM/arm64, a GICv2 is created. Any other GIC versions require the usage of -KVM_CREATE_DEVICE, which also supports creating a GICv2. Using -KVM_CREATE_DEVICE is preferred over KVM_CREATE_IRQCHIP for GICv2. -On s390, a dummy irq routing table is created. - -Note that on s390 the KVM_CAP_S390_IRQCHIP vm capability needs to be enabled -before KVM_CREATE_IRQCHIP can be used. - - -4.25 KVM_IRQ_LINE - -Capability: KVM_CAP_IRQCHIP -Architectures: x86, arm, arm64 -Type: vm ioctl -Parameters: struct kvm_irq_level -Returns: 0 on success, -1 on error - -Sets the level of a GSI input to the interrupt controller model in the kernel. -On some architectures it is required that an interrupt controller model has -been previously created with KVM_CREATE_IRQCHIP. Note that edge-triggered -interrupts require the level to be set to 1 and then back to 0. - -On real hardware, interrupt pins can be active-low or active-high. This -does not matter for the level field of struct kvm_irq_level: 1 always -means active (asserted), 0 means inactive (deasserted). - -x86 allows the operating system to program the interrupt polarity -(active-low/active-high) for level-triggered interrupts, and KVM used -to consider the polarity. However, due to bitrot in the handling of -active-low interrupts, the above convention is now valid on x86 too. -This is signaled by KVM_CAP_X86_IOAPIC_POLARITY_IGNORED. Userspace -should not present interrupts to the guest as active-low unless this -capability is present (or unless it is not using the in-kernel irqchip, -of course). - - -ARM/arm64 can signal an interrupt either at the CPU level, or at the -in-kernel irqchip (GIC), and for in-kernel irqchip can tell the GIC to -use PPIs designated for specific cpus. The irq field is interpreted -like this: - -  bits: | 31 ... 24 | 23 ... 16 | 15 ... 0 | - field: | irq_type | vcpu_index | irq_id | - -The irq_type field has the following values: -- irq_type[0]: out-of-kernel GIC: irq_id 0 is IRQ, irq_id 1 is FIQ -- irq_type[1]: in-kernel GIC: SPI, irq_id between 32 and 1019 (incl.) - (the vcpu_index field is ignored) -- irq_type[2]: in-kernel GIC: PPI, irq_id between 16 and 31 (incl.) - -(The irq_id field thus corresponds nicely to the IRQ ID in the ARM GIC specs) - -In both cases, level is used to assert/deassert the line. - -struct kvm_irq_level { - union { - __u32 irq; /* GSI */ - __s32 status; /* not used for KVM_IRQ_LEVEL */ - }; - __u32 level; /* 0 or 1 */ -}; - - -4.26 KVM_GET_IRQCHIP - -Capability: KVM_CAP_IRQCHIP -Architectures: x86 -Type: vm ioctl -Parameters: struct kvm_irqchip (in/out) -Returns: 0 on success, -1 on error - -Reads the state of a kernel interrupt controller created with -KVM_CREATE_IRQCHIP into a buffer provided by the caller. - -struct kvm_irqchip { - __u32 chip_id; /* 0 = PIC1, 1 = PIC2, 2 = IOAPIC */ - __u32 pad; - union { - char dummy[512]; /* reserving space */ - struct kvm_pic_state pic; - struct kvm_ioapic_state ioapic; - } chip; -}; - - -4.27 KVM_SET_IRQCHIP - -Capability: KVM_CAP_IRQCHIP -Architectures: x86 -Type: vm ioctl -Parameters: struct kvm_irqchip (in) -Returns: 0 on success, -1 on error - -Sets the state of a kernel interrupt controller created with -KVM_CREATE_IRQCHIP from a buffer provided by the caller. - -struct kvm_irqchip { - __u32 chip_id; /* 0 = PIC1, 1 = PIC2, 2 = IOAPIC */ - __u32 pad; - union { - char dummy[512]; /* reserving space */ - struct kvm_pic_state pic; - struct kvm_ioapic_state ioapic; - } chip; -}; - - -4.28 KVM_XEN_HVM_CONFIG - -Capability: KVM_CAP_XEN_HVM -Architectures: x86 -Type: vm ioctl -Parameters: struct kvm_xen_hvm_config (in) -Returns: 0 on success, -1 on error - -Sets the MSR that the Xen HVM guest uses to initialize its hypercall -page, and provides the starting address and size of the hypercall -blobs in userspace. When the guest writes the MSR, kvm copies one -page of a blob (32- or 64-bit, depending on the vcpu mode) to guest -memory. - -struct kvm_xen_hvm_config { - __u32 flags; - __u32 msr; - __u64 blob_addr_32; - __u64 blob_addr_64; - __u8 blob_size_32; - __u8 blob_size_64; - __u8 pad2[30]; -}; - - -4.29 KVM_GET_CLOCK - -Capability: KVM_CAP_ADJUST_CLOCK -Architectures: x86 -Type: vm ioctl -Parameters: struct kvm_clock_data (out) -Returns: 0 on success, -1 on error - -Gets the current timestamp of kvmclock as seen by the current guest. In -conjunction with KVM_SET_CLOCK, it is used to ensure monotonicity on scenarios -such as migration. - -When KVM_CAP_ADJUST_CLOCK is passed to KVM_CHECK_EXTENSION, it returns the -set of bits that KVM can return in struct kvm_clock_data's flag member. - -The only flag defined now is KVM_CLOCK_TSC_STABLE. If set, the returned -value is the exact kvmclock value seen by all VCPUs at the instant -when KVM_GET_CLOCK was called. If clear, the returned value is simply -CLOCK_MONOTONIC plus a constant offset; the offset can be modified -with KVM_SET_CLOCK. KVM will try to make all VCPUs follow this clock, -but the exact value read by each VCPU could differ, because the host -TSC is not stable. - -struct kvm_clock_data { - __u64 clock; /* kvmclock current value */ - __u32 flags; - __u32 pad[9]; -}; - - -4.30 KVM_SET_CLOCK - -Capability: KVM_CAP_ADJUST_CLOCK -Architectures: x86 -Type: vm ioctl -Parameters: struct kvm_clock_data (in) -Returns: 0 on success, -1 on error - -Sets the current timestamp of kvmclock to the value specified in its parameter. -In conjunction with KVM_GET_CLOCK, it is used to ensure monotonicity on scenarios -such as migration. - -struct kvm_clock_data { - __u64 clock; /* kvmclock current value */ - __u32 flags; - __u32 pad[9]; -}; - - -4.31 KVM_GET_VCPU_EVENTS - -Capability: KVM_CAP_VCPU_EVENTS -Extended by: KVM_CAP_INTR_SHADOW -Architectures: x86, arm, arm64 -Type: vcpu ioctl -Parameters: struct kvm_vcpu_event (out) -Returns: 0 on success, -1 on error - -X86: - -Gets currently pending exceptions, interrupts, and NMIs as well as related -states of the vcpu. - -struct kvm_vcpu_events { - struct { - __u8 injected; - __u8 nr; - __u8 has_error_code; - __u8 pending; - __u32 error_code; - } exception; - struct { - __u8 injected; - __u8 nr; - __u8 soft; - __u8 shadow; - } interrupt; - struct { - __u8 injected; - __u8 pending; - __u8 masked; - __u8 pad; - } nmi; - __u32 sipi_vector; - __u32 flags; - struct { - __u8 smm; - __u8 pending; - __u8 smm_inside_nmi; - __u8 latched_init; - } smi; - __u8 reserved[27]; - __u8 exception_has_payload; - __u64 exception_payload; -}; - -The following bits are defined in the flags field: - -- KVM_VCPUEVENT_VALID_SHADOW may be set to signal that - interrupt.shadow contains a valid state. - -- KVM_VCPUEVENT_VALID_SMM may be set to signal that smi contains a - valid state. - -- KVM_VCPUEVENT_VALID_PAYLOAD may be set to signal that the - exception_has_payload, exception_payload, and exception.pending - fields contain a valid state. This bit will be set whenever - KVM_CAP_EXCEPTION_PAYLOAD is enabled. - -ARM/ARM64: - -If the guest accesses a device that is being emulated by the host kernel in -such a way that a real device would generate a physical SError, KVM may make -a virtual SError pending for that VCPU. This system error interrupt remains -pending until the guest takes the exception by unmasking PSTATE.A. - -Running the VCPU may cause it to take a pending SError, or make an access that -causes an SError to become pending. The event's description is only valid while -the VPCU is not running. - -This API provides a way to read and write the pending 'event' state that is not -visible to the guest. To save, restore or migrate a VCPU the struct representing -the state can be read then written using this GET/SET API, along with the other -guest-visible registers. It is not possible to 'cancel' an SError that has been -made pending. - -A device being emulated in user-space may also wish to generate an SError. To do -this the events structure can be populated by user-space. The current state -should be read first, to ensure no existing SError is pending. If an existing -SError is pending, the architecture's 'Multiple SError interrupts' rules should -be followed. (2.5.3 of DDI0587.a "ARM Reliability, Availability, and -Serviceability (RAS) Specification"). - -SError exceptions always have an ESR value. Some CPUs have the ability to -specify what the virtual SError's ESR value should be. These systems will -advertise KVM_CAP_ARM_INJECT_SERROR_ESR. In this case exception.has_esr will -always have a non-zero value when read, and the agent making an SError pending -should specify the ISS field in the lower 24 bits of exception.serror_esr. If -the system supports KVM_CAP_ARM_INJECT_SERROR_ESR, but user-space sets the events -with exception.has_esr as zero, KVM will choose an ESR. - -Specifying exception.has_esr on a system that does not support it will return --EINVAL. Setting anything other than the lower 24bits of exception.serror_esr -will return -EINVAL. - -struct kvm_vcpu_events { - struct { - __u8 serror_pending; - __u8 serror_has_esr; - /* Align it to 8 bytes */ - __u8 pad[6]; - __u64 serror_esr; - } exception; - __u32 reserved[12]; -}; - -4.32 KVM_SET_VCPU_EVENTS - -Capability: KVM_CAP_VCPU_EVENTS -Extended by: KVM_CAP_INTR_SHADOW -Architectures: x86, arm, arm64 -Type: vcpu ioctl -Parameters: struct kvm_vcpu_event (in) -Returns: 0 on success, -1 on error - -X86: - -Set pending exceptions, interrupts, and NMIs as well as related states of the -vcpu. - -See KVM_GET_VCPU_EVENTS for the data structure. - -Fields that may be modified asynchronously by running VCPUs can be excluded -from the update. These fields are nmi.pending, sipi_vector, smi.smm, -smi.pending. Keep the corresponding bits in the flags field cleared to -suppress overwriting the current in-kernel state. The bits are: - -KVM_VCPUEVENT_VALID_NMI_PENDING - transfer nmi.pending to the kernel -KVM_VCPUEVENT_VALID_SIPI_VECTOR - transfer sipi_vector -KVM_VCPUEVENT_VALID_SMM - transfer the smi sub-struct. - -If KVM_CAP_INTR_SHADOW is available, KVM_VCPUEVENT_VALID_SHADOW can be set in -the flags field to signal that interrupt.shadow contains a valid state and -shall be written into the VCPU. - -KVM_VCPUEVENT_VALID_SMM can only be set if KVM_CAP_X86_SMM is available. - -If KVM_CAP_EXCEPTION_PAYLOAD is enabled, KVM_VCPUEVENT_VALID_PAYLOAD -can be set in the flags field to signal that the -exception_has_payload, exception_payload, and exception.pending fields -contain a valid state and shall be written into the VCPU. - -ARM/ARM64: - -Set the pending SError exception state for this VCPU. It is not possible to -'cancel' an Serror that has been made pending. - -See KVM_GET_VCPU_EVENTS for the data structure. - - -4.33 KVM_GET_DEBUGREGS - -Capability: KVM_CAP_DEBUGREGS -Architectures: x86 -Type: vm ioctl -Parameters: struct kvm_debugregs (out) -Returns: 0 on success, -1 on error - -Reads debug registers from the vcpu. - -struct kvm_debugregs { - __u64 db[4]; - __u64 dr6; - __u64 dr7; - __u64 flags; - __u64 reserved[9]; -}; - - -4.34 KVM_SET_DEBUGREGS - -Capability: KVM_CAP_DEBUGREGS -Architectures: x86 -Type: vm ioctl -Parameters: struct kvm_debugregs (in) -Returns: 0 on success, -1 on error - -Writes debug registers into the vcpu. - -See KVM_GET_DEBUGREGS for the data structure. The flags field is unused -yet and must be cleared on entry. - - -4.35 KVM_SET_USER_MEMORY_REGION - -Capability: KVM_CAP_USER_MEMORY -Architectures: all -Type: vm ioctl -Parameters: struct kvm_userspace_memory_region (in) -Returns: 0 on success, -1 on error - -struct kvm_userspace_memory_region { - __u32 slot; - __u32 flags; - __u64 guest_phys_addr; - __u64 memory_size; /* bytes */ - __u64 userspace_addr; /* start of the userspace allocated memory */ -}; - -/* for kvm_memory_region::flags */ -#define KVM_MEM_LOG_DIRTY_PAGES (1UL << 0) -#define KVM_MEM_READONLY (1UL << 1) - -This ioctl allows the user to create, modify or delete a guest physical -memory slot. Bits 0-15 of "slot" specify the slot id and this value -should be less than the maximum number of user memory slots supported per -VM. The maximum allowed slots can be queried using KVM_CAP_NR_MEMSLOTS. -Slots may not overlap in guest physical address space. - -If KVM_CAP_MULTI_ADDRESS_SPACE is available, bits 16-31 of "slot" -specifies the address space which is being modified. They must be -less than the value that KVM_CHECK_EXTENSION returns for the -KVM_CAP_MULTI_ADDRESS_SPACE capability. Slots in separate address spaces -are unrelated; the restriction on overlapping slots only applies within -each address space. - -Deleting a slot is done by passing zero for memory_size. When changing -an existing slot, it may be moved in the guest physical memory space, -or its flags may be modified, but it may not be resized. - -Memory for the region is taken starting at the address denoted by the -field userspace_addr, which must point at user addressable memory for -the entire memory slot size. Any object may back this memory, including -anonymous memory, ordinary files, and hugetlbfs. - -It is recommended that the lower 21 bits of guest_phys_addr and userspace_addr -be identical. This allows large pages in the guest to be backed by large -pages in the host. - -The flags field supports two flags: KVM_MEM_LOG_DIRTY_PAGES and -KVM_MEM_READONLY. The former can be set to instruct KVM to keep track of -writes to memory within the slot. See KVM_GET_DIRTY_LOG ioctl to know how to -use it. The latter can be set, if KVM_CAP_READONLY_MEM capability allows it, -to make a new slot read-only. In this case, writes to this memory will be -posted to userspace as KVM_EXIT_MMIO exits. - -When the KVM_CAP_SYNC_MMU capability is available, changes in the backing of -the memory region are automatically reflected into the guest. For example, an -mmap() that affects the region will be made visible immediately. Another -example is madvise(MADV_DROP). - -It is recommended to use this API instead of the KVM_SET_MEMORY_REGION ioctl. -The KVM_SET_MEMORY_REGION does not allow fine grained control over memory -allocation and is deprecated. - - -4.36 KVM_SET_TSS_ADDR - -Capability: KVM_CAP_SET_TSS_ADDR -Architectures: x86 -Type: vm ioctl -Parameters: unsigned long tss_address (in) -Returns: 0 on success, -1 on error - -This ioctl defines the physical address of a three-page region in the guest -physical address space. The region must be within the first 4GB of the -guest physical address space and must not conflict with any memory slot -or any mmio address. The guest may malfunction if it accesses this memory -region. - -This ioctl is required on Intel-based hosts. This is needed on Intel hardware -because of a quirk in the virtualization implementation (see the internals -documentation when it pops into existence). - - -4.37 KVM_ENABLE_CAP - -Capability: KVM_CAP_ENABLE_CAP -Architectures: mips, ppc, s390 -Type: vcpu ioctl -Parameters: struct kvm_enable_cap (in) -Returns: 0 on success; -1 on error - -Capability: KVM_CAP_ENABLE_CAP_VM -Architectures: all -Type: vcpu ioctl -Parameters: struct kvm_enable_cap (in) -Returns: 0 on success; -1 on error - -+Not all extensions are enabled by default. Using this ioctl the application -can enable an extension, making it available to the guest. - -On systems that do not support this ioctl, it always fails. On systems that -do support it, it only works for extensions that are supported for enablement. - -To check if a capability can be enabled, the KVM_CHECK_EXTENSION ioctl should -be used. - -struct kvm_enable_cap { - /* in */ - __u32 cap; - -The capability that is supposed to get enabled. - - __u32 flags; - -A bitfield indicating future enhancements. Has to be 0 for now. - - __u64 args[4]; - -Arguments for enabling a feature. If a feature needs initial values to -function properly, this is the place to put them. - - __u8 pad[64]; -}; - -The vcpu ioctl should be used for vcpu-specific capabilities, the vm ioctl -for vm-wide capabilities. - -4.38 KVM_GET_MP_STATE - -Capability: KVM_CAP_MP_STATE -Architectures: x86, s390, arm, arm64 -Type: vcpu ioctl -Parameters: struct kvm_mp_state (out) -Returns: 0 on success; -1 on error - -struct kvm_mp_state { - __u32 mp_state; -}; - -Returns the vcpu's current "multiprocessing state" (though also valid on -uniprocessor guests). - -Possible values are: - - - KVM_MP_STATE_RUNNABLE: the vcpu is currently running [x86,arm/arm64] - - KVM_MP_STATE_UNINITIALIZED: the vcpu is an application processor (AP) - which has not yet received an INIT signal [x86] - - KVM_MP_STATE_INIT_RECEIVED: the vcpu has received an INIT signal, and is - now ready for a SIPI [x86] - - KVM_MP_STATE_HALTED: the vcpu has executed a HLT instruction and - is waiting for an interrupt [x86] - - KVM_MP_STATE_SIPI_RECEIVED: the vcpu has just received a SIPI (vector - accessible via KVM_GET_VCPU_EVENTS) [x86] - - KVM_MP_STATE_STOPPED: the vcpu is stopped [s390,arm/arm64] - - KVM_MP_STATE_CHECK_STOP: the vcpu is in a special error state [s390] - - KVM_MP_STATE_OPERATING: the vcpu is operating (running or halted) - [s390] - - KVM_MP_STATE_LOAD: the vcpu is in a special load/startup state - [s390] - -On x86, this ioctl is only useful after KVM_CREATE_IRQCHIP. Without an -in-kernel irqchip, the multiprocessing state must be maintained by userspace on -these architectures. - -For arm/arm64: - -The only states that are valid are KVM_MP_STATE_STOPPED and -KVM_MP_STATE_RUNNABLE which reflect if the vcpu is paused or not. - -4.39 KVM_SET_MP_STATE - -Capability: KVM_CAP_MP_STATE -Architectures: x86, s390, arm, arm64 -Type: vcpu ioctl -Parameters: struct kvm_mp_state (in) -Returns: 0 on success; -1 on error - -Sets the vcpu's current "multiprocessing state"; see KVM_GET_MP_STATE for -arguments. - -On x86, this ioctl is only useful after KVM_CREATE_IRQCHIP. Without an -in-kernel irqchip, the multiprocessing state must be maintained by userspace on -these architectures. - -For arm/arm64: - -The only states that are valid are KVM_MP_STATE_STOPPED and -KVM_MP_STATE_RUNNABLE which reflect if the vcpu should be paused or not. - -4.40 KVM_SET_IDENTITY_MAP_ADDR - -Capability: KVM_CAP_SET_IDENTITY_MAP_ADDR -Architectures: x86 -Type: vm ioctl -Parameters: unsigned long identity (in) -Returns: 0 on success, -1 on error - -This ioctl defines the physical address of a one-page region in the guest -physical address space. The region must be within the first 4GB of the -guest physical address space and must not conflict with any memory slot -or any mmio address. The guest may malfunction if it accesses this memory -region. - -Setting the address to 0 will result in resetting the address to its default -(0xfffbc000). - -This ioctl is required on Intel-based hosts. This is needed on Intel hardware -because of a quirk in the virtualization implementation (see the internals -documentation when it pops into existence). - -Fails if any VCPU has already been created. - -4.41 KVM_SET_BOOT_CPU_ID - -Capability: KVM_CAP_SET_BOOT_CPU_ID -Architectures: x86 -Type: vm ioctl -Parameters: unsigned long vcpu_id -Returns: 0 on success, -1 on error - -Define which vcpu is the Bootstrap Processor (BSP). Values are the same -as the vcpu id in KVM_CREATE_VCPU. If this ioctl is not called, the default -is vcpu 0. - - -4.42 KVM_GET_XSAVE - -Capability: KVM_CAP_XSAVE -Architectures: x86 -Type: vcpu ioctl -Parameters: struct kvm_xsave (out) -Returns: 0 on success, -1 on error - -struct kvm_xsave { - __u32 region[1024]; -}; - -This ioctl would copy current vcpu's xsave struct to the userspace. - - -4.43 KVM_SET_XSAVE - -Capability: KVM_CAP_XSAVE -Architectures: x86 -Type: vcpu ioctl -Parameters: struct kvm_xsave (in) -Returns: 0 on success, -1 on error - -struct kvm_xsave { - __u32 region[1024]; -}; - -This ioctl would copy userspace's xsave struct to the kernel. - - -4.44 KVM_GET_XCRS - -Capability: KVM_CAP_XCRS -Architectures: x86 -Type: vcpu ioctl -Parameters: struct kvm_xcrs (out) -Returns: 0 on success, -1 on error - -struct kvm_xcr { - __u32 xcr; - __u32 reserved; - __u64 value; -}; - -struct kvm_xcrs { - __u32 nr_xcrs; - __u32 flags; - struct kvm_xcr xcrs[KVM_MAX_XCRS]; - __u64 padding[16]; -}; - -This ioctl would copy current vcpu's xcrs to the userspace. - - -4.45 KVM_SET_XCRS - -Capability: KVM_CAP_XCRS -Architectures: x86 -Type: vcpu ioctl -Parameters: struct kvm_xcrs (in) -Returns: 0 on success, -1 on error - -struct kvm_xcr { - __u32 xcr; - __u32 reserved; - __u64 value; -}; - -struct kvm_xcrs { - __u32 nr_xcrs; - __u32 flags; - struct kvm_xcr xcrs[KVM_MAX_XCRS]; - __u64 padding[16]; -}; - -This ioctl would set vcpu's xcr to the value userspace specified. - - -4.46 KVM_GET_SUPPORTED_CPUID - -Capability: KVM_CAP_EXT_CPUID -Architectures: x86 -Type: system ioctl -Parameters: struct kvm_cpuid2 (in/out) -Returns: 0 on success, -1 on error - -struct kvm_cpuid2 { - __u32 nent; - __u32 padding; - struct kvm_cpuid_entry2 entries[0]; -}; - -#define KVM_CPUID_FLAG_SIGNIFCANT_INDEX BIT(0) -#define KVM_CPUID_FLAG_STATEFUL_FUNC BIT(1) -#define KVM_CPUID_FLAG_STATE_READ_NEXT BIT(2) - -struct kvm_cpuid_entry2 { - __u32 function; - __u32 index; - __u32 flags; - __u32 eax; - __u32 ebx; - __u32 ecx; - __u32 edx; - __u32 padding[3]; -}; - -This ioctl returns x86 cpuid features which are supported by both the -hardware and kvm in its default configuration. Userspace can use the -information returned by this ioctl to construct cpuid information (for -KVM_SET_CPUID2) that is consistent with hardware, kernel, and -userspace capabilities, and with user requirements (for example, the -user may wish to constrain cpuid to emulate older hardware, or for -feature consistency across a cluster). - -Note that certain capabilities, such as KVM_CAP_X86_DISABLE_EXITS, may -expose cpuid features (e.g. MONITOR) which are not supported by kvm in -its default configuration. If userspace enables such capabilities, it -is responsible for modifying the results of this ioctl appropriately. - -Userspace invokes KVM_GET_SUPPORTED_CPUID by passing a kvm_cpuid2 structure -with the 'nent' field indicating the number of entries in the variable-size -array 'entries'. If the number of entries is too low to describe the cpu -capabilities, an error (E2BIG) is returned. If the number is too high, -the 'nent' field is adjusted and an error (ENOMEM) is returned. If the -number is just right, the 'nent' field is adjusted to the number of valid -entries in the 'entries' array, which is then filled. - -The entries returned are the host cpuid as returned by the cpuid instruction, -with unknown or unsupported features masked out. Some features (for example, -x2apic), may not be present in the host cpu, but are exposed by kvm if it can -emulate them efficiently. The fields in each entry are defined as follows: - - function: the eax value used to obtain the entry - index: the ecx value used to obtain the entry (for entries that are - affected by ecx) - flags: an OR of zero or more of the following: - KVM_CPUID_FLAG_SIGNIFCANT_INDEX: - if the index field is valid - KVM_CPUID_FLAG_STATEFUL_FUNC: - if cpuid for this function returns different values for successive - invocations; there will be several entries with the same function, - all with this flag set - KVM_CPUID_FLAG_STATE_READ_NEXT: - for KVM_CPUID_FLAG_STATEFUL_FUNC entries, set if this entry is - the first entry to be read by a cpu - eax, ebx, ecx, edx: the values returned by the cpuid instruction for - this function/index combination - -The TSC deadline timer feature (CPUID leaf 1, ecx[24]) is always returned -as false, since the feature depends on KVM_CREATE_IRQCHIP for local APIC -support. Instead it is reported via - - ioctl(KVM_CHECK_EXTENSION, KVM_CAP_TSC_DEADLINE_TIMER) - -if that returns true and you use KVM_CREATE_IRQCHIP, or if you emulate the -feature in userspace, then you can enable the feature for KVM_SET_CPUID2. - - -4.47 KVM_PPC_GET_PVINFO - -Capability: KVM_CAP_PPC_GET_PVINFO -Architectures: ppc -Type: vm ioctl -Parameters: struct kvm_ppc_pvinfo (out) -Returns: 0 on success, !0 on error - -struct kvm_ppc_pvinfo { - __u32 flags; - __u32 hcall[4]; - __u8 pad[108]; -}; - -This ioctl fetches PV specific information that need to be passed to the guest -using the device tree or other means from vm context. - -The hcall array defines 4 instructions that make up a hypercall. - -If any additional field gets added to this structure later on, a bit for that -additional piece of information will be set in the flags bitmap. - -The flags bitmap is defined as: - - /* the host supports the ePAPR idle hcall - #define KVM_PPC_PVINFO_FLAGS_EV_IDLE (1<<0) - -4.52 KVM_SET_GSI_ROUTING - -Capability: KVM_CAP_IRQ_ROUTING -Architectures: x86 s390 arm arm64 -Type: vm ioctl -Parameters: struct kvm_irq_routing (in) -Returns: 0 on success, -1 on error - -Sets the GSI routing table entries, overwriting any previously set entries. - -On arm/arm64, GSI routing has the following limitation: -- GSI routing does not apply to KVM_IRQ_LINE but only to KVM_IRQFD. - -struct kvm_irq_routing { - __u32 nr; - __u32 flags; - struct kvm_irq_routing_entry entries[0]; -}; - -No flags are specified so far, the corresponding field must be set to zero. - -struct kvm_irq_routing_entry { - __u32 gsi; - __u32 type; - __u32 flags; - __u32 pad; - union { - struct kvm_irq_routing_irqchip irqchip; - struct kvm_irq_routing_msi msi; - struct kvm_irq_routing_s390_adapter adapter; - struct kvm_irq_routing_hv_sint hv_sint; - __u32 pad[8]; - } u; -}; - -/* gsi routing entry types */ -#define KVM_IRQ_ROUTING_IRQCHIP 1 -#define KVM_IRQ_ROUTING_MSI 2 -#define KVM_IRQ_ROUTING_S390_ADAPTER 3 -#define KVM_IRQ_ROUTING_HV_SINT 4 - -flags: -- KVM_MSI_VALID_DEVID: used along with KVM_IRQ_ROUTING_MSI routing entry - type, specifies that the devid field contains a valid value. The per-VM - KVM_CAP_MSI_DEVID capability advertises the requirement to provide - the device ID. If this capability is not available, userspace should - never set the KVM_MSI_VALID_DEVID flag as the ioctl might fail. -- zero otherwise - -struct kvm_irq_routing_irqchip { - __u32 irqchip; - __u32 pin; -}; - -struct kvm_irq_routing_msi { - __u32 address_lo; - __u32 address_hi; - __u32 data; - union { - __u32 pad; - __u32 devid; - }; -}; - -If KVM_MSI_VALID_DEVID is set, devid contains a unique device identifier -for the device that wrote the MSI message. For PCI, this is usually a -BFD identifier in the lower 16 bits. - -On x86, address_hi is ignored unless the KVM_X2APIC_API_USE_32BIT_IDS -feature of KVM_CAP_X2APIC_API capability is enabled. If it is enabled, -address_hi bits 31-8 provide bits 31-8 of the destination id. Bits 7-0 of -address_hi must be zero. - -struct kvm_irq_routing_s390_adapter { - __u64 ind_addr; - __u64 summary_addr; - __u64 ind_offset; - __u32 summary_offset; - __u32 adapter_id; -}; - -struct kvm_irq_routing_hv_sint { - __u32 vcpu; - __u32 sint; -}; - - -4.55 KVM_SET_TSC_KHZ - -Capability: KVM_CAP_TSC_CONTROL -Architectures: x86 -Type: vcpu ioctl -Parameters: virtual tsc_khz -Returns: 0 on success, -1 on error - -Specifies the tsc frequency for the virtual machine. The unit of the -frequency is KHz. - - -4.56 KVM_GET_TSC_KHZ - -Capability: KVM_CAP_GET_TSC_KHZ -Architectures: x86 -Type: vcpu ioctl -Parameters: none -Returns: virtual tsc-khz on success, negative value on error - -Returns the tsc frequency of the guest. The unit of the return value is -KHz. If the host has unstable tsc this ioctl returns -EIO instead as an -error. - - -4.57 KVM_GET_LAPIC - -Capability: KVM_CAP_IRQCHIP -Architectures: x86 -Type: vcpu ioctl -Parameters: struct kvm_lapic_state (out) -Returns: 0 on success, -1 on error - -#define KVM_APIC_REG_SIZE 0x400 -struct kvm_lapic_state { - char regs[KVM_APIC_REG_SIZE]; -}; - -Reads the Local APIC registers and copies them into the input argument. The -data format and layout are the same as documented in the architecture manual. - -If KVM_X2APIC_API_USE_32BIT_IDS feature of KVM_CAP_X2APIC_API is -enabled, then the format of APIC_ID register depends on the APIC mode -(reported by MSR_IA32_APICBASE) of its VCPU. x2APIC stores APIC ID in -the APIC_ID register (bytes 32-35). xAPIC only allows an 8-bit APIC ID -which is stored in bits 31-24 of the APIC register, or equivalently in -byte 35 of struct kvm_lapic_state's regs field. KVM_GET_LAPIC must then -be called after MSR_IA32_APICBASE has been set with KVM_SET_MSR. - -If KVM_X2APIC_API_USE_32BIT_IDS feature is disabled, struct kvm_lapic_state -always uses xAPIC format. - - -4.58 KVM_SET_LAPIC - -Capability: KVM_CAP_IRQCHIP -Architectures: x86 -Type: vcpu ioctl -Parameters: struct kvm_lapic_state (in) -Returns: 0 on success, -1 on error - -#define KVM_APIC_REG_SIZE 0x400 -struct kvm_lapic_state { - char regs[KVM_APIC_REG_SIZE]; -}; - -Copies the input argument into the Local APIC registers. The data format -and layout are the same as documented in the architecture manual. - -The format of the APIC ID register (bytes 32-35 of struct kvm_lapic_state's -regs field) depends on the state of the KVM_CAP_X2APIC_API capability. -See the note in KVM_GET_LAPIC. - - -4.59 KVM_IOEVENTFD - -Capability: KVM_CAP_IOEVENTFD -Architectures: all -Type: vm ioctl -Parameters: struct kvm_ioeventfd (in) -Returns: 0 on success, !0 on error - -This ioctl attaches or detaches an ioeventfd to a legal pio/mmio address -within the guest. A guest write in the registered address will signal the -provided event instead of triggering an exit. - -struct kvm_ioeventfd { - __u64 datamatch; - __u64 addr; /* legal pio/mmio address */ - __u32 len; /* 0, 1, 2, 4, or 8 bytes */ - __s32 fd; - __u32 flags; - __u8 pad[36]; -}; - -For the special case of virtio-ccw devices on s390, the ioevent is matched -to a subchannel/virtqueue tuple instead. - -The following flags are defined: - -#define KVM_IOEVENTFD_FLAG_DATAMATCH (1 << kvm_ioeventfd_flag_nr_datamatch) -#define KVM_IOEVENTFD_FLAG_PIO (1 << kvm_ioeventfd_flag_nr_pio) -#define KVM_IOEVENTFD_FLAG_DEASSIGN (1 << kvm_ioeventfd_flag_nr_deassign) -#define KVM_IOEVENTFD_FLAG_VIRTIO_CCW_NOTIFY \ - (1 << kvm_ioeventfd_flag_nr_virtio_ccw_notify) - -If datamatch flag is set, the event will be signaled only if the written value -to the registered address is equal to datamatch in struct kvm_ioeventfd. - -For virtio-ccw devices, addr contains the subchannel id and datamatch the -virtqueue index. - -With KVM_CAP_IOEVENTFD_ANY_LENGTH, a zero length ioeventfd is allowed, and -the kernel will ignore the length of guest write and may get a faster vmexit. -The speedup may only apply to specific architectures, but the ioeventfd will -work anyway. - -4.60 KVM_DIRTY_TLB - -Capability: KVM_CAP_SW_TLB -Architectures: ppc -Type: vcpu ioctl -Parameters: struct kvm_dirty_tlb (in) -Returns: 0 on success, -1 on error - -struct kvm_dirty_tlb { - __u64 bitmap; - __u32 num_dirty; -}; - -This must be called whenever userspace has changed an entry in the shared -TLB, prior to calling KVM_RUN on the associated vcpu. - -The "bitmap" field is the userspace address of an array. This array -consists of a number of bits, equal to the total number of TLB entries as -determined by the last successful call to KVM_CONFIG_TLB, rounded up to the -nearest multiple of 64. - -Each bit corresponds to one TLB entry, ordered the same as in the shared TLB -array. - -The array is little-endian: the bit 0 is the least significant bit of the -first byte, bit 8 is the least significant bit of the second byte, etc. -This avoids any complications with differing word sizes. - -The "num_dirty" field is a performance hint for KVM to determine whether it -should skip processing the bitmap and just invalidate everything. It must -be set to the number of set bits in the bitmap. - - -4.62 KVM_CREATE_SPAPR_TCE - -Capability: KVM_CAP_SPAPR_TCE -Architectures: powerpc -Type: vm ioctl -Parameters: struct kvm_create_spapr_tce (in) -Returns: file descriptor for manipulating the created TCE table - -This creates a virtual TCE (translation control entry) table, which -is an IOMMU for PAPR-style virtual I/O. It is used to translate -logical addresses used in virtual I/O into guest physical addresses, -and provides a scatter/gather capability for PAPR virtual I/O. - -/* for KVM_CAP_SPAPR_TCE */ -struct kvm_create_spapr_tce { - __u64 liobn; - __u32 window_size; -}; - -The liobn field gives the logical IO bus number for which to create a -TCE table. The window_size field specifies the size of the DMA window -which this TCE table will translate - the table will contain one 64 -bit TCE entry for every 4kiB of the DMA window. - -When the guest issues an H_PUT_TCE hcall on a liobn for which a TCE -table has been created using this ioctl(), the kernel will handle it -in real mode, updating the TCE table. H_PUT_TCE calls for other -liobns will cause a vm exit and must be handled by userspace. - -The return value is a file descriptor which can be passed to mmap(2) -to map the created TCE table into userspace. This lets userspace read -the entries written by kernel-handled H_PUT_TCE calls, and also lets -userspace update the TCE table directly which is useful in some -circumstances. - - -4.63 KVM_ALLOCATE_RMA - -Capability: KVM_CAP_PPC_RMA -Architectures: powerpc -Type: vm ioctl -Parameters: struct kvm_allocate_rma (out) -Returns: file descriptor for mapping the allocated RMA - -This allocates a Real Mode Area (RMA) from the pool allocated at boot -time by the kernel. An RMA is a physically-contiguous, aligned region -of memory used on older POWER processors to provide the memory which -will be accessed by real-mode (MMU off) accesses in a KVM guest. -POWER processors support a set of sizes for the RMA that usually -includes 64MB, 128MB, 256MB and some larger powers of two. - -/* for KVM_ALLOCATE_RMA */ -struct kvm_allocate_rma { - __u64 rma_size; -}; - -The return value is a file descriptor which can be passed to mmap(2) -to map the allocated RMA into userspace. The mapped area can then be -passed to the KVM_SET_USER_MEMORY_REGION ioctl to establish it as the -RMA for a virtual machine. The size of the RMA in bytes (which is -fixed at host kernel boot time) is returned in the rma_size field of -the argument structure. - -The KVM_CAP_PPC_RMA capability is 1 or 2 if the KVM_ALLOCATE_RMA ioctl -is supported; 2 if the processor requires all virtual machines to have -an RMA, or 1 if the processor can use an RMA but doesn't require it, -because it supports the Virtual RMA (VRMA) facility. - - -4.64 KVM_NMI - -Capability: KVM_CAP_USER_NMI -Architectures: x86 -Type: vcpu ioctl -Parameters: none -Returns: 0 on success, -1 on error - -Queues an NMI on the thread's vcpu. Note this is well defined only -when KVM_CREATE_IRQCHIP has not been called, since this is an interface -between the virtual cpu core and virtual local APIC. After KVM_CREATE_IRQCHIP -has been called, this interface is completely emulated within the kernel. - -To use this to emulate the LINT1 input with KVM_CREATE_IRQCHIP, use the -following algorithm: - - - pause the vcpu - - read the local APIC's state (KVM_GET_LAPIC) - - check whether changing LINT1 will queue an NMI (see the LVT entry for LINT1) - - if so, issue KVM_NMI - - resume the vcpu - -Some guests configure the LINT1 NMI input to cause a panic, aiding in -debugging. - - -4.65 KVM_S390_UCAS_MAP - -Capability: KVM_CAP_S390_UCONTROL -Architectures: s390 -Type: vcpu ioctl -Parameters: struct kvm_s390_ucas_mapping (in) -Returns: 0 in case of success - -The parameter is defined like this: - struct kvm_s390_ucas_mapping { - __u64 user_addr; - __u64 vcpu_addr; - __u64 length; - }; - -This ioctl maps the memory at "user_addr" with the length "length" to -the vcpu's address space starting at "vcpu_addr". All parameters need to -be aligned by 1 megabyte. - - -4.66 KVM_S390_UCAS_UNMAP - -Capability: KVM_CAP_S390_UCONTROL -Architectures: s390 -Type: vcpu ioctl -Parameters: struct kvm_s390_ucas_mapping (in) -Returns: 0 in case of success - -The parameter is defined like this: - struct kvm_s390_ucas_mapping { - __u64 user_addr; - __u64 vcpu_addr; - __u64 length; - }; - -This ioctl unmaps the memory in the vcpu's address space starting at -"vcpu_addr" with the length "length". The field "user_addr" is ignored. -All parameters need to be aligned by 1 megabyte. - - -4.67 KVM_S390_VCPU_FAULT - -Capability: KVM_CAP_S390_UCONTROL -Architectures: s390 -Type: vcpu ioctl -Parameters: vcpu absolute address (in) -Returns: 0 in case of success - -This call creates a page table entry on the virtual cpu's address space -(for user controlled virtual machines) or the virtual machine's address -space (for regular virtual machines). This only works for minor faults, -thus it's recommended to access subject memory page via the user page -table upfront. This is useful to handle validity intercepts for user -controlled virtual machines to fault in the virtual cpu's lowcore pages -prior to calling the KVM_RUN ioctl. - - -4.68 KVM_SET_ONE_REG - -Capability: KVM_CAP_ONE_REG -Architectures: all -Type: vcpu ioctl -Parameters: struct kvm_one_reg (in) -Returns: 0 on success, negative value on failure -Errors: -  ENOENT:   no such register -  EINVAL:   invalid register ID, or no such register -  EPERM:    (arm64) register access not allowed before vcpu finalization -(These error codes are indicative only: do not rely on a specific error -code being returned in a specific situation.) - -struct kvm_one_reg { - __u64 id; - __u64 addr; -}; - -Using this ioctl, a single vcpu register can be set to a specific value -defined by user space with the passed in struct kvm_one_reg, where id -refers to the register identifier as described below and addr is a pointer -to a variable with the respective size. There can be architecture agnostic -and architecture specific registers. Each have their own range of operation -and their own constants and width. To keep track of the implemented -registers, find a list below: - - Arch | Register | Width (bits) - | | - PPC | KVM_REG_PPC_HIOR | 64 - PPC | KVM_REG_PPC_IAC1 | 64 - PPC | KVM_REG_PPC_IAC2 | 64 - PPC | KVM_REG_PPC_IAC3 | 64 - PPC | KVM_REG_PPC_IAC4 | 64 - PPC | KVM_REG_PPC_DAC1 | 64 - PPC | KVM_REG_PPC_DAC2 | 64 - PPC | KVM_REG_PPC_DABR | 64 - PPC | KVM_REG_PPC_DSCR | 64 - PPC | KVM_REG_PPC_PURR | 64 - PPC | KVM_REG_PPC_SPURR | 64 - PPC | KVM_REG_PPC_DAR | 64 - PPC | KVM_REG_PPC_DSISR | 32 - PPC | KVM_REG_PPC_AMR | 64 - PPC | KVM_REG_PPC_UAMOR | 64 - PPC | KVM_REG_PPC_MMCR0 | 64 - PPC | KVM_REG_PPC_MMCR1 | 64 - PPC | KVM_REG_PPC_MMCRA | 64 - PPC | KVM_REG_PPC_MMCR2 | 64 - PPC | KVM_REG_PPC_MMCRS | 64 - PPC | KVM_REG_PPC_SIAR | 64 - PPC | KVM_REG_PPC_SDAR | 64 - PPC | KVM_REG_PPC_SIER | 64 - PPC | KVM_REG_PPC_PMC1 | 32 - PPC | KVM_REG_PPC_PMC2 | 32 - PPC | KVM_REG_PPC_PMC3 | 32 - PPC | KVM_REG_PPC_PMC4 | 32 - PPC | KVM_REG_PPC_PMC5 | 32 - PPC | KVM_REG_PPC_PMC6 | 32 - PPC | KVM_REG_PPC_PMC7 | 32 - PPC | KVM_REG_PPC_PMC8 | 32 - PPC | KVM_REG_PPC_FPR0 | 64 - ... - PPC | KVM_REG_PPC_FPR31 | 64 - PPC | KVM_REG_PPC_VR0 | 128 - ... - PPC | KVM_REG_PPC_VR31 | 128 - PPC | KVM_REG_PPC_VSR0 | 128 - ... - PPC | KVM_REG_PPC_VSR31 | 128 - PPC | KVM_REG_PPC_FPSCR | 64 - PPC | KVM_REG_PPC_VSCR | 32 - PPC | KVM_REG_PPC_VPA_ADDR | 64 - PPC | KVM_REG_PPC_VPA_SLB | 128 - PPC | KVM_REG_PPC_VPA_DTL | 128 - PPC | KVM_REG_PPC_EPCR | 32 - PPC | KVM_REG_PPC_EPR | 32 - PPC | KVM_REG_PPC_TCR | 32 - PPC | KVM_REG_PPC_TSR | 32 - PPC | KVM_REG_PPC_OR_TSR | 32 - PPC | KVM_REG_PPC_CLEAR_TSR | 32 - PPC | KVM_REG_PPC_MAS0 | 32 - PPC | KVM_REG_PPC_MAS1 | 32 - PPC | KVM_REG_PPC_MAS2 | 64 - PPC | KVM_REG_PPC_MAS7_3 | 64 - PPC | KVM_REG_PPC_MAS4 | 32 - PPC | KVM_REG_PPC_MAS6 | 32 - PPC | KVM_REG_PPC_MMUCFG | 32 - PPC | KVM_REG_PPC_TLB0CFG | 32 - PPC | KVM_REG_PPC_TLB1CFG | 32 - PPC | KVM_REG_PPC_TLB2CFG | 32 - PPC | KVM_REG_PPC_TLB3CFG | 32 - PPC | KVM_REG_PPC_TLB0PS | 32 - PPC | KVM_REG_PPC_TLB1PS | 32 - PPC | KVM_REG_PPC_TLB2PS | 32 - PPC | KVM_REG_PPC_TLB3PS | 32 - PPC | KVM_REG_PPC_EPTCFG | 32 - PPC | KVM_REG_PPC_ICP_STATE | 64 - PPC | KVM_REG_PPC_VP_STATE | 128 - PPC | KVM_REG_PPC_TB_OFFSET | 64 - PPC | KVM_REG_PPC_SPMC1 | 32 - PPC | KVM_REG_PPC_SPMC2 | 32 - PPC | KVM_REG_PPC_IAMR | 64 - PPC | KVM_REG_PPC_TFHAR | 64 - PPC | KVM_REG_PPC_TFIAR | 64 - PPC | KVM_REG_PPC_TEXASR | 64 - PPC | KVM_REG_PPC_FSCR | 64 - PPC | KVM_REG_PPC_PSPB | 32 - PPC | KVM_REG_PPC_EBBHR | 64 - PPC | KVM_REG_PPC_EBBRR | 64 - PPC | KVM_REG_PPC_BESCR | 64 - PPC | KVM_REG_PPC_TAR | 64 - PPC | KVM_REG_PPC_DPDES | 64 - PPC | KVM_REG_PPC_DAWR | 64 - PPC | KVM_REG_PPC_DAWRX | 64 - PPC | KVM_REG_PPC_CIABR | 64 - PPC | KVM_REG_PPC_IC | 64 - PPC | KVM_REG_PPC_VTB | 64 - PPC | KVM_REG_PPC_CSIGR | 64 - PPC | KVM_REG_PPC_TACR | 64 - PPC | KVM_REG_PPC_TCSCR | 64 - PPC | KVM_REG_PPC_PID | 64 - PPC | KVM_REG_PPC_ACOP | 64 - PPC | KVM_REG_PPC_VRSAVE | 32 - PPC | KVM_REG_PPC_LPCR | 32 - PPC | KVM_REG_PPC_LPCR_64 | 64 - PPC | KVM_REG_PPC_PPR | 64 - PPC | KVM_REG_PPC_ARCH_COMPAT | 32 - PPC | KVM_REG_PPC_DABRX | 32 - PPC | KVM_REG_PPC_WORT | 64 - PPC | KVM_REG_PPC_SPRG9 | 64 - PPC | KVM_REG_PPC_DBSR | 32 - PPC | KVM_REG_PPC_TIDR | 64 - PPC | KVM_REG_PPC_PSSCR | 64 - PPC | KVM_REG_PPC_DEC_EXPIRY | 64 - PPC | KVM_REG_PPC_PTCR | 64 - PPC | KVM_REG_PPC_TM_GPR0 | 64 - ... - PPC | KVM_REG_PPC_TM_GPR31 | 64 - PPC | KVM_REG_PPC_TM_VSR0 | 128 - ... - PPC | KVM_REG_PPC_TM_VSR63 | 128 - PPC | KVM_REG_PPC_TM_CR | 64 - PPC | KVM_REG_PPC_TM_LR | 64 - PPC | KVM_REG_PPC_TM_CTR | 64 - PPC | KVM_REG_PPC_TM_FPSCR | 64 - PPC | KVM_REG_PPC_TM_AMR | 64 - PPC | KVM_REG_PPC_TM_PPR | 64 - PPC | KVM_REG_PPC_TM_VRSAVE | 64 - PPC | KVM_REG_PPC_TM_VSCR | 32 - PPC | KVM_REG_PPC_TM_DSCR | 64 - PPC | KVM_REG_PPC_TM_TAR | 64 - PPC | KVM_REG_PPC_TM_XER | 64 - | | - MIPS | KVM_REG_MIPS_R0 | 64 - ... - MIPS | KVM_REG_MIPS_R31 | 64 - MIPS | KVM_REG_MIPS_HI | 64 - MIPS | KVM_REG_MIPS_LO | 64 - MIPS | KVM_REG_MIPS_PC | 64 - MIPS | KVM_REG_MIPS_CP0_INDEX | 32 - MIPS | KVM_REG_MIPS_CP0_ENTRYLO0 | 64 - MIPS | KVM_REG_MIPS_CP0_ENTRYLO1 | 64 - MIPS | KVM_REG_MIPS_CP0_CONTEXT | 64 - MIPS | KVM_REG_MIPS_CP0_CONTEXTCONFIG| 32 - MIPS | KVM_REG_MIPS_CP0_USERLOCAL | 64 - MIPS | KVM_REG_MIPS_CP0_XCONTEXTCONFIG| 64 - MIPS | KVM_REG_MIPS_CP0_PAGEMASK | 32 - MIPS | KVM_REG_MIPS_CP0_PAGEGRAIN | 32 - MIPS | KVM_REG_MIPS_CP0_SEGCTL0 | 64 - MIPS | KVM_REG_MIPS_CP0_SEGCTL1 | 64 - MIPS | KVM_REG_MIPS_CP0_SEGCTL2 | 64 - MIPS | KVM_REG_MIPS_CP0_PWBASE | 64 - MIPS | KVM_REG_MIPS_CP0_PWFIELD | 64 - MIPS | KVM_REG_MIPS_CP0_PWSIZE | 64 - MIPS | KVM_REG_MIPS_CP0_WIRED | 32 - MIPS | KVM_REG_MIPS_CP0_PWCTL | 32 - MIPS | KVM_REG_MIPS_CP0_HWRENA | 32 - MIPS | KVM_REG_MIPS_CP0_BADVADDR | 64 - MIPS | KVM_REG_MIPS_CP0_BADINSTR | 32 - MIPS | KVM_REG_MIPS_CP0_BADINSTRP | 32 - MIPS | KVM_REG_MIPS_CP0_COUNT | 32 - MIPS | KVM_REG_MIPS_CP0_ENTRYHI | 64 - MIPS | KVM_REG_MIPS_CP0_COMPARE | 32 - MIPS | KVM_REG_MIPS_CP0_STATUS | 32 - MIPS | KVM_REG_MIPS_CP0_INTCTL | 32 - MIPS | KVM_REG_MIPS_CP0_CAUSE | 32 - MIPS | KVM_REG_MIPS_CP0_EPC | 64 - MIPS | KVM_REG_MIPS_CP0_PRID | 32 - MIPS | KVM_REG_MIPS_CP0_EBASE | 64 - MIPS | KVM_REG_MIPS_CP0_CONFIG | 32 - MIPS | KVM_REG_MIPS_CP0_CONFIG1 | 32 - MIPS | KVM_REG_MIPS_CP0_CONFIG2 | 32 - MIPS | KVM_REG_MIPS_CP0_CONFIG3 | 32 - MIPS | KVM_REG_MIPS_CP0_CONFIG4 | 32 - MIPS | KVM_REG_MIPS_CP0_CONFIG5 | 32 - MIPS | KVM_REG_MIPS_CP0_CONFIG7 | 32 - MIPS | KVM_REG_MIPS_CP0_XCONTEXT | 64 - MIPS | KVM_REG_MIPS_CP0_ERROREPC | 64 - MIPS | KVM_REG_MIPS_CP0_KSCRATCH1 | 64 - MIPS | KVM_REG_MIPS_CP0_KSCRATCH2 | 64 - MIPS | KVM_REG_MIPS_CP0_KSCRATCH3 | 64 - MIPS | KVM_REG_MIPS_CP0_KSCRATCH4 | 64 - MIPS | KVM_REG_MIPS_CP0_KSCRATCH5 | 64 - MIPS | KVM_REG_MIPS_CP0_KSCRATCH6 | 64 - MIPS | KVM_REG_MIPS_CP0_MAAR(0..63) | 64 - MIPS | KVM_REG_MIPS_COUNT_CTL | 64 - MIPS | KVM_REG_MIPS_COUNT_RESUME | 64 - MIPS | KVM_REG_MIPS_COUNT_HZ | 64 - MIPS | KVM_REG_MIPS_FPR_32(0..31) | 32 - MIPS | KVM_REG_MIPS_FPR_64(0..31) | 64 - MIPS | KVM_REG_MIPS_VEC_128(0..31) | 128 - MIPS | KVM_REG_MIPS_FCR_IR | 32 - MIPS | KVM_REG_MIPS_FCR_CSR | 32 - MIPS | KVM_REG_MIPS_MSA_IR | 32 - MIPS | KVM_REG_MIPS_MSA_CSR | 32 - -ARM registers are mapped using the lower 32 bits. The upper 16 of that -is the register group type, or coprocessor number: - -ARM core registers have the following id bit patterns: - 0x4020 0000 0010 - -ARM 32-bit CP15 registers have the following id bit patterns: - 0x4020 0000 000F - -ARM 64-bit CP15 registers have the following id bit patterns: - 0x4030 0000 000F - -ARM CCSIDR registers are demultiplexed by CSSELR value: - 0x4020 0000 0011 00 - -ARM 32-bit VFP control registers have the following id bit patterns: - 0x4020 0000 0012 1 - -ARM 64-bit FP registers have the following id bit patterns: - 0x4030 0000 0012 0 - -ARM firmware pseudo-registers have the following bit pattern: - 0x4030 0000 0014 - - -arm64 registers are mapped using the lower 32 bits. The upper 16 of -that is the register group type, or coprocessor number: - -arm64 core/FP-SIMD registers have the following id bit patterns. Note -that the size of the access is variable, as the kvm_regs structure -contains elements ranging from 32 to 128 bits. The index is a 32bit -value in the kvm_regs structure seen as a 32bit array. - 0x60x0 0000 0010 - -Specifically: - Encoding Register Bits kvm_regs member ----------------------------------------------------------------- - 0x6030 0000 0010 0000 X0 64 regs.regs[0] - 0x6030 0000 0010 0002 X1 64 regs.regs[1] - ... - 0x6030 0000 0010 003c X30 64 regs.regs[30] - 0x6030 0000 0010 003e SP 64 regs.sp - 0x6030 0000 0010 0040 PC 64 regs.pc - 0x6030 0000 0010 0042 PSTATE 64 regs.pstate - 0x6030 0000 0010 0044 SP_EL1 64 sp_el1 - 0x6030 0000 0010 0046 ELR_EL1 64 elr_el1 - 0x6030 0000 0010 0048 SPSR_EL1 64 spsr[KVM_SPSR_EL1] (alias SPSR_SVC) - 0x6030 0000 0010 004a SPSR_ABT 64 spsr[KVM_SPSR_ABT] - 0x6030 0000 0010 004c SPSR_UND 64 spsr[KVM_SPSR_UND] - 0x6030 0000 0010 004e SPSR_IRQ 64 spsr[KVM_SPSR_IRQ] - 0x6060 0000 0010 0050 SPSR_FIQ 64 spsr[KVM_SPSR_FIQ] - 0x6040 0000 0010 0054 V0 128 fp_regs.vregs[0] (*) - 0x6040 0000 0010 0058 V1 128 fp_regs.vregs[1] (*) - ... - 0x6040 0000 0010 00d0 V31 128 fp_regs.vregs[31] (*) - 0x6020 0000 0010 00d4 FPSR 32 fp_regs.fpsr - 0x6020 0000 0010 00d5 FPCR 32 fp_regs.fpcr - -(*) These encodings are not accepted for SVE-enabled vcpus. See - KVM_ARM_VCPU_INIT. - - The equivalent register content can be accessed via bits [127:0] of - the corresponding SVE Zn registers instead for vcpus that have SVE - enabled (see below). - -arm64 CCSIDR registers are demultiplexed by CSSELR value: - 0x6020 0000 0011 00 - -arm64 system registers have the following id bit patterns: - 0x6030 0000 0013 - -arm64 firmware pseudo-registers have the following bit pattern: - 0x6030 0000 0014 - -arm64 SVE registers have the following bit patterns: - 0x6080 0000 0015 00 Zn bits[2048*slice + 2047 : 2048*slice] - 0x6050 0000 0015 04 Pn bits[256*slice + 255 : 256*slice] - 0x6050 0000 0015 060 FFR bits[256*slice + 255 : 256*slice] - 0x6060 0000 0015 ffff KVM_REG_ARM64_SVE_VLS pseudo-register - -Access to register IDs where 2048 * slice >= 128 * max_vq will fail with -ENOENT. max_vq is the vcpu's maximum supported vector length in 128-bit -quadwords: see (**) below. - -These registers are only accessible on vcpus for which SVE is enabled. -See KVM_ARM_VCPU_INIT for details. - -In addition, except for KVM_REG_ARM64_SVE_VLS, these registers are not -accessible until the vcpu's SVE configuration has been finalized -using KVM_ARM_VCPU_FINALIZE(KVM_ARM_VCPU_SVE). See KVM_ARM_VCPU_INIT -and KVM_ARM_VCPU_FINALIZE for more information about this procedure. - -KVM_REG_ARM64_SVE_VLS is a pseudo-register that allows the set of vector -lengths supported by the vcpu to be discovered and configured by -userspace. When transferred to or from user memory via KVM_GET_ONE_REG -or KVM_SET_ONE_REG, the value of this register is of type -__u64[KVM_ARM64_SVE_VLS_WORDS], and encodes the set of vector lengths as -follows: - -__u64 vector_lengths[KVM_ARM64_SVE_VLS_WORDS]; - -if (vq >= SVE_VQ_MIN && vq <= SVE_VQ_MAX && - ((vector_lengths[(vq - KVM_ARM64_SVE_VQ_MIN) / 64] >> - ((vq - KVM_ARM64_SVE_VQ_MIN) % 64)) & 1)) - /* Vector length vq * 16 bytes supported */ -else - /* Vector length vq * 16 bytes not supported */ - -(**) The maximum value vq for which the above condition is true is -max_vq. This is the maximum vector length available to the guest on -this vcpu, and determines which register slices are visible through -this ioctl interface. - -(See Documentation/arm64/sve.rst for an explanation of the "vq" -nomenclature.) - -KVM_REG_ARM64_SVE_VLS is only accessible after KVM_ARM_VCPU_INIT. -KVM_ARM_VCPU_INIT initialises it to the best set of vector lengths that -the host supports. - -Userspace may subsequently modify it if desired until the vcpu's SVE -configuration is finalized using KVM_ARM_VCPU_FINALIZE(KVM_ARM_VCPU_SVE). - -Apart from simply removing all vector lengths from the host set that -exceed some value, support for arbitrarily chosen sets of vector lengths -is hardware-dependent and may not be available. Attempting to configure -an invalid set of vector lengths via KVM_SET_ONE_REG will fail with -EINVAL. - -After the vcpu's SVE configuration is finalized, further attempts to -write this register will fail with EPERM. - - -MIPS registers are mapped using the lower 32 bits. The upper 16 of that is -the register group type: - -MIPS core registers (see above) have the following id bit patterns: - 0x7030 0000 0000 - -MIPS CP0 registers (see KVM_REG_MIPS_CP0_* above) have the following id bit -patterns depending on whether they're 32-bit or 64-bit registers: - 0x7020 0000 0001 00 (32-bit) - 0x7030 0000 0001 00 (64-bit) - -Note: KVM_REG_MIPS_CP0_ENTRYLO0 and KVM_REG_MIPS_CP0_ENTRYLO1 are the MIPS64 -versions of the EntryLo registers regardless of the word size of the host -hardware, host kernel, guest, and whether XPA is present in the guest, i.e. -with the RI and XI bits (if they exist) in bits 63 and 62 respectively, and -the PFNX field starting at bit 30. - -MIPS MAARs (see KVM_REG_MIPS_CP0_MAAR(*) above) have the following id bit -patterns: - 0x7030 0000 0001 01 - -MIPS KVM control registers (see above) have the following id bit patterns: - 0x7030 0000 0002 - -MIPS FPU registers (see KVM_REG_MIPS_FPR_{32,64}() above) have the following -id bit patterns depending on the size of the register being accessed. They are -always accessed according to the current guest FPU mode (Status.FR and -Config5.FRE), i.e. as the guest would see them, and they become unpredictable -if the guest FPU mode is changed. MIPS SIMD Architecture (MSA) vector -registers (see KVM_REG_MIPS_VEC_128() above) have similar patterns as they -overlap the FPU registers: - 0x7020 0000 0003 00 <0:3> (32-bit FPU registers) - 0x7030 0000 0003 00 <0:3> (64-bit FPU registers) - 0x7040 0000 0003 00 <0:3> (128-bit MSA vector registers) - -MIPS FPU control registers (see KVM_REG_MIPS_FCR_{IR,CSR} above) have the -following id bit patterns: - 0x7020 0000 0003 01 <0:3> - -MIPS MSA control registers (see KVM_REG_MIPS_MSA_{IR,CSR} above) have the -following id bit patterns: - 0x7020 0000 0003 02 <0:3> - - -4.69 KVM_GET_ONE_REG - -Capability: KVM_CAP_ONE_REG -Architectures: all -Type: vcpu ioctl -Parameters: struct kvm_one_reg (in and out) -Returns: 0 on success, negative value on failure -Errors include: -  ENOENT:   no such register -  EINVAL:   invalid register ID, or no such register -  EPERM:    (arm64) register access not allowed before vcpu finalization -(These error codes are indicative only: do not rely on a specific error -code being returned in a specific situation.) - -This ioctl allows to receive the value of a single register implemented -in a vcpu. The register to read is indicated by the "id" field of the -kvm_one_reg struct passed in. On success, the register value can be found -at the memory location pointed to by "addr". - -The list of registers accessible using this interface is identical to the -list in 4.68. - - -4.70 KVM_KVMCLOCK_CTRL - -Capability: KVM_CAP_KVMCLOCK_CTRL -Architectures: Any that implement pvclocks (currently x86 only) -Type: vcpu ioctl -Parameters: None -Returns: 0 on success, -1 on error - -This signals to the host kernel that the specified guest is being paused by -userspace. The host will set a flag in the pvclock structure that is checked -from the soft lockup watchdog. The flag is part of the pvclock structure that -is shared between guest and host, specifically the second bit of the flags -field of the pvclock_vcpu_time_info structure. It will be set exclusively by -the host and read/cleared exclusively by the guest. The guest operation of -checking and clearing the flag must an atomic operation so -load-link/store-conditional, or equivalent must be used. There are two cases -where the guest will clear the flag: when the soft lockup watchdog timer resets -itself or when a soft lockup is detected. This ioctl can be called any time -after pausing the vcpu, but before it is resumed. - - -4.71 KVM_SIGNAL_MSI - -Capability: KVM_CAP_SIGNAL_MSI -Architectures: x86 arm arm64 -Type: vm ioctl -Parameters: struct kvm_msi (in) -Returns: >0 on delivery, 0 if guest blocked the MSI, and -1 on error - -Directly inject a MSI message. Only valid with in-kernel irqchip that handles -MSI messages. - -struct kvm_msi { - __u32 address_lo; - __u32 address_hi; - __u32 data; - __u32 flags; - __u32 devid; - __u8 pad[12]; -}; - -flags: KVM_MSI_VALID_DEVID: devid contains a valid value. The per-VM - KVM_CAP_MSI_DEVID capability advertises the requirement to provide - the device ID. If this capability is not available, userspace - should never set the KVM_MSI_VALID_DEVID flag as the ioctl might fail. - -If KVM_MSI_VALID_DEVID is set, devid contains a unique device identifier -for the device that wrote the MSI message. For PCI, this is usually a -BFD identifier in the lower 16 bits. - -On x86, address_hi is ignored unless the KVM_X2APIC_API_USE_32BIT_IDS -feature of KVM_CAP_X2APIC_API capability is enabled. If it is enabled, -address_hi bits 31-8 provide bits 31-8 of the destination id. Bits 7-0 of -address_hi must be zero. - - -4.71 KVM_CREATE_PIT2 - -Capability: KVM_CAP_PIT2 -Architectures: x86 -Type: vm ioctl -Parameters: struct kvm_pit_config (in) -Returns: 0 on success, -1 on error - -Creates an in-kernel device model for the i8254 PIT. This call is only valid -after enabling in-kernel irqchip support via KVM_CREATE_IRQCHIP. The following -parameters have to be passed: - -struct kvm_pit_config { - __u32 flags; - __u32 pad[15]; -}; - -Valid flags are: - -#define KVM_PIT_SPEAKER_DUMMY 1 /* emulate speaker port stub */ - -PIT timer interrupts may use a per-VM kernel thread for injection. If it -exists, this thread will have a name of the following pattern: - -kvm-pit/ - -When running a guest with elevated priorities, the scheduling parameters of -this thread may have to be adjusted accordingly. - -This IOCTL replaces the obsolete KVM_CREATE_PIT. - - -4.72 KVM_GET_PIT2 - -Capability: KVM_CAP_PIT_STATE2 -Architectures: x86 -Type: vm ioctl -Parameters: struct kvm_pit_state2 (out) -Returns: 0 on success, -1 on error - -Retrieves the state of the in-kernel PIT model. Only valid after -KVM_CREATE_PIT2. The state is returned in the following structure: - -struct kvm_pit_state2 { - struct kvm_pit_channel_state channels[3]; - __u32 flags; - __u32 reserved[9]; -}; - -Valid flags are: - -/* disable PIT in HPET legacy mode */ -#define KVM_PIT_FLAGS_HPET_LEGACY 0x00000001 - -This IOCTL replaces the obsolete KVM_GET_PIT. - - -4.73 KVM_SET_PIT2 - -Capability: KVM_CAP_PIT_STATE2 -Architectures: x86 -Type: vm ioctl -Parameters: struct kvm_pit_state2 (in) -Returns: 0 on success, -1 on error - -Sets the state of the in-kernel PIT model. Only valid after KVM_CREATE_PIT2. -See KVM_GET_PIT2 for details on struct kvm_pit_state2. - -This IOCTL replaces the obsolete KVM_SET_PIT. - - -4.74 KVM_PPC_GET_SMMU_INFO - -Capability: KVM_CAP_PPC_GET_SMMU_INFO -Architectures: powerpc -Type: vm ioctl -Parameters: None -Returns: 0 on success, -1 on error - -This populates and returns a structure describing the features of -the "Server" class MMU emulation supported by KVM. -This can in turn be used by userspace to generate the appropriate -device-tree properties for the guest operating system. - -The structure contains some global information, followed by an -array of supported segment page sizes: - - struct kvm_ppc_smmu_info { - __u64 flags; - __u32 slb_size; - __u32 pad; - struct kvm_ppc_one_seg_page_size sps[KVM_PPC_PAGE_SIZES_MAX_SZ]; - }; - -The supported flags are: - - - KVM_PPC_PAGE_SIZES_REAL: - When that flag is set, guest page sizes must "fit" the backing - store page sizes. When not set, any page size in the list can - be used regardless of how they are backed by userspace. - - - KVM_PPC_1T_SEGMENTS - The emulated MMU supports 1T segments in addition to the - standard 256M ones. - - - KVM_PPC_NO_HASH - This flag indicates that HPT guests are not supported by KVM, - thus all guests must use radix MMU mode. - -The "slb_size" field indicates how many SLB entries are supported - -The "sps" array contains 8 entries indicating the supported base -page sizes for a segment in increasing order. Each entry is defined -as follow: - - struct kvm_ppc_one_seg_page_size { - __u32 page_shift; /* Base page shift of segment (or 0) */ - __u32 slb_enc; /* SLB encoding for BookS */ - struct kvm_ppc_one_page_size enc[KVM_PPC_PAGE_SIZES_MAX_SZ]; - }; - -An entry with a "page_shift" of 0 is unused. Because the array is -organized in increasing order, a lookup can stop when encoutering -such an entry. - -The "slb_enc" field provides the encoding to use in the SLB for the -page size. The bits are in positions such as the value can directly -be OR'ed into the "vsid" argument of the slbmte instruction. - -The "enc" array is a list which for each of those segment base page -size provides the list of supported actual page sizes (which can be -only larger or equal to the base page size), along with the -corresponding encoding in the hash PTE. Similarly, the array is -8 entries sorted by increasing sizes and an entry with a "0" shift -is an empty entry and a terminator: - - struct kvm_ppc_one_page_size { - __u32 page_shift; /* Page shift (or 0) */ - __u32 pte_enc; /* Encoding in the HPTE (>>12) */ - }; - -The "pte_enc" field provides a value that can OR'ed into the hash -PTE's RPN field (ie, it needs to be shifted left by 12 to OR it -into the hash PTE second double word). - -4.75 KVM_IRQFD - -Capability: KVM_CAP_IRQFD -Architectures: x86 s390 arm arm64 -Type: vm ioctl -Parameters: struct kvm_irqfd (in) -Returns: 0 on success, -1 on error - -Allows setting an eventfd to directly trigger a guest interrupt. -kvm_irqfd.fd specifies the file descriptor to use as the eventfd and -kvm_irqfd.gsi specifies the irqchip pin toggled by this event. When -an event is triggered on the eventfd, an interrupt is injected into -the guest using the specified gsi pin. The irqfd is removed using -the KVM_IRQFD_FLAG_DEASSIGN flag, specifying both kvm_irqfd.fd -and kvm_irqfd.gsi. - -With KVM_CAP_IRQFD_RESAMPLE, KVM_IRQFD supports a de-assert and notify -mechanism allowing emulation of level-triggered, irqfd-based -interrupts. When KVM_IRQFD_FLAG_RESAMPLE is set the user must pass an -additional eventfd in the kvm_irqfd.resamplefd field. When operating -in resample mode, posting of an interrupt through kvm_irq.fd asserts -the specified gsi in the irqchip. When the irqchip is resampled, such -as from an EOI, the gsi is de-asserted and the user is notified via -kvm_irqfd.resamplefd. It is the user's responsibility to re-queue -the interrupt if the device making use of it still requires service. -Note that closing the resamplefd is not sufficient to disable the -irqfd. The KVM_IRQFD_FLAG_RESAMPLE is only necessary on assignment -and need not be specified with KVM_IRQFD_FLAG_DEASSIGN. - -On arm/arm64, gsi routing being supported, the following can happen: -- in case no routing entry is associated to this gsi, injection fails -- in case the gsi is associated to an irqchip routing entry, - irqchip.pin + 32 corresponds to the injected SPI ID. -- in case the gsi is associated to an MSI routing entry, the MSI - message and device ID are translated into an LPI (support restricted - to GICv3 ITS in-kernel emulation). - -4.76 KVM_PPC_ALLOCATE_HTAB - -Capability: KVM_CAP_PPC_ALLOC_HTAB -Architectures: powerpc -Type: vm ioctl -Parameters: Pointer to u32 containing hash table order (in/out) -Returns: 0 on success, -1 on error - -This requests the host kernel to allocate an MMU hash table for a -guest using the PAPR paravirtualization interface. This only does -anything if the kernel is configured to use the Book 3S HV style of -virtualization. Otherwise the capability doesn't exist and the ioctl -returns an ENOTTY error. The rest of this description assumes Book 3S -HV. - -There must be no vcpus running when this ioctl is called; if there -are, it will do nothing and return an EBUSY error. - -The parameter is a pointer to a 32-bit unsigned integer variable -containing the order (log base 2) of the desired size of the hash -table, which must be between 18 and 46. On successful return from the -ioctl, the value will not be changed by the kernel. - -If no hash table has been allocated when any vcpu is asked to run -(with the KVM_RUN ioctl), the host kernel will allocate a -default-sized hash table (16 MB). - -If this ioctl is called when a hash table has already been allocated, -with a different order from the existing hash table, the existing hash -table will be freed and a new one allocated. If this is ioctl is -called when a hash table has already been allocated of the same order -as specified, the kernel will clear out the existing hash table (zero -all HPTEs). In either case, if the guest is using the virtualized -real-mode area (VRMA) facility, the kernel will re-create the VMRA -HPTEs on the next KVM_RUN of any vcpu. - -4.77 KVM_S390_INTERRUPT - -Capability: basic -Architectures: s390 -Type: vm ioctl, vcpu ioctl -Parameters: struct kvm_s390_interrupt (in) -Returns: 0 on success, -1 on error - -Allows to inject an interrupt to the guest. Interrupts can be floating -(vm ioctl) or per cpu (vcpu ioctl), depending on the interrupt type. - -Interrupt parameters are passed via kvm_s390_interrupt: - -struct kvm_s390_interrupt { - __u32 type; - __u32 parm; - __u64 parm64; -}; - -type can be one of the following: - -KVM_S390_SIGP_STOP (vcpu) - sigp stop; optional flags in parm -KVM_S390_PROGRAM_INT (vcpu) - program check; code in parm -KVM_S390_SIGP_SET_PREFIX (vcpu) - sigp set prefix; prefix address in parm -KVM_S390_RESTART (vcpu) - restart -KVM_S390_INT_CLOCK_COMP (vcpu) - clock comparator interrupt -KVM_S390_INT_CPU_TIMER (vcpu) - CPU timer interrupt -KVM_S390_INT_VIRTIO (vm) - virtio external interrupt; external interrupt - parameters in parm and parm64 -KVM_S390_INT_SERVICE (vm) - sclp external interrupt; sclp parameter in parm -KVM_S390_INT_EMERGENCY (vcpu) - sigp emergency; source cpu in parm -KVM_S390_INT_EXTERNAL_CALL (vcpu) - sigp external call; source cpu in parm -KVM_S390_INT_IO(ai,cssid,ssid,schid) (vm) - compound value to indicate an - I/O interrupt (ai - adapter interrupt; cssid,ssid,schid - subchannel); - I/O interruption parameters in parm (subchannel) and parm64 (intparm, - interruption subclass) -KVM_S390_MCHK (vm, vcpu) - machine check interrupt; cr 14 bits in parm, - machine check interrupt code in parm64 (note that - machine checks needing further payload are not - supported by this ioctl) - -This is an asynchronous vcpu ioctl and can be invoked from any thread. - -4.78 KVM_PPC_GET_HTAB_FD - -Capability: KVM_CAP_PPC_HTAB_FD -Architectures: powerpc -Type: vm ioctl -Parameters: Pointer to struct kvm_get_htab_fd (in) -Returns: file descriptor number (>= 0) on success, -1 on error - -This returns a file descriptor that can be used either to read out the -entries in the guest's hashed page table (HPT), or to write entries to -initialize the HPT. The returned fd can only be written to if the -KVM_GET_HTAB_WRITE bit is set in the flags field of the argument, and -can only be read if that bit is clear. The argument struct looks like -this: - -/* For KVM_PPC_GET_HTAB_FD */ -struct kvm_get_htab_fd { - __u64 flags; - __u64 start_index; - __u64 reserved[2]; -}; - -/* Values for kvm_get_htab_fd.flags */ -#define KVM_GET_HTAB_BOLTED_ONLY ((__u64)0x1) -#define KVM_GET_HTAB_WRITE ((__u64)0x2) - -The `start_index' field gives the index in the HPT of the entry at -which to start reading. It is ignored when writing. - -Reads on the fd will initially supply information about all -"interesting" HPT entries. Interesting entries are those with the -bolted bit set, if the KVM_GET_HTAB_BOLTED_ONLY bit is set, otherwise -all entries. When the end of the HPT is reached, the read() will -return. If read() is called again on the fd, it will start again from -the beginning of the HPT, but will only return HPT entries that have -changed since they were last read. - -Data read or written is structured as a header (8 bytes) followed by a -series of valid HPT entries (16 bytes) each. The header indicates how -many valid HPT entries there are and how many invalid entries follow -the valid entries. The invalid entries are not represented explicitly -in the stream. The header format is: - -struct kvm_get_htab_header { - __u32 index; - __u16 n_valid; - __u16 n_invalid; -}; - -Writes to the fd create HPT entries starting at the index given in the -header; first `n_valid' valid entries with contents from the data -written, then `n_invalid' invalid entries, invalidating any previously -valid entries found. - -4.79 KVM_CREATE_DEVICE - -Capability: KVM_CAP_DEVICE_CTRL -Type: vm ioctl -Parameters: struct kvm_create_device (in/out) -Returns: 0 on success, -1 on error -Errors: - ENODEV: The device type is unknown or unsupported - EEXIST: Device already created, and this type of device may not - be instantiated multiple times - - Other error conditions may be defined by individual device types or - have their standard meanings. - -Creates an emulated device in the kernel. The file descriptor returned -in fd can be used with KVM_SET/GET/HAS_DEVICE_ATTR. - -If the KVM_CREATE_DEVICE_TEST flag is set, only test whether the -device type is supported (not necessarily whether it can be created -in the current vm). - -Individual devices should not define flags. Attributes should be used -for specifying any behavior that is not implied by the device type -number. - -struct kvm_create_device { - __u32 type; /* in: KVM_DEV_TYPE_xxx */ - __u32 fd; /* out: device handle */ - __u32 flags; /* in: KVM_CREATE_DEVICE_xxx */ -}; - -4.80 KVM_SET_DEVICE_ATTR/KVM_GET_DEVICE_ATTR - -Capability: KVM_CAP_DEVICE_CTRL, KVM_CAP_VM_ATTRIBUTES for vm device, - KVM_CAP_VCPU_ATTRIBUTES for vcpu device -Type: device ioctl, vm ioctl, vcpu ioctl -Parameters: struct kvm_device_attr -Returns: 0 on success, -1 on error -Errors: - ENXIO: The group or attribute is unknown/unsupported for this device - or hardware support is missing. - EPERM: The attribute cannot (currently) be accessed this way - (e.g. read-only attribute, or attribute that only makes - sense when the device is in a different state) - - Other error conditions may be defined by individual device types. - -Gets/sets a specified piece of device configuration and/or state. The -semantics are device-specific. See individual device documentation in -the "devices" directory. As with ONE_REG, the size of the data -transferred is defined by the particular attribute. - -struct kvm_device_attr { - __u32 flags; /* no flags currently defined */ - __u32 group; /* device-defined */ - __u64 attr; /* group-defined */ - __u64 addr; /* userspace address of attr data */ -}; - -4.81 KVM_HAS_DEVICE_ATTR - -Capability: KVM_CAP_DEVICE_CTRL, KVM_CAP_VM_ATTRIBUTES for vm device, - KVM_CAP_VCPU_ATTRIBUTES for vcpu device -Type: device ioctl, vm ioctl, vcpu ioctl -Parameters: struct kvm_device_attr -Returns: 0 on success, -1 on error -Errors: - ENXIO: The group or attribute is unknown/unsupported for this device - or hardware support is missing. - -Tests whether a device supports a particular attribute. A successful -return indicates the attribute is implemented. It does not necessarily -indicate that the attribute can be read or written in the device's -current state. "addr" is ignored. - -4.82 KVM_ARM_VCPU_INIT - -Capability: basic -Architectures: arm, arm64 -Type: vcpu ioctl -Parameters: struct kvm_vcpu_init (in) -Returns: 0 on success; -1 on error -Errors: -  EINVAL:    the target is unknown, or the combination of features is invalid. -  ENOENT:    a features bit specified is unknown. - -This tells KVM what type of CPU to present to the guest, and what -optional features it should have.  This will cause a reset of the cpu -registers to their initial values.  If this is not called, KVM_RUN will -return ENOEXEC for that vcpu. - -Note that because some registers reflect machine topology, all vcpus -should be created before this ioctl is invoked. - -Userspace can call this function multiple times for a given vcpu, including -after the vcpu has been run. This will reset the vcpu to its initial -state. All calls to this function after the initial call must use the same -target and same set of feature flags, otherwise EINVAL will be returned. - -Possible features: - - KVM_ARM_VCPU_POWER_OFF: Starts the CPU in a power-off state. - Depends on KVM_CAP_ARM_PSCI. If not set, the CPU will be powered on - and execute guest code when KVM_RUN is called. - - KVM_ARM_VCPU_EL1_32BIT: Starts the CPU in a 32bit mode. - Depends on KVM_CAP_ARM_EL1_32BIT (arm64 only). - - KVM_ARM_VCPU_PSCI_0_2: Emulate PSCI v0.2 (or a future revision - backward compatible with v0.2) for the CPU. - Depends on KVM_CAP_ARM_PSCI_0_2. - - KVM_ARM_VCPU_PMU_V3: Emulate PMUv3 for the CPU. - Depends on KVM_CAP_ARM_PMU_V3. - - - KVM_ARM_VCPU_PTRAUTH_ADDRESS: Enables Address Pointer authentication - for arm64 only. - Depends on KVM_CAP_ARM_PTRAUTH_ADDRESS. - If KVM_CAP_ARM_PTRAUTH_ADDRESS and KVM_CAP_ARM_PTRAUTH_GENERIC are - both present, then both KVM_ARM_VCPU_PTRAUTH_ADDRESS and - KVM_ARM_VCPU_PTRAUTH_GENERIC must be requested or neither must be - requested. - - - KVM_ARM_VCPU_PTRAUTH_GENERIC: Enables Generic Pointer authentication - for arm64 only. - Depends on KVM_CAP_ARM_PTRAUTH_GENERIC. - If KVM_CAP_ARM_PTRAUTH_ADDRESS and KVM_CAP_ARM_PTRAUTH_GENERIC are - both present, then both KVM_ARM_VCPU_PTRAUTH_ADDRESS and - KVM_ARM_VCPU_PTRAUTH_GENERIC must be requested or neither must be - requested. - - - KVM_ARM_VCPU_SVE: Enables SVE for the CPU (arm64 only). - Depends on KVM_CAP_ARM_SVE. - Requires KVM_ARM_VCPU_FINALIZE(KVM_ARM_VCPU_SVE): - - * After KVM_ARM_VCPU_INIT: - - - KVM_REG_ARM64_SVE_VLS may be read using KVM_GET_ONE_REG: the - initial value of this pseudo-register indicates the best set of - vector lengths possible for a vcpu on this host. - - * Before KVM_ARM_VCPU_FINALIZE(KVM_ARM_VCPU_SVE): - - - KVM_RUN and KVM_GET_REG_LIST are not available; - - - KVM_GET_ONE_REG and KVM_SET_ONE_REG cannot be used to access - the scalable archietctural SVE registers - KVM_REG_ARM64_SVE_ZREG(), KVM_REG_ARM64_SVE_PREG() or - KVM_REG_ARM64_SVE_FFR; - - - KVM_REG_ARM64_SVE_VLS may optionally be written using - KVM_SET_ONE_REG, to modify the set of vector lengths available - for the vcpu. - - * After KVM_ARM_VCPU_FINALIZE(KVM_ARM_VCPU_SVE): - - - the KVM_REG_ARM64_SVE_VLS pseudo-register is immutable, and can - no longer be written using KVM_SET_ONE_REG. - -4.83 KVM_ARM_PREFERRED_TARGET - -Capability: basic -Architectures: arm, arm64 -Type: vm ioctl -Parameters: struct struct kvm_vcpu_init (out) -Returns: 0 on success; -1 on error -Errors: - ENODEV: no preferred target available for the host - -This queries KVM for preferred CPU target type which can be emulated -by KVM on underlying host. - -The ioctl returns struct kvm_vcpu_init instance containing information -about preferred CPU target type and recommended features for it. The -kvm_vcpu_init->features bitmap returned will have feature bits set if -the preferred target recommends setting these features, but this is -not mandatory. - -The information returned by this ioctl can be used to prepare an instance -of struct kvm_vcpu_init for KVM_ARM_VCPU_INIT ioctl which will result in -in VCPU matching underlying host. - - -4.84 KVM_GET_REG_LIST - -Capability: basic -Architectures: arm, arm64, mips -Type: vcpu ioctl -Parameters: struct kvm_reg_list (in/out) -Returns: 0 on success; -1 on error -Errors: -  E2BIG:     the reg index list is too big to fit in the array specified by -             the user (the number required will be written into n). - -struct kvm_reg_list { - __u64 n; /* number of registers in reg[] */ - __u64 reg[0]; -}; - -This ioctl returns the guest registers that are supported for the -KVM_GET_ONE_REG/KVM_SET_ONE_REG calls. - - -4.85 KVM_ARM_SET_DEVICE_ADDR (deprecated) - -Capability: KVM_CAP_ARM_SET_DEVICE_ADDR -Architectures: arm, arm64 -Type: vm ioctl -Parameters: struct kvm_arm_device_address (in) -Returns: 0 on success, -1 on error -Errors: - ENODEV: The device id is unknown - ENXIO: Device not supported on current system - EEXIST: Address already set - E2BIG: Address outside guest physical address space - EBUSY: Address overlaps with other device range - -struct kvm_arm_device_addr { - __u64 id; - __u64 addr; -}; - -Specify a device address in the guest's physical address space where guests -can access emulated or directly exposed devices, which the host kernel needs -to know about. The id field is an architecture specific identifier for a -specific device. - -ARM/arm64 divides the id field into two parts, a device id and an -address type id specific to the individual device. - -  bits: | 63 ... 32 | 31 ... 16 | 15 ... 0 | - field: | 0x00000000 | device id | addr type id | - -ARM/arm64 currently only require this when using the in-kernel GIC -support for the hardware VGIC features, using KVM_ARM_DEVICE_VGIC_V2 -as the device id. When setting the base address for the guest's -mapping of the VGIC virtual CPU and distributor interface, the ioctl -must be called after calling KVM_CREATE_IRQCHIP, but before calling -KVM_RUN on any of the VCPUs. Calling this ioctl twice for any of the -base addresses will return -EEXIST. - -Note, this IOCTL is deprecated and the more flexible SET/GET_DEVICE_ATTR API -should be used instead. - - -4.86 KVM_PPC_RTAS_DEFINE_TOKEN - -Capability: KVM_CAP_PPC_RTAS -Architectures: ppc -Type: vm ioctl -Parameters: struct kvm_rtas_token_args -Returns: 0 on success, -1 on error - -Defines a token value for a RTAS (Run Time Abstraction Services) -service in order to allow it to be handled in the kernel. The -argument struct gives the name of the service, which must be the name -of a service that has a kernel-side implementation. If the token -value is non-zero, it will be associated with that service, and -subsequent RTAS calls by the guest specifying that token will be -handled by the kernel. If the token value is 0, then any token -associated with the service will be forgotten, and subsequent RTAS -calls by the guest for that service will be passed to userspace to be -handled. - -4.87 KVM_SET_GUEST_DEBUG - -Capability: KVM_CAP_SET_GUEST_DEBUG -Architectures: x86, s390, ppc, arm64 -Type: vcpu ioctl -Parameters: struct kvm_guest_debug (in) -Returns: 0 on success; -1 on error - -struct kvm_guest_debug { - __u32 control; - __u32 pad; - struct kvm_guest_debug_arch arch; -}; - -Set up the processor specific debug registers and configure vcpu for -handling guest debug events. There are two parts to the structure, the -first a control bitfield indicates the type of debug events to handle -when running. Common control bits are: - - - KVM_GUESTDBG_ENABLE: guest debugging is enabled - - KVM_GUESTDBG_SINGLESTEP: the next run should single-step - -The top 16 bits of the control field are architecture specific control -flags which can include the following: - - - KVM_GUESTDBG_USE_SW_BP: using software breakpoints [x86, arm64] - - KVM_GUESTDBG_USE_HW_BP: using hardware breakpoints [x86, s390, arm64] - - KVM_GUESTDBG_INJECT_DB: inject DB type exception [x86] - - KVM_GUESTDBG_INJECT_BP: inject BP type exception [x86] - - KVM_GUESTDBG_EXIT_PENDING: trigger an immediate guest exit [s390] - -For example KVM_GUESTDBG_USE_SW_BP indicates that software breakpoints -are enabled in memory so we need to ensure breakpoint exceptions are -correctly trapped and the KVM run loop exits at the breakpoint and not -running off into the normal guest vector. For KVM_GUESTDBG_USE_HW_BP -we need to ensure the guest vCPUs architecture specific registers are -updated to the correct (supplied) values. - -The second part of the structure is architecture specific and -typically contains a set of debug registers. - -For arm64 the number of debug registers is implementation defined and -can be determined by querying the KVM_CAP_GUEST_DEBUG_HW_BPS and -KVM_CAP_GUEST_DEBUG_HW_WPS capabilities which return a positive number -indicating the number of supported registers. - -When debug events exit the main run loop with the reason -KVM_EXIT_DEBUG with the kvm_debug_exit_arch part of the kvm_run -structure containing architecture specific debug information. - -4.88 KVM_GET_EMULATED_CPUID - -Capability: KVM_CAP_EXT_EMUL_CPUID -Architectures: x86 -Type: system ioctl -Parameters: struct kvm_cpuid2 (in/out) -Returns: 0 on success, -1 on error - -struct kvm_cpuid2 { - __u32 nent; - __u32 flags; - struct kvm_cpuid_entry2 entries[0]; -}; - -The member 'flags' is used for passing flags from userspace. - -#define KVM_CPUID_FLAG_SIGNIFCANT_INDEX BIT(0) -#define KVM_CPUID_FLAG_STATEFUL_FUNC BIT(1) -#define KVM_CPUID_FLAG_STATE_READ_NEXT BIT(2) - -struct kvm_cpuid_entry2 { - __u32 function; - __u32 index; - __u32 flags; - __u32 eax; - __u32 ebx; - __u32 ecx; - __u32 edx; - __u32 padding[3]; -}; - -This ioctl returns x86 cpuid features which are emulated by -kvm.Userspace can use the information returned by this ioctl to query -which features are emulated by kvm instead of being present natively. - -Userspace invokes KVM_GET_EMULATED_CPUID by passing a kvm_cpuid2 -structure with the 'nent' field indicating the number of entries in -the variable-size array 'entries'. If the number of entries is too low -to describe the cpu capabilities, an error (E2BIG) is returned. If the -number is too high, the 'nent' field is adjusted and an error (ENOMEM) -is returned. If the number is just right, the 'nent' field is adjusted -to the number of valid entries in the 'entries' array, which is then -filled. - -The entries returned are the set CPUID bits of the respective features -which kvm emulates, as returned by the CPUID instruction, with unknown -or unsupported feature bits cleared. - -Features like x2apic, for example, may not be present in the host cpu -but are exposed by kvm in KVM_GET_SUPPORTED_CPUID because they can be -emulated efficiently and thus not included here. - -The fields in each entry are defined as follows: - - function: the eax value used to obtain the entry - index: the ecx value used to obtain the entry (for entries that are - affected by ecx) - flags: an OR of zero or more of the following: - KVM_CPUID_FLAG_SIGNIFCANT_INDEX: - if the index field is valid - KVM_CPUID_FLAG_STATEFUL_FUNC: - if cpuid for this function returns different values for successive - invocations; there will be several entries with the same function, - all with this flag set - KVM_CPUID_FLAG_STATE_READ_NEXT: - for KVM_CPUID_FLAG_STATEFUL_FUNC entries, set if this entry is - the first entry to be read by a cpu - eax, ebx, ecx, edx: the values returned by the cpuid instruction for - this function/index combination - -4.89 KVM_S390_MEM_OP - -Capability: KVM_CAP_S390_MEM_OP -Architectures: s390 -Type: vcpu ioctl -Parameters: struct kvm_s390_mem_op (in) -Returns: = 0 on success, - < 0 on generic error (e.g. -EFAULT or -ENOMEM), - > 0 if an exception occurred while walking the page tables - -Read or write data from/to the logical (virtual) memory of a VCPU. - -Parameters are specified via the following structure: - -struct kvm_s390_mem_op { - __u64 gaddr; /* the guest address */ - __u64 flags; /* flags */ - __u32 size; /* amount of bytes */ - __u32 op; /* type of operation */ - __u64 buf; /* buffer in userspace */ - __u8 ar; /* the access register number */ - __u8 reserved[31]; /* should be set to 0 */ -}; - -The type of operation is specified in the "op" field. It is either -KVM_S390_MEMOP_LOGICAL_READ for reading from logical memory space or -KVM_S390_MEMOP_LOGICAL_WRITE for writing to logical memory space. The -KVM_S390_MEMOP_F_CHECK_ONLY flag can be set in the "flags" field to check -whether the corresponding memory access would create an access exception -(without touching the data in the memory at the destination). In case an -access exception occurred while walking the MMU tables of the guest, the -ioctl returns a positive error number to indicate the type of exception. -This exception is also raised directly at the corresponding VCPU if the -flag KVM_S390_MEMOP_F_INJECT_EXCEPTION is set in the "flags" field. - -The start address of the memory region has to be specified in the "gaddr" -field, and the length of the region in the "size" field. "buf" is the buffer -supplied by the userspace application where the read data should be written -to for KVM_S390_MEMOP_LOGICAL_READ, or where the data that should be written -is stored for a KVM_S390_MEMOP_LOGICAL_WRITE. "buf" is unused and can be NULL -when KVM_S390_MEMOP_F_CHECK_ONLY is specified. "ar" designates the access -register number to be used. - -The "reserved" field is meant for future extensions. It is not used by -KVM with the currently defined set of flags. - -4.90 KVM_S390_GET_SKEYS - -Capability: KVM_CAP_S390_SKEYS -Architectures: s390 -Type: vm ioctl -Parameters: struct kvm_s390_skeys -Returns: 0 on success, KVM_S390_GET_KEYS_NONE if guest is not using storage - keys, negative value on error - -This ioctl is used to get guest storage key values on the s390 -architecture. The ioctl takes parameters via the kvm_s390_skeys struct. - -struct kvm_s390_skeys { - __u64 start_gfn; - __u64 count; - __u64 skeydata_addr; - __u32 flags; - __u32 reserved[9]; -}; - -The start_gfn field is the number of the first guest frame whose storage keys -you want to get. - -The count field is the number of consecutive frames (starting from start_gfn) -whose storage keys to get. The count field must be at least 1 and the maximum -allowed value is defined as KVM_S390_SKEYS_ALLOC_MAX. Values outside this range -will cause the ioctl to return -EINVAL. - -The skeydata_addr field is the address to a buffer large enough to hold count -bytes. This buffer will be filled with storage key data by the ioctl. - -4.91 KVM_S390_SET_SKEYS - -Capability: KVM_CAP_S390_SKEYS -Architectures: s390 -Type: vm ioctl -Parameters: struct kvm_s390_skeys -Returns: 0 on success, negative value on error - -This ioctl is used to set guest storage key values on the s390 -architecture. The ioctl takes parameters via the kvm_s390_skeys struct. -See section on KVM_S390_GET_SKEYS for struct definition. - -The start_gfn field is the number of the first guest frame whose storage keys -you want to set. - -The count field is the number of consecutive frames (starting from start_gfn) -whose storage keys to get. The count field must be at least 1 and the maximum -allowed value is defined as KVM_S390_SKEYS_ALLOC_MAX. Values outside this range -will cause the ioctl to return -EINVAL. - -The skeydata_addr field is the address to a buffer containing count bytes of -storage keys. Each byte in the buffer will be set as the storage key for a -single frame starting at start_gfn for count frames. - -Note: If any architecturally invalid key value is found in the given data then -the ioctl will return -EINVAL. - -4.92 KVM_S390_IRQ - -Capability: KVM_CAP_S390_INJECT_IRQ -Architectures: s390 -Type: vcpu ioctl -Parameters: struct kvm_s390_irq (in) -Returns: 0 on success, -1 on error -Errors: - EINVAL: interrupt type is invalid - type is KVM_S390_SIGP_STOP and flag parameter is invalid value - type is KVM_S390_INT_EXTERNAL_CALL and code is bigger - than the maximum of VCPUs - EBUSY: type is KVM_S390_SIGP_SET_PREFIX and vcpu is not stopped - type is KVM_S390_SIGP_STOP and a stop irq is already pending - type is KVM_S390_INT_EXTERNAL_CALL and an external call interrupt - is already pending - -Allows to inject an interrupt to the guest. - -Using struct kvm_s390_irq as a parameter allows -to inject additional payload which is not -possible via KVM_S390_INTERRUPT. - -Interrupt parameters are passed via kvm_s390_irq: - -struct kvm_s390_irq { - __u64 type; - union { - struct kvm_s390_io_info io; - struct kvm_s390_ext_info ext; - struct kvm_s390_pgm_info pgm; - struct kvm_s390_emerg_info emerg; - struct kvm_s390_extcall_info extcall; - struct kvm_s390_prefix_info prefix; - struct kvm_s390_stop_info stop; - struct kvm_s390_mchk_info mchk; - char reserved[64]; - } u; -}; - -type can be one of the following: - -KVM_S390_SIGP_STOP - sigp stop; parameter in .stop -KVM_S390_PROGRAM_INT - program check; parameters in .pgm -KVM_S390_SIGP_SET_PREFIX - sigp set prefix; parameters in .prefix -KVM_S390_RESTART - restart; no parameters -KVM_S390_INT_CLOCK_COMP - clock comparator interrupt; no parameters -KVM_S390_INT_CPU_TIMER - CPU timer interrupt; no parameters -KVM_S390_INT_EMERGENCY - sigp emergency; parameters in .emerg -KVM_S390_INT_EXTERNAL_CALL - sigp external call; parameters in .extcall -KVM_S390_MCHK - machine check interrupt; parameters in .mchk - -This is an asynchronous vcpu ioctl and can be invoked from any thread. - -4.94 KVM_S390_GET_IRQ_STATE - -Capability: KVM_CAP_S390_IRQ_STATE -Architectures: s390 -Type: vcpu ioctl -Parameters: struct kvm_s390_irq_state (out) -Returns: >= number of bytes copied into buffer, - -EINVAL if buffer size is 0, - -ENOBUFS if buffer size is too small to fit all pending interrupts, - -EFAULT if the buffer address was invalid - -This ioctl allows userspace to retrieve the complete state of all currently -pending interrupts in a single buffer. Use cases include migration -and introspection. The parameter structure contains the address of a -userspace buffer and its length: - -struct kvm_s390_irq_state { - __u64 buf; - __u32 flags; /* will stay unused for compatibility reasons */ - __u32 len; - __u32 reserved[4]; /* will stay unused for compatibility reasons */ -}; - -Userspace passes in the above struct and for each pending interrupt a -struct kvm_s390_irq is copied to the provided buffer. - -The structure contains a flags and a reserved field for future extensions. As -the kernel never checked for flags == 0 and QEMU never pre-zeroed flags and -reserved, these fields can not be used in the future without breaking -compatibility. - -If -ENOBUFS is returned the buffer provided was too small and userspace -may retry with a bigger buffer. - -4.95 KVM_S390_SET_IRQ_STATE - -Capability: KVM_CAP_S390_IRQ_STATE -Architectures: s390 -Type: vcpu ioctl -Parameters: struct kvm_s390_irq_state (in) -Returns: 0 on success, - -EFAULT if the buffer address was invalid, - -EINVAL for an invalid buffer length (see below), - -EBUSY if there were already interrupts pending, - errors occurring when actually injecting the - interrupt. See KVM_S390_IRQ. - -This ioctl allows userspace to set the complete state of all cpu-local -interrupts currently pending for the vcpu. It is intended for restoring -interrupt state after a migration. The input parameter is a userspace buffer -containing a struct kvm_s390_irq_state: - -struct kvm_s390_irq_state { - __u64 buf; - __u32 flags; /* will stay unused for compatibility reasons */ - __u32 len; - __u32 reserved[4]; /* will stay unused for compatibility reasons */ -}; - -The restrictions for flags and reserved apply as well. -(see KVM_S390_GET_IRQ_STATE) - -The userspace memory referenced by buf contains a struct kvm_s390_irq -for each interrupt to be injected into the guest. -If one of the interrupts could not be injected for some reason the -ioctl aborts. - -len must be a multiple of sizeof(struct kvm_s390_irq). It must be > 0 -and it must not exceed (max_vcpus + 32) * sizeof(struct kvm_s390_irq), -which is the maximum number of possibly pending cpu-local interrupts. - -4.96 KVM_SMI - -Capability: KVM_CAP_X86_SMM -Architectures: x86 -Type: vcpu ioctl -Parameters: none -Returns: 0 on success, -1 on error - -Queues an SMI on the thread's vcpu. - -4.97 KVM_CAP_PPC_MULTITCE - -Capability: KVM_CAP_PPC_MULTITCE -Architectures: ppc -Type: vm - -This capability means the kernel is capable of handling hypercalls -H_PUT_TCE_INDIRECT and H_STUFF_TCE without passing those into the user -space. This significantly accelerates DMA operations for PPC KVM guests. -User space should expect that its handlers for these hypercalls -are not going to be called if user space previously registered LIOBN -in KVM (via KVM_CREATE_SPAPR_TCE or similar calls). - -In order to enable H_PUT_TCE_INDIRECT and H_STUFF_TCE use in the guest, -user space might have to advertise it for the guest. For example, -IBM pSeries (sPAPR) guest starts using them if "hcall-multi-tce" is -present in the "ibm,hypertas-functions" device-tree property. - -The hypercalls mentioned above may or may not be processed successfully -in the kernel based fast path. If they can not be handled by the kernel, -they will get passed on to user space. So user space still has to have -an implementation for these despite the in kernel acceleration. - -This capability is always enabled. - -4.98 KVM_CREATE_SPAPR_TCE_64 - -Capability: KVM_CAP_SPAPR_TCE_64 -Architectures: powerpc -Type: vm ioctl -Parameters: struct kvm_create_spapr_tce_64 (in) -Returns: file descriptor for manipulating the created TCE table - -This is an extension for KVM_CAP_SPAPR_TCE which only supports 32bit -windows, described in 4.62 KVM_CREATE_SPAPR_TCE - -This capability uses extended struct in ioctl interface: - -/* for KVM_CAP_SPAPR_TCE_64 */ -struct kvm_create_spapr_tce_64 { - __u64 liobn; - __u32 page_shift; - __u32 flags; - __u64 offset; /* in pages */ - __u64 size; /* in pages */ -}; - -The aim of extension is to support an additional bigger DMA window with -a variable page size. -KVM_CREATE_SPAPR_TCE_64 receives a 64bit window size, an IOMMU page shift and -a bus offset of the corresponding DMA window, @size and @offset are numbers -of IOMMU pages. - -@flags are not used at the moment. - -The rest of functionality is identical to KVM_CREATE_SPAPR_TCE. - -4.99 KVM_REINJECT_CONTROL - -Capability: KVM_CAP_REINJECT_CONTROL -Architectures: x86 -Type: vm ioctl -Parameters: struct kvm_reinject_control (in) -Returns: 0 on success, - -EFAULT if struct kvm_reinject_control cannot be read, - -ENXIO if KVM_CREATE_PIT or KVM_CREATE_PIT2 didn't succeed earlier. - -i8254 (PIT) has two modes, reinject and !reinject. The default is reinject, -where KVM queues elapsed i8254 ticks and monitors completion of interrupt from -vector(s) that i8254 injects. Reinject mode dequeues a tick and injects its -interrupt whenever there isn't a pending interrupt from i8254. -!reinject mode injects an interrupt as soon as a tick arrives. - -struct kvm_reinject_control { - __u8 pit_reinject; - __u8 reserved[31]; -}; - -pit_reinject = 0 (!reinject mode) is recommended, unless running an old -operating system that uses the PIT for timing (e.g. Linux 2.4.x). - -4.100 KVM_PPC_CONFIGURE_V3_MMU - -Capability: KVM_CAP_PPC_RADIX_MMU or KVM_CAP_PPC_HASH_MMU_V3 -Architectures: ppc -Type: vm ioctl -Parameters: struct kvm_ppc_mmuv3_cfg (in) -Returns: 0 on success, - -EFAULT if struct kvm_ppc_mmuv3_cfg cannot be read, - -EINVAL if the configuration is invalid - -This ioctl controls whether the guest will use radix or HPT (hashed -page table) translation, and sets the pointer to the process table for -the guest. - -struct kvm_ppc_mmuv3_cfg { - __u64 flags; - __u64 process_table; -}; - -There are two bits that can be set in flags; KVM_PPC_MMUV3_RADIX and -KVM_PPC_MMUV3_GTSE. KVM_PPC_MMUV3_RADIX, if set, configures the guest -to use radix tree translation, and if clear, to use HPT translation. -KVM_PPC_MMUV3_GTSE, if set and if KVM permits it, configures the guest -to be able to use the global TLB and SLB invalidation instructions; -if clear, the guest may not use these instructions. - -The process_table field specifies the address and size of the guest -process table, which is in the guest's space. This field is formatted -as the second doubleword of the partition table entry, as defined in -the Power ISA V3.00, Book III section 5.7.6.1. - -4.101 KVM_PPC_GET_RMMU_INFO - -Capability: KVM_CAP_PPC_RADIX_MMU -Architectures: ppc -Type: vm ioctl -Parameters: struct kvm_ppc_rmmu_info (out) -Returns: 0 on success, - -EFAULT if struct kvm_ppc_rmmu_info cannot be written, - -EINVAL if no useful information can be returned - -This ioctl returns a structure containing two things: (a) a list -containing supported radix tree geometries, and (b) a list that maps -page sizes to put in the "AP" (actual page size) field for the tlbie -(TLB invalidate entry) instruction. - -struct kvm_ppc_rmmu_info { - struct kvm_ppc_radix_geom { - __u8 page_shift; - __u8 level_bits[4]; - __u8 pad[3]; - } geometries[8]; - __u32 ap_encodings[8]; -}; - -The geometries[] field gives up to 8 supported geometries for the -radix page table, in terms of the log base 2 of the smallest page -size, and the number of bits indexed at each level of the tree, from -the PTE level up to the PGD level in that order. Any unused entries -will have 0 in the page_shift field. - -The ap_encodings gives the supported page sizes and their AP field -encodings, encoded with the AP value in the top 3 bits and the log -base 2 of the page size in the bottom 6 bits. - -4.102 KVM_PPC_RESIZE_HPT_PREPARE - -Capability: KVM_CAP_SPAPR_RESIZE_HPT -Architectures: powerpc -Type: vm ioctl -Parameters: struct kvm_ppc_resize_hpt (in) -Returns: 0 on successful completion, - >0 if a new HPT is being prepared, the value is an estimated - number of milliseconds until preparation is complete - -EFAULT if struct kvm_reinject_control cannot be read, - -EINVAL if the supplied shift or flags are invalid - -ENOMEM if unable to allocate the new HPT - -ENOSPC if there was a hash collision when moving existing - HPT entries to the new HPT - -EIO on other error conditions - -Used to implement the PAPR extension for runtime resizing of a guest's -Hashed Page Table (HPT). Specifically this starts, stops or monitors -the preparation of a new potential HPT for the guest, essentially -implementing the H_RESIZE_HPT_PREPARE hypercall. - -If called with shift > 0 when there is no pending HPT for the guest, -this begins preparation of a new pending HPT of size 2^(shift) bytes. -It then returns a positive integer with the estimated number of -milliseconds until preparation is complete. - -If called when there is a pending HPT whose size does not match that -requested in the parameters, discards the existing pending HPT and -creates a new one as above. - -If called when there is a pending HPT of the size requested, will: - * If preparation of the pending HPT is already complete, return 0 - * If preparation of the pending HPT has failed, return an error - code, then discard the pending HPT. - * If preparation of the pending HPT is still in progress, return an - estimated number of milliseconds until preparation is complete. - -If called with shift == 0, discards any currently pending HPT and -returns 0 (i.e. cancels any in-progress preparation). - -flags is reserved for future expansion, currently setting any bits in -flags will result in an -EINVAL. - -Normally this will be called repeatedly with the same parameters until -it returns <= 0. The first call will initiate preparation, subsequent -ones will monitor preparation until it completes or fails. - -struct kvm_ppc_resize_hpt { - __u64 flags; - __u32 shift; - __u32 pad; -}; - -4.103 KVM_PPC_RESIZE_HPT_COMMIT - -Capability: KVM_CAP_SPAPR_RESIZE_HPT -Architectures: powerpc -Type: vm ioctl -Parameters: struct kvm_ppc_resize_hpt (in) -Returns: 0 on successful completion, - -EFAULT if struct kvm_reinject_control cannot be read, - -EINVAL if the supplied shift or flags are invalid - -ENXIO is there is no pending HPT, or the pending HPT doesn't - have the requested size - -EBUSY if the pending HPT is not fully prepared - -ENOSPC if there was a hash collision when moving existing - HPT entries to the new HPT - -EIO on other error conditions - -Used to implement the PAPR extension for runtime resizing of a guest's -Hashed Page Table (HPT). Specifically this requests that the guest be -transferred to working with the new HPT, essentially implementing the -H_RESIZE_HPT_COMMIT hypercall. - -This should only be called after KVM_PPC_RESIZE_HPT_PREPARE has -returned 0 with the same parameters. In other cases -KVM_PPC_RESIZE_HPT_COMMIT will return an error (usually -ENXIO or --EBUSY, though others may be possible if the preparation was started, -but failed). - -This will have undefined effects on the guest if it has not already -placed itself in a quiescent state where no vcpu will make MMU enabled -memory accesses. - -On succsful completion, the pending HPT will become the guest's active -HPT and the previous HPT will be discarded. - -On failure, the guest will still be operating on its previous HPT. - -struct kvm_ppc_resize_hpt { - __u64 flags; - __u32 shift; - __u32 pad; -}; - -4.104 KVM_X86_GET_MCE_CAP_SUPPORTED - -Capability: KVM_CAP_MCE -Architectures: x86 -Type: system ioctl -Parameters: u64 mce_cap (out) -Returns: 0 on success, -1 on error - -Returns supported MCE capabilities. The u64 mce_cap parameter -has the same format as the MSR_IA32_MCG_CAP register. Supported -capabilities will have the corresponding bits set. - -4.105 KVM_X86_SETUP_MCE - -Capability: KVM_CAP_MCE -Architectures: x86 -Type: vcpu ioctl -Parameters: u64 mcg_cap (in) -Returns: 0 on success, - -EFAULT if u64 mcg_cap cannot be read, - -EINVAL if the requested number of banks is invalid, - -EINVAL if requested MCE capability is not supported. - -Initializes MCE support for use. The u64 mcg_cap parameter -has the same format as the MSR_IA32_MCG_CAP register and -specifies which capabilities should be enabled. The maximum -supported number of error-reporting banks can be retrieved when -checking for KVM_CAP_MCE. The supported capabilities can be -retrieved with KVM_X86_GET_MCE_CAP_SUPPORTED. - -4.106 KVM_X86_SET_MCE - -Capability: KVM_CAP_MCE -Architectures: x86 -Type: vcpu ioctl -Parameters: struct kvm_x86_mce (in) -Returns: 0 on success, - -EFAULT if struct kvm_x86_mce cannot be read, - -EINVAL if the bank number is invalid, - -EINVAL if VAL bit is not set in status field. - -Inject a machine check error (MCE) into the guest. The input -parameter is: - -struct kvm_x86_mce { - __u64 status; - __u64 addr; - __u64 misc; - __u64 mcg_status; - __u8 bank; - __u8 pad1[7]; - __u64 pad2[3]; -}; - -If the MCE being reported is an uncorrected error, KVM will -inject it as an MCE exception into the guest. If the guest -MCG_STATUS register reports that an MCE is in progress, KVM -causes an KVM_EXIT_SHUTDOWN vmexit. - -Otherwise, if the MCE is a corrected error, KVM will just -store it in the corresponding bank (provided this bank is -not holding a previously reported uncorrected error). - -4.107 KVM_S390_GET_CMMA_BITS - -Capability: KVM_CAP_S390_CMMA_MIGRATION -Architectures: s390 -Type: vm ioctl -Parameters: struct kvm_s390_cmma_log (in, out) -Returns: 0 on success, a negative value on error - -This ioctl is used to get the values of the CMMA bits on the s390 -architecture. It is meant to be used in two scenarios: -- During live migration to save the CMMA values. Live migration needs - to be enabled via the KVM_REQ_START_MIGRATION VM property. -- To non-destructively peek at the CMMA values, with the flag - KVM_S390_CMMA_PEEK set. - -The ioctl takes parameters via the kvm_s390_cmma_log struct. The desired -values are written to a buffer whose location is indicated via the "values" -member in the kvm_s390_cmma_log struct. The values in the input struct are -also updated as needed. -Each CMMA value takes up one byte. - -struct kvm_s390_cmma_log { - __u64 start_gfn; - __u32 count; - __u32 flags; - union { - __u64 remaining; - __u64 mask; - }; - __u64 values; -}; - -start_gfn is the number of the first guest frame whose CMMA values are -to be retrieved, - -count is the length of the buffer in bytes, - -values points to the buffer where the result will be written to. - -If count is greater than KVM_S390_SKEYS_MAX, then it is considered to be -KVM_S390_SKEYS_MAX. KVM_S390_SKEYS_MAX is re-used for consistency with -other ioctls. - -The result is written in the buffer pointed to by the field values, and -the values of the input parameter are updated as follows. - -Depending on the flags, different actions are performed. The only -supported flag so far is KVM_S390_CMMA_PEEK. - -The default behaviour if KVM_S390_CMMA_PEEK is not set is: -start_gfn will indicate the first page frame whose CMMA bits were dirty. -It is not necessarily the same as the one passed as input, as clean pages -are skipped. - -count will indicate the number of bytes actually written in the buffer. -It can (and very often will) be smaller than the input value, since the -buffer is only filled until 16 bytes of clean values are found (which -are then not copied in the buffer). Since a CMMA migration block needs -the base address and the length, for a total of 16 bytes, we will send -back some clean data if there is some dirty data afterwards, as long as -the size of the clean data does not exceed the size of the header. This -allows to minimize the amount of data to be saved or transferred over -the network at the expense of more roundtrips to userspace. The next -invocation of the ioctl will skip over all the clean values, saving -potentially more than just the 16 bytes we found. - -If KVM_S390_CMMA_PEEK is set: -the existing storage attributes are read even when not in migration -mode, and no other action is performed; - -the output start_gfn will be equal to the input start_gfn, - -the output count will be equal to the input count, except if the end of -memory has been reached. - -In both cases: -the field "remaining" will indicate the total number of dirty CMMA values -still remaining, or 0 if KVM_S390_CMMA_PEEK is set and migration mode is -not enabled. - -mask is unused. - -values points to the userspace buffer where the result will be stored. - -This ioctl can fail with -ENOMEM if not enough memory can be allocated to -complete the task, with -ENXIO if CMMA is not enabled, with -EINVAL if -KVM_S390_CMMA_PEEK is not set but migration mode was not enabled, with --EFAULT if the userspace address is invalid or if no page table is -present for the addresses (e.g. when using hugepages). - -4.108 KVM_S390_SET_CMMA_BITS - -Capability: KVM_CAP_S390_CMMA_MIGRATION -Architectures: s390 -Type: vm ioctl -Parameters: struct kvm_s390_cmma_log (in) -Returns: 0 on success, a negative value on error - -This ioctl is used to set the values of the CMMA bits on the s390 -architecture. It is meant to be used during live migration to restore -the CMMA values, but there are no restrictions on its use. -The ioctl takes parameters via the kvm_s390_cmma_values struct. -Each CMMA value takes up one byte. - -struct kvm_s390_cmma_log { - __u64 start_gfn; - __u32 count; - __u32 flags; - union { - __u64 remaining; - __u64 mask; - }; - __u64 values; -}; - -start_gfn indicates the starting guest frame number, - -count indicates how many values are to be considered in the buffer, - -flags is not used and must be 0. - -mask indicates which PGSTE bits are to be considered. - -remaining is not used. - -values points to the buffer in userspace where to store the values. - -This ioctl can fail with -ENOMEM if not enough memory can be allocated to -complete the task, with -ENXIO if CMMA is not enabled, with -EINVAL if -the count field is too large (e.g. more than KVM_S390_CMMA_SIZE_MAX) or -if the flags field was not 0, with -EFAULT if the userspace address is -invalid, if invalid pages are written to (e.g. after the end of memory) -or if no page table is present for the addresses (e.g. when using -hugepages). - -4.109 KVM_PPC_GET_CPU_CHAR - -Capability: KVM_CAP_PPC_GET_CPU_CHAR -Architectures: powerpc -Type: vm ioctl -Parameters: struct kvm_ppc_cpu_char (out) -Returns: 0 on successful completion - -EFAULT if struct kvm_ppc_cpu_char cannot be written - -This ioctl gives userspace information about certain characteristics -of the CPU relating to speculative execution of instructions and -possible information leakage resulting from speculative execution (see -CVE-2017-5715, CVE-2017-5753 and CVE-2017-5754). The information is -returned in struct kvm_ppc_cpu_char, which looks like this: - -struct kvm_ppc_cpu_char { - __u64 character; /* characteristics of the CPU */ - __u64 behaviour; /* recommended software behaviour */ - __u64 character_mask; /* valid bits in character */ - __u64 behaviour_mask; /* valid bits in behaviour */ -}; - -For extensibility, the character_mask and behaviour_mask fields -indicate which bits of character and behaviour have been filled in by -the kernel. If the set of defined bits is extended in future then -userspace will be able to tell whether it is running on a kernel that -knows about the new bits. - -The character field describes attributes of the CPU which can help -with preventing inadvertent information disclosure - specifically, -whether there is an instruction to flash-invalidate the L1 data cache -(ori 30,30,0 or mtspr SPRN_TRIG2,rN), whether the L1 data cache is set -to a mode where entries can only be used by the thread that created -them, whether the bcctr[l] instruction prevents speculation, and -whether a speculation barrier instruction (ori 31,31,0) is provided. - -The behaviour field describes actions that software should take to -prevent inadvertent information disclosure, and thus describes which -vulnerabilities the hardware is subject to; specifically whether the -L1 data cache should be flushed when returning to user mode from the -kernel, and whether a speculation barrier should be placed between an -array bounds check and the array access. - -These fields use the same bit definitions as the new -H_GET_CPU_CHARACTERISTICS hypercall. - -4.110 KVM_MEMORY_ENCRYPT_OP - -Capability: basic -Architectures: x86 -Type: system -Parameters: an opaque platform specific structure (in/out) -Returns: 0 on success; -1 on error - -If the platform supports creating encrypted VMs then this ioctl can be used -for issuing platform-specific memory encryption commands to manage those -encrypted VMs. - -Currently, this ioctl is used for issuing Secure Encrypted Virtualization -(SEV) commands on AMD Processors. The SEV commands are defined in -Documentation/virtual/kvm/amd-memory-encryption.rst. - -4.111 KVM_MEMORY_ENCRYPT_REG_REGION - -Capability: basic -Architectures: x86 -Type: system -Parameters: struct kvm_enc_region (in) -Returns: 0 on success; -1 on error - -This ioctl can be used to register a guest memory region which may -contain encrypted data (e.g. guest RAM, SMRAM etc). - -It is used in the SEV-enabled guest. When encryption is enabled, a guest -memory region may contain encrypted data. The SEV memory encryption -engine uses a tweak such that two identical plaintext pages, each at -different locations will have differing ciphertexts. So swapping or -moving ciphertext of those pages will not result in plaintext being -swapped. So relocating (or migrating) physical backing pages for the SEV -guest will require some additional steps. - -Note: The current SEV key management spec does not provide commands to -swap or migrate (move) ciphertext pages. Hence, for now we pin the guest -memory region registered with the ioctl. - -4.112 KVM_MEMORY_ENCRYPT_UNREG_REGION - -Capability: basic -Architectures: x86 -Type: system -Parameters: struct kvm_enc_region (in) -Returns: 0 on success; -1 on error - -This ioctl can be used to unregister the guest memory region registered -with KVM_MEMORY_ENCRYPT_REG_REGION ioctl above. - -4.113 KVM_HYPERV_EVENTFD - -Capability: KVM_CAP_HYPERV_EVENTFD -Architectures: x86 -Type: vm ioctl -Parameters: struct kvm_hyperv_eventfd (in) - -This ioctl (un)registers an eventfd to receive notifications from the guest on -the specified Hyper-V connection id through the SIGNAL_EVENT hypercall, without -causing a user exit. SIGNAL_EVENT hypercall with non-zero event flag number -(bits 24-31) still triggers a KVM_EXIT_HYPERV_HCALL user exit. - -struct kvm_hyperv_eventfd { - __u32 conn_id; - __s32 fd; - __u32 flags; - __u32 padding[3]; -}; - -The conn_id field should fit within 24 bits: - -#define KVM_HYPERV_CONN_ID_MASK 0x00ffffff - -The acceptable values for the flags field are: - -#define KVM_HYPERV_EVENTFD_DEASSIGN (1 << 0) - -Returns: 0 on success, - -EINVAL if conn_id or flags is outside the allowed range - -ENOENT on deassign if the conn_id isn't registered - -EEXIST on assign if the conn_id is already registered - -4.114 KVM_GET_NESTED_STATE - -Capability: KVM_CAP_NESTED_STATE -Architectures: x86 -Type: vcpu ioctl -Parameters: struct kvm_nested_state (in/out) -Returns: 0 on success, -1 on error -Errors: - E2BIG: the total state size exceeds the value of 'size' specified by - the user; the size required will be written into size. - -struct kvm_nested_state { - __u16 flags; - __u16 format; - __u32 size; - - union { - struct kvm_vmx_nested_state_hdr vmx; - struct kvm_svm_nested_state_hdr svm; - - /* Pad the header to 128 bytes. */ - __u8 pad[120]; - } hdr; - - union { - struct kvm_vmx_nested_state_data vmx[0]; - struct kvm_svm_nested_state_data svm[0]; - } data; -}; - -#define KVM_STATE_NESTED_GUEST_MODE 0x00000001 -#define KVM_STATE_NESTED_RUN_PENDING 0x00000002 -#define KVM_STATE_NESTED_EVMCS 0x00000004 - -#define KVM_STATE_NESTED_FORMAT_VMX 0 -#define KVM_STATE_NESTED_FORMAT_SVM 1 - -#define KVM_STATE_NESTED_VMX_VMCS_SIZE 0x1000 - -#define KVM_STATE_NESTED_VMX_SMM_GUEST_MODE 0x00000001 -#define KVM_STATE_NESTED_VMX_SMM_VMXON 0x00000002 - -struct kvm_vmx_nested_state_hdr { - __u64 vmxon_pa; - __u64 vmcs12_pa; - - struct { - __u16 flags; - } smm; -}; - -struct kvm_vmx_nested_state_data { - __u8 vmcs12[KVM_STATE_NESTED_VMX_VMCS_SIZE]; - __u8 shadow_vmcs12[KVM_STATE_NESTED_VMX_VMCS_SIZE]; -}; - -This ioctl copies the vcpu's nested virtualization state from the kernel to -userspace. - -The maximum size of the state can be retrieved by passing KVM_CAP_NESTED_STATE -to the KVM_CHECK_EXTENSION ioctl(). - -4.115 KVM_SET_NESTED_STATE - -Capability: KVM_CAP_NESTED_STATE -Architectures: x86 -Type: vcpu ioctl -Parameters: struct kvm_nested_state (in) -Returns: 0 on success, -1 on error - -This copies the vcpu's kvm_nested_state struct from userspace to the kernel. -For the definition of struct kvm_nested_state, see KVM_GET_NESTED_STATE. - -4.116 KVM_(UN)REGISTER_COALESCED_MMIO - -Capability: KVM_CAP_COALESCED_MMIO (for coalesced mmio) - KVM_CAP_COALESCED_PIO (for coalesced pio) -Architectures: all -Type: vm ioctl -Parameters: struct kvm_coalesced_mmio_zone -Returns: 0 on success, < 0 on error - -Coalesced I/O is a performance optimization that defers hardware -register write emulation so that userspace exits are avoided. It is -typically used to reduce the overhead of emulating frequently accessed -hardware registers. - -When a hardware register is configured for coalesced I/O, write accesses -do not exit to userspace and their value is recorded in a ring buffer -that is shared between kernel and userspace. - -Coalesced I/O is used if one or more write accesses to a hardware -register can be deferred until a read or a write to another hardware -register on the same device. This last access will cause a vmexit and -userspace will process accesses from the ring buffer before emulating -it. That will avoid exiting to userspace on repeated writes. - -Coalesced pio is based on coalesced mmio. There is little difference -between coalesced mmio and pio except that coalesced pio records accesses -to I/O ports. - -4.117 KVM_CLEAR_DIRTY_LOG (vm ioctl) - -Capability: KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 -Architectures: x86, arm, arm64, mips -Type: vm ioctl -Parameters: struct kvm_dirty_log (in) -Returns: 0 on success, -1 on error - -/* for KVM_CLEAR_DIRTY_LOG */ -struct kvm_clear_dirty_log { - __u32 slot; - __u32 num_pages; - __u64 first_page; - union { - void __user *dirty_bitmap; /* one bit per page */ - __u64 padding; - }; -}; - -The ioctl clears the dirty status of pages in a memory slot, according to -the bitmap that is passed in struct kvm_clear_dirty_log's dirty_bitmap -field. Bit 0 of the bitmap corresponds to page "first_page" in the -memory slot, and num_pages is the size in bits of the input bitmap. -first_page must be a multiple of 64; num_pages must also be a multiple of -64 unless first_page + num_pages is the size of the memory slot. For each -bit that is set in the input bitmap, the corresponding page is marked "clean" -in KVM's dirty bitmap, and dirty tracking is re-enabled for that page -(for example via write-protection, or by clearing the dirty bit in -a page table entry). - -If KVM_CAP_MULTI_ADDRESS_SPACE is available, bits 16-31 specifies -the address space for which you want to return the dirty bitmap. -They must be less than the value that KVM_CHECK_EXTENSION returns for -the KVM_CAP_MULTI_ADDRESS_SPACE capability. - -This ioctl is mostly useful when KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 -is enabled; for more information, see the description of the capability. -However, it can always be used as long as KVM_CHECK_EXTENSION confirms -that KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 is present. - -4.118 KVM_GET_SUPPORTED_HV_CPUID - -Capability: KVM_CAP_HYPERV_CPUID -Architectures: x86 -Type: vcpu ioctl -Parameters: struct kvm_cpuid2 (in/out) -Returns: 0 on success, -1 on error - -struct kvm_cpuid2 { - __u32 nent; - __u32 padding; - struct kvm_cpuid_entry2 entries[0]; -}; - -struct kvm_cpuid_entry2 { - __u32 function; - __u32 index; - __u32 flags; - __u32 eax; - __u32 ebx; - __u32 ecx; - __u32 edx; - __u32 padding[3]; -}; - -This ioctl returns x86 cpuid features leaves related to Hyper-V emulation in -KVM. Userspace can use the information returned by this ioctl to construct -cpuid information presented to guests consuming Hyper-V enlightenments (e.g. -Windows or Hyper-V guests). - -CPUID feature leaves returned by this ioctl are defined by Hyper-V Top Level -Functional Specification (TLFS). These leaves can't be obtained with -KVM_GET_SUPPORTED_CPUID ioctl because some of them intersect with KVM feature -leaves (0x40000000, 0x40000001). - -Currently, the following list of CPUID leaves are returned: - HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS - HYPERV_CPUID_INTERFACE - HYPERV_CPUID_VERSION - HYPERV_CPUID_FEATURES - HYPERV_CPUID_ENLIGHTMENT_INFO - HYPERV_CPUID_IMPLEMENT_LIMITS - HYPERV_CPUID_NESTED_FEATURES - -HYPERV_CPUID_NESTED_FEATURES leaf is only exposed when Enlightened VMCS was -enabled on the corresponding vCPU (KVM_CAP_HYPERV_ENLIGHTENED_VMCS). - -Userspace invokes KVM_GET_SUPPORTED_CPUID by passing a kvm_cpuid2 structure -with the 'nent' field indicating the number of entries in the variable-size -array 'entries'. If the number of entries is too low to describe all Hyper-V -feature leaves, an error (E2BIG) is returned. If the number is more or equal -to the number of Hyper-V feature leaves, the 'nent' field is adjusted to the -number of valid entries in the 'entries' array, which is then filled. - -'index' and 'flags' fields in 'struct kvm_cpuid_entry2' are currently reserved, -userspace should not expect to get any particular value there. - -4.119 KVM_ARM_VCPU_FINALIZE - -Architectures: arm, arm64 -Type: vcpu ioctl -Parameters: int feature (in) -Returns: 0 on success, -1 on error -Errors: - EPERM: feature not enabled, needs configuration, or already finalized - EINVAL: feature unknown or not present - -Recognised values for feature: - arm64 KVM_ARM_VCPU_SVE (requires KVM_CAP_ARM_SVE) - -Finalizes the configuration of the specified vcpu feature. - -The vcpu must already have been initialised, enabling the affected feature, by -means of a successful KVM_ARM_VCPU_INIT call with the appropriate flag set in -features[]. - -For affected vcpu features, this is a mandatory step that must be performed -before the vcpu is fully usable. - -Between KVM_ARM_VCPU_INIT and KVM_ARM_VCPU_FINALIZE, the feature may be -configured by use of ioctls such as KVM_SET_ONE_REG. The exact configuration -that should be performaned and how to do it are feature-dependent. - -Other calls that depend on a particular feature being finalized, such as -KVM_RUN, KVM_GET_REG_LIST, KVM_GET_ONE_REG and KVM_SET_ONE_REG, will fail with --EPERM unless the feature has already been finalized by means of a -KVM_ARM_VCPU_FINALIZE call. - -See KVM_ARM_VCPU_INIT for details of vcpu features that require finalization -using this ioctl. - -4.120 KVM_SET_PMU_EVENT_FILTER - -Capability: KVM_CAP_PMU_EVENT_FILTER -Architectures: x86 -Type: vm ioctl -Parameters: struct kvm_pmu_event_filter (in) -Returns: 0 on success, -1 on error - -struct kvm_pmu_event_filter { - __u32 action; - __u32 nevents; - __u32 fixed_counter_bitmap; - __u32 flags; - __u32 pad[4]; - __u64 events[0]; -}; - -This ioctl restricts the set of PMU events that the guest can program. -The argument holds a list of events which will be allowed or denied. -The eventsel+umask of each event the guest attempts to program is compared -against the events field to determine whether the guest should have access. -The events field only controls general purpose counters; fixed purpose -counters are controlled by the fixed_counter_bitmap. - -No flags are defined yet, the field must be zero. - -Valid values for 'action': -#define KVM_PMU_EVENT_ALLOW 0 -#define KVM_PMU_EVENT_DENY 1 - - -5. The kvm_run structure ------------------------- - -Application code obtains a pointer to the kvm_run structure by -mmap()ing a vcpu fd. From that point, application code can control -execution by changing fields in kvm_run prior to calling the KVM_RUN -ioctl, and obtain information about the reason KVM_RUN returned by -looking up structure members. - -struct kvm_run { - /* in */ - __u8 request_interrupt_window; - -Request that KVM_RUN return when it becomes possible to inject external -interrupts into the guest. Useful in conjunction with KVM_INTERRUPT. - - __u8 immediate_exit; - -This field is polled once when KVM_RUN starts; if non-zero, KVM_RUN -exits immediately, returning -EINTR. In the common scenario where a -signal is used to "kick" a VCPU out of KVM_RUN, this field can be used -to avoid usage of KVM_SET_SIGNAL_MASK, which has worse scalability. -Rather than blocking the signal outside KVM_RUN, userspace can set up -a signal handler that sets run->immediate_exit to a non-zero value. - -This field is ignored if KVM_CAP_IMMEDIATE_EXIT is not available. - - __u8 padding1[6]; - - /* out */ - __u32 exit_reason; - -When KVM_RUN has returned successfully (return value 0), this informs -application code why KVM_RUN has returned. Allowable values for this -field are detailed below. - - __u8 ready_for_interrupt_injection; - -If request_interrupt_window has been specified, this field indicates -an interrupt can be injected now with KVM_INTERRUPT. - - __u8 if_flag; - -The value of the current interrupt flag. Only valid if in-kernel -local APIC is not used. - - __u16 flags; - -More architecture-specific flags detailing state of the VCPU that may -affect the device's behavior. The only currently defined flag is -KVM_RUN_X86_SMM, which is valid on x86 machines and is set if the -VCPU is in system management mode. - - /* in (pre_kvm_run), out (post_kvm_run) */ - __u64 cr8; - -The value of the cr8 register. Only valid if in-kernel local APIC is -not used. Both input and output. - - __u64 apic_base; - -The value of the APIC BASE msr. Only valid if in-kernel local -APIC is not used. Both input and output. - - union { - /* KVM_EXIT_UNKNOWN */ - struct { - __u64 hardware_exit_reason; - } hw; - -If exit_reason is KVM_EXIT_UNKNOWN, the vcpu has exited due to unknown -reasons. Further architecture-specific information is available in -hardware_exit_reason. - - /* KVM_EXIT_FAIL_ENTRY */ - struct { - __u64 hardware_entry_failure_reason; - } fail_entry; - -If exit_reason is KVM_EXIT_FAIL_ENTRY, the vcpu could not be run due -to unknown reasons. Further architecture-specific information is -available in hardware_entry_failure_reason. - - /* KVM_EXIT_EXCEPTION */ - struct { - __u32 exception; - __u32 error_code; - } ex; - -Unused. - - /* KVM_EXIT_IO */ - struct { -#define KVM_EXIT_IO_IN 0 -#define KVM_EXIT_IO_OUT 1 - __u8 direction; - __u8 size; /* bytes */ - __u16 port; - __u32 count; - __u64 data_offset; /* relative to kvm_run start */ - } io; - -If exit_reason is KVM_EXIT_IO, then the vcpu has -executed a port I/O instruction which could not be satisfied by kvm. -data_offset describes where the data is located (KVM_EXIT_IO_OUT) or -where kvm expects application code to place the data for the next -KVM_RUN invocation (KVM_EXIT_IO_IN). Data format is a packed array. - - /* KVM_EXIT_DEBUG */ - struct { - struct kvm_debug_exit_arch arch; - } debug; - -If the exit_reason is KVM_EXIT_DEBUG, then a vcpu is processing a debug event -for which architecture specific information is returned. - - /* KVM_EXIT_MMIO */ - struct { - __u64 phys_addr; - __u8 data[8]; - __u32 len; - __u8 is_write; - } mmio; - -If exit_reason is KVM_EXIT_MMIO, then the vcpu has -executed a memory-mapped I/O instruction which could not be satisfied -by kvm. The 'data' member contains the written data if 'is_write' is -true, and should be filled by application code otherwise. - -The 'data' member contains, in its first 'len' bytes, the value as it would -appear if the VCPU performed a load or store of the appropriate width directly -to the byte array. - -NOTE: For KVM_EXIT_IO, KVM_EXIT_MMIO, KVM_EXIT_OSI, KVM_EXIT_PAPR and - KVM_EXIT_EPR the corresponding -operations are complete (and guest state is consistent) only after userspace -has re-entered the kernel with KVM_RUN. The kernel side will first finish -incomplete operations and then check for pending signals. Userspace -can re-enter the guest with an unmasked signal pending to complete -pending operations. - - /* KVM_EXIT_HYPERCALL */ - struct { - __u64 nr; - __u64 args[6]; - __u64 ret; - __u32 longmode; - __u32 pad; - } hypercall; - -Unused. This was once used for 'hypercall to userspace'. To implement -such functionality, use KVM_EXIT_IO (x86) or KVM_EXIT_MMIO (all except s390). -Note KVM_EXIT_IO is significantly faster than KVM_EXIT_MMIO. - - /* KVM_EXIT_TPR_ACCESS */ - struct { - __u64 rip; - __u32 is_write; - __u32 pad; - } tpr_access; - -To be documented (KVM_TPR_ACCESS_REPORTING). - - /* KVM_EXIT_S390_SIEIC */ - struct { - __u8 icptcode; - __u64 mask; /* psw upper half */ - __u64 addr; /* psw lower half */ - __u16 ipa; - __u32 ipb; - } s390_sieic; - -s390 specific. - - /* KVM_EXIT_S390_RESET */ -#define KVM_S390_RESET_POR 1 -#define KVM_S390_RESET_CLEAR 2 -#define KVM_S390_RESET_SUBSYSTEM 4 -#define KVM_S390_RESET_CPU_INIT 8 -#define KVM_S390_RESET_IPL 16 - __u64 s390_reset_flags; - -s390 specific. - - /* KVM_EXIT_S390_UCONTROL */ - struct { - __u64 trans_exc_code; - __u32 pgm_code; - } s390_ucontrol; - -s390 specific. A page fault has occurred for a user controlled virtual -machine (KVM_VM_S390_UNCONTROL) on it's host page table that cannot be -resolved by the kernel. -The program code and the translation exception code that were placed -in the cpu's lowcore are presented here as defined by the z Architecture -Principles of Operation Book in the Chapter for Dynamic Address Translation -(DAT) - - /* KVM_EXIT_DCR */ - struct { - __u32 dcrn; - __u32 data; - __u8 is_write; - } dcr; - -Deprecated - was used for 440 KVM. - - /* KVM_EXIT_OSI */ - struct { - __u64 gprs[32]; - } osi; - -MOL uses a special hypercall interface it calls 'OSI'. To enable it, we catch -hypercalls and exit with this exit struct that contains all the guest gprs. - -If exit_reason is KVM_EXIT_OSI, then the vcpu has triggered such a hypercall. -Userspace can now handle the hypercall and when it's done modify the gprs as -necessary. Upon guest entry all guest GPRs will then be replaced by the values -in this struct. - - /* KVM_EXIT_PAPR_HCALL */ - struct { - __u64 nr; - __u64 ret; - __u64 args[9]; - } papr_hcall; - -This is used on 64-bit PowerPC when emulating a pSeries partition, -e.g. with the 'pseries' machine type in qemu. It occurs when the -guest does a hypercall using the 'sc 1' instruction. The 'nr' field -contains the hypercall number (from the guest R3), and 'args' contains -the arguments (from the guest R4 - R12). Userspace should put the -return code in 'ret' and any extra returned values in args[]. -The possible hypercalls are defined in the Power Architecture Platform -Requirements (PAPR) document available from www.power.org (free -developer registration required to access it). - - /* KVM_EXIT_S390_TSCH */ - struct { - __u16 subchannel_id; - __u16 subchannel_nr; - __u32 io_int_parm; - __u32 io_int_word; - __u32 ipb; - __u8 dequeued; - } s390_tsch; - -s390 specific. This exit occurs when KVM_CAP_S390_CSS_SUPPORT has been enabled -and TEST SUBCHANNEL was intercepted. If dequeued is set, a pending I/O -interrupt for the target subchannel has been dequeued and subchannel_id, -subchannel_nr, io_int_parm and io_int_word contain the parameters for that -interrupt. ipb is needed for instruction parameter decoding. - - /* KVM_EXIT_EPR */ - struct { - __u32 epr; - } epr; - -On FSL BookE PowerPC chips, the interrupt controller has a fast patch -interrupt acknowledge path to the core. When the core successfully -delivers an interrupt, it automatically populates the EPR register with -the interrupt vector number and acknowledges the interrupt inside -the interrupt controller. - -In case the interrupt controller lives in user space, we need to do -the interrupt acknowledge cycle through it to fetch the next to be -delivered interrupt vector using this exit. - -It gets triggered whenever both KVM_CAP_PPC_EPR are enabled and an -external interrupt has just been delivered into the guest. User space -should put the acknowledged interrupt vector into the 'epr' field. - - /* KVM_EXIT_SYSTEM_EVENT */ - struct { -#define KVM_SYSTEM_EVENT_SHUTDOWN 1 -#define KVM_SYSTEM_EVENT_RESET 2 -#define KVM_SYSTEM_EVENT_CRASH 3 - __u32 type; - __u64 flags; - } system_event; - -If exit_reason is KVM_EXIT_SYSTEM_EVENT then the vcpu has triggered -a system-level event using some architecture specific mechanism (hypercall -or some special instruction). In case of ARM/ARM64, this is triggered using -HVC instruction based PSCI call from the vcpu. The 'type' field describes -the system-level event type. The 'flags' field describes architecture -specific flags for the system-level event. - -Valid values for 'type' are: - KVM_SYSTEM_EVENT_SHUTDOWN -- the guest has requested a shutdown of the - VM. Userspace is not obliged to honour this, and if it does honour - this does not need to destroy the VM synchronously (ie it may call - KVM_RUN again before shutdown finally occurs). - KVM_SYSTEM_EVENT_RESET -- the guest has requested a reset of the VM. - As with SHUTDOWN, userspace can choose to ignore the request, or - to schedule the reset to occur in the future and may call KVM_RUN again. - KVM_SYSTEM_EVENT_CRASH -- the guest crash occurred and the guest - has requested a crash condition maintenance. Userspace can choose - to ignore the request, or to gather VM memory core dump and/or - reset/shutdown of the VM. - - /* KVM_EXIT_IOAPIC_EOI */ - struct { - __u8 vector; - } eoi; - -Indicates that the VCPU's in-kernel local APIC received an EOI for a -level-triggered IOAPIC interrupt. This exit only triggers when the -IOAPIC is implemented in userspace (i.e. KVM_CAP_SPLIT_IRQCHIP is enabled); -the userspace IOAPIC should process the EOI and retrigger the interrupt if -it is still asserted. Vector is the LAPIC interrupt vector for which the -EOI was received. - - struct kvm_hyperv_exit { -#define KVM_EXIT_HYPERV_SYNIC 1 -#define KVM_EXIT_HYPERV_HCALL 2 - __u32 type; - union { - struct { - __u32 msr; - __u64 control; - __u64 evt_page; - __u64 msg_page; - } synic; - struct { - __u64 input; - __u64 result; - __u64 params[2]; - } hcall; - } u; - }; - /* KVM_EXIT_HYPERV */ - struct kvm_hyperv_exit hyperv; -Indicates that the VCPU exits into userspace to process some tasks -related to Hyper-V emulation. -Valid values for 'type' are: - KVM_EXIT_HYPERV_SYNIC -- synchronously notify user-space about -Hyper-V SynIC state change. Notification is used to remap SynIC -event/message pages and to enable/disable SynIC messages/events processing -in userspace. - - /* Fix the size of the union. */ - char padding[256]; - }; - - /* - * shared registers between kvm and userspace. - * kvm_valid_regs specifies the register classes set by the host - * kvm_dirty_regs specified the register classes dirtied by userspace - * struct kvm_sync_regs is architecture specific, as well as the - * bits for kvm_valid_regs and kvm_dirty_regs - */ - __u64 kvm_valid_regs; - __u64 kvm_dirty_regs; - union { - struct kvm_sync_regs regs; - char padding[SYNC_REGS_SIZE_BYTES]; - } s; - -If KVM_CAP_SYNC_REGS is defined, these fields allow userspace to access -certain guest registers without having to call SET/GET_*REGS. Thus we can -avoid some system call overhead if userspace has to handle the exit. -Userspace can query the validity of the structure by checking -kvm_valid_regs for specific bits. These bits are architecture specific -and usually define the validity of a groups of registers. (e.g. one bit - for general purpose registers) - -Please note that the kernel is allowed to use the kvm_run structure as the -primary storage for certain register types. Therefore, the kernel may use the -values in kvm_run even if the corresponding bit in kvm_dirty_regs is not set. - -}; - - - -6. Capabilities that can be enabled on vCPUs --------------------------------------------- - -There are certain capabilities that change the behavior of the virtual CPU or -the virtual machine when enabled. To enable them, please see section 4.37. -Below you can find a list of capabilities and what their effect on the vCPU or -the virtual machine is when enabling them. - -The following information is provided along with the description: - - Architectures: which instruction set architectures provide this ioctl. - x86 includes both i386 and x86_64. - - Target: whether this is a per-vcpu or per-vm capability. - - Parameters: what parameters are accepted by the capability. - - Returns: the return value. General error numbers (EBADF, ENOMEM, EINVAL) - are not detailed, but errors with specific meanings are. - - -6.1 KVM_CAP_PPC_OSI - -Architectures: ppc -Target: vcpu -Parameters: none -Returns: 0 on success; -1 on error - -This capability enables interception of OSI hypercalls that otherwise would -be treated as normal system calls to be injected into the guest. OSI hypercalls -were invented by Mac-on-Linux to have a standardized communication mechanism -between the guest and the host. - -When this capability is enabled, KVM_EXIT_OSI can occur. - - -6.2 KVM_CAP_PPC_PAPR - -Architectures: ppc -Target: vcpu -Parameters: none -Returns: 0 on success; -1 on error - -This capability enables interception of PAPR hypercalls. PAPR hypercalls are -done using the hypercall instruction "sc 1". - -It also sets the guest privilege level to "supervisor" mode. Usually the guest -runs in "hypervisor" privilege mode with a few missing features. - -In addition to the above, it changes the semantics of SDR1. In this mode, the -HTAB address part of SDR1 contains an HVA instead of a GPA, as PAPR keeps the -HTAB invisible to the guest. - -When this capability is enabled, KVM_EXIT_PAPR_HCALL can occur. - - -6.3 KVM_CAP_SW_TLB - -Architectures: ppc -Target: vcpu -Parameters: args[0] is the address of a struct kvm_config_tlb -Returns: 0 on success; -1 on error - -struct kvm_config_tlb { - __u64 params; - __u64 array; - __u32 mmu_type; - __u32 array_len; -}; - -Configures the virtual CPU's TLB array, establishing a shared memory area -between userspace and KVM. The "params" and "array" fields are userspace -addresses of mmu-type-specific data structures. The "array_len" field is an -safety mechanism, and should be set to the size in bytes of the memory that -userspace has reserved for the array. It must be at least the size dictated -by "mmu_type" and "params". - -While KVM_RUN is active, the shared region is under control of KVM. Its -contents are undefined, and any modification by userspace results in -boundedly undefined behavior. - -On return from KVM_RUN, the shared region will reflect the current state of -the guest's TLB. If userspace makes any changes, it must call KVM_DIRTY_TLB -to tell KVM which entries have been changed, prior to calling KVM_RUN again -on this vcpu. - -For mmu types KVM_MMU_FSL_BOOKE_NOHV and KVM_MMU_FSL_BOOKE_HV: - - The "params" field is of type "struct kvm_book3e_206_tlb_params". - - The "array" field points to an array of type "struct - kvm_book3e_206_tlb_entry". - - The array consists of all entries in the first TLB, followed by all - entries in the second TLB. - - Within a TLB, entries are ordered first by increasing set number. Within a - set, entries are ordered by way (increasing ESEL). - - The hash for determining set number in TLB0 is: (MAS2 >> 12) & (num_sets - 1) - where "num_sets" is the tlb_sizes[] value divided by the tlb_ways[] value. - - The tsize field of mas1 shall be set to 4K on TLB0, even though the - hardware ignores this value for TLB0. - -6.4 KVM_CAP_S390_CSS_SUPPORT - -Architectures: s390 -Target: vcpu -Parameters: none -Returns: 0 on success; -1 on error - -This capability enables support for handling of channel I/O instructions. - -TEST PENDING INTERRUPTION and the interrupt portion of TEST SUBCHANNEL are -handled in-kernel, while the other I/O instructions are passed to userspace. - -When this capability is enabled, KVM_EXIT_S390_TSCH will occur on TEST -SUBCHANNEL intercepts. - -Note that even though this capability is enabled per-vcpu, the complete -virtual machine is affected. - -6.5 KVM_CAP_PPC_EPR - -Architectures: ppc -Target: vcpu -Parameters: args[0] defines whether the proxy facility is active -Returns: 0 on success; -1 on error - -This capability enables or disables the delivery of interrupts through the -external proxy facility. - -When enabled (args[0] != 0), every time the guest gets an external interrupt -delivered, it automatically exits into user space with a KVM_EXIT_EPR exit -to receive the topmost interrupt vector. - -When disabled (args[0] == 0), behavior is as if this facility is unsupported. - -When this capability is enabled, KVM_EXIT_EPR can occur. - -6.6 KVM_CAP_IRQ_MPIC - -Architectures: ppc -Parameters: args[0] is the MPIC device fd - args[1] is the MPIC CPU number for this vcpu - -This capability connects the vcpu to an in-kernel MPIC device. - -6.7 KVM_CAP_IRQ_XICS - -Architectures: ppc -Target: vcpu -Parameters: args[0] is the XICS device fd - args[1] is the XICS CPU number (server ID) for this vcpu - -This capability connects the vcpu to an in-kernel XICS device. - -6.8 KVM_CAP_S390_IRQCHIP - -Architectures: s390 -Target: vm -Parameters: none - -This capability enables the in-kernel irqchip for s390. Please refer to -"4.24 KVM_CREATE_IRQCHIP" for details. - -6.9 KVM_CAP_MIPS_FPU - -Architectures: mips -Target: vcpu -Parameters: args[0] is reserved for future use (should be 0). - -This capability allows the use of the host Floating Point Unit by the guest. It -allows the Config1.FP bit to be set to enable the FPU in the guest. Once this is -done the KVM_REG_MIPS_FPR_* and KVM_REG_MIPS_FCR_* registers can be accessed -(depending on the current guest FPU register mode), and the Status.FR, -Config5.FRE bits are accessible via the KVM API and also from the guest, -depending on them being supported by the FPU. - -6.10 KVM_CAP_MIPS_MSA - -Architectures: mips -Target: vcpu -Parameters: args[0] is reserved for future use (should be 0). - -This capability allows the use of the MIPS SIMD Architecture (MSA) by the guest. -It allows the Config3.MSAP bit to be set to enable the use of MSA by the guest. -Once this is done the KVM_REG_MIPS_VEC_* and KVM_REG_MIPS_MSA_* registers can be -accessed, and the Config5.MSAEn bit is accessible via the KVM API and also from -the guest. - -6.74 KVM_CAP_SYNC_REGS -Architectures: s390, x86 -Target: s390: always enabled, x86: vcpu -Parameters: none -Returns: x86: KVM_CHECK_EXTENSION returns a bit-array indicating which register -sets are supported (bitfields defined in arch/x86/include/uapi/asm/kvm.h). - -As described above in the kvm_sync_regs struct info in section 5 (kvm_run): -KVM_CAP_SYNC_REGS "allow[s] userspace to access certain guest registers -without having to call SET/GET_*REGS". This reduces overhead by eliminating -repeated ioctl calls for setting and/or getting register values. This is -particularly important when userspace is making synchronous guest state -modifications, e.g. when emulating and/or intercepting instructions in -userspace. - -For s390 specifics, please refer to the source code. - -For x86: -- the register sets to be copied out to kvm_run are selectable - by userspace (rather that all sets being copied out for every exit). -- vcpu_events are available in addition to regs and sregs. - -For x86, the 'kvm_valid_regs' field of struct kvm_run is overloaded to -function as an input bit-array field set by userspace to indicate the -specific register sets to be copied out on the next exit. - -To indicate when userspace has modified values that should be copied into -the vCPU, the all architecture bitarray field, 'kvm_dirty_regs' must be set. -This is done using the same bitflags as for the 'kvm_valid_regs' field. -If the dirty bit is not set, then the register set values will not be copied -into the vCPU even if they've been modified. - -Unused bitfields in the bitarrays must be set to zero. - -struct kvm_sync_regs { - struct kvm_regs regs; - struct kvm_sregs sregs; - struct kvm_vcpu_events events; -}; - -6.75 KVM_CAP_PPC_IRQ_XIVE - -Architectures: ppc -Target: vcpu -Parameters: args[0] is the XIVE device fd - args[1] is the XIVE CPU number (server ID) for this vcpu - -This capability connects the vcpu to an in-kernel XIVE device. - -7. Capabilities that can be enabled on VMs ------------------------------------------- - -There are certain capabilities that change the behavior of the virtual -machine when enabled. To enable them, please see section 4.37. Below -you can find a list of capabilities and what their effect on the VM -is when enabling them. - -The following information is provided along with the description: - - Architectures: which instruction set architectures provide this ioctl. - x86 includes both i386 and x86_64. - - Parameters: what parameters are accepted by the capability. - - Returns: the return value. General error numbers (EBADF, ENOMEM, EINVAL) - are not detailed, but errors with specific meanings are. - - -7.1 KVM_CAP_PPC_ENABLE_HCALL - -Architectures: ppc -Parameters: args[0] is the sPAPR hcall number - args[1] is 0 to disable, 1 to enable in-kernel handling - -This capability controls whether individual sPAPR hypercalls (hcalls) -get handled by the kernel or not. Enabling or disabling in-kernel -handling of an hcall is effective across the VM. On creation, an -initial set of hcalls are enabled for in-kernel handling, which -consists of those hcalls for which in-kernel handlers were implemented -before this capability was implemented. If disabled, the kernel will -not to attempt to handle the hcall, but will always exit to userspace -to handle it. Note that it may not make sense to enable some and -disable others of a group of related hcalls, but KVM does not prevent -userspace from doing that. - -If the hcall number specified is not one that has an in-kernel -implementation, the KVM_ENABLE_CAP ioctl will fail with an EINVAL -error. - -7.2 KVM_CAP_S390_USER_SIGP - -Architectures: s390 -Parameters: none - -This capability controls which SIGP orders will be handled completely in user -space. With this capability enabled, all fast orders will be handled completely -in the kernel: -- SENSE -- SENSE RUNNING -- EXTERNAL CALL -- EMERGENCY SIGNAL -- CONDITIONAL EMERGENCY SIGNAL - -All other orders will be handled completely in user space. - -Only privileged operation exceptions will be checked for in the kernel (or even -in the hardware prior to interception). If this capability is not enabled, the -old way of handling SIGP orders is used (partially in kernel and user space). - -7.3 KVM_CAP_S390_VECTOR_REGISTERS - -Architectures: s390 -Parameters: none -Returns: 0 on success, negative value on error - -Allows use of the vector registers introduced with z13 processor, and -provides for the synchronization between host and user space. Will -return -EINVAL if the machine does not support vectors. - -7.4 KVM_CAP_S390_USER_STSI - -Architectures: s390 -Parameters: none - -This capability allows post-handlers for the STSI instruction. After -initial handling in the kernel, KVM exits to user space with -KVM_EXIT_S390_STSI to allow user space to insert further data. - -Before exiting to userspace, kvm handlers should fill in s390_stsi field of -vcpu->run: -struct { - __u64 addr; - __u8 ar; - __u8 reserved; - __u8 fc; - __u8 sel1; - __u16 sel2; -} s390_stsi; - -@addr - guest address of STSI SYSIB -@fc - function code -@sel1 - selector 1 -@sel2 - selector 2 -@ar - access register number - -KVM handlers should exit to userspace with rc = -EREMOTE. - -7.5 KVM_CAP_SPLIT_IRQCHIP - -Architectures: x86 -Parameters: args[0] - number of routes reserved for userspace IOAPICs -Returns: 0 on success, -1 on error - -Create a local apic for each processor in the kernel. This can be used -instead of KVM_CREATE_IRQCHIP if the userspace VMM wishes to emulate the -IOAPIC and PIC (and also the PIT, even though this has to be enabled -separately). - -This capability also enables in kernel routing of interrupt requests; -when KVM_CAP_SPLIT_IRQCHIP only routes of KVM_IRQ_ROUTING_MSI type are -used in the IRQ routing table. The first args[0] MSI routes are reserved -for the IOAPIC pins. Whenever the LAPIC receives an EOI for these routes, -a KVM_EXIT_IOAPIC_EOI vmexit will be reported to userspace. - -Fails if VCPU has already been created, or if the irqchip is already in the -kernel (i.e. KVM_CREATE_IRQCHIP has already been called). - -7.6 KVM_CAP_S390_RI - -Architectures: s390 -Parameters: none - -Allows use of runtime-instrumentation introduced with zEC12 processor. -Will return -EINVAL if the machine does not support runtime-instrumentation. -Will return -EBUSY if a VCPU has already been created. - -7.7 KVM_CAP_X2APIC_API - -Architectures: x86 -Parameters: args[0] - features that should be enabled -Returns: 0 on success, -EINVAL when args[0] contains invalid features - -Valid feature flags in args[0] are - -#define KVM_X2APIC_API_USE_32BIT_IDS (1ULL << 0) -#define KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK (1ULL << 1) - -Enabling KVM_X2APIC_API_USE_32BIT_IDS changes the behavior of -KVM_SET_GSI_ROUTING, KVM_SIGNAL_MSI, KVM_SET_LAPIC, and KVM_GET_LAPIC, -allowing the use of 32-bit APIC IDs. See KVM_CAP_X2APIC_API in their -respective sections. - -KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK must be enabled for x2APIC to work -in logical mode or with more than 255 VCPUs. Otherwise, KVM treats 0xff -as a broadcast even in x2APIC mode in order to support physical x2APIC -without interrupt remapping. This is undesirable in logical mode, -where 0xff represents CPUs 0-7 in cluster 0. - -7.8 KVM_CAP_S390_USER_INSTR0 - -Architectures: s390 -Parameters: none - -With this capability enabled, all illegal instructions 0x0000 (2 bytes) will -be intercepted and forwarded to user space. User space can use this -mechanism e.g. to realize 2-byte software breakpoints. The kernel will -not inject an operating exception for these instructions, user space has -to take care of that. - -This capability can be enabled dynamically even if VCPUs were already -created and are running. - -7.9 KVM_CAP_S390_GS - -Architectures: s390 -Parameters: none -Returns: 0 on success; -EINVAL if the machine does not support - guarded storage; -EBUSY if a VCPU has already been created. - -Allows use of guarded storage for the KVM guest. - -7.10 KVM_CAP_S390_AIS - -Architectures: s390 -Parameters: none - -Allow use of adapter-interruption suppression. -Returns: 0 on success; -EBUSY if a VCPU has already been created. - -7.11 KVM_CAP_PPC_SMT - -Architectures: ppc -Parameters: vsmt_mode, flags - -Enabling this capability on a VM provides userspace with a way to set -the desired virtual SMT mode (i.e. the number of virtual CPUs per -virtual core). The virtual SMT mode, vsmt_mode, must be a power of 2 -between 1 and 8. On POWER8, vsmt_mode must also be no greater than -the number of threads per subcore for the host. Currently flags must -be 0. A successful call to enable this capability will result in -vsmt_mode being returned when the KVM_CAP_PPC_SMT capability is -subsequently queried for the VM. This capability is only supported by -HV KVM, and can only be set before any VCPUs have been created. -The KVM_CAP_PPC_SMT_POSSIBLE capability indicates which virtual SMT -modes are available. - -7.12 KVM_CAP_PPC_FWNMI - -Architectures: ppc -Parameters: none - -With this capability a machine check exception in the guest address -space will cause KVM to exit the guest with NMI exit reason. This -enables QEMU to build error log and branch to guest kernel registered -machine check handling routine. Without this capability KVM will -branch to guests' 0x200 interrupt vector. - -7.13 KVM_CAP_X86_DISABLE_EXITS - -Architectures: x86 -Parameters: args[0] defines which exits are disabled -Returns: 0 on success, -EINVAL when args[0] contains invalid exits - -Valid bits in args[0] are - -#define KVM_X86_DISABLE_EXITS_MWAIT (1 << 0) -#define KVM_X86_DISABLE_EXITS_HLT (1 << 1) -#define KVM_X86_DISABLE_EXITS_PAUSE (1 << 2) -#define KVM_X86_DISABLE_EXITS_CSTATE (1 << 3) - -Enabling this capability on a VM provides userspace with a way to no -longer intercept some instructions for improved latency in some -workloads, and is suggested when vCPUs are associated to dedicated -physical CPUs. More bits can be added in the future; userspace can -just pass the KVM_CHECK_EXTENSION result to KVM_ENABLE_CAP to disable -all such vmexits. - -Do not enable KVM_FEATURE_PV_UNHALT if you disable HLT exits. - -7.14 KVM_CAP_S390_HPAGE_1M - -Architectures: s390 -Parameters: none -Returns: 0 on success, -EINVAL if hpage module parameter was not set - or cmma is enabled, or the VM has the KVM_VM_S390_UCONTROL - flag set - -With this capability the KVM support for memory backing with 1m pages -through hugetlbfs can be enabled for a VM. After the capability is -enabled, cmma can't be enabled anymore and pfmfi and the storage key -interpretation are disabled. If cmma has already been enabled or the -hpage module parameter is not set to 1, -EINVAL is returned. - -While it is generally possible to create a huge page backed VM without -this capability, the VM will not be able to run. - -7.15 KVM_CAP_MSR_PLATFORM_INFO - -Architectures: x86 -Parameters: args[0] whether feature should be enabled or not - -With this capability, a guest may read the MSR_PLATFORM_INFO MSR. Otherwise, -a #GP would be raised when the guest tries to access. Currently, this -capability does not enable write permissions of this MSR for the guest. - -7.16 KVM_CAP_PPC_NESTED_HV - -Architectures: ppc -Parameters: none -Returns: 0 on success, -EINVAL when the implementation doesn't support - nested-HV virtualization. - -HV-KVM on POWER9 and later systems allows for "nested-HV" -virtualization, which provides a way for a guest VM to run guests that -can run using the CPU's supervisor mode (privileged non-hypervisor -state). Enabling this capability on a VM depends on the CPU having -the necessary functionality and on the facility being enabled with a -kvm-hv module parameter. - -7.17 KVM_CAP_EXCEPTION_PAYLOAD - -Architectures: x86 -Parameters: args[0] whether feature should be enabled or not - -With this capability enabled, CR2 will not be modified prior to the -emulated VM-exit when L1 intercepts a #PF exception that occurs in -L2. Similarly, for kvm-intel only, DR6 will not be modified prior to -the emulated VM-exit when L1 intercepts a #DB exception that occurs in -L2. As a result, when KVM_GET_VCPU_EVENTS reports a pending #PF (or -#DB) exception for L2, exception.has_payload will be set and the -faulting address (or the new DR6 bits*) will be reported in the -exception_payload field. Similarly, when userspace injects a #PF (or -#DB) into L2 using KVM_SET_VCPU_EVENTS, it is expected to set -exception.has_payload and to put the faulting address (or the new DR6 -bits*) in the exception_payload field. - -This capability also enables exception.pending in struct -kvm_vcpu_events, which allows userspace to distinguish between pending -and injected exceptions. - - -* For the new DR6 bits, note that bit 16 is set iff the #DB exception - will clear DR6.RTM. - -7.18 KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 - -Architectures: x86, arm, arm64, mips -Parameters: args[0] whether feature should be enabled or not - -With this capability enabled, KVM_GET_DIRTY_LOG will not automatically -clear and write-protect all pages that are returned as dirty. -Rather, userspace will have to do this operation separately using -KVM_CLEAR_DIRTY_LOG. - -At the cost of a slightly more complicated operation, this provides better -scalability and responsiveness for two reasons. First, -KVM_CLEAR_DIRTY_LOG ioctl can operate on a 64-page granularity rather -than requiring to sync a full memslot; this ensures that KVM does not -take spinlocks for an extended period of time. Second, in some cases a -large amount of time can pass between a call to KVM_GET_DIRTY_LOG and -userspace actually using the data in the page. Pages can be modified -during this time, which is inefficint for both the guest and userspace: -the guest will incur a higher penalty due to write protection faults, -while userspace can see false reports of dirty pages. Manual reprotection -helps reducing this time, improving guest performance and reducing the -number of dirty log false positives. - -KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 was previously available under the name -KVM_CAP_MANUAL_DIRTY_LOG_PROTECT, but the implementation had bugs that make -it hard or impossible to use it correctly. The availability of -KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 signals that those bugs are fixed. -Userspace should not try to use KVM_CAP_MANUAL_DIRTY_LOG_PROTECT. - -8. Other capabilities. ----------------------- - -This section lists capabilities that give information about other -features of the KVM implementation. - -8.1 KVM_CAP_PPC_HWRNG - -Architectures: ppc - -This capability, if KVM_CHECK_EXTENSION indicates that it is -available, means that that the kernel has an implementation of the -H_RANDOM hypercall backed by a hardware random-number generator. -If present, the kernel H_RANDOM handler can be enabled for guest use -with the KVM_CAP_PPC_ENABLE_HCALL capability. - -8.2 KVM_CAP_HYPERV_SYNIC - -Architectures: x86 -This capability, if KVM_CHECK_EXTENSION indicates that it is -available, means that that the kernel has an implementation of the -Hyper-V Synthetic interrupt controller(SynIC). Hyper-V SynIC is -used to support Windows Hyper-V based guest paravirt drivers(VMBus). - -In order to use SynIC, it has to be activated by setting this -capability via KVM_ENABLE_CAP ioctl on the vcpu fd. Note that this -will disable the use of APIC hardware virtualization even if supported -by the CPU, as it's incompatible with SynIC auto-EOI behavior. - -8.3 KVM_CAP_PPC_RADIX_MMU - -Architectures: ppc - -This capability, if KVM_CHECK_EXTENSION indicates that it is -available, means that that the kernel can support guests using the -radix MMU defined in Power ISA V3.00 (as implemented in the POWER9 -processor). - -8.4 KVM_CAP_PPC_HASH_MMU_V3 - -Architectures: ppc - -This capability, if KVM_CHECK_EXTENSION indicates that it is -available, means that that the kernel can support guests using the -hashed page table MMU defined in Power ISA V3.00 (as implemented in -the POWER9 processor), including in-memory segment tables. - -8.5 KVM_CAP_MIPS_VZ - -Architectures: mips - -This capability, if KVM_CHECK_EXTENSION on the main kvm handle indicates that -it is available, means that full hardware assisted virtualization capabilities -of the hardware are available for use through KVM. An appropriate -KVM_VM_MIPS_* type must be passed to KVM_CREATE_VM to create a VM which -utilises it. - -If KVM_CHECK_EXTENSION on a kvm VM handle indicates that this capability is -available, it means that the VM is using full hardware assisted virtualization -capabilities of the hardware. This is useful to check after creating a VM with -KVM_VM_MIPS_DEFAULT. - -The value returned by KVM_CHECK_EXTENSION should be compared against known -values (see below). All other values are reserved. This is to allow for the -possibility of other hardware assisted virtualization implementations which -may be incompatible with the MIPS VZ ASE. - - 0: The trap & emulate implementation is in use to run guest code in user - mode. Guest virtual memory segments are rearranged to fit the guest in the - user mode address space. - - 1: The MIPS VZ ASE is in use, providing full hardware assisted - virtualization, including standard guest virtual memory segments. - -8.6 KVM_CAP_MIPS_TE - -Architectures: mips - -This capability, if KVM_CHECK_EXTENSION on the main kvm handle indicates that -it is available, means that the trap & emulate implementation is available to -run guest code in user mode, even if KVM_CAP_MIPS_VZ indicates that hardware -assisted virtualisation is also available. KVM_VM_MIPS_TE (0) must be passed -to KVM_CREATE_VM to create a VM which utilises it. - -If KVM_CHECK_EXTENSION on a kvm VM handle indicates that this capability is -available, it means that the VM is using trap & emulate. - -8.7 KVM_CAP_MIPS_64BIT - -Architectures: mips - -This capability indicates the supported architecture type of the guest, i.e. the -supported register and address width. - -The values returned when this capability is checked by KVM_CHECK_EXTENSION on a -kvm VM handle correspond roughly to the CP0_Config.AT register field, and should -be checked specifically against known values (see below). All other values are -reserved. - - 0: MIPS32 or microMIPS32. - Both registers and addresses are 32-bits wide. - It will only be possible to run 32-bit guest code. - - 1: MIPS64 or microMIPS64 with access only to 32-bit compatibility segments. - Registers are 64-bits wide, but addresses are 32-bits wide. - 64-bit guest code may run but cannot access MIPS64 memory segments. - It will also be possible to run 32-bit guest code. - - 2: MIPS64 or microMIPS64 with access to all address segments. - Both registers and addresses are 64-bits wide. - It will be possible to run 64-bit or 32-bit guest code. - -8.9 KVM_CAP_ARM_USER_IRQ - -Architectures: arm, arm64 -This capability, if KVM_CHECK_EXTENSION indicates that it is available, means -that if userspace creates a VM without an in-kernel interrupt controller, it -will be notified of changes to the output level of in-kernel emulated devices, -which can generate virtual interrupts, presented to the VM. -For such VMs, on every return to userspace, the kernel -updates the vcpu's run->s.regs.device_irq_level field to represent the actual -output level of the device. - -Whenever kvm detects a change in the device output level, kvm guarantees at -least one return to userspace before running the VM. This exit could either -be a KVM_EXIT_INTR or any other exit event, like KVM_EXIT_MMIO. This way, -userspace can always sample the device output level and re-compute the state of -the userspace interrupt controller. Userspace should always check the state -of run->s.regs.device_irq_level on every kvm exit. -The value in run->s.regs.device_irq_level can represent both level and edge -triggered interrupt signals, depending on the device. Edge triggered interrupt -signals will exit to userspace with the bit in run->s.regs.device_irq_level -set exactly once per edge signal. - -The field run->s.regs.device_irq_level is available independent of -run->kvm_valid_regs or run->kvm_dirty_regs bits. - -If KVM_CAP_ARM_USER_IRQ is supported, the KVM_CHECK_EXTENSION ioctl returns a -number larger than 0 indicating the version of this capability is implemented -and thereby which bits in in run->s.regs.device_irq_level can signal values. - -Currently the following bits are defined for the device_irq_level bitmap: - - KVM_CAP_ARM_USER_IRQ >= 1: - - KVM_ARM_DEV_EL1_VTIMER - EL1 virtual timer - KVM_ARM_DEV_EL1_PTIMER - EL1 physical timer - KVM_ARM_DEV_PMU - ARM PMU overflow interrupt signal - -Future versions of kvm may implement additional events. These will get -indicated by returning a higher number from KVM_CHECK_EXTENSION and will be -listed above. - -8.10 KVM_CAP_PPC_SMT_POSSIBLE - -Architectures: ppc - -Querying this capability returns a bitmap indicating the possible -virtual SMT modes that can be set using KVM_CAP_PPC_SMT. If bit N -(counting from the right) is set, then a virtual SMT mode of 2^N is -available. - -8.11 KVM_CAP_HYPERV_SYNIC2 - -Architectures: x86 - -This capability enables a newer version of Hyper-V Synthetic interrupt -controller (SynIC). The only difference with KVM_CAP_HYPERV_SYNIC is that KVM -doesn't clear SynIC message and event flags pages when they are enabled by -writing to the respective MSRs. - -8.12 KVM_CAP_HYPERV_VP_INDEX - -Architectures: x86 - -This capability indicates that userspace can load HV_X64_MSR_VP_INDEX msr. Its -value is used to denote the target vcpu for a SynIC interrupt. For -compatibilty, KVM initializes this msr to KVM's internal vcpu index. When this -capability is absent, userspace can still query this msr's value. - -8.13 KVM_CAP_S390_AIS_MIGRATION - -Architectures: s390 -Parameters: none - -This capability indicates if the flic device will be able to get/set the -AIS states for migration via the KVM_DEV_FLIC_AISM_ALL attribute and allows -to discover this without having to create a flic device. - -8.14 KVM_CAP_S390_PSW - -Architectures: s390 - -This capability indicates that the PSW is exposed via the kvm_run structure. - -8.15 KVM_CAP_S390_GMAP - -Architectures: s390 - -This capability indicates that the user space memory used as guest mapping can -be anywhere in the user memory address space, as long as the memory slots are -aligned and sized to a segment (1MB) boundary. - -8.16 KVM_CAP_S390_COW - -Architectures: s390 - -This capability indicates that the user space memory used as guest mapping can -use copy-on-write semantics as well as dirty pages tracking via read-only page -tables. - -8.17 KVM_CAP_S390_BPB - -Architectures: s390 - -This capability indicates that kvm will implement the interfaces to handle -reset, migration and nested KVM for branch prediction blocking. The stfle -facility 82 should not be provided to the guest without this capability. - -8.18 KVM_CAP_HYPERV_TLBFLUSH - -Architectures: x86 - -This capability indicates that KVM supports paravirtualized Hyper-V TLB Flush -hypercalls: -HvFlushVirtualAddressSpace, HvFlushVirtualAddressSpaceEx, -HvFlushVirtualAddressList, HvFlushVirtualAddressListEx. - -8.19 KVM_CAP_ARM_INJECT_SERROR_ESR - -Architectures: arm, arm64 - -This capability indicates that userspace can specify (via the -KVM_SET_VCPU_EVENTS ioctl) the syndrome value reported to the guest when it -takes a virtual SError interrupt exception. -If KVM advertises this capability, userspace can only specify the ISS field for -the ESR syndrome. Other parts of the ESR, such as the EC are generated by the -CPU when the exception is taken. If this virtual SError is taken to EL1 using -AArch64, this value will be reported in the ISS field of ESR_ELx. - -See KVM_CAP_VCPU_EVENTS for more details. -8.20 KVM_CAP_HYPERV_SEND_IPI - -Architectures: x86 - -This capability indicates that KVM supports paravirtualized Hyper-V IPI send -hypercalls: -HvCallSendSyntheticClusterIpi, HvCallSendSyntheticClusterIpiEx. diff --git a/Documentation/virtual/kvm/arm/hyp-abi.txt b/Documentation/virtual/kvm/arm/hyp-abi.txt deleted file mode 100644 index a20a0bee268d..000000000000 --- a/Documentation/virtual/kvm/arm/hyp-abi.txt +++ /dev/null @@ -1,53 +0,0 @@ -* Internal ABI between the kernel and HYP - -This file documents the interaction between the Linux kernel and the -hypervisor layer when running Linux as a hypervisor (for example -KVM). It doesn't cover the interaction of the kernel with the -hypervisor when running as a guest (under Xen, KVM or any other -hypervisor), or any hypervisor-specific interaction when the kernel is -used as a host. - -On arm and arm64 (without VHE), the kernel doesn't run in hypervisor -mode, but still needs to interact with it, allowing a built-in -hypervisor to be either installed or torn down. - -In order to achieve this, the kernel must be booted at HYP (arm) or -EL2 (arm64), allowing it to install a set of stubs before dropping to -SVC/EL1. These stubs are accessible by using a 'hvc #0' instruction, -and only act on individual CPUs. - -Unless specified otherwise, any built-in hypervisor must implement -these functions (see arch/arm{,64}/include/asm/virt.h): - -* r0/x0 = HVC_SET_VECTORS - r1/x1 = vectors - - Set HVBAR/VBAR_EL2 to 'vectors' to enable a hypervisor. 'vectors' - must be a physical address, and respect the alignment requirements - of the architecture. Only implemented by the initial stubs, not by - Linux hypervisors. - -* r0/x0 = HVC_RESET_VECTORS - - Turn HYP/EL2 MMU off, and reset HVBAR/VBAR_EL2 to the initials - stubs' exception vector value. This effectively disables an existing - hypervisor. - -* r0/x0 = HVC_SOFT_RESTART - r1/x1 = restart address - x2 = x0's value when entering the next payload (arm64) - x3 = x1's value when entering the next payload (arm64) - x4 = x2's value when entering the next payload (arm64) - - Mask all exceptions, disable the MMU, move the arguments into place - (arm64 only), and jump to the restart address while at HYP/EL2. This - hypercall is not expected to return to its caller. - -Any other value of r0/x0 triggers a hypervisor-specific handling, -which is not documented here. - -The return value of a stub hypercall is held by r0/x0, and is 0 on -success, and HVC_STUB_ERR on error. A stub hypercall is allowed to -clobber any of the caller-saved registers (x0-x18 on arm64, r0-r3 and -ip on arm). It is thus recommended to use a function call to perform -the hypercall. diff --git a/Documentation/virtual/kvm/arm/psci.txt b/Documentation/virtual/kvm/arm/psci.txt deleted file mode 100644 index 559586fc9d37..000000000000 --- a/Documentation/virtual/kvm/arm/psci.txt +++ /dev/null @@ -1,61 +0,0 @@ -KVM implements the PSCI (Power State Coordination Interface) -specification in order to provide services such as CPU on/off, reset -and power-off to the guest. - -The PSCI specification is regularly updated to provide new features, -and KVM implements these updates if they make sense from a virtualization -point of view. - -This means that a guest booted on two different versions of KVM can -observe two different "firmware" revisions. This could cause issues if -a given guest is tied to a particular PSCI revision (unlikely), or if -a migration causes a different PSCI version to be exposed out of the -blue to an unsuspecting guest. - -In order to remedy this situation, KVM exposes a set of "firmware -pseudo-registers" that can be manipulated using the GET/SET_ONE_REG -interface. These registers can be saved/restored by userspace, and set -to a convenient value if required. - -The following register is defined: - -* KVM_REG_ARM_PSCI_VERSION: - - - Only valid if the vcpu has the KVM_ARM_VCPU_PSCI_0_2 feature set - (and thus has already been initialized) - - Returns the current PSCI version on GET_ONE_REG (defaulting to the - highest PSCI version implemented by KVM and compatible with v0.2) - - Allows any PSCI version implemented by KVM and compatible with - v0.2 to be set with SET_ONE_REG - - Affects the whole VM (even if the register view is per-vcpu) - -* KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1: - Holds the state of the firmware support to mitigate CVE-2017-5715, as - offered by KVM to the guest via a HVC call. The workaround is described - under SMCCC_ARCH_WORKAROUND_1 in [1]. - Accepted values are: - KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_NOT_AVAIL: KVM does not offer - firmware support for the workaround. The mitigation status for the - guest is unknown. - KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_AVAIL: The workaround HVC call is - available to the guest and required for the mitigation. - KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_NOT_REQUIRED: The workaround HVC call - is available to the guest, but it is not needed on this VCPU. - -* KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2: - Holds the state of the firmware support to mitigate CVE-2018-3639, as - offered by KVM to the guest via a HVC call. The workaround is described - under SMCCC_ARCH_WORKAROUND_2 in [1]. - Accepted values are: - KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_AVAIL: A workaround is not - available. KVM does not offer firmware support for the workaround. - KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_UNKNOWN: The workaround state is - unknown. KVM does not offer firmware support for the workaround. - KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_AVAIL: The workaround is available, - and can be disabled by a vCPU. If - KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_ENABLED is set, it is active for - this vCPU. - KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_REQUIRED: The workaround is - always active on this vCPU or it is not needed. - -[1] https://developer.arm.com/-/media/developer/pdf/ARM_DEN_0070A_Firmware_interfaces_for_mitigating_CVE-2017-5715.pdf diff --git a/Documentation/virtual/kvm/cpuid.rst b/Documentation/virtual/kvm/cpuid.rst deleted file mode 100644 index 01b081f6e7ea..000000000000 --- a/Documentation/virtual/kvm/cpuid.rst +++ /dev/null @@ -1,107 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -============== -KVM CPUID bits -============== - -:Author: Glauber Costa - -A guest running on a kvm host, can check some of its features using -cpuid. This is not always guaranteed to work, since userspace can -mask-out some, or even all KVM-related cpuid features before launching -a guest. - -KVM cpuid functions are: - -function: KVM_CPUID_SIGNATURE (0x40000000) - -returns:: - - eax = 0x40000001 - ebx = 0x4b4d564b - ecx = 0x564b4d56 - edx = 0x4d - -Note that this value in ebx, ecx and edx corresponds to the string "KVMKVMKVM". -The value in eax corresponds to the maximum cpuid function present in this leaf, -and will be updated if more functions are added in the future. -Note also that old hosts set eax value to 0x0. This should -be interpreted as if the value was 0x40000001. -This function queries the presence of KVM cpuid leafs. - -function: define KVM_CPUID_FEATURES (0x40000001) - -returns:: - - ebx, ecx - eax = an OR'ed group of (1 << flag) - -where ``flag`` is defined as below: - -================================= =========== ================================ -flag value meaning -================================= =========== ================================ -KVM_FEATURE_CLOCKSOURCE 0 kvmclock available at msrs - 0x11 and 0x12 - -KVM_FEATURE_NOP_IO_DELAY 1 not necessary to perform delays - on PIO operations - -KVM_FEATURE_MMU_OP 2 deprecated - -KVM_FEATURE_CLOCKSOURCE2 3 kvmclock available at msrs - - 0x4b564d00 and 0x4b564d01 -KVM_FEATURE_ASYNC_PF 4 async pf can be enabled by - writing to msr 0x4b564d02 - -KVM_FEATURE_STEAL_TIME 5 steal time can be enabled by - writing to msr 0x4b564d03 - -KVM_FEATURE_PV_EOI 6 paravirtualized end of interrupt - handler can be enabled by - writing to msr 0x4b564d04 - -KVM_FEATURE_PV_UNHAULT 7 guest checks this feature bit - before enabling paravirtualized - spinlock support - -KVM_FEATURE_PV_TLB_FLUSH 9 guest checks this feature bit - before enabling paravirtualized - tlb flush - -KVM_FEATURE_ASYNC_PF_VMEXIT 10 paravirtualized async PF VM EXIT - can be enabled by setting bit 2 - when writing to msr 0x4b564d02 - -KVM_FEATURE_PV_SEND_IPI 11 guest checks this feature bit - before enabling paravirtualized - sebd IPIs - -KVM_FEATURE_PV_POLL_CONTROL 12 host-side polling on HLT can - be disabled by writing - to msr 0x4b564d05. - -KVM_FEATURE_PV_SCHED_YIELD 13 guest checks this feature bit - before using paravirtualized - sched yield. - -KVM_FEATURE_CLOCSOURCE_STABLE_BIT 24 host will warn if no guest-side - per-cpu warps are expeced in - kvmclock -================================= =========== ================================ - -:: - - edx = an OR'ed group of (1 << flag) - -Where ``flag`` here is defined as below: - -================== ============ ================================= -flag value meaning -================== ============ ================================= -KVM_HINTS_REALTIME 0 guest checks this feature bit to - determine that vCPUs are never - preempted for an unlimited time - allowing optimizations -================== ============ ================================= diff --git a/Documentation/virtual/kvm/devices/README b/Documentation/virtual/kvm/devices/README deleted file mode 100644 index 34a69834124a..000000000000 --- a/Documentation/virtual/kvm/devices/README +++ /dev/null @@ -1 +0,0 @@ -This directory contains specific device bindings for KVM_CAP_DEVICE_CTRL. diff --git a/Documentation/virtual/kvm/devices/arm-vgic-its.txt b/Documentation/virtual/kvm/devices/arm-vgic-its.txt deleted file mode 100644 index eeaa95b893a8..000000000000 --- a/Documentation/virtual/kvm/devices/arm-vgic-its.txt +++ /dev/null @@ -1,181 +0,0 @@ -ARM Virtual Interrupt Translation Service (ITS) -=============================================== - -Device types supported: - KVM_DEV_TYPE_ARM_VGIC_ITS ARM Interrupt Translation Service Controller - -The ITS allows MSI(-X) interrupts to be injected into guests. This extension is -optional. Creating a virtual ITS controller also requires a host GICv3 (see -arm-vgic-v3.txt), but does not depend on having physical ITS controllers. - -There can be multiple ITS controllers per guest, each of them has to have -a separate, non-overlapping MMIO region. - - -Groups: - KVM_DEV_ARM_VGIC_GRP_ADDR - Attributes: - KVM_VGIC_ITS_ADDR_TYPE (rw, 64-bit) - Base address in the guest physical address space of the GICv3 ITS - control register frame. - This address needs to be 64K aligned and the region covers 128K. - Errors: - -E2BIG: Address outside of addressable IPA range - -EINVAL: Incorrectly aligned address - -EEXIST: Address already configured - -EFAULT: Invalid user pointer for attr->addr. - -ENODEV: Incorrect attribute or the ITS is not supported. - - - KVM_DEV_ARM_VGIC_GRP_CTRL - Attributes: - KVM_DEV_ARM_VGIC_CTRL_INIT - request the initialization of the ITS, no additional parameter in - kvm_device_attr.addr. - - KVM_DEV_ARM_ITS_CTRL_RESET - reset the ITS, no additional parameter in kvm_device_attr.addr. - See "ITS Reset State" section. - - KVM_DEV_ARM_ITS_SAVE_TABLES - save the ITS table data into guest RAM, at the location provisioned - by the guest in corresponding registers/table entries. - - The layout of the tables in guest memory defines an ABI. The entries - are laid out in little endian format as described in the last paragraph. - - KVM_DEV_ARM_ITS_RESTORE_TABLES - restore the ITS tables from guest RAM to ITS internal structures. - - The GICV3 must be restored before the ITS and all ITS registers but - the GITS_CTLR must be restored before restoring the ITS tables. - - The GITS_IIDR read-only register must also be restored before - calling KVM_DEV_ARM_ITS_RESTORE_TABLES as the IIDR revision field - encodes the ABI revision. - - The expected ordering when restoring the GICv3/ITS is described in section - "ITS Restore Sequence". - - Errors: - -ENXIO: ITS not properly configured as required prior to setting - this attribute - -ENOMEM: Memory shortage when allocating ITS internal data - -EINVAL: Inconsistent restored data - -EFAULT: Invalid guest ram access - -EBUSY: One or more VCPUS are running - -EACCES: The virtual ITS is backed by a physical GICv4 ITS, and the - state is not available - - KVM_DEV_ARM_VGIC_GRP_ITS_REGS - Attributes: - The attr field of kvm_device_attr encodes the offset of the - ITS register, relative to the ITS control frame base address - (ITS_base). - - kvm_device_attr.addr points to a __u64 value whatever the width - of the addressed register (32/64 bits). 64 bit registers can only - be accessed with full length. - - Writes to read-only registers are ignored by the kernel except for: - - GITS_CREADR. It must be restored otherwise commands in the queue - will be re-executed after restoring CWRITER. GITS_CREADR must be - restored before restoring the GITS_CTLR which is likely to enable the - ITS. Also it must be restored after GITS_CBASER since a write to - GITS_CBASER resets GITS_CREADR. - - GITS_IIDR. The Revision field encodes the table layout ABI revision. - In the future we might implement direct injection of virtual LPIs. - This will require an upgrade of the table layout and an evolution of - the ABI. GITS_IIDR must be restored before calling - KVM_DEV_ARM_ITS_RESTORE_TABLES. - - For other registers, getting or setting a register has the same - effect as reading/writing the register on real hardware. - Errors: - -ENXIO: Offset does not correspond to any supported register - -EFAULT: Invalid user pointer for attr->addr - -EINVAL: Offset is not 64-bit aligned - -EBUSY: one or more VCPUS are running - - ITS Restore Sequence: - ------------------------- - -The following ordering must be followed when restoring the GIC and the ITS: -a) restore all guest memory and create vcpus -b) restore all redistributors -c) provide the ITS base address - (KVM_DEV_ARM_VGIC_GRP_ADDR) -d) restore the ITS in the following order: - 1. Restore GITS_CBASER - 2. Restore all other GITS_ registers, except GITS_CTLR! - 3. Load the ITS table data (KVM_DEV_ARM_ITS_RESTORE_TABLES) - 4. Restore GITS_CTLR - -Then vcpus can be started. - - ITS Table ABI REV0: - ------------------- - - Revision 0 of the ABI only supports the features of a virtual GICv3, and does - not support a virtual GICv4 with support for direct injection of virtual - interrupts for nested hypervisors. - - The device table and ITT are indexed by the DeviceID and EventID, - respectively. The collection table is not indexed by CollectionID, and the - entries in the collection are listed in no particular order. - All entries are 8 bytes. - - Device Table Entry (DTE): - - bits: | 63| 62 ... 49 | 48 ... 5 | 4 ... 0 | - values: | V | next | ITT_addr | Size | - - where; - - V indicates whether the entry is valid. If not, other fields - are not meaningful. - - next: equals to 0 if this entry is the last one; otherwise it - corresponds to the DeviceID offset to the next DTE, capped by - 2^14 -1. - - ITT_addr matches bits [51:8] of the ITT address (256 Byte aligned). - - Size specifies the supported number of bits for the EventID, - minus one - - Collection Table Entry (CTE): - - bits: | 63| 62 .. 52 | 51 ... 16 | 15 ... 0 | - values: | V | RES0 | RDBase | ICID | - - where: - - V indicates whether the entry is valid. If not, other fields are - not meaningful. - - RES0: reserved field with Should-Be-Zero-or-Preserved behavior. - - RDBase is the PE number (GICR_TYPER.Processor_Number semantic), - - ICID is the collection ID - - Interrupt Translation Entry (ITE): - - bits: | 63 ... 48 | 47 ... 16 | 15 ... 0 | - values: | next | pINTID | ICID | - - where: - - next: equals to 0 if this entry is the last one; otherwise it corresponds - to the EventID offset to the next ITE capped by 2^16 -1. - - pINTID is the physical LPI ID; if zero, it means the entry is not valid - and other fields are not meaningful. - - ICID is the collection ID - - ITS Reset State: - ---------------- - -RESET returns the ITS to the same state that it was when first created and -initialized. When the RESET command returns, the following things are -guaranteed: - -- The ITS is not enabled and quiescent - GITS_CTLR.Enabled = 0 .Quiescent=1 -- There is no internally cached state -- No collection or device table are used - GITS_BASER.Valid = 0 -- GITS_CBASER = 0, GITS_CREADR = 0, GITS_CWRITER = 0 -- The ABI version is unchanged and remains the one set when the ITS - device was first created. diff --git a/Documentation/virtual/kvm/devices/arm-vgic-v3.txt b/Documentation/virtual/kvm/devices/arm-vgic-v3.txt deleted file mode 100644 index ff290b43c8e5..000000000000 --- a/Documentation/virtual/kvm/devices/arm-vgic-v3.txt +++ /dev/null @@ -1,251 +0,0 @@ -ARM Virtual Generic Interrupt Controller v3 and later (VGICv3) -============================================================== - - -Device types supported: - KVM_DEV_TYPE_ARM_VGIC_V3 ARM Generic Interrupt Controller v3.0 - -Only one VGIC instance may be instantiated through this API. The created VGIC -will act as the VM interrupt controller, requiring emulated user-space devices -to inject interrupts to the VGIC instead of directly to CPUs. It is not -possible to create both a GICv3 and GICv2 on the same VM. - -Creating a guest GICv3 device requires a host GICv3 as well. - - -Groups: - KVM_DEV_ARM_VGIC_GRP_ADDR - Attributes: - KVM_VGIC_V3_ADDR_TYPE_DIST (rw, 64-bit) - Base address in the guest physical address space of the GICv3 distributor - register mappings. Only valid for KVM_DEV_TYPE_ARM_VGIC_V3. - This address needs to be 64K aligned and the region covers 64 KByte. - - KVM_VGIC_V3_ADDR_TYPE_REDIST (rw, 64-bit) - Base address in the guest physical address space of the GICv3 - redistributor register mappings. There are two 64K pages for each - VCPU and all of the redistributor pages are contiguous. - Only valid for KVM_DEV_TYPE_ARM_VGIC_V3. - This address needs to be 64K aligned. - - KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION (rw, 64-bit) - The attribute data pointed to by kvm_device_attr.addr is a __u64 value: - bits: | 63 .... 52 | 51 .... 16 | 15 - 12 |11 - 0 - values: | count | base | flags | index - - index encodes the unique redistributor region index - - flags: reserved for future use, currently 0 - - base field encodes bits [51:16] of the guest physical base address - of the first redistributor in the region. - - count encodes the number of redistributors in the region. Must be - greater than 0. - There are two 64K pages for each redistributor in the region and - redistributors are laid out contiguously within the region. Regions - are filled with redistributors in the index order. The sum of all - region count fields must be greater than or equal to the number of - VCPUs. Redistributor regions must be registered in the incremental - index order, starting from index 0. - The characteristics of a specific redistributor region can be read - by presetting the index field in the attr data. - Only valid for KVM_DEV_TYPE_ARM_VGIC_V3. - - It is invalid to mix calls with KVM_VGIC_V3_ADDR_TYPE_REDIST and - KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION attributes. - - Errors: - -E2BIG: Address outside of addressable IPA range - -EINVAL: Incorrectly aligned address, bad redistributor region - count/index, mixed redistributor region attribute usage - -EEXIST: Address already configured - -ENOENT: Attempt to read the characteristics of a non existing - redistributor region - -ENXIO: The group or attribute is unknown/unsupported for this device - or hardware support is missing. - -EFAULT: Invalid user pointer for attr->addr. - - - KVM_DEV_ARM_VGIC_GRP_DIST_REGS - KVM_DEV_ARM_VGIC_GRP_REDIST_REGS - Attributes: - The attr field of kvm_device_attr encodes two values: - bits: | 63 .... 32 | 31 .... 0 | - values: | mpidr | offset | - - All distributor regs are (rw, 32-bit) and kvm_device_attr.addr points to a - __u32 value. 64-bit registers must be accessed by separately accessing the - lower and higher word. - - Writes to read-only registers are ignored by the kernel. - - KVM_DEV_ARM_VGIC_GRP_DIST_REGS accesses the main distributor registers. - KVM_DEV_ARM_VGIC_GRP_REDIST_REGS accesses the redistributor of the CPU - specified by the mpidr. - - The offset is relative to the "[Re]Distributor base address" as defined - in the GICv3/4 specs. Getting or setting such a register has the same - effect as reading or writing the register on real hardware, except for the - following registers: GICD_STATUSR, GICR_STATUSR, GICD_ISPENDR, - GICR_ISPENDR0, GICD_ICPENDR, and GICR_ICPENDR0. These registers behave - differently when accessed via this interface compared to their - architecturally defined behavior to allow software a full view of the - VGIC's internal state. - - The mpidr field is used to specify which - redistributor is accessed. The mpidr is ignored for the distributor. - - The mpidr encoding is based on the affinity information in the - architecture defined MPIDR, and the field is encoded as follows: - | 63 .... 56 | 55 .... 48 | 47 .... 40 | 39 .... 32 | - | Aff3 | Aff2 | Aff1 | Aff0 | - - Note that distributor fields are not banked, but return the same value - regardless of the mpidr used to access the register. - - GICD_IIDR.Revision is updated when the KVM implementation is changed in a - way directly observable by the guest or userspace. Userspace should read - GICD_IIDR from KVM and write back the read value to confirm its expected - behavior is aligned with the KVM implementation. Userspace should set - GICD_IIDR before setting any other registers to ensure the expected - behavior. - - - The GICD_STATUSR and GICR_STATUSR registers are architecturally defined such - that a write of a clear bit has no effect, whereas a write with a set bit - clears that value. To allow userspace to freely set the values of these two - registers, setting the attributes with the register offsets for these two - registers simply sets the non-reserved bits to the value written. - - - Accesses (reads and writes) to the GICD_ISPENDR register region and - GICR_ISPENDR0 registers get/set the value of the latched pending state for - the interrupts. - - This is identical to the value returned by a guest read from ISPENDR for an - edge triggered interrupt, but may differ for level triggered interrupts. - For edge triggered interrupts, once an interrupt becomes pending (whether - because of an edge detected on the input line or because of a guest write - to ISPENDR) this state is "latched", and only cleared when either the - interrupt is activated or when the guest writes to ICPENDR. A level - triggered interrupt may be pending either because the level input is held - high by a device, or because of a guest write to the ISPENDR register. Only - ISPENDR writes are latched; if the device lowers the line level then the - interrupt is no longer pending unless the guest also wrote to ISPENDR, and - conversely writes to ICPENDR or activations of the interrupt do not clear - the pending status if the line level is still being held high. (These - rules are documented in the GICv3 specification descriptions of the ICPENDR - and ISPENDR registers.) For a level triggered interrupt the value accessed - here is that of the latch which is set by ISPENDR and cleared by ICPENDR or - interrupt activation, whereas the value returned by a guest read from - ISPENDR is the logical OR of the latch value and the input line level. - - Raw access to the latch state is provided to userspace so that it can save - and restore the entire GIC internal state (which is defined by the - combination of the current input line level and the latch state, and cannot - be deduced from purely the line level and the value of the ISPENDR - registers). - - Accesses to GICD_ICPENDR register region and GICR_ICPENDR0 registers have - RAZ/WI semantics, meaning that reads always return 0 and writes are always - ignored. - - Errors: - -ENXIO: Getting or setting this register is not yet supported - -EBUSY: One or more VCPUs are running - - - KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS - Attributes: - The attr field of kvm_device_attr encodes two values: - bits: | 63 .... 32 | 31 .... 16 | 15 .... 0 | - values: | mpidr | RES | instr | - - The mpidr field encodes the CPU ID based on the affinity information in the - architecture defined MPIDR, and the field is encoded as follows: - | 63 .... 56 | 55 .... 48 | 47 .... 40 | 39 .... 32 | - | Aff3 | Aff2 | Aff1 | Aff0 | - - The instr field encodes the system register to access based on the fields - defined in the A64 instruction set encoding for system register access - (RES means the bits are reserved for future use and should be zero): - - | 15 ... 14 | 13 ... 11 | 10 ... 7 | 6 ... 3 | 2 ... 0 | - | Op 0 | Op1 | CRn | CRm | Op2 | - - All system regs accessed through this API are (rw, 64-bit) and - kvm_device_attr.addr points to a __u64 value. - - KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS accesses the CPU interface registers for the - CPU specified by the mpidr field. - - CPU interface registers access is not implemented for AArch32 mode. - Error -ENXIO is returned when accessed in AArch32 mode. - Errors: - -ENXIO: Getting or setting this register is not yet supported - -EBUSY: VCPU is running - -EINVAL: Invalid mpidr or register value supplied - - - KVM_DEV_ARM_VGIC_GRP_NR_IRQS - Attributes: - A value describing the number of interrupts (SGI, PPI and SPI) for - this GIC instance, ranging from 64 to 1024, in increments of 32. - - kvm_device_attr.addr points to a __u32 value. - - Errors: - -EINVAL: Value set is out of the expected range - -EBUSY: Value has already be set. - - - KVM_DEV_ARM_VGIC_GRP_CTRL - Attributes: - KVM_DEV_ARM_VGIC_CTRL_INIT - request the initialization of the VGIC, no additional parameter in - kvm_device_attr.addr. - KVM_DEV_ARM_VGIC_SAVE_PENDING_TABLES - save all LPI pending bits into guest RAM pending tables. - - The first kB of the pending table is not altered by this operation. - Errors: - -ENXIO: VGIC not properly configured as required prior to calling - this attribute - -ENODEV: no online VCPU - -ENOMEM: memory shortage when allocating vgic internal data - -EFAULT: Invalid guest ram access - -EBUSY: One or more VCPUS are running - - - KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO - Attributes: - The attr field of kvm_device_attr encodes the following values: - bits: | 63 .... 32 | 31 .... 10 | 9 .... 0 | - values: | mpidr | info | vINTID | - - The vINTID specifies which set of IRQs is reported on. - - The info field specifies which information userspace wants to get or set - using this interface. Currently we support the following info values: - - VGIC_LEVEL_INFO_LINE_LEVEL: - Get/Set the input level of the IRQ line for a set of 32 contiguously - numbered interrupts. - vINTID must be a multiple of 32. - - kvm_device_attr.addr points to a __u32 value which will contain a - bitmap where a set bit means the interrupt level is asserted. - - Bit[n] indicates the status for interrupt vINTID + n. - - SGIs and any interrupt with a higher ID than the number of interrupts - supported, will be RAZ/WI. LPIs are always edge-triggered and are - therefore not supported by this interface. - - PPIs are reported per VCPU as specified in the mpidr field, and SPIs are - reported with the same value regardless of the mpidr specified. - - The mpidr field encodes the CPU ID based on the affinity information in the - architecture defined MPIDR, and the field is encoded as follows: - | 63 .... 56 | 55 .... 48 | 47 .... 40 | 39 .... 32 | - | Aff3 | Aff2 | Aff1 | Aff0 | - Errors: - -EINVAL: vINTID is not multiple of 32 or - info field is not VGIC_LEVEL_INFO_LINE_LEVEL diff --git a/Documentation/virtual/kvm/devices/arm-vgic.txt b/Documentation/virtual/kvm/devices/arm-vgic.txt deleted file mode 100644 index 97b6518148f8..000000000000 --- a/Documentation/virtual/kvm/devices/arm-vgic.txt +++ /dev/null @@ -1,127 +0,0 @@ -ARM Virtual Generic Interrupt Controller v2 (VGIC) -================================================== - -Device types supported: - KVM_DEV_TYPE_ARM_VGIC_V2 ARM Generic Interrupt Controller v2.0 - -Only one VGIC instance may be instantiated through either this API or the -legacy KVM_CREATE_IRQCHIP API. The created VGIC will act as the VM interrupt -controller, requiring emulated user-space devices to inject interrupts to the -VGIC instead of directly to CPUs. - -GICv3 implementations with hardware compatibility support allow creating a -guest GICv2 through this interface. For information on creating a guest GICv3 -device and guest ITS devices, see arm-vgic-v3.txt. It is not possible to -create both a GICv3 and GICv2 device on the same VM. - - -Groups: - KVM_DEV_ARM_VGIC_GRP_ADDR - Attributes: - KVM_VGIC_V2_ADDR_TYPE_DIST (rw, 64-bit) - Base address in the guest physical address space of the GIC distributor - register mappings. Only valid for KVM_DEV_TYPE_ARM_VGIC_V2. - This address needs to be 4K aligned and the region covers 4 KByte. - - KVM_VGIC_V2_ADDR_TYPE_CPU (rw, 64-bit) - Base address in the guest physical address space of the GIC virtual cpu - interface register mappings. Only valid for KVM_DEV_TYPE_ARM_VGIC_V2. - This address needs to be 4K aligned and the region covers 4 KByte. - Errors: - -E2BIG: Address outside of addressable IPA range - -EINVAL: Incorrectly aligned address - -EEXIST: Address already configured - -ENXIO: The group or attribute is unknown/unsupported for this device - or hardware support is missing. - -EFAULT: Invalid user pointer for attr->addr. - - KVM_DEV_ARM_VGIC_GRP_DIST_REGS - Attributes: - The attr field of kvm_device_attr encodes two values: - bits: | 63 .... 40 | 39 .. 32 | 31 .... 0 | - values: | reserved | vcpu_index | offset | - - All distributor regs are (rw, 32-bit) - - The offset is relative to the "Distributor base address" as defined in the - GICv2 specs. Getting or setting such a register has the same effect as - reading or writing the register on the actual hardware from the cpu whose - index is specified with the vcpu_index field. Note that most distributor - fields are not banked, but return the same value regardless of the - vcpu_index used to access the register. - - GICD_IIDR.Revision is updated when the KVM implementation of an emulated - GICv2 is changed in a way directly observable by the guest or userspace. - Userspace should read GICD_IIDR from KVM and write back the read value to - confirm its expected behavior is aligned with the KVM implementation. - Userspace should set GICD_IIDR before setting any other registers (both - KVM_DEV_ARM_VGIC_GRP_DIST_REGS and KVM_DEV_ARM_VGIC_GRP_CPU_REGS) to ensure - the expected behavior. Unless GICD_IIDR has been set from userspace, writes - to the interrupt group registers (GICD_IGROUPR) are ignored. - Errors: - -ENXIO: Getting or setting this register is not yet supported - -EBUSY: One or more VCPUs are running - -EINVAL: Invalid vcpu_index supplied - - KVM_DEV_ARM_VGIC_GRP_CPU_REGS - Attributes: - The attr field of kvm_device_attr encodes two values: - bits: | 63 .... 40 | 39 .. 32 | 31 .... 0 | - values: | reserved | vcpu_index | offset | - - All CPU interface regs are (rw, 32-bit) - - The offset specifies the offset from the "CPU interface base address" as - defined in the GICv2 specs. Getting or setting such a register has the - same effect as reading or writing the register on the actual hardware. - - The Active Priorities Registers APRn are implementation defined, so we set a - fixed format for our implementation that fits with the model of a "GICv2 - implementation without the security extensions" which we present to the - guest. This interface always exposes four register APR[0-3] describing the - maximum possible 128 preemption levels. The semantics of the register - indicate if any interrupts in a given preemption level are in the active - state by setting the corresponding bit. - - Thus, preemption level X has one or more active interrupts if and only if: - - APRn[X mod 32] == 0b1, where n = X / 32 - - Bits for undefined preemption levels are RAZ/WI. - - Note that this differs from a CPU's view of the APRs on hardware in which - a GIC without the security extensions expose group 0 and group 1 active - priorities in separate register groups, whereas we show a combined view - similar to GICv2's GICH_APR. - - For historical reasons and to provide ABI compatibility with userspace we - export the GICC_PMR register in the format of the GICH_VMCR.VMPriMask - field in the lower 5 bits of a word, meaning that userspace must always - use the lower 5 bits to communicate with the KVM device and must shift the - value left by 3 places to obtain the actual priority mask level. - - Errors: - -ENXIO: Getting or setting this register is not yet supported - -EBUSY: One or more VCPUs are running - -EINVAL: Invalid vcpu_index supplied - - KVM_DEV_ARM_VGIC_GRP_NR_IRQS - Attributes: - A value describing the number of interrupts (SGI, PPI and SPI) for - this GIC instance, ranging from 64 to 1024, in increments of 32. - - Errors: - -EINVAL: Value set is out of the expected range - -EBUSY: Value has already be set, or GIC has already been initialized - with default values. - - KVM_DEV_ARM_VGIC_GRP_CTRL - Attributes: - KVM_DEV_ARM_VGIC_CTRL_INIT - request the initialization of the VGIC or ITS, no additional parameter - in kvm_device_attr.addr. - Errors: - -ENXIO: VGIC not properly configured as required prior to calling - this attribute - -ENODEV: no online VCPU - -ENOMEM: memory shortage when allocating vgic internal data diff --git a/Documentation/virtual/kvm/devices/mpic.txt b/Documentation/virtual/kvm/devices/mpic.txt deleted file mode 100644 index 8257397adc3c..000000000000 --- a/Documentation/virtual/kvm/devices/mpic.txt +++ /dev/null @@ -1,53 +0,0 @@ -MPIC interrupt controller -========================= - -Device types supported: - KVM_DEV_TYPE_FSL_MPIC_20 Freescale MPIC v2.0 - KVM_DEV_TYPE_FSL_MPIC_42 Freescale MPIC v4.2 - -Only one MPIC instance, of any type, may be instantiated. The created -MPIC will act as the system interrupt controller, connecting to each -vcpu's interrupt inputs. - -Groups: - KVM_DEV_MPIC_GRP_MISC - Attributes: - KVM_DEV_MPIC_BASE_ADDR (rw, 64-bit) - Base address of the 256 KiB MPIC register space. Must be - naturally aligned. A value of zero disables the mapping. - Reset value is zero. - - KVM_DEV_MPIC_GRP_REGISTER (rw, 32-bit) - Access an MPIC register, as if the access were made from the guest. - "attr" is the byte offset into the MPIC register space. Accesses - must be 4-byte aligned. - - MSIs may be signaled by using this attribute group to write - to the relevant MSIIR. - - KVM_DEV_MPIC_GRP_IRQ_ACTIVE (rw, 32-bit) - IRQ input line for each standard openpic source. 0 is inactive and 1 - is active, regardless of interrupt sense. - - For edge-triggered interrupts: Writing 1 is considered an activating - edge, and writing 0 is ignored. Reading returns 1 if a previously - signaled edge has not been acknowledged, and 0 otherwise. - - "attr" is the IRQ number. IRQ numbers for standard sources are the - byte offset of the relevant IVPR from EIVPR0, divided by 32. - -IRQ Routing: - - The MPIC emulation supports IRQ routing. Only a single MPIC device can - be instantiated. Once that device has been created, it's available as - irqchip id 0. - - This irqchip 0 has 256 interrupt pins, which expose the interrupts in - the main array of interrupt sources (a.k.a. "SRC" interrupts). - - The numbering is the same as the MPIC device tree binding -- based on - the register offset from the beginning of the sources array, without - regard to any subdivisions in chip documentation such as "internal" - or "external" interrupts. - - Access to non-SRC interrupts is not implemented through IRQ routing mechanisms. diff --git a/Documentation/virtual/kvm/devices/s390_flic.txt b/Documentation/virtual/kvm/devices/s390_flic.txt deleted file mode 100644 index a4e20a090174..000000000000 --- a/Documentation/virtual/kvm/devices/s390_flic.txt +++ /dev/null @@ -1,163 +0,0 @@ -FLIC (floating interrupt controller) -==================================== - -FLIC handles floating (non per-cpu) interrupts, i.e. I/O, service and some -machine check interruptions. All interrupts are stored in a per-vm list of -pending interrupts. FLIC performs operations on this list. - -Only one FLIC instance may be instantiated. - -FLIC provides support to -- add interrupts (KVM_DEV_FLIC_ENQUEUE) -- inspect currently pending interrupts (KVM_FLIC_GET_ALL_IRQS) -- purge all pending floating interrupts (KVM_DEV_FLIC_CLEAR_IRQS) -- purge one pending floating I/O interrupt (KVM_DEV_FLIC_CLEAR_IO_IRQ) -- enable/disable for the guest transparent async page faults -- register and modify adapter interrupt sources (KVM_DEV_FLIC_ADAPTER_*) -- modify AIS (adapter-interruption-suppression) mode state (KVM_DEV_FLIC_AISM) -- inject adapter interrupts on a specified adapter (KVM_DEV_FLIC_AIRQ_INJECT) -- get/set all AIS mode states (KVM_DEV_FLIC_AISM_ALL) - -Groups: - KVM_DEV_FLIC_ENQUEUE - Passes a buffer and length into the kernel which are then injected into - the list of pending interrupts. - attr->addr contains the pointer to the buffer and attr->attr contains - the length of the buffer. - The format of the data structure kvm_s390_irq as it is copied from userspace - is defined in usr/include/linux/kvm.h. - - KVM_DEV_FLIC_GET_ALL_IRQS - Copies all floating interrupts into a buffer provided by userspace. - When the buffer is too small it returns -ENOMEM, which is the indication - for userspace to try again with a bigger buffer. - -ENOBUFS is returned when the allocation of a kernelspace buffer has - failed. - -EFAULT is returned when copying data to userspace failed. - All interrupts remain pending, i.e. are not deleted from the list of - currently pending interrupts. - attr->addr contains the userspace address of the buffer into which all - interrupt data will be copied. - attr->attr contains the size of the buffer in bytes. - - KVM_DEV_FLIC_CLEAR_IRQS - Simply deletes all elements from the list of currently pending floating - interrupts. No interrupts are injected into the guest. - - KVM_DEV_FLIC_CLEAR_IO_IRQ - Deletes one (if any) I/O interrupt for a subchannel identified by the - subsystem identification word passed via the buffer specified by - attr->addr (address) and attr->attr (length). - - KVM_DEV_FLIC_APF_ENABLE - Enables async page faults for the guest. So in case of a major page fault - the host is allowed to handle this async and continues the guest. - - KVM_DEV_FLIC_APF_DISABLE_WAIT - Disables async page faults for the guest and waits until already pending - async page faults are done. This is necessary to trigger a completion interrupt - for every init interrupt before migrating the interrupt list. - - KVM_DEV_FLIC_ADAPTER_REGISTER - Register an I/O adapter interrupt source. Takes a kvm_s390_io_adapter - describing the adapter to register: - -struct kvm_s390_io_adapter { - __u32 id; - __u8 isc; - __u8 maskable; - __u8 swap; - __u8 flags; -}; - - id contains the unique id for the adapter, isc the I/O interruption subclass - to use, maskable whether this adapter may be masked (interrupts turned off), - swap whether the indicators need to be byte swapped, and flags contains - further characteristics of the adapter. - Currently defined values for 'flags' are: - - KVM_S390_ADAPTER_SUPPRESSIBLE: adapter is subject to AIS - (adapter-interrupt-suppression) facility. This flag only has an effect if - the AIS capability is enabled. - Unknown flag values are ignored. - - - KVM_DEV_FLIC_ADAPTER_MODIFY - Modifies attributes of an existing I/O adapter interrupt source. Takes - a kvm_s390_io_adapter_req specifying the adapter and the operation: - -struct kvm_s390_io_adapter_req { - __u32 id; - __u8 type; - __u8 mask; - __u16 pad0; - __u64 addr; -}; - - id specifies the adapter and type the operation. The supported operations - are: - - KVM_S390_IO_ADAPTER_MASK - mask or unmask the adapter, as specified in mask - - KVM_S390_IO_ADAPTER_MAP - perform a gmap translation for the guest address provided in addr, - pin a userspace page for the translated address and add it to the - list of mappings - Note: A new mapping will be created unconditionally; therefore, - the calling code should avoid making duplicate mappings. - - KVM_S390_IO_ADAPTER_UNMAP - release a userspace page for the translated address specified in addr - from the list of mappings - - KVM_DEV_FLIC_AISM - modify the adapter-interruption-suppression mode for a given isc if the - AIS capability is enabled. Takes a kvm_s390_ais_req describing: - -struct kvm_s390_ais_req { - __u8 isc; - __u16 mode; -}; - - isc contains the target I/O interruption subclass, mode the target - adapter-interruption-suppression mode. The following modes are - currently supported: - - KVM_S390_AIS_MODE_ALL: ALL-Interruptions Mode, i.e. airq injection - is always allowed; - - KVM_S390_AIS_MODE_SINGLE: SINGLE-Interruption Mode, i.e. airq - injection is only allowed once and the following adapter interrupts - will be suppressed until the mode is set again to ALL-Interruptions - or SINGLE-Interruption mode. - - KVM_DEV_FLIC_AIRQ_INJECT - Inject adapter interrupts on a specified adapter. - attr->attr contains the unique id for the adapter, which allows for - adapter-specific checks and actions. - For adapters subject to AIS, handle the airq injection suppression for - an isc according to the adapter-interruption-suppression mode on condition - that the AIS capability is enabled. - - KVM_DEV_FLIC_AISM_ALL - Gets or sets the adapter-interruption-suppression mode for all ISCs. Takes - a kvm_s390_ais_all describing: - -struct kvm_s390_ais_all { - __u8 simm; /* Single-Interruption-Mode mask */ - __u8 nimm; /* No-Interruption-Mode mask * -}; - - simm contains Single-Interruption-Mode mask for all ISCs, nimm contains - No-Interruption-Mode mask for all ISCs. Each bit in simm and nimm corresponds - to an ISC (MSB0 bit 0 to ISC 0 and so on). The combination of simm bit and - nimm bit presents AIS mode for a ISC. - - KVM_DEV_FLIC_AISM_ALL is indicated by KVM_CAP_S390_AIS_MIGRATION. - -Note: The KVM_SET_DEVICE_ATTR/KVM_GET_DEVICE_ATTR device ioctls executed on -FLIC with an unknown group or attribute gives the error code EINVAL (instead of -ENXIO, as specified in the API documentation). It is not possible to conclude -that a FLIC operation is unavailable based on the error code resulting from a -usage attempt. - -Note: The KVM_DEV_FLIC_CLEAR_IO_IRQ ioctl will return EINVAL in case a zero -schid is specified. diff --git a/Documentation/virtual/kvm/devices/vcpu.txt b/Documentation/virtual/kvm/devices/vcpu.txt deleted file mode 100644 index 2b5dab16c4f2..000000000000 --- a/Documentation/virtual/kvm/devices/vcpu.txt +++ /dev/null @@ -1,62 +0,0 @@ -Generic vcpu interface -==================================== - -The virtual cpu "device" also accepts the ioctls KVM_SET_DEVICE_ATTR, -KVM_GET_DEVICE_ATTR, and KVM_HAS_DEVICE_ATTR. The interface uses the same struct -kvm_device_attr as other devices, but targets VCPU-wide settings and controls. - -The groups and attributes per virtual cpu, if any, are architecture specific. - -1. GROUP: KVM_ARM_VCPU_PMU_V3_CTRL -Architectures: ARM64 - -1.1. ATTRIBUTE: KVM_ARM_VCPU_PMU_V3_IRQ -Parameters: in kvm_device_attr.addr the address for PMU overflow interrupt is a - pointer to an int -Returns: -EBUSY: The PMU overflow interrupt is already set - -ENXIO: The overflow interrupt not set when attempting to get it - -ENODEV: PMUv3 not supported - -EINVAL: Invalid PMU overflow interrupt number supplied or - trying to set the IRQ number without using an in-kernel - irqchip. - -A value describing the PMUv3 (Performance Monitor Unit v3) overflow interrupt -number for this vcpu. This interrupt could be a PPI or SPI, but the interrupt -type must be same for each vcpu. As a PPI, the interrupt number is the same for -all vcpus, while as an SPI it must be a separate number per vcpu. - -1.2 ATTRIBUTE: KVM_ARM_VCPU_PMU_V3_INIT -Parameters: no additional parameter in kvm_device_attr.addr -Returns: -ENODEV: PMUv3 not supported or GIC not initialized - -ENXIO: PMUv3 not properly configured or in-kernel irqchip not - configured as required prior to calling this attribute - -EBUSY: PMUv3 already initialized - -Request the initialization of the PMUv3. If using the PMUv3 with an in-kernel -virtual GIC implementation, this must be done after initializing the in-kernel -irqchip. - - -2. GROUP: KVM_ARM_VCPU_TIMER_CTRL -Architectures: ARM,ARM64 - -2.1. ATTRIBUTE: KVM_ARM_VCPU_TIMER_IRQ_VTIMER -2.2. ATTRIBUTE: KVM_ARM_VCPU_TIMER_IRQ_PTIMER -Parameters: in kvm_device_attr.addr the address for the timer interrupt is a - pointer to an int -Returns: -EINVAL: Invalid timer interrupt number - -EBUSY: One or more VCPUs has already run - -A value describing the architected timer interrupt number when connected to an -in-kernel virtual GIC. These must be a PPI (16 <= intid < 32). Setting the -attribute overrides the default values (see below). - -KVM_ARM_VCPU_TIMER_IRQ_VTIMER: The EL1 virtual timer intid (default: 27) -KVM_ARM_VCPU_TIMER_IRQ_PTIMER: The EL1 physical timer intid (default: 30) - -Setting the same PPI for different timers will prevent the VCPUs from running. -Setting the interrupt number on a VCPU configures all VCPUs created at that -time to use the number provided for a given timer, overwriting any previously -configured values on other VCPUs. Userspace should configure the interrupt -numbers on at least one VCPU after creating all VCPUs and before running any -VCPUs. diff --git a/Documentation/virtual/kvm/devices/vfio.txt b/Documentation/virtual/kvm/devices/vfio.txt deleted file mode 100644 index 528c77c8022c..000000000000 --- a/Documentation/virtual/kvm/devices/vfio.txt +++ /dev/null @@ -1,36 +0,0 @@ -VFIO virtual device -=================== - -Device types supported: - KVM_DEV_TYPE_VFIO - -Only one VFIO instance may be created per VM. The created device -tracks VFIO groups in use by the VM and features of those groups -important to the correctness and acceleration of the VM. As groups -are enabled and disabled for use by the VM, KVM should be updated -about their presence. When registered with KVM, a reference to the -VFIO-group is held by KVM. - -Groups: - KVM_DEV_VFIO_GROUP - -KVM_DEV_VFIO_GROUP attributes: - KVM_DEV_VFIO_GROUP_ADD: Add a VFIO group to VFIO-KVM device tracking - kvm_device_attr.addr points to an int32_t file descriptor - for the VFIO group. - KVM_DEV_VFIO_GROUP_DEL: Remove a VFIO group from VFIO-KVM device tracking - kvm_device_attr.addr points to an int32_t file descriptor - for the VFIO group. - KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE: attaches a guest visible TCE table - allocated by sPAPR KVM. - kvm_device_attr.addr points to a struct: - - struct kvm_vfio_spapr_tce { - __s32 groupfd; - __s32 tablefd; - }; - - where - @groupfd is a file descriptor for a VFIO group; - @tablefd is a file descriptor for a TCE table allocated via - KVM_CREATE_SPAPR_TCE. diff --git a/Documentation/virtual/kvm/devices/vm.txt b/Documentation/virtual/kvm/devices/vm.txt deleted file mode 100644 index 4ffb82b02468..000000000000 --- a/Documentation/virtual/kvm/devices/vm.txt +++ /dev/null @@ -1,270 +0,0 @@ -Generic vm interface -==================================== - -The virtual machine "device" also accepts the ioctls KVM_SET_DEVICE_ATTR, -KVM_GET_DEVICE_ATTR, and KVM_HAS_DEVICE_ATTR. The interface uses the same -struct kvm_device_attr as other devices, but targets VM-wide settings -and controls. - -The groups and attributes per virtual machine, if any, are architecture -specific. - -1. GROUP: KVM_S390_VM_MEM_CTRL -Architectures: s390 - -1.1. ATTRIBUTE: KVM_S390_VM_MEM_ENABLE_CMMA -Parameters: none -Returns: -EBUSY if a vcpu is already defined, otherwise 0 - -Enables Collaborative Memory Management Assist (CMMA) for the virtual machine. - -1.2. ATTRIBUTE: KVM_S390_VM_MEM_CLR_CMMA -Parameters: none -Returns: -EINVAL if CMMA was not enabled - 0 otherwise - -Clear the CMMA status for all guest pages, so any pages the guest marked -as unused are again used any may not be reclaimed by the host. - -1.3. ATTRIBUTE KVM_S390_VM_MEM_LIMIT_SIZE -Parameters: in attr->addr the address for the new limit of guest memory -Returns: -EFAULT if the given address is not accessible - -EINVAL if the virtual machine is of type UCONTROL - -E2BIG if the given guest memory is to big for that machine - -EBUSY if a vcpu is already defined - -ENOMEM if not enough memory is available for a new shadow guest mapping - 0 otherwise - -Allows userspace to query the actual limit and set a new limit for -the maximum guest memory size. The limit will be rounded up to -2048 MB, 4096 GB, 8192 TB respectively, as this limit is governed by -the number of page table levels. In the case that there is no limit we will set -the limit to KVM_S390_NO_MEM_LIMIT (U64_MAX). - -2. GROUP: KVM_S390_VM_CPU_MODEL -Architectures: s390 - -2.1. ATTRIBUTE: KVM_S390_VM_CPU_MACHINE (r/o) - -Allows user space to retrieve machine and kvm specific cpu related information: - -struct kvm_s390_vm_cpu_machine { - __u64 cpuid; # CPUID of host - __u32 ibc; # IBC level range offered by host - __u8 pad[4]; - __u64 fac_mask[256]; # set of cpu facilities enabled by KVM - __u64 fac_list[256]; # set of cpu facilities offered by host -} - -Parameters: address of buffer to store the machine related cpu data - of type struct kvm_s390_vm_cpu_machine* -Returns: -EFAULT if the given address is not accessible from kernel space - -ENOMEM if not enough memory is available to process the ioctl - 0 in case of success - -2.2. ATTRIBUTE: KVM_S390_VM_CPU_PROCESSOR (r/w) - -Allows user space to retrieve or request to change cpu related information for a vcpu: - -struct kvm_s390_vm_cpu_processor { - __u64 cpuid; # CPUID currently (to be) used by this vcpu - __u16 ibc; # IBC level currently (to be) used by this vcpu - __u8 pad[6]; - __u64 fac_list[256]; # set of cpu facilities currently (to be) used - # by this vcpu -} - -KVM does not enforce or limit the cpu model data in any form. Take the information -retrieved by means of KVM_S390_VM_CPU_MACHINE as hint for reasonable configuration -setups. Instruction interceptions triggered by additionally set facility bits that -are not handled by KVM need to by imlemented in the VM driver code. - -Parameters: address of buffer to store/set the processor related cpu - data of type struct kvm_s390_vm_cpu_processor*. -Returns: -EBUSY in case 1 or more vcpus are already activated (only in write case) - -EFAULT if the given address is not accessible from kernel space - -ENOMEM if not enough memory is available to process the ioctl - 0 in case of success - -2.3. ATTRIBUTE: KVM_S390_VM_CPU_MACHINE_FEAT (r/o) - -Allows user space to retrieve available cpu features. A feature is available if -provided by the hardware and supported by kvm. In theory, cpu features could -even be completely emulated by kvm. - -struct kvm_s390_vm_cpu_feat { - __u64 feat[16]; # Bitmap (1 = feature available), MSB 0 bit numbering -}; - -Parameters: address of a buffer to load the feature list from. -Returns: -EFAULT if the given address is not accessible from kernel space. - 0 in case of success. - -2.4. ATTRIBUTE: KVM_S390_VM_CPU_PROCESSOR_FEAT (r/w) - -Allows user space to retrieve or change enabled cpu features for all VCPUs of a -VM. Features that are not available cannot be enabled. - -See 2.3. for a description of the parameter struct. - -Parameters: address of a buffer to store/load the feature list from. -Returns: -EFAULT if the given address is not accessible from kernel space. - -EINVAL if a cpu feature that is not available is to be enabled. - -EBUSY if at least one VCPU has already been defined. - 0 in case of success. - -2.5. ATTRIBUTE: KVM_S390_VM_CPU_MACHINE_SUBFUNC (r/o) - -Allows user space to retrieve available cpu subfunctions without any filtering -done by a set IBC. These subfunctions are indicated to the guest VCPU via -query or "test bit" subfunctions and used e.g. by cpacf functions, plo and ptff. - -A subfunction block is only valid if KVM_S390_VM_CPU_MACHINE contains the -STFL(E) bit introducing the affected instruction. If the affected instruction -indicates subfunctions via a "query subfunction", the response block is -contained in the returned struct. If the affected instruction -indicates subfunctions via a "test bit" mechanism, the subfunction codes are -contained in the returned struct in MSB 0 bit numbering. - -struct kvm_s390_vm_cpu_subfunc { - u8 plo[32]; # always valid (ESA/390 feature) - u8 ptff[16]; # valid with TOD-clock steering - u8 kmac[16]; # valid with Message-Security-Assist - u8 kmc[16]; # valid with Message-Security-Assist - u8 km[16]; # valid with Message-Security-Assist - u8 kimd[16]; # valid with Message-Security-Assist - u8 klmd[16]; # valid with Message-Security-Assist - u8 pckmo[16]; # valid with Message-Security-Assist-Extension 3 - u8 kmctr[16]; # valid with Message-Security-Assist-Extension 4 - u8 kmf[16]; # valid with Message-Security-Assist-Extension 4 - u8 kmo[16]; # valid with Message-Security-Assist-Extension 4 - u8 pcc[16]; # valid with Message-Security-Assist-Extension 4 - u8 ppno[16]; # valid with Message-Security-Assist-Extension 5 - u8 kma[16]; # valid with Message-Security-Assist-Extension 8 - u8 kdsa[16]; # valid with Message-Security-Assist-Extension 9 - u8 reserved[1792]; # reserved for future instructions -}; - -Parameters: address of a buffer to load the subfunction blocks from. -Returns: -EFAULT if the given address is not accessible from kernel space. - 0 in case of success. - -2.6. ATTRIBUTE: KVM_S390_VM_CPU_PROCESSOR_SUBFUNC (r/w) - -Allows user space to retrieve or change cpu subfunctions to be indicated for -all VCPUs of a VM. This attribute will only be available if kernel and -hardware support are in place. - -The kernel uses the configured subfunction blocks for indication to -the guest. A subfunction block will only be used if the associated STFL(E) bit -has not been disabled by user space (so the instruction to be queried is -actually available for the guest). - -As long as no data has been written, a read will fail. The IBC will be used -to determine available subfunctions in this case, this will guarantee backward -compatibility. - -See 2.5. for a description of the parameter struct. - -Parameters: address of a buffer to store/load the subfunction blocks from. -Returns: -EFAULT if the given address is not accessible from kernel space. - -EINVAL when reading, if there was no write yet. - -EBUSY if at least one VCPU has already been defined. - 0 in case of success. - -3. GROUP: KVM_S390_VM_TOD -Architectures: s390 - -3.1. ATTRIBUTE: KVM_S390_VM_TOD_HIGH - -Allows user space to set/get the TOD clock extension (u8) (superseded by -KVM_S390_VM_TOD_EXT). - -Parameters: address of a buffer in user space to store the data (u8) to -Returns: -EFAULT if the given address is not accessible from kernel space - -EINVAL if setting the TOD clock extension to != 0 is not supported - -3.2. ATTRIBUTE: KVM_S390_VM_TOD_LOW - -Allows user space to set/get bits 0-63 of the TOD clock register as defined in -the POP (u64). - -Parameters: address of a buffer in user space to store the data (u64) to -Returns: -EFAULT if the given address is not accessible from kernel space - -3.3. ATTRIBUTE: KVM_S390_VM_TOD_EXT -Allows user space to set/get bits 0-63 of the TOD clock register as defined in -the POP (u64). If the guest CPU model supports the TOD clock extension (u8), it -also allows user space to get/set it. If the guest CPU model does not support -it, it is stored as 0 and not allowed to be set to a value != 0. - -Parameters: address of a buffer in user space to store the data - (kvm_s390_vm_tod_clock) to -Returns: -EFAULT if the given address is not accessible from kernel space - -EINVAL if setting the TOD clock extension to != 0 is not supported - -4. GROUP: KVM_S390_VM_CRYPTO -Architectures: s390 - -4.1. ATTRIBUTE: KVM_S390_VM_CRYPTO_ENABLE_AES_KW (w/o) - -Allows user space to enable aes key wrapping, including generating a new -wrapping key. - -Parameters: none -Returns: 0 - -4.2. ATTRIBUTE: KVM_S390_VM_CRYPTO_ENABLE_DEA_KW (w/o) - -Allows user space to enable dea key wrapping, including generating a new -wrapping key. - -Parameters: none -Returns: 0 - -4.3. ATTRIBUTE: KVM_S390_VM_CRYPTO_DISABLE_AES_KW (w/o) - -Allows user space to disable aes key wrapping, clearing the wrapping key. - -Parameters: none -Returns: 0 - -4.4. ATTRIBUTE: KVM_S390_VM_CRYPTO_DISABLE_DEA_KW (w/o) - -Allows user space to disable dea key wrapping, clearing the wrapping key. - -Parameters: none -Returns: 0 - -5. GROUP: KVM_S390_VM_MIGRATION -Architectures: s390 - -5.1. ATTRIBUTE: KVM_S390_VM_MIGRATION_STOP (w/o) - -Allows userspace to stop migration mode, needed for PGSTE migration. -Setting this attribute when migration mode is not active will have no -effects. - -Parameters: none -Returns: 0 - -5.2. ATTRIBUTE: KVM_S390_VM_MIGRATION_START (w/o) - -Allows userspace to start migration mode, needed for PGSTE migration. -Setting this attribute when migration mode is already active will have -no effects. - -Parameters: none -Returns: -ENOMEM if there is not enough free memory to start migration mode - -EINVAL if the state of the VM is invalid (e.g. no memory defined) - 0 in case of success. - -5.3. ATTRIBUTE: KVM_S390_VM_MIGRATION_STATUS (r/o) - -Allows userspace to query the status of migration mode. - -Parameters: address of a buffer in user space to store the data (u64) to; - the data itself is either 0 if migration mode is disabled or 1 - if it is enabled -Returns: -EFAULT if the given address is not accessible from kernel space - 0 in case of success. diff --git a/Documentation/virtual/kvm/devices/xics.txt b/Documentation/virtual/kvm/devices/xics.txt deleted file mode 100644 index 42864935ac5d..000000000000 --- a/Documentation/virtual/kvm/devices/xics.txt +++ /dev/null @@ -1,66 +0,0 @@ -XICS interrupt controller - -Device type supported: KVM_DEV_TYPE_XICS - -Groups: - KVM_DEV_XICS_SOURCES - Attributes: One per interrupt source, indexed by the source number. - -This device emulates the XICS (eXternal Interrupt Controller -Specification) defined in PAPR. The XICS has a set of interrupt -sources, each identified by a 20-bit source number, and a set of -Interrupt Control Presentation (ICP) entities, also called "servers", -each associated with a virtual CPU. - -The ICP entities are created by enabling the KVM_CAP_IRQ_ARCH -capability for each vcpu, specifying KVM_CAP_IRQ_XICS in args[0] and -the interrupt server number (i.e. the vcpu number from the XICS's -point of view) in args[1] of the kvm_enable_cap struct. Each ICP has -64 bits of state which can be read and written using the -KVM_GET_ONE_REG and KVM_SET_ONE_REG ioctls on the vcpu. The 64 bit -state word has the following bitfields, starting at the -least-significant end of the word: - -* Unused, 16 bits - -* Pending interrupt priority, 8 bits - Zero is the highest priority, 255 means no interrupt is pending. - -* Pending IPI (inter-processor interrupt) priority, 8 bits - Zero is the highest priority, 255 means no IPI is pending. - -* Pending interrupt source number, 24 bits - Zero means no interrupt pending, 2 means an IPI is pending - -* Current processor priority, 8 bits - Zero is the highest priority, meaning no interrupts can be - delivered, and 255 is the lowest priority. - -Each source has 64 bits of state that can be read and written using -the KVM_GET_DEVICE_ATTR and KVM_SET_DEVICE_ATTR ioctls, specifying the -KVM_DEV_XICS_SOURCES attribute group, with the attribute number being -the interrupt source number. The 64 bit state word has the following -bitfields, starting from the least-significant end of the word: - -* Destination (server number), 32 bits - This specifies where the interrupt should be sent, and is the - interrupt server number specified for the destination vcpu. - -* Priority, 8 bits - This is the priority specified for this interrupt source, where 0 is - the highest priority and 255 is the lowest. An interrupt with a - priority of 255 will never be delivered. - -* Level sensitive flag, 1 bit - This bit is 1 for a level-sensitive interrupt source, or 0 for - edge-sensitive (or MSI). - -* Masked flag, 1 bit - This bit is set to 1 if the interrupt is masked (cannot be delivered - regardless of its priority), for example by the ibm,int-off RTAS - call, or 0 if it is not masked. - -* Pending flag, 1 bit - This bit is 1 if the source has a pending interrupt, otherwise 0. - -Only one XICS instance may be created per VM. diff --git a/Documentation/virtual/kvm/devices/xive.txt b/Documentation/virtual/kvm/devices/xive.txt deleted file mode 100644 index 9a24a4525253..000000000000 --- a/Documentation/virtual/kvm/devices/xive.txt +++ /dev/null @@ -1,197 +0,0 @@ -POWER9 eXternal Interrupt Virtualization Engine (XIVE Gen1) -========================================================== - -Device types supported: - KVM_DEV_TYPE_XIVE POWER9 XIVE Interrupt Controller generation 1 - -This device acts as a VM interrupt controller. It provides the KVM -interface to configure the interrupt sources of a VM in the underlying -POWER9 XIVE interrupt controller. - -Only one XIVE instance may be instantiated. A guest XIVE device -requires a POWER9 host and the guest OS should have support for the -XIVE native exploitation interrupt mode. If not, it should run using -the legacy interrupt mode, referred as XICS (POWER7/8). - -* Device Mappings - - The KVM device exposes different MMIO ranges of the XIVE HW which - are required for interrupt management. These are exposed to the - guest in VMAs populated with a custom VM fault handler. - - 1. Thread Interrupt Management Area (TIMA) - - Each thread has an associated Thread Interrupt Management context - composed of a set of registers. These registers let the thread - handle priority management and interrupt acknowledgment. The most - important are : - - - Interrupt Pending Buffer (IPB) - - Current Processor Priority (CPPR) - - Notification Source Register (NSR) - - They are exposed to software in four different pages each proposing - a view with a different privilege. The first page is for the - physical thread context and the second for the hypervisor. Only the - third (operating system) and the fourth (user level) are exposed the - guest. - - 2. Event State Buffer (ESB) - - Each source is associated with an Event State Buffer (ESB) with - either a pair of even/odd pair of pages which provides commands to - manage the source: to trigger, to EOI, to turn off the source for - instance. - - 3. Device pass-through - - When a device is passed-through into the guest, the source - interrupts are from a different HW controller (PHB4) and the ESB - pages exposed to the guest should accommadate this change. - - The passthru_irq helpers, kvmppc_xive_set_mapped() and - kvmppc_xive_clr_mapped() are called when the device HW irqs are - mapped into or unmapped from the guest IRQ number space. The KVM - device extends these helpers to clear the ESB pages of the guest IRQ - number being mapped and then lets the VM fault handler repopulate. - The handler will insert the ESB page corresponding to the HW - interrupt of the device being passed-through or the initial IPI ESB - page if the device has being removed. - - The ESB remapping is fully transparent to the guest and the OS - device driver. All handling is done within VFIO and the above - helpers in KVM-PPC. - -* Groups: - - 1. KVM_DEV_XIVE_GRP_CTRL - Provides global controls on the device - Attributes: - 1.1 KVM_DEV_XIVE_RESET (write only) - Resets the interrupt controller configuration for sources and event - queues. To be used by kexec and kdump. - Errors: none - - 1.2 KVM_DEV_XIVE_EQ_SYNC (write only) - Sync all the sources and queues and mark the EQ pages dirty. This - to make sure that a consistent memory state is captured when - migrating the VM. - Errors: none - - 2. KVM_DEV_XIVE_GRP_SOURCE (write only) - Initializes a new source in the XIVE device and mask it. - Attributes: - Interrupt source number (64-bit) - The kvm_device_attr.addr points to a __u64 value: - bits: | 63 .... 2 | 1 | 0 - values: | unused | level | type - - type: 0:MSI 1:LSI - - level: assertion level in case of an LSI. - Errors: - -E2BIG: Interrupt source number is out of range - -ENOMEM: Could not create a new source block - -EFAULT: Invalid user pointer for attr->addr. - -ENXIO: Could not allocate underlying HW interrupt - - 3. KVM_DEV_XIVE_GRP_SOURCE_CONFIG (write only) - Configures source targeting - Attributes: - Interrupt source number (64-bit) - The kvm_device_attr.addr points to a __u64 value: - bits: | 63 .... 33 | 32 | 31 .. 3 | 2 .. 0 - values: | eisn | mask | server | priority - - priority: 0-7 interrupt priority level - - server: CPU number chosen to handle the interrupt - - mask: mask flag (unused) - - eisn: Effective Interrupt Source Number - Errors: - -ENOENT: Unknown source number - -EINVAL: Not initialized source number - -EINVAL: Invalid priority - -EINVAL: Invalid CPU number. - -EFAULT: Invalid user pointer for attr->addr. - -ENXIO: CPU event queues not configured or configuration of the - underlying HW interrupt failed - -EBUSY: No CPU available to serve interrupt - - 4. KVM_DEV_XIVE_GRP_EQ_CONFIG (read-write) - Configures an event queue of a CPU - Attributes: - EQ descriptor identifier (64-bit) - The EQ descriptor identifier is a tuple (server, priority) : - bits: | 63 .... 32 | 31 .. 3 | 2 .. 0 - values: | unused | server | priority - The kvm_device_attr.addr points to : - struct kvm_ppc_xive_eq { - __u32 flags; - __u32 qshift; - __u64 qaddr; - __u32 qtoggle; - __u32 qindex; - __u8 pad[40]; - }; - - flags: queue flags - KVM_XIVE_EQ_ALWAYS_NOTIFY (required) - forces notification without using the coalescing mechanism - provided by the XIVE END ESBs. - - qshift: queue size (power of 2) - - qaddr: real address of queue - - qtoggle: current queue toggle bit - - qindex: current queue index - - pad: reserved for future use - Errors: - -ENOENT: Invalid CPU number - -EINVAL: Invalid priority - -EINVAL: Invalid flags - -EINVAL: Invalid queue size - -EINVAL: Invalid queue address - -EFAULT: Invalid user pointer for attr->addr. - -EIO: Configuration of the underlying HW failed - - 5. KVM_DEV_XIVE_GRP_SOURCE_SYNC (write only) - Synchronize the source to flush event notifications - Attributes: - Interrupt source number (64-bit) - Errors: - -ENOENT: Unknown source number - -EINVAL: Not initialized source number - -* VCPU state - - The XIVE IC maintains VP interrupt state in an internal structure - called the NVT. When a VP is not dispatched on a HW processor - thread, this structure can be updated by HW if the VP is the target - of an event notification. - - It is important for migration to capture the cached IPB from the NVT - as it synthesizes the priorities of the pending interrupts. We - capture a bit more to report debug information. - - KVM_REG_PPC_VP_STATE (2 * 64bits) - bits: | 63 .... 32 | 31 .... 0 | - values: | TIMA word0 | TIMA word1 | - bits: | 127 .......... 64 | - values: | unused | - -* Migration: - - Saving the state of a VM using the XIVE native exploitation mode - should follow a specific sequence. When the VM is stopped : - - 1. Mask all sources (PQ=01) to stop the flow of events. - - 2. Sync the XIVE device with the KVM control KVM_DEV_XIVE_EQ_SYNC to - flush any in-flight event notification and to stabilize the EQs. At - this stage, the EQ pages are marked dirty to make sure they are - transferred in the migration sequence. - - 3. Capture the state of the source targeting, the EQs configuration - and the state of thread interrupt context registers. - - Restore is similar : - - 1. Restore the EQ configuration. As targeting depends on it. - 2. Restore targeting - 3. Restore the thread interrupt contexts - 4. Restore the source states - 5. Let the vCPU run diff --git a/Documentation/virtual/kvm/halt-polling.txt b/Documentation/virtual/kvm/halt-polling.txt deleted file mode 100644 index 4f791b128dd2..000000000000 --- a/Documentation/virtual/kvm/halt-polling.txt +++ /dev/null @@ -1,136 +0,0 @@ -The KVM halt polling system -=========================== - -The KVM halt polling system provides a feature within KVM whereby the latency -of a guest can, under some circumstances, be reduced by polling in the host -for some time period after the guest has elected to no longer run by cedeing. -That is, when a guest vcpu has ceded, or in the case of powerpc when all of the -vcpus of a single vcore have ceded, the host kernel polls for wakeup conditions -before giving up the cpu to the scheduler in order to let something else run. - -Polling provides a latency advantage in cases where the guest can be run again -very quickly by at least saving us a trip through the scheduler, normally on -the order of a few micro-seconds, although performance benefits are workload -dependant. In the event that no wakeup source arrives during the polling -interval or some other task on the runqueue is runnable the scheduler is -invoked. Thus halt polling is especially useful on workloads with very short -wakeup periods where the time spent halt polling is minimised and the time -savings of not invoking the scheduler are distinguishable. - -The generic halt polling code is implemented in: - - virt/kvm/kvm_main.c: kvm_vcpu_block() - -The powerpc kvm-hv specific case is implemented in: - - arch/powerpc/kvm/book3s_hv.c: kvmppc_vcore_blocked() - -Halt Polling Interval -===================== - -The maximum time for which to poll before invoking the scheduler, referred to -as the halt polling interval, is increased and decreased based on the perceived -effectiveness of the polling in an attempt to limit pointless polling. -This value is stored in either the vcpu struct: - - kvm_vcpu->halt_poll_ns - -or in the case of powerpc kvm-hv, in the vcore struct: - - kvmppc_vcore->halt_poll_ns - -Thus this is a per vcpu (or vcore) value. - -During polling if a wakeup source is received within the halt polling interval, -the interval is left unchanged. In the event that a wakeup source isn't -received during the polling interval (and thus schedule is invoked) there are -two options, either the polling interval and total block time[0] were less than -the global max polling interval (see module params below), or the total block -time was greater than the global max polling interval. - -In the event that both the polling interval and total block time were less than -the global max polling interval then the polling interval can be increased in -the hope that next time during the longer polling interval the wake up source -will be received while the host is polling and the latency benefits will be -received. The polling interval is grown in the function grow_halt_poll_ns() and -is multiplied by the module parameters halt_poll_ns_grow and -halt_poll_ns_grow_start. - -In the event that the total block time was greater than the global max polling -interval then the host will never poll for long enough (limited by the global -max) to wakeup during the polling interval so it may as well be shrunk in order -to avoid pointless polling. The polling interval is shrunk in the function -shrink_halt_poll_ns() and is divided by the module parameter -halt_poll_ns_shrink, or set to 0 iff halt_poll_ns_shrink == 0. - -It is worth noting that this adjustment process attempts to hone in on some -steady state polling interval but will only really do a good job for wakeups -which come at an approximately constant rate, otherwise there will be constant -adjustment of the polling interval. - -[0] total block time: the time between when the halt polling function is - invoked and a wakeup source received (irrespective of - whether the scheduler is invoked within that function). - -Module Parameters -================= - -The kvm module has 3 tuneable module parameters to adjust the global max -polling interval as well as the rate at which the polling interval is grown and -shrunk. These variables are defined in include/linux/kvm_host.h and as module -parameters in virt/kvm/kvm_main.c, or arch/powerpc/kvm/book3s_hv.c in the -powerpc kvm-hv case. - -Module Parameter | Description | Default Value --------------------------------------------------------------------------------- -halt_poll_ns | The global max polling | KVM_HALT_POLL_NS_DEFAULT - | interval which defines | - | the ceiling value of the | - | polling interval for | (per arch value) - | each vcpu. | --------------------------------------------------------------------------------- -halt_poll_ns_grow | The value by which the | 2 - | halt polling interval is | - | multiplied in the | - | grow_halt_poll_ns() | - | function. | --------------------------------------------------------------------------------- -halt_poll_ns_grow_start | The initial value to grow | 10000 - | to from zero in the | - | grow_halt_poll_ns() | - | function. | --------------------------------------------------------------------------------- -halt_poll_ns_shrink | The value by which the | 0 - | halt polling interval is | - | divided in the | - | shrink_halt_poll_ns() | - | function. | --------------------------------------------------------------------------------- - -These module parameters can be set from the debugfs files in: - - /sys/module/kvm/parameters/ - -Note: that these module parameters are system wide values and are not able to - be tuned on a per vm basis. - -Further Notes -============= - -- Care should be taken when setting the halt_poll_ns module parameter as a -large value has the potential to drive the cpu usage to 100% on a machine which -would be almost entirely idle otherwise. This is because even if a guest has -wakeups during which very little work is done and which are quite far apart, if -the period is shorter than the global max polling interval (halt_poll_ns) then -the host will always poll for the entire block time and thus cpu utilisation -will go to 100%. - -- Halt polling essentially presents a trade off between power usage and latency -and the module parameters should be used to tune the affinity for this. Idle -cpu time is essentially converted to host kernel time with the aim of decreasing -latency when entering the guest. - -- Halt polling will only be conducted by the host when no other tasks are -runnable on that cpu, otherwise the polling will cease immediately and -schedule will be invoked to allow that other task to run. Thus this doesn't -allow a guest to denial of service the cpu. diff --git a/Documentation/virtual/kvm/hypercalls.txt b/Documentation/virtual/kvm/hypercalls.txt deleted file mode 100644 index da210651f714..000000000000 --- a/Documentation/virtual/kvm/hypercalls.txt +++ /dev/null @@ -1,154 +0,0 @@ -Linux KVM Hypercall: -=================== -X86: - KVM Hypercalls have a three-byte sequence of either the vmcall or the vmmcall - instruction. The hypervisor can replace it with instructions that are - guaranteed to be supported. - - Up to four arguments may be passed in rbx, rcx, rdx, and rsi respectively. - The hypercall number should be placed in rax and the return value will be - placed in rax. No other registers will be clobbered unless explicitly stated - by the particular hypercall. - -S390: - R2-R7 are used for parameters 1-6. In addition, R1 is used for hypercall - number. The return value is written to R2. - - S390 uses diagnose instruction as hypercall (0x500) along with hypercall - number in R1. - - For further information on the S390 diagnose call as supported by KVM, - refer to Documentation/virtual/kvm/s390-diag.txt. - - PowerPC: - It uses R3-R10 and hypercall number in R11. R4-R11 are used as output registers. - Return value is placed in R3. - - KVM hypercalls uses 4 byte opcode, that are patched with 'hypercall-instructions' - property inside the device tree's /hypervisor node. - For more information refer to Documentation/virtual/kvm/ppc-pv.txt - -MIPS: - KVM hypercalls use the HYPCALL instruction with code 0 and the hypercall - number in $2 (v0). Up to four arguments may be placed in $4-$7 (a0-a3) and - the return value is placed in $2 (v0). - -KVM Hypercalls Documentation -=========================== -The template for each hypercall is: -1. Hypercall name. -2. Architecture(s) -3. Status (deprecated, obsolete, active) -4. Purpose - -1. KVM_HC_VAPIC_POLL_IRQ ------------------------- -Architecture: x86 -Status: active -Purpose: Trigger guest exit so that the host can check for pending -interrupts on reentry. - -2. KVM_HC_MMU_OP ------------------------- -Architecture: x86 -Status: deprecated. -Purpose: Support MMU operations such as writing to PTE, -flushing TLB, release PT. - -3. KVM_HC_FEATURES ------------------------- -Architecture: PPC -Status: active -Purpose: Expose hypercall availability to the guest. On x86 platforms, cpuid -used to enumerate which hypercalls are available. On PPC, either device tree -based lookup ( which is also what EPAPR dictates) OR KVM specific enumeration -mechanism (which is this hypercall) can be used. - -4. KVM_HC_PPC_MAP_MAGIC_PAGE ------------------------- -Architecture: PPC -Status: active -Purpose: To enable communication between the hypervisor and guest there is a -shared page that contains parts of supervisor visible register state. -The guest can map this shared page to access its supervisor register through -memory using this hypercall. - -5. KVM_HC_KICK_CPU ------------------------- -Architecture: x86 -Status: active -Purpose: Hypercall used to wakeup a vcpu from HLT state -Usage example : A vcpu of a paravirtualized guest that is busywaiting in guest -kernel mode for an event to occur (ex: a spinlock to become available) can -execute HLT instruction once it has busy-waited for more than a threshold -time-interval. Execution of HLT instruction would cause the hypervisor to put -the vcpu to sleep until occurrence of an appropriate event. Another vcpu of the -same guest can wakeup the sleeping vcpu by issuing KVM_HC_KICK_CPU hypercall, -specifying APIC ID (a1) of the vcpu to be woken up. An additional argument (a0) -is used in the hypercall for future use. - - -6. KVM_HC_CLOCK_PAIRING ------------------------- -Architecture: x86 -Status: active -Purpose: Hypercall used to synchronize host and guest clocks. -Usage: - -a0: guest physical address where host copies -"struct kvm_clock_offset" structure. - -a1: clock_type, ATM only KVM_CLOCK_PAIRING_WALLCLOCK (0) -is supported (corresponding to the host's CLOCK_REALTIME clock). - - struct kvm_clock_pairing { - __s64 sec; - __s64 nsec; - __u64 tsc; - __u32 flags; - __u32 pad[9]; - }; - - Where: - * sec: seconds from clock_type clock. - * nsec: nanoseconds from clock_type clock. - * tsc: guest TSC value used to calculate sec/nsec pair - * flags: flags, unused (0) at the moment. - -The hypercall lets a guest compute a precise timestamp across -host and guest. The guest can use the returned TSC value to -compute the CLOCK_REALTIME for its clock, at the same instant. - -Returns KVM_EOPNOTSUPP if the host does not use TSC clocksource, -or if clock type is different than KVM_CLOCK_PAIRING_WALLCLOCK. - -6. KVM_HC_SEND_IPI ------------------------- -Architecture: x86 -Status: active -Purpose: Send IPIs to multiple vCPUs. - -a0: lower part of the bitmap of destination APIC IDs -a1: higher part of the bitmap of destination APIC IDs -a2: the lowest APIC ID in bitmap -a3: APIC ICR - -The hypercall lets a guest send multicast IPIs, with at most 128 -128 destinations per hypercall in 64-bit mode and 64 vCPUs per -hypercall in 32-bit mode. The destinations are represented by a -bitmap contained in the first two arguments (a0 and a1). Bit 0 of -a0 corresponds to the APIC ID in the third argument (a2), bit 1 -corresponds to the APIC ID a2+1, and so on. - -Returns the number of CPUs to which the IPIs were delivered successfully. - -7. KVM_HC_SCHED_YIELD ------------------------- -Architecture: x86 -Status: active -Purpose: Hypercall used to yield if the IPI target vCPU is preempted - -a0: destination APIC ID - -Usage example: When sending a call-function IPI-many to vCPUs, yield if -any of the IPI target vCPUs was preempted. diff --git a/Documentation/virtual/kvm/index.rst b/Documentation/virtual/kvm/index.rst deleted file mode 100644 index 0b206a06f5be..000000000000 --- a/Documentation/virtual/kvm/index.rst +++ /dev/null @@ -1,11 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -=== -KVM -=== - -.. toctree:: - :maxdepth: 2 - - amd-memory-encryption - cpuid diff --git a/Documentation/virtual/kvm/locking.txt b/Documentation/virtual/kvm/locking.txt deleted file mode 100644 index 635cd6eaf714..000000000000 --- a/Documentation/virtual/kvm/locking.txt +++ /dev/null @@ -1,215 +0,0 @@ -KVM Lock Overview -================= - -1. Acquisition Orders ---------------------- - -The acquisition orders for mutexes are as follows: - -- kvm->lock is taken outside vcpu->mutex - -- kvm->lock is taken outside kvm->slots_lock and kvm->irq_lock - -- kvm->slots_lock is taken outside kvm->irq_lock, though acquiring - them together is quite rare. - -On x86, vcpu->mutex is taken outside kvm->arch.hyperv.hv_lock. - -Everything else is a leaf: no other lock is taken inside the critical -sections. - -2: Exception ------------- - -Fast page fault: - -Fast page fault is the fast path which fixes the guest page fault out of -the mmu-lock on x86. Currently, the page fault can be fast in one of the -following two cases: - -1. Access Tracking: The SPTE is not present, but it is marked for access -tracking i.e. the SPTE_SPECIAL_MASK is set. That means we need to -restore the saved R/X bits. This is described in more detail later below. - -2. Write-Protection: The SPTE is present and the fault is -caused by write-protect. That means we just need to change the W bit of the -spte. - -What we use to avoid all the race is the SPTE_HOST_WRITEABLE bit and -SPTE_MMU_WRITEABLE bit on the spte: -- SPTE_HOST_WRITEABLE means the gfn is writable on host. -- SPTE_MMU_WRITEABLE means the gfn is writable on mmu. The bit is set when - the gfn is writable on guest mmu and it is not write-protected by shadow - page write-protection. - -On fast page fault path, we will use cmpxchg to atomically set the spte W -bit if spte.SPTE_HOST_WRITEABLE = 1 and spte.SPTE_WRITE_PROTECT = 1, or -restore the saved R/X bits if VMX_EPT_TRACK_ACCESS mask is set, or both. This -is safe because whenever changing these bits can be detected by cmpxchg. - -But we need carefully check these cases: -1): The mapping from gfn to pfn -The mapping from gfn to pfn may be changed since we can only ensure the pfn -is not changed during cmpxchg. This is a ABA problem, for example, below case -will happen: - -At the beginning: -gpte = gfn1 -gfn1 is mapped to pfn1 on host -spte is the shadow page table entry corresponding with gpte and -spte = pfn1 - - VCPU 0 VCPU0 -on fast page fault path: - - old_spte = *spte; - pfn1 is swapped out: - spte = 0; - - pfn1 is re-alloced for gfn2. - - gpte is changed to point to - gfn2 by the guest: - spte = pfn1; - - if (cmpxchg(spte, old_spte, old_spte+W) - mark_page_dirty(vcpu->kvm, gfn1) - OOPS!!! - -We dirty-log for gfn1, that means gfn2 is lost in dirty-bitmap. - -For direct sp, we can easily avoid it since the spte of direct sp is fixed -to gfn. For indirect sp, before we do cmpxchg, we call gfn_to_pfn_atomic() -to pin gfn to pfn, because after gfn_to_pfn_atomic(): -- We have held the refcount of pfn that means the pfn can not be freed and - be reused for another gfn. -- The pfn is writable that means it can not be shared between different gfns - by KSM. - -Then, we can ensure the dirty bitmaps is correctly set for a gfn. - -Currently, to simplify the whole things, we disable fast page fault for -indirect shadow page. - -2): Dirty bit tracking -In the origin code, the spte can be fast updated (non-atomically) if the -spte is read-only and the Accessed bit has already been set since the -Accessed bit and Dirty bit can not be lost. - -But it is not true after fast page fault since the spte can be marked -writable between reading spte and updating spte. Like below case: - -At the beginning: -spte.W = 0 -spte.Accessed = 1 - - VCPU 0 VCPU0 -In mmu_spte_clear_track_bits(): - - old_spte = *spte; - - /* 'if' condition is satisfied. */ - if (old_spte.Accessed == 1 && - old_spte.W == 0) - spte = 0ull; - on fast page fault path: - spte.W = 1 - memory write on the spte: - spte.Dirty = 1 - - - else - old_spte = xchg(spte, 0ull) - - - if (old_spte.Accessed == 1) - kvm_set_pfn_accessed(spte.pfn); - if (old_spte.Dirty == 1) - kvm_set_pfn_dirty(spte.pfn); - OOPS!!! - -The Dirty bit is lost in this case. - -In order to avoid this kind of issue, we always treat the spte as "volatile" -if it can be updated out of mmu-lock, see spte_has_volatile_bits(), it means, -the spte is always atomically updated in this case. - -3): flush tlbs due to spte updated -If the spte is updated from writable to readonly, we should flush all TLBs, -otherwise rmap_write_protect will find a read-only spte, even though the -writable spte might be cached on a CPU's TLB. - -As mentioned before, the spte can be updated to writable out of mmu-lock on -fast page fault path, in order to easily audit the path, we see if TLBs need -be flushed caused by this reason in mmu_spte_update() since this is a common -function to update spte (present -> present). - -Since the spte is "volatile" if it can be updated out of mmu-lock, we always -atomically update the spte, the race caused by fast page fault can be avoided, -See the comments in spte_has_volatile_bits() and mmu_spte_update(). - -Lockless Access Tracking: - -This is used for Intel CPUs that are using EPT but do not support the EPT A/D -bits. In this case, when the KVM MMU notifier is called to track accesses to a -page (via kvm_mmu_notifier_clear_flush_young), it marks the PTE as not-present -by clearing the RWX bits in the PTE and storing the original R & X bits in -some unused/ignored bits. In addition, the SPTE_SPECIAL_MASK is also set on the -PTE (using the ignored bit 62). When the VM tries to access the page later on, -a fault is generated and the fast page fault mechanism described above is used -to atomically restore the PTE to a Present state. The W bit is not saved when -the PTE is marked for access tracking and during restoration to the Present -state, the W bit is set depending on whether or not it was a write access. If -it wasn't, then the W bit will remain clear until a write access happens, at -which time it will be set using the Dirty tracking mechanism described above. - -3. Reference ------------- - -Name: kvm_lock -Type: mutex -Arch: any -Protects: - vm_list - -Name: kvm_count_lock -Type: raw_spinlock_t -Arch: any -Protects: - hardware virtualization enable/disable -Comment: 'raw' because hardware enabling/disabling must be atomic /wrt - migration. - -Name: kvm_arch::tsc_write_lock -Type: raw_spinlock -Arch: x86 -Protects: - kvm_arch::{last_tsc_write,last_tsc_nsec,last_tsc_offset} - - tsc offset in vmcb -Comment: 'raw' because updating the tsc offsets must not be preempted. - -Name: kvm->mmu_lock -Type: spinlock_t -Arch: any -Protects: -shadow page/shadow tlb entry -Comment: it is a spinlock since it is used in mmu notifier. - -Name: kvm->srcu -Type: srcu lock -Arch: any -Protects: - kvm->memslots - - kvm->buses -Comment: The srcu read lock must be held while accessing memslots (e.g. - when using gfn_to_* functions) and while accessing in-kernel - MMIO/PIO address->device structure mapping (kvm->buses). - The srcu index can be stored in kvm_vcpu->srcu_idx per vcpu - if it is needed by multiple functions. - -Name: blocked_vcpu_on_cpu_lock -Type: spinlock_t -Arch: x86 -Protects: blocked_vcpu_on_cpu -Comment: This is a per-CPU lock and it is used for VT-d posted-interrupts. - When VT-d posted-interrupts is supported and the VM has assigned - devices, we put the blocked vCPU on the list blocked_vcpu_on_cpu - protected by blocked_vcpu_on_cpu_lock, when VT-d hardware issues - wakeup notification event since external interrupts from the - assigned devices happens, we will find the vCPU on the list to - wakeup. diff --git a/Documentation/virtual/kvm/mmu.txt b/Documentation/virtual/kvm/mmu.txt deleted file mode 100644 index 2efe0efc516e..000000000000 --- a/Documentation/virtual/kvm/mmu.txt +++ /dev/null @@ -1,449 +0,0 @@ -The x86 kvm shadow mmu -====================== - -The mmu (in arch/x86/kvm, files mmu.[ch] and paging_tmpl.h) is responsible -for presenting a standard x86 mmu to the guest, while translating guest -physical addresses to host physical addresses. - -The mmu code attempts to satisfy the following requirements: - -- correctness: the guest should not be able to determine that it is running - on an emulated mmu except for timing (we attempt to comply - with the specification, not emulate the characteristics of - a particular implementation such as tlb size) -- security: the guest must not be able to touch host memory not assigned - to it -- performance: minimize the performance penalty imposed by the mmu -- scaling: need to scale to large memory and large vcpu guests -- hardware: support the full range of x86 virtualization hardware -- integration: Linux memory management code must be in control of guest memory - so that swapping, page migration, page merging, transparent - hugepages, and similar features work without change -- dirty tracking: report writes to guest memory to enable live migration - and framebuffer-based displays -- footprint: keep the amount of pinned kernel memory low (most memory - should be shrinkable) -- reliability: avoid multipage or GFP_ATOMIC allocations - -Acronyms -======== - -pfn host page frame number -hpa host physical address -hva host virtual address -gfn guest frame number -gpa guest physical address -gva guest virtual address -ngpa nested guest physical address -ngva nested guest virtual address -pte page table entry (used also to refer generically to paging structure - entries) -gpte guest pte (referring to gfns) -spte shadow pte (referring to pfns) -tdp two dimensional paging (vendor neutral term for NPT and EPT) - -Virtual and real hardware supported -=================================== - -The mmu supports first-generation mmu hardware, which allows an atomic switch -of the current paging mode and cr3 during guest entry, as well as -two-dimensional paging (AMD's NPT and Intel's EPT). The emulated hardware -it exposes is the traditional 2/3/4 level x86 mmu, with support for global -pages, pae, pse, pse36, cr0.wp, and 1GB pages. Emulated hardware also -able to expose NPT capable hardware on NPT capable hosts. - -Translation -=========== - -The primary job of the mmu is to program the processor's mmu to translate -addresses for the guest. Different translations are required at different -times: - -- when guest paging is disabled, we translate guest physical addresses to - host physical addresses (gpa->hpa) -- when guest paging is enabled, we translate guest virtual addresses, to - guest physical addresses, to host physical addresses (gva->gpa->hpa) -- when the guest launches a guest of its own, we translate nested guest - virtual addresses, to nested guest physical addresses, to guest physical - addresses, to host physical addresses (ngva->ngpa->gpa->hpa) - -The primary challenge is to encode between 1 and 3 translations into hardware -that support only 1 (traditional) and 2 (tdp) translations. When the -number of required translations matches the hardware, the mmu operates in -direct mode; otherwise it operates in shadow mode (see below). - -Memory -====== - -Guest memory (gpa) is part of the user address space of the process that is -using kvm. Userspace defines the translation between guest addresses and user -addresses (gpa->hva); note that two gpas may alias to the same hva, but not -vice versa. - -These hvas may be backed using any method available to the host: anonymous -memory, file backed memory, and device memory. Memory might be paged by the -host at any time. - -Events -====== - -The mmu is driven by events, some from the guest, some from the host. - -Guest generated events: -- writes to control registers (especially cr3) -- invlpg/invlpga instruction execution -- access to missing or protected translations - -Host generated events: -- changes in the gpa->hpa translation (either through gpa->hva changes or - through hva->hpa changes) -- memory pressure (the shrinker) - -Shadow pages -============ - -The principal data structure is the shadow page, 'struct kvm_mmu_page'. A -shadow page contains 512 sptes, which can be either leaf or nonleaf sptes. A -shadow page may contain a mix of leaf and nonleaf sptes. - -A nonleaf spte allows the hardware mmu to reach the leaf pages and -is not related to a translation directly. It points to other shadow pages. - -A leaf spte corresponds to either one or two translations encoded into -one paging structure entry. These are always the lowest level of the -translation stack, with optional higher level translations left to NPT/EPT. -Leaf ptes point at guest pages. - -The following table shows translations encoded by leaf ptes, with higher-level -translations in parentheses: - - Non-nested guests: - nonpaging: gpa->hpa - paging: gva->gpa->hpa - paging, tdp: (gva->)gpa->hpa - Nested guests: - non-tdp: ngva->gpa->hpa (*) - tdp: (ngva->)ngpa->gpa->hpa - -(*) the guest hypervisor will encode the ngva->gpa translation into its page - tables if npt is not present - -Shadow pages contain the following information: - role.level: - The level in the shadow paging hierarchy that this shadow page belongs to. - 1=4k sptes, 2=2M sptes, 3=1G sptes, etc. - role.direct: - If set, leaf sptes reachable from this page are for a linear range. - Examples include real mode translation, large guest pages backed by small - host pages, and gpa->hpa translations when NPT or EPT is active. - The linear range starts at (gfn << PAGE_SHIFT) and its size is determined - by role.level (2MB for first level, 1GB for second level, 0.5TB for third - level, 256TB for fourth level) - If clear, this page corresponds to a guest page table denoted by the gfn - field. - role.quadrant: - When role.gpte_is_8_bytes=0, the guest uses 32-bit gptes while the host uses 64-bit - sptes. That means a guest page table contains more ptes than the host, - so multiple shadow pages are needed to shadow one guest page. - For first-level shadow pages, role.quadrant can be 0 or 1 and denotes the - first or second 512-gpte block in the guest page table. For second-level - page tables, each 32-bit gpte is converted to two 64-bit sptes - (since each first-level guest page is shadowed by two first-level - shadow pages) so role.quadrant takes values in the range 0..3. Each - quadrant maps 1GB virtual address space. - role.access: - Inherited guest access permissions in the form uwx. Note execute - permission is positive, not negative. - role.invalid: - The page is invalid and should not be used. It is a root page that is - currently pinned (by a cpu hardware register pointing to it); once it is - unpinned it will be destroyed. - role.gpte_is_8_bytes: - Reflects the size of the guest PTE for which the page is valid, i.e. '1' - if 64-bit gptes are in use, '0' if 32-bit gptes are in use. - role.nxe: - Contains the value of efer.nxe for which the page is valid. - role.cr0_wp: - Contains the value of cr0.wp for which the page is valid. - role.smep_andnot_wp: - Contains the value of cr4.smep && !cr0.wp for which the page is valid - (pages for which this is true are different from other pages; see the - treatment of cr0.wp=0 below). - role.smap_andnot_wp: - Contains the value of cr4.smap && !cr0.wp for which the page is valid - (pages for which this is true are different from other pages; see the - treatment of cr0.wp=0 below). - role.ept_sp: - This is a virtual flag to denote a shadowed nested EPT page. ept_sp - is true if "cr0_wp && smap_andnot_wp", an otherwise invalid combination. - role.smm: - Is 1 if the page is valid in system management mode. This field - determines which of the kvm_memslots array was used to build this - shadow page; it is also used to go back from a struct kvm_mmu_page - to a memslot, through the kvm_memslots_for_spte_role macro and - __gfn_to_memslot. - role.ad_disabled: - Is 1 if the MMU instance cannot use A/D bits. EPT did not have A/D - bits before Haswell; shadow EPT page tables also cannot use A/D bits - if the L1 hypervisor does not enable them. - gfn: - Either the guest page table containing the translations shadowed by this - page, or the base page frame for linear translations. See role.direct. - spt: - A pageful of 64-bit sptes containing the translations for this page. - Accessed by both kvm and hardware. - The page pointed to by spt will have its page->private pointing back - at the shadow page structure. - sptes in spt point either at guest pages, or at lower-level shadow pages. - Specifically, if sp1 and sp2 are shadow pages, then sp1->spt[n] may point - at __pa(sp2->spt). sp2 will point back at sp1 through parent_pte. - The spt array forms a DAG structure with the shadow page as a node, and - guest pages as leaves. - gfns: - An array of 512 guest frame numbers, one for each present pte. Used to - perform a reverse map from a pte to a gfn. When role.direct is set, any - element of this array can be calculated from the gfn field when used, in - this case, the array of gfns is not allocated. See role.direct and gfn. - root_count: - A counter keeping track of how many hardware registers (guest cr3 or - pdptrs) are now pointing at the page. While this counter is nonzero, the - page cannot be destroyed. See role.invalid. - parent_ptes: - The reverse mapping for the pte/ptes pointing at this page's spt. If - parent_ptes bit 0 is zero, only one spte points at this page and - parent_ptes points at this single spte, otherwise, there exists multiple - sptes pointing at this page and (parent_ptes & ~0x1) points at a data - structure with a list of parent sptes. - unsync: - If true, then the translations in this page may not match the guest's - translation. This is equivalent to the state of the tlb when a pte is - changed but before the tlb entry is flushed. Accordingly, unsync ptes - are synchronized when the guest executes invlpg or flushes its tlb by - other means. Valid for leaf pages. - unsync_children: - How many sptes in the page point at pages that are unsync (or have - unsynchronized children). - unsync_child_bitmap: - A bitmap indicating which sptes in spt point (directly or indirectly) at - pages that may be unsynchronized. Used to quickly locate all unsychronized - pages reachable from a given page. - clear_spte_count: - Only present on 32-bit hosts, where a 64-bit spte cannot be written - atomically. The reader uses this while running out of the MMU lock - to detect in-progress updates and retry them until the writer has - finished the write. - write_flooding_count: - A guest may write to a page table many times, causing a lot of - emulations if the page needs to be write-protected (see "Synchronized - and unsynchronized pages" below). Leaf pages can be unsynchronized - so that they do not trigger frequent emulation, but this is not - possible for non-leafs. This field counts the number of emulations - since the last time the page table was actually used; if emulation - is triggered too frequently on this page, KVM will unmap the page - to avoid emulation in the future. - -Reverse map -=========== - -The mmu maintains a reverse mapping whereby all ptes mapping a page can be -reached given its gfn. This is used, for example, when swapping out a page. - -Synchronized and unsynchronized pages -===================================== - -The guest uses two events to synchronize its tlb and page tables: tlb flushes -and page invalidations (invlpg). - -A tlb flush means that we need to synchronize all sptes reachable from the -guest's cr3. This is expensive, so we keep all guest page tables write -protected, and synchronize sptes to gptes when a gpte is written. - -A special case is when a guest page table is reachable from the current -guest cr3. In this case, the guest is obliged to issue an invlpg instruction -before using the translation. We take advantage of that by removing write -protection from the guest page, and allowing the guest to modify it freely. -We synchronize modified gptes when the guest invokes invlpg. This reduces -the amount of emulation we have to do when the guest modifies multiple gptes, -or when the a guest page is no longer used as a page table and is used for -random guest data. - -As a side effect we have to resynchronize all reachable unsynchronized shadow -pages on a tlb flush. - - -Reaction to events -================== - -- guest page fault (or npt page fault, or ept violation) - -This is the most complicated event. The cause of a page fault can be: - - - a true guest fault (the guest translation won't allow the access) (*) - - access to a missing translation - - access to a protected translation - - when logging dirty pages, memory is write protected - - synchronized shadow pages are write protected (*) - - access to untranslatable memory (mmio) - - (*) not applicable in direct mode - -Handling a page fault is performed as follows: - - - if the RSV bit of the error code is set, the page fault is caused by guest - accessing MMIO and cached MMIO information is available. - - walk shadow page table - - check for valid generation number in the spte (see "Fast invalidation of - MMIO sptes" below) - - cache the information to vcpu->arch.mmio_gva, vcpu->arch.access and - vcpu->arch.mmio_gfn, and call the emulator - - If both P bit and R/W bit of error code are set, this could possibly - be handled as a "fast page fault" (fixed without taking the MMU lock). See - the description in Documentation/virtual/kvm/locking.txt. - - if needed, walk the guest page tables to determine the guest translation - (gva->gpa or ngpa->gpa) - - if permissions are insufficient, reflect the fault back to the guest - - determine the host page - - if this is an mmio request, there is no host page; cache the info to - vcpu->arch.mmio_gva, vcpu->arch.access and vcpu->arch.mmio_gfn - - walk the shadow page table to find the spte for the translation, - instantiating missing intermediate page tables as necessary - - If this is an mmio request, cache the mmio info to the spte and set some - reserved bit on the spte (see callers of kvm_mmu_set_mmio_spte_mask) - - try to unsynchronize the page - - if successful, we can let the guest continue and modify the gpte - - emulate the instruction - - if failed, unshadow the page and let the guest continue - - update any translations that were modified by the instruction - -invlpg handling: - - - walk the shadow page hierarchy and drop affected translations - - try to reinstantiate the indicated translation in the hope that the - guest will use it in the near future - -Guest control register updates: - -- mov to cr3 - - look up new shadow roots - - synchronize newly reachable shadow pages - -- mov to cr0/cr4/efer - - set up mmu context for new paging mode - - look up new shadow roots - - synchronize newly reachable shadow pages - -Host translation updates: - - - mmu notifier called with updated hva - - look up affected sptes through reverse map - - drop (or update) translations - -Emulating cr0.wp -================ - -If tdp is not enabled, the host must keep cr0.wp=1 so page write protection -works for the guest kernel, not guest guest userspace. When the guest -cr0.wp=1, this does not present a problem. However when the guest cr0.wp=0, -we cannot map the permissions for gpte.u=1, gpte.w=0 to any spte (the -semantics require allowing any guest kernel access plus user read access). - -We handle this by mapping the permissions to two possible sptes, depending -on fault type: - -- kernel write fault: spte.u=0, spte.w=1 (allows full kernel access, - disallows user access) -- read fault: spte.u=1, spte.w=0 (allows full read access, disallows kernel - write access) - -(user write faults generate a #PF) - -In the first case there are two additional complications: -- if CR4.SMEP is enabled: since we've turned the page into a kernel page, - the kernel may now execute it. We handle this by also setting spte.nx. - If we get a user fetch or read fault, we'll change spte.u=1 and - spte.nx=gpte.nx back. For this to work, KVM forces EFER.NX to 1 when - shadow paging is in use. -- if CR4.SMAP is disabled: since the page has been changed to a kernel - page, it can not be reused when CR4.SMAP is enabled. We set - CR4.SMAP && !CR0.WP into shadow page's role to avoid this case. Note, - here we do not care the case that CR4.SMAP is enabled since KVM will - directly inject #PF to guest due to failed permission check. - -To prevent an spte that was converted into a kernel page with cr0.wp=0 -from being written by the kernel after cr0.wp has changed to 1, we make -the value of cr0.wp part of the page role. This means that an spte created -with one value of cr0.wp cannot be used when cr0.wp has a different value - -it will simply be missed by the shadow page lookup code. A similar issue -exists when an spte created with cr0.wp=0 and cr4.smep=0 is used after -changing cr4.smep to 1. To avoid this, the value of !cr0.wp && cr4.smep -is also made a part of the page role. - -Large pages -=========== - -The mmu supports all combinations of large and small guest and host pages. -Supported page sizes include 4k, 2M, 4M, and 1G. 4M pages are treated as -two separate 2M pages, on both guest and host, since the mmu always uses PAE -paging. - -To instantiate a large spte, four constraints must be satisfied: - -- the spte must point to a large host page -- the guest pte must be a large pte of at least equivalent size (if tdp is - enabled, there is no guest pte and this condition is satisfied) -- if the spte will be writeable, the large page frame may not overlap any - write-protected pages -- the guest page must be wholly contained by a single memory slot - -To check the last two conditions, the mmu maintains a ->disallow_lpage set of -arrays for each memory slot and large page size. Every write protected page -causes its disallow_lpage to be incremented, thus preventing instantiation of -a large spte. The frames at the end of an unaligned memory slot have -artificially inflated ->disallow_lpages so they can never be instantiated. - -Fast invalidation of MMIO sptes -=============================== - -As mentioned in "Reaction to events" above, kvm will cache MMIO -information in leaf sptes. When a new memslot is added or an existing -memslot is changed, this information may become stale and needs to be -invalidated. This also needs to hold the MMU lock while walking all -shadow pages, and is made more scalable with a similar technique. - -MMIO sptes have a few spare bits, which are used to store a -generation number. The global generation number is stored in -kvm_memslots(kvm)->generation, and increased whenever guest memory info -changes. - -When KVM finds an MMIO spte, it checks the generation number of the spte. -If the generation number of the spte does not equal the global generation -number, it will ignore the cached MMIO information and handle the page -fault through the slow path. - -Since only 19 bits are used to store generation-number on mmio spte, all -pages are zapped when there is an overflow. - -Unfortunately, a single memory access might access kvm_memslots(kvm) multiple -times, the last one happening when the generation number is retrieved and -stored into the MMIO spte. Thus, the MMIO spte might be created based on -out-of-date information, but with an up-to-date generation number. - -To avoid this, the generation number is incremented again after synchronize_srcu -returns; thus, bit 63 of kvm_memslots(kvm)->generation set to 1 only during a -memslot update, while some SRCU readers might be using the old copy. We do not -want to use an MMIO sptes created with an odd generation number, and we can do -this without losing a bit in the MMIO spte. The "update in-progress" bit of the -generation is not stored in MMIO spte, and is so is implicitly zero when the -generation is extracted out of the spte. If KVM is unlucky and creates an MMIO -spte while an update is in-progress, the next access to the spte will always be -a cache miss. For example, a subsequent access during the update window will -miss due to the in-progress flag diverging, while an access after the update -window closes will have a higher generation number (as compared to the spte). - - -Further reading -=============== - -- NPT presentation from KVM Forum 2008 - http://www.linux-kvm.org/images/c/c8/KvmForum2008%24kdf2008_21.pdf - diff --git a/Documentation/virtual/kvm/msr.txt b/Documentation/virtual/kvm/msr.txt deleted file mode 100644 index df1f4338b3ca..000000000000 --- a/Documentation/virtual/kvm/msr.txt +++ /dev/null @@ -1,284 +0,0 @@ -KVM-specific MSRs. -Glauber Costa , Red Hat Inc, 2010 -===================================================== - -KVM makes use of some custom MSRs to service some requests. - -Custom MSRs have a range reserved for them, that goes from -0x4b564d00 to 0x4b564dff. There are MSRs outside this area, -but they are deprecated and their use is discouraged. - -Custom MSR list --------- - -The current supported Custom MSR list is: - -MSR_KVM_WALL_CLOCK_NEW: 0x4b564d00 - - data: 4-byte alignment physical address of a memory area which must be - in guest RAM. This memory is expected to hold a copy of the following - structure: - - struct pvclock_wall_clock { - u32 version; - u32 sec; - u32 nsec; - } __attribute__((__packed__)); - - whose data will be filled in by the hypervisor. The hypervisor is only - guaranteed to update this data at the moment of MSR write. - Users that want to reliably query this information more than once have - to write more than once to this MSR. Fields have the following meanings: - - version: guest has to check version before and after grabbing - time information and check that they are both equal and even. - An odd version indicates an in-progress update. - - sec: number of seconds for wallclock at time of boot. - - nsec: number of nanoseconds for wallclock at time of boot. - - In order to get the current wallclock time, the system_time from - MSR_KVM_SYSTEM_TIME_NEW needs to be added. - - Note that although MSRs are per-CPU entities, the effect of this - particular MSR is global. - - Availability of this MSR must be checked via bit 3 in 0x4000001 cpuid - leaf prior to usage. - -MSR_KVM_SYSTEM_TIME_NEW: 0x4b564d01 - - data: 4-byte aligned physical address of a memory area which must be in - guest RAM, plus an enable bit in bit 0. This memory is expected to hold - a copy of the following structure: - - struct pvclock_vcpu_time_info { - u32 version; - u32 pad0; - u64 tsc_timestamp; - u64 system_time; - u32 tsc_to_system_mul; - s8 tsc_shift; - u8 flags; - u8 pad[2]; - } __attribute__((__packed__)); /* 32 bytes */ - - whose data will be filled in by the hypervisor periodically. Only one - write, or registration, is needed for each VCPU. The interval between - updates of this structure is arbitrary and implementation-dependent. - The hypervisor may update this structure at any time it sees fit until - anything with bit0 == 0 is written to it. - - Fields have the following meanings: - - version: guest has to check version before and after grabbing - time information and check that they are both equal and even. - An odd version indicates an in-progress update. - - tsc_timestamp: the tsc value at the current VCPU at the time - of the update of this structure. Guests can subtract this value - from current tsc to derive a notion of elapsed time since the - structure update. - - system_time: a host notion of monotonic time, including sleep - time at the time this structure was last updated. Unit is - nanoseconds. - - tsc_to_system_mul: multiplier to be used when converting - tsc-related quantity to nanoseconds - - tsc_shift: shift to be used when converting tsc-related - quantity to nanoseconds. This shift will ensure that - multiplication with tsc_to_system_mul does not overflow. - A positive value denotes a left shift, a negative value - a right shift. - - The conversion from tsc to nanoseconds involves an additional - right shift by 32 bits. With this information, guests can - derive per-CPU time by doing: - - time = (current_tsc - tsc_timestamp) - if (tsc_shift >= 0) - time <<= tsc_shift; - else - time >>= -tsc_shift; - time = (time * tsc_to_system_mul) >> 32 - time = time + system_time - - flags: bits in this field indicate extended capabilities - coordinated between the guest and the hypervisor. Availability - of specific flags has to be checked in 0x40000001 cpuid leaf. - Current flags are: - - flag bit | cpuid bit | meaning - ------------------------------------------------------------- - | | time measures taken across - 0 | 24 | multiple cpus are guaranteed to - | | be monotonic - ------------------------------------------------------------- - | | guest vcpu has been paused by - 1 | N/A | the host - | | See 4.70 in api.txt - ------------------------------------------------------------- - - Availability of this MSR must be checked via bit 3 in 0x4000001 cpuid - leaf prior to usage. - - -MSR_KVM_WALL_CLOCK: 0x11 - - data and functioning: same as MSR_KVM_WALL_CLOCK_NEW. Use that instead. - - This MSR falls outside the reserved KVM range and may be removed in the - future. Its usage is deprecated. - - Availability of this MSR must be checked via bit 0 in 0x4000001 cpuid - leaf prior to usage. - -MSR_KVM_SYSTEM_TIME: 0x12 - - data and functioning: same as MSR_KVM_SYSTEM_TIME_NEW. Use that instead. - - This MSR falls outside the reserved KVM range and may be removed in the - future. Its usage is deprecated. - - Availability of this MSR must be checked via bit 0 in 0x4000001 cpuid - leaf prior to usage. - - The suggested algorithm for detecting kvmclock presence is then: - - if (!kvm_para_available()) /* refer to cpuid.txt */ - return NON_PRESENT; - - flags = cpuid_eax(0x40000001); - if (flags & 3) { - msr_kvm_system_time = MSR_KVM_SYSTEM_TIME_NEW; - msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK_NEW; - return PRESENT; - } else if (flags & 0) { - msr_kvm_system_time = MSR_KVM_SYSTEM_TIME; - msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK; - return PRESENT; - } else - return NON_PRESENT; - -MSR_KVM_ASYNC_PF_EN: 0x4b564d02 - data: Bits 63-6 hold 64-byte aligned physical address of a - 64 byte memory area which must be in guest RAM and must be - zeroed. Bits 5-3 are reserved and should be zero. Bit 0 is 1 - when asynchronous page faults are enabled on the vcpu 0 when - disabled. Bit 1 is 1 if asynchronous page faults can be injected - when vcpu is in cpl == 0. Bit 2 is 1 if asynchronous page faults - are delivered to L1 as #PF vmexits. Bit 2 can be set only if - KVM_FEATURE_ASYNC_PF_VMEXIT is present in CPUID. - - First 4 byte of 64 byte memory location will be written to by - the hypervisor at the time of asynchronous page fault (APF) - injection to indicate type of asynchronous page fault. Value - of 1 means that the page referred to by the page fault is not - present. Value 2 means that the page is now available. Disabling - interrupt inhibits APFs. Guest must not enable interrupt - before the reason is read, or it may be overwritten by another - APF. Since APF uses the same exception vector as regular page - fault guest must reset the reason to 0 before it does - something that can generate normal page fault. If during page - fault APF reason is 0 it means that this is regular page - fault. - - During delivery of type 1 APF cr2 contains a token that will - be used to notify a guest when missing page becomes - available. When page becomes available type 2 APF is sent with - cr2 set to the token associated with the page. There is special - kind of token 0xffffffff which tells vcpu that it should wake - up all processes waiting for APFs and no individual type 2 APFs - will be sent. - - If APF is disabled while there are outstanding APFs, they will - not be delivered. - - Currently type 2 APF will be always delivered on the same vcpu as - type 1 was, but guest should not rely on that. - -MSR_KVM_STEAL_TIME: 0x4b564d03 - - data: 64-byte alignment physical address of a memory area which must be - in guest RAM, plus an enable bit in bit 0. This memory is expected to - hold a copy of the following structure: - - struct kvm_steal_time { - __u64 steal; - __u32 version; - __u32 flags; - __u8 preempted; - __u8 u8_pad[3]; - __u32 pad[11]; - } - - whose data will be filled in by the hypervisor periodically. Only one - write, or registration, is needed for each VCPU. The interval between - updates of this structure is arbitrary and implementation-dependent. - The hypervisor may update this structure at any time it sees fit until - anything with bit0 == 0 is written to it. Guest is required to make sure - this structure is initialized to zero. - - Fields have the following meanings: - - version: a sequence counter. In other words, guest has to check - this field before and after grabbing time information and make - sure they are both equal and even. An odd version indicates an - in-progress update. - - flags: At this point, always zero. May be used to indicate - changes in this structure in the future. - - steal: the amount of time in which this vCPU did not run, in - nanoseconds. Time during which the vcpu is idle, will not be - reported as steal time. - - preempted: indicate the vCPU who owns this struct is running or - not. Non-zero values mean the vCPU has been preempted. Zero - means the vCPU is not preempted. NOTE, it is always zero if the - the hypervisor doesn't support this field. - -MSR_KVM_EOI_EN: 0x4b564d04 - data: Bit 0 is 1 when PV end of interrupt is enabled on the vcpu; 0 - when disabled. Bit 1 is reserved and must be zero. When PV end of - interrupt is enabled (bit 0 set), bits 63-2 hold a 4-byte aligned - physical address of a 4 byte memory area which must be in guest RAM and - must be zeroed. - - The first, least significant bit of 4 byte memory location will be - written to by the hypervisor, typically at the time of interrupt - injection. Value of 1 means that guest can skip writing EOI to the apic - (using MSR or MMIO write); instead, it is sufficient to signal - EOI by clearing the bit in guest memory - this location will - later be polled by the hypervisor. - Value of 0 means that the EOI write is required. - - It is always safe for the guest to ignore the optimization and perform - the APIC EOI write anyway. - - Hypervisor is guaranteed to only modify this least - significant bit while in the current VCPU context, this means that - guest does not need to use either lock prefix or memory ordering - primitives to synchronise with the hypervisor. - - However, hypervisor can set and clear this memory bit at any time: - therefore to make sure hypervisor does not interrupt the - guest and clear the least significant bit in the memory area - in the window between guest testing it to detect - whether it can skip EOI apic write and between guest - clearing it to signal EOI to the hypervisor, - guest must both read the least significant bit in the memory area and - clear it using a single CPU instruction, such as test and clear, or - compare and exchange. - -MSR_KVM_POLL_CONTROL: 0x4b564d05 - Control host-side polling. - - data: Bit 0 enables (1) or disables (0) host-side HLT polling logic. - - KVM guests can request the host not to poll on HLT, for example if - they are performing polling themselves. - diff --git a/Documentation/virtual/kvm/nested-vmx.txt b/Documentation/virtual/kvm/nested-vmx.txt deleted file mode 100644 index 97eb1353e962..000000000000 --- a/Documentation/virtual/kvm/nested-vmx.txt +++ /dev/null @@ -1,240 +0,0 @@ -Nested VMX -========== - -Overview ---------- - -On Intel processors, KVM uses Intel's VMX (Virtual-Machine eXtensions) -to easily and efficiently run guest operating systems. Normally, these guests -*cannot* themselves be hypervisors running their own guests, because in VMX, -guests cannot use VMX instructions. - -The "Nested VMX" feature adds this missing capability - of running guest -hypervisors (which use VMX) with their own nested guests. It does so by -allowing a guest to use VMX instructions, and correctly and efficiently -emulating them using the single level of VMX available in the hardware. - -We describe in much greater detail the theory behind the nested VMX feature, -its implementation and its performance characteristics, in the OSDI 2010 paper -"The Turtles Project: Design and Implementation of Nested Virtualization", -available at: - - http://www.usenix.org/events/osdi10/tech/full_papers/Ben-Yehuda.pdf - - -Terminology ------------ - -Single-level virtualization has two levels - the host (KVM) and the guests. -In nested virtualization, we have three levels: The host (KVM), which we call -L0, the guest hypervisor, which we call L1, and its nested guest, which we -call L2. - - -Running nested VMX ------------------- - -The nested VMX feature is disabled by default. It can be enabled by giving -the "nested=1" option to the kvm-intel module. - -No modifications are required to user space (qemu). However, qemu's default -emulated CPU type (qemu64) does not list the "VMX" CPU feature, so it must be -explicitly enabled, by giving qemu one of the following options: - - -cpu host (emulated CPU has all features of the real CPU) - - -cpu qemu64,+vmx (add just the vmx feature to a named CPU type) - - -ABIs ----- - -Nested VMX aims to present a standard and (eventually) fully-functional VMX -implementation for the a guest hypervisor to use. As such, the official -specification of the ABI that it provides is Intel's VMX specification, -namely volume 3B of their "Intel 64 and IA-32 Architectures Software -Developer's Manual". Not all of VMX's features are currently fully supported, -but the goal is to eventually support them all, starting with the VMX features -which are used in practice by popular hypervisors (KVM and others). - -As a VMX implementation, nested VMX presents a VMCS structure to L1. -As mandated by the spec, other than the two fields revision_id and abort, -this structure is *opaque* to its user, who is not supposed to know or care -about its internal structure. Rather, the structure is accessed through the -VMREAD and VMWRITE instructions. -Still, for debugging purposes, KVM developers might be interested to know the -internals of this structure; This is struct vmcs12 from arch/x86/kvm/vmx.c. - -The name "vmcs12" refers to the VMCS that L1 builds for L2. In the code we -also have "vmcs01", the VMCS that L0 built for L1, and "vmcs02" is the VMCS -which L0 builds to actually run L2 - how this is done is explained in the -aforementioned paper. - -For convenience, we repeat the content of struct vmcs12 here. If the internals -of this structure changes, this can break live migration across KVM versions. -VMCS12_REVISION (from vmx.c) should be changed if struct vmcs12 or its inner -struct shadow_vmcs is ever changed. - - typedef u64 natural_width; - struct __packed vmcs12 { - /* According to the Intel spec, a VMCS region must start with - * these two user-visible fields */ - u32 revision_id; - u32 abort; - - u32 launch_state; /* set to 0 by VMCLEAR, to 1 by VMLAUNCH */ - u32 padding[7]; /* room for future expansion */ - - u64 io_bitmap_a; - u64 io_bitmap_b; - u64 msr_bitmap; - u64 vm_exit_msr_store_addr; - u64 vm_exit_msr_load_addr; - u64 vm_entry_msr_load_addr; - u64 tsc_offset; - u64 virtual_apic_page_addr; - u64 apic_access_addr; - u64 ept_pointer; - u64 guest_physical_address; - u64 vmcs_link_pointer; - u64 guest_ia32_debugctl; - u64 guest_ia32_pat; - u64 guest_ia32_efer; - u64 guest_pdptr0; - u64 guest_pdptr1; - u64 guest_pdptr2; - u64 guest_pdptr3; - u64 host_ia32_pat; - u64 host_ia32_efer; - u64 padding64[8]; /* room for future expansion */ - natural_width cr0_guest_host_mask; - natural_width cr4_guest_host_mask; - natural_width cr0_read_shadow; - natural_width cr4_read_shadow; - natural_width cr3_target_value0; - natural_width cr3_target_value1; - natural_width cr3_target_value2; - natural_width cr3_target_value3; - natural_width exit_qualification; - natural_width guest_linear_address; - natural_width guest_cr0; - natural_width guest_cr3; - natural_width guest_cr4; - natural_width guest_es_base; - natural_width guest_cs_base; - natural_width guest_ss_base; - natural_width guest_ds_base; - natural_width guest_fs_base; - natural_width guest_gs_base; - natural_width guest_ldtr_base; - natural_width guest_tr_base; - natural_width guest_gdtr_base; - natural_width guest_idtr_base; - natural_width guest_dr7; - natural_width guest_rsp; - natural_width guest_rip; - natural_width guest_rflags; - natural_width guest_pending_dbg_exceptions; - natural_width guest_sysenter_esp; - natural_width guest_sysenter_eip; - natural_width host_cr0; - natural_width host_cr3; - natural_width host_cr4; - natural_width host_fs_base; - natural_width host_gs_base; - natural_width host_tr_base; - natural_width host_gdtr_base; - natural_width host_idtr_base; - natural_width host_ia32_sysenter_esp; - natural_width host_ia32_sysenter_eip; - natural_width host_rsp; - natural_width host_rip; - natural_width paddingl[8]; /* room for future expansion */ - u32 pin_based_vm_exec_control; - u32 cpu_based_vm_exec_control; - u32 exception_bitmap; - u32 page_fault_error_code_mask; - u32 page_fault_error_code_match; - u32 cr3_target_count; - u32 vm_exit_controls; - u32 vm_exit_msr_store_count; - u32 vm_exit_msr_load_count; - u32 vm_entry_controls; - u32 vm_entry_msr_load_count; - u32 vm_entry_intr_info_field; - u32 vm_entry_exception_error_code; - u32 vm_entry_instruction_len; - u32 tpr_threshold; - u32 secondary_vm_exec_control; - u32 vm_instruction_error; - u32 vm_exit_reason; - u32 vm_exit_intr_info; - u32 vm_exit_intr_error_code; - u32 idt_vectoring_info_field; - u32 idt_vectoring_error_code; - u32 vm_exit_instruction_len; - u32 vmx_instruction_info; - u32 guest_es_limit; - u32 guest_cs_limit; - u32 guest_ss_limit; - u32 guest_ds_limit; - u32 guest_fs_limit; - u32 guest_gs_limit; - u32 guest_ldtr_limit; - u32 guest_tr_limit; - u32 guest_gdtr_limit; - u32 guest_idtr_limit; - u32 guest_es_ar_bytes; - u32 guest_cs_ar_bytes; - u32 guest_ss_ar_bytes; - u32 guest_ds_ar_bytes; - u32 guest_fs_ar_bytes; - u32 guest_gs_ar_bytes; - u32 guest_ldtr_ar_bytes; - u32 guest_tr_ar_bytes; - u32 guest_interruptibility_info; - u32 guest_activity_state; - u32 guest_sysenter_cs; - u32 host_ia32_sysenter_cs; - u32 padding32[8]; /* room for future expansion */ - u16 virtual_processor_id; - u16 guest_es_selector; - u16 guest_cs_selector; - u16 guest_ss_selector; - u16 guest_ds_selector; - u16 guest_fs_selector; - u16 guest_gs_selector; - u16 guest_ldtr_selector; - u16 guest_tr_selector; - u16 host_es_selector; - u16 host_cs_selector; - u16 host_ss_selector; - u16 host_ds_selector; - u16 host_fs_selector; - u16 host_gs_selector; - u16 host_tr_selector; - }; - - -Authors -------- - -These patches were written by: - Abel Gordon, abelg il.ibm.com - Nadav Har'El, nyh il.ibm.com - Orit Wasserman, oritw il.ibm.com - Ben-Ami Yassor, benami il.ibm.com - Muli Ben-Yehuda, muli il.ibm.com - -With contributions by: - Anthony Liguori, aliguori us.ibm.com - Mike Day, mdday us.ibm.com - Michael Factor, factor il.ibm.com - Zvi Dubitzky, dubi il.ibm.com - -And valuable reviews by: - Avi Kivity, avi redhat.com - Gleb Natapov, gleb redhat.com - Marcelo Tosatti, mtosatti redhat.com - Kevin Tian, kevin.tian intel.com - and others. diff --git a/Documentation/virtual/kvm/ppc-pv.txt b/Documentation/virtual/kvm/ppc-pv.txt deleted file mode 100644 index e26115ce4258..000000000000 --- a/Documentation/virtual/kvm/ppc-pv.txt +++ /dev/null @@ -1,212 +0,0 @@ -The PPC KVM paravirtual interface -================================= - -The basic execution principle by which KVM on PowerPC works is to run all kernel -space code in PR=1 which is user space. This way we trap all privileged -instructions and can emulate them accordingly. - -Unfortunately that is also the downfall. There are quite some privileged -instructions that needlessly return us to the hypervisor even though they -could be handled differently. - -This is what the PPC PV interface helps with. It takes privileged instructions -and transforms them into unprivileged ones with some help from the hypervisor. -This cuts down virtualization costs by about 50% on some of my benchmarks. - -The code for that interface can be found in arch/powerpc/kernel/kvm* - -Querying for existence -====================== - -To find out if we're running on KVM or not, we leverage the device tree. When -Linux is running on KVM, a node /hypervisor exists. That node contains a -compatible property with the value "linux,kvm". - -Once you determined you're running under a PV capable KVM, you can now use -hypercalls as described below. - -KVM hypercalls -============== - -Inside the device tree's /hypervisor node there's a property called -'hypercall-instructions'. This property contains at most 4 opcodes that make -up the hypercall. To call a hypercall, just call these instructions. - -The parameters are as follows: - - Register IN OUT - - r0 - volatile - r3 1st parameter Return code - r4 2nd parameter 1st output value - r5 3rd parameter 2nd output value - r6 4th parameter 3rd output value - r7 5th parameter 4th output value - r8 6th parameter 5th output value - r9 7th parameter 6th output value - r10 8th parameter 7th output value - r11 hypercall number 8th output value - r12 - volatile - -Hypercall definitions are shared in generic code, so the same hypercall numbers -apply for x86 and powerpc alike with the exception that each KVM hypercall -also needs to be ORed with the KVM vendor code which is (42 << 16). - -Return codes can be as follows: - - Code Meaning - - 0 Success - 12 Hypercall not implemented - <0 Error - -The magic page -============== - -To enable communication between the hypervisor and guest there is a new shared -page that contains parts of supervisor visible register state. The guest can -map this shared page using the KVM hypercall KVM_HC_PPC_MAP_MAGIC_PAGE. - -With this hypercall issued the guest always gets the magic page mapped at the -desired location. The first parameter indicates the effective address when the -MMU is enabled. The second parameter indicates the address in real mode, if -applicable to the target. For now, we always map the page to -4096. This way we -can access it using absolute load and store functions. The following -instruction reads the first field of the magic page: - - ld rX, -4096(0) - -The interface is designed to be extensible should there be need later to add -additional registers to the magic page. If you add fields to the magic page, -also define a new hypercall feature to indicate that the host can give you more -registers. Only if the host supports the additional features, make use of them. - -The magic page layout is described by struct kvm_vcpu_arch_shared -in arch/powerpc/include/asm/kvm_para.h. - -Magic page features -=================== - -When mapping the magic page using the KVM hypercall KVM_HC_PPC_MAP_MAGIC_PAGE, -a second return value is passed to the guest. This second return value contains -a bitmap of available features inside the magic page. - -The following enhancements to the magic page are currently available: - - KVM_MAGIC_FEAT_SR Maps SR registers r/w in the magic page - KVM_MAGIC_FEAT_MAS0_TO_SPRG7 Maps MASn, ESR, PIR and high SPRGs - -For enhanced features in the magic page, please check for the existence of the -feature before using them! - -Magic page flags -================ - -In addition to features that indicate whether a host is capable of a particular -feature we also have a channel for a guest to tell the guest whether it's capable -of something. This is what we call "flags". - -Flags are passed to the host in the low 12 bits of the Effective Address. - -The following flags are currently available for a guest to expose: - - MAGIC_PAGE_FLAG_NOT_MAPPED_NX Guest handles NX bits correctly wrt magic page - -MSR bits -======== - -The MSR contains bits that require hypervisor intervention and bits that do -not require direct hypervisor intervention because they only get interpreted -when entering the guest or don't have any impact on the hypervisor's behavior. - -The following bits are safe to be set inside the guest: - - MSR_EE - MSR_RI - -If any other bit changes in the MSR, please still use mtmsr(d). - -Patched instructions -==================== - -The "ld" and "std" instructions are transformed to "lwz" and "stw" instructions -respectively on 32 bit systems with an added offset of 4 to accommodate for big -endianness. - -The following is a list of mapping the Linux kernel performs when running as -guest. Implementing any of those mappings is optional, as the instruction traps -also act on the shared page. So calling privileged instructions still works as -before. - -From To -==== == - -mfmsr rX ld rX, magic_page->msr -mfsprg rX, 0 ld rX, magic_page->sprg0 -mfsprg rX, 1 ld rX, magic_page->sprg1 -mfsprg rX, 2 ld rX, magic_page->sprg2 -mfsprg rX, 3 ld rX, magic_page->sprg3 -mfsrr0 rX ld rX, magic_page->srr0 -mfsrr1 rX ld rX, magic_page->srr1 -mfdar rX ld rX, magic_page->dar -mfdsisr rX lwz rX, magic_page->dsisr - -mtmsr rX std rX, magic_page->msr -mtsprg 0, rX std rX, magic_page->sprg0 -mtsprg 1, rX std rX, magic_page->sprg1 -mtsprg 2, rX std rX, magic_page->sprg2 -mtsprg 3, rX std rX, magic_page->sprg3 -mtsrr0 rX std rX, magic_page->srr0 -mtsrr1 rX std rX, magic_page->srr1 -mtdar rX std rX, magic_page->dar -mtdsisr rX stw rX, magic_page->dsisr - -tlbsync nop - -mtmsrd rX, 0 b -mtmsr rX b - -mtmsrd rX, 1 b - -[Book3S only] -mtsrin rX, rY b - -[BookE only] -wrteei [0|1] b - - -Some instructions require more logic to determine what's going on than a load -or store instruction can deliver. To enable patching of those, we keep some -RAM around where we can live translate instructions to. What happens is the -following: - - 1) copy emulation code to memory - 2) patch that code to fit the emulated instruction - 3) patch that code to return to the original pc + 4 - 4) patch the original instruction to branch to the new code - -That way we can inject an arbitrary amount of code as replacement for a single -instruction. This allows us to check for pending interrupts when setting EE=1 -for example. - -Hypercall ABIs in KVM on PowerPC -================================= -1) KVM hypercalls (ePAPR) - -These are ePAPR compliant hypercall implementation (mentioned above). Even -generic hypercalls are implemented here, like the ePAPR idle hcall. These are -available on all targets. - -2) PAPR hypercalls - -PAPR hypercalls are needed to run server PowerPC PAPR guests (-M pseries in QEMU). -These are the same hypercalls that pHyp, the POWER hypervisor implements. Some of -them are handled in the kernel, some are handled in user space. This is only -available on book3s_64. - -3) OSI hypercalls - -Mac-on-Linux is another user of KVM on PowerPC, which has its own hypercall (long -before KVM). This is supported to maintain compatibility. All these hypercalls get -forwarded to user space. This is only useful on book3s_32, but can be used with -book3s_64 as well. diff --git a/Documentation/virtual/kvm/review-checklist.txt b/Documentation/virtual/kvm/review-checklist.txt deleted file mode 100644 index a83b27635fdd..000000000000 --- a/Documentation/virtual/kvm/review-checklist.txt +++ /dev/null @@ -1,38 +0,0 @@ -Review checklist for kvm patches -================================ - -1. The patch must follow Documentation/process/coding-style.rst and - Documentation/process/submitting-patches.rst. - -2. Patches should be against kvm.git master branch. - -3. If the patch introduces or modifies a new userspace API: - - the API must be documented in Documentation/virtual/kvm/api.txt - - the API must be discoverable using KVM_CHECK_EXTENSION - -4. New state must include support for save/restore. - -5. New features must default to off (userspace should explicitly request them). - Performance improvements can and should default to on. - -6. New cpu features should be exposed via KVM_GET_SUPPORTED_CPUID2 - -7. Emulator changes should be accompanied by unit tests for qemu-kvm.git - kvm/test directory. - -8. Changes should be vendor neutral when possible. Changes to common code - are better than duplicating changes to vendor code. - -9. Similarly, prefer changes to arch independent code than to arch dependent - code. - -10. User/kernel interfaces and guest/host interfaces must be 64-bit clean - (all variables and sizes naturally aligned on 64-bit; use specific types - only - u64 rather than ulong). - -11. New guest visible features must either be documented in a hardware manual - or be accompanied by documentation. - -12. Features must be robust against reset and kexec - for example, shared - host/guest memory must be unshared to prevent the host from writing to - guest memory that the guest has not reserved for this purpose. diff --git a/Documentation/virtual/kvm/s390-diag.txt b/Documentation/virtual/kvm/s390-diag.txt deleted file mode 100644 index 7c52e5f8b210..000000000000 --- a/Documentation/virtual/kvm/s390-diag.txt +++ /dev/null @@ -1,83 +0,0 @@ -The s390 DIAGNOSE call on KVM -============================= - -KVM on s390 supports the DIAGNOSE call for making hypercalls, both for -native hypercalls and for selected hypercalls found on other s390 -hypervisors. - -Note that bits are numbered as by the usual s390 convention (most significant -bit on the left). - - -General remarks ---------------- - -DIAGNOSE calls by the guest cause a mandatory intercept. This implies -all supported DIAGNOSE calls need to be handled by either KVM or its -userspace. - -All DIAGNOSE calls supported by KVM use the RS-a format: - --------------------------------------- -| '83' | R1 | R3 | B2 | D2 | --------------------------------------- -0 8 12 16 20 31 - -The second-operand address (obtained by the base/displacement calculation) -is not used to address data. Instead, bits 48-63 of this address specify -the function code, and bits 0-47 are ignored. - -The supported DIAGNOSE function codes vary by the userspace used. For -DIAGNOSE function codes not specific to KVM, please refer to the -documentation for the s390 hypervisors defining them. - - -DIAGNOSE function code 'X'500' - KVM virtio functions ------------------------------------------------------ - -If the function code specifies 0x500, various virtio-related functions -are performed. - -General register 1 contains the virtio subfunction code. Supported -virtio subfunctions depend on KVM's userspace. Generally, userspace -provides either s390-virtio (subcodes 0-2) or virtio-ccw (subcode 3). - -Upon completion of the DIAGNOSE instruction, general register 2 contains -the function's return code, which is either a return code or a subcode -specific value. - -Subcode 0 - s390-virtio notification and early console printk - Handled by userspace. - -Subcode 1 - s390-virtio reset - Handled by userspace. - -Subcode 2 - s390-virtio set status - Handled by userspace. - -Subcode 3 - virtio-ccw notification - Handled by either userspace or KVM (ioeventfd case). - - General register 2 contains a subchannel-identification word denoting - the subchannel of the virtio-ccw proxy device to be notified. - - General register 3 contains the number of the virtqueue to be notified. - - General register 4 contains a 64bit identifier for KVM usage (the - kvm_io_bus cookie). If general register 4 does not contain a valid - identifier, it is ignored. - - After completion of the DIAGNOSE call, general register 2 may contain - a 64bit identifier (in the kvm_io_bus cookie case), or a negative - error value, if an internal error occurred. - - See also the virtio standard for a discussion of this hypercall. - - -DIAGNOSE function code 'X'501 - KVM breakpoint ----------------------------------------------- - -If the function code specifies 0x501, breakpoint functions may be performed. -This function code is handled by userspace. - -This diagnose function code has no subfunctions and uses no parameters. diff --git a/Documentation/virtual/kvm/timekeeping.txt b/Documentation/virtual/kvm/timekeeping.txt deleted file mode 100644 index 76808a17ad84..000000000000 --- a/Documentation/virtual/kvm/timekeeping.txt +++ /dev/null @@ -1,612 +0,0 @@ - - Timekeeping Virtualization for X86-Based Architectures - - Zachary Amsden - Copyright (c) 2010, Red Hat. All rights reserved. - -1) Overview -2) Timing Devices -3) TSC Hardware -4) Virtualization Problems - -========================================================================= - -1) Overview - -One of the most complicated parts of the X86 platform, and specifically, -the virtualization of this platform is the plethora of timing devices available -and the complexity of emulating those devices. In addition, virtualization of -time introduces a new set of challenges because it introduces a multiplexed -division of time beyond the control of the guest CPU. - -First, we will describe the various timekeeping hardware available, then -present some of the problems which arise and solutions available, giving -specific recommendations for certain classes of KVM guests. - -The purpose of this document is to collect data and information relevant to -timekeeping which may be difficult to find elsewhere, specifically, -information relevant to KVM and hardware-based virtualization. - -========================================================================= - -2) Timing Devices - -First we discuss the basic hardware devices available. TSC and the related -KVM clock are special enough to warrant a full exposition and are described in -the following section. - -2.1) i8254 - PIT - -One of the first timer devices available is the programmable interrupt timer, -or PIT. The PIT has a fixed frequency 1.193182 MHz base clock and three -channels which can be programmed to deliver periodic or one-shot interrupts. -These three channels can be configured in different modes and have individual -counters. Channel 1 and 2 were not available for general use in the original -IBM PC, and historically were connected to control RAM refresh and the PC -speaker. Now the PIT is typically integrated as part of an emulated chipset -and a separate physical PIT is not used. - -The PIT uses I/O ports 0x40 - 0x43. Access to the 16-bit counters is done -using single or multiple byte access to the I/O ports. There are 6 modes -available, but not all modes are available to all timers, as only timer 2 -has a connected gate input, required for modes 1 and 5. The gate line is -controlled by port 61h, bit 0, as illustrated in the following diagram. - - -------------- ---------------- -| | | | -| 1.1932 MHz |---------->| CLOCK OUT | ---------> IRQ 0 -| Clock | | | | - -------------- | +->| GATE TIMER 0 | - | ---------------- - | - | ---------------- - | | | - |------>| CLOCK OUT | ---------> 66.3 KHZ DRAM - | | | (aka /dev/null) - | +->| GATE TIMER 1 | - | ---------------- - | - | ---------------- - | | | - |------>| CLOCK OUT | ---------> Port 61h, bit 5 - | | | -Port 61h, bit 0 ---------->| GATE TIMER 2 | \_.---- ____ - ---------------- _| )--|LPF|---Speaker - / *---- \___/ -Port 61h, bit 1 -----------------------------------/ - -The timer modes are now described. - -Mode 0: Single Timeout. This is a one-shot software timeout that counts down - when the gate is high (always true for timers 0 and 1). When the count - reaches zero, the output goes high. - -Mode 1: Triggered One-shot. The output is initially set high. When the gate - line is set high, a countdown is initiated (which does not stop if the gate is - lowered), during which the output is set low. When the count reaches zero, - the output goes high. - -Mode 2: Rate Generator. The output is initially set high. When the countdown - reaches 1, the output goes low for one count and then returns high. The value - is reloaded and the countdown automatically resumes. If the gate line goes - low, the count is halted. If the output is low when the gate is lowered, the - output automatically goes high (this only affects timer 2). - -Mode 3: Square Wave. This generates a high / low square wave. The count - determines the length of the pulse, which alternates between high and low - when zero is reached. The count only proceeds when gate is high and is - automatically reloaded on reaching zero. The count is decremented twice at - each clock to generate a full high / low cycle at the full periodic rate. - If the count is even, the clock remains high for N/2 counts and low for N/2 - counts; if the clock is odd, the clock is high for (N+1)/2 counts and low - for (N-1)/2 counts. Only even values are latched by the counter, so odd - values are not observed when reading. This is the intended mode for timer 2, - which generates sine-like tones by low-pass filtering the square wave output. - -Mode 4: Software Strobe. After programming this mode and loading the counter, - the output remains high until the counter reaches zero. Then the output - goes low for 1 clock cycle and returns high. The counter is not reloaded. - Counting only occurs when gate is high. - -Mode 5: Hardware Strobe. After programming and loading the counter, the - output remains high. When the gate is raised, a countdown is initiated - (which does not stop if the gate is lowered). When the counter reaches zero, - the output goes low for 1 clock cycle and then returns high. The counter is - not reloaded. - -In addition to normal binary counting, the PIT supports BCD counting. The -command port, 0x43 is used to set the counter and mode for each of the three -timers. - -PIT commands, issued to port 0x43, using the following bit encoding: - -Bit 7-4: Command (See table below) -Bit 3-1: Mode (000 = Mode 0, 101 = Mode 5, 11X = undefined) -Bit 0 : Binary (0) / BCD (1) - -Command table: - -0000 - Latch Timer 0 count for port 0x40 - sample and hold the count to be read in port 0x40; - additional commands ignored until counter is read; - mode bits ignored. - -0001 - Set Timer 0 LSB mode for port 0x40 - set timer to read LSB only and force MSB to zero; - mode bits set timer mode - -0010 - Set Timer 0 MSB mode for port 0x40 - set timer to read MSB only and force LSB to zero; - mode bits set timer mode - -0011 - Set Timer 0 16-bit mode for port 0x40 - set timer to read / write LSB first, then MSB; - mode bits set timer mode - -0100 - Latch Timer 1 count for port 0x41 - as described above -0101 - Set Timer 1 LSB mode for port 0x41 - as described above -0110 - Set Timer 1 MSB mode for port 0x41 - as described above -0111 - Set Timer 1 16-bit mode for port 0x41 - as described above - -1000 - Latch Timer 2 count for port 0x42 - as described above -1001 - Set Timer 2 LSB mode for port 0x42 - as described above -1010 - Set Timer 2 MSB mode for port 0x42 - as described above -1011 - Set Timer 2 16-bit mode for port 0x42 as described above - -1101 - General counter latch - Latch combination of counters into corresponding ports - Bit 3 = Counter 2 - Bit 2 = Counter 1 - Bit 1 = Counter 0 - Bit 0 = Unused - -1110 - Latch timer status - Latch combination of counter mode into corresponding ports - Bit 3 = Counter 2 - Bit 2 = Counter 1 - Bit 1 = Counter 0 - - The output of ports 0x40-0x42 following this command will be: - - Bit 7 = Output pin - Bit 6 = Count loaded (0 if timer has expired) - Bit 5-4 = Read / Write mode - 01 = MSB only - 10 = LSB only - 11 = LSB / MSB (16-bit) - Bit 3-1 = Mode - Bit 0 = Binary (0) / BCD mode (1) - -2.2) RTC - -The second device which was available in the original PC was the MC146818 real -time clock. The original device is now obsolete, and usually emulated by the -system chipset, sometimes by an HPET and some frankenstein IRQ routing. - -The RTC is accessed through CMOS variables, which uses an index register to -control which bytes are read. Since there is only one index register, read -of the CMOS and read of the RTC require lock protection (in addition, it is -dangerous to allow userspace utilities such as hwclock to have direct RTC -access, as they could corrupt kernel reads and writes of CMOS memory). - -The RTC generates an interrupt which is usually routed to IRQ 8. The interrupt -can function as a periodic timer, an additional once a day alarm, and can issue -interrupts after an update of the CMOS registers by the MC146818 is complete. -The type of interrupt is signalled in the RTC status registers. - -The RTC will update the current time fields by battery power even while the -system is off. The current time fields should not be read while an update is -in progress, as indicated in the status register. - -The clock uses a 32.768kHz crystal, so bits 6-4 of register A should be -programmed to a 32kHz divider if the RTC is to count seconds. - -This is the RAM map originally used for the RTC/CMOS: - -Location Size Description ------------------------------------------- -00h byte Current second (BCD) -01h byte Seconds alarm (BCD) -02h byte Current minute (BCD) -03h byte Minutes alarm (BCD) -04h byte Current hour (BCD) -05h byte Hours alarm (BCD) -06h byte Current day of week (BCD) -07h byte Current day of month (BCD) -08h byte Current month (BCD) -09h byte Current year (BCD) -0Ah byte Register A - bit 7 = Update in progress - bit 6-4 = Divider for clock - 000 = 4.194 MHz - 001 = 1.049 MHz - 010 = 32 kHz - 10X = test modes - 110 = reset / disable - 111 = reset / disable - bit 3-0 = Rate selection for periodic interrupt - 000 = periodic timer disabled - 001 = 3.90625 uS - 010 = 7.8125 uS - 011 = .122070 mS - 100 = .244141 mS - ... - 1101 = 125 mS - 1110 = 250 mS - 1111 = 500 mS -0Bh byte Register B - bit 7 = Run (0) / Halt (1) - bit 6 = Periodic interrupt enable - bit 5 = Alarm interrupt enable - bit 4 = Update-ended interrupt enable - bit 3 = Square wave interrupt enable - bit 2 = BCD calendar (0) / Binary (1) - bit 1 = 12-hour mode (0) / 24-hour mode (1) - bit 0 = 0 (DST off) / 1 (DST enabled) -OCh byte Register C (read only) - bit 7 = interrupt request flag (IRQF) - bit 6 = periodic interrupt flag (PF) - bit 5 = alarm interrupt flag (AF) - bit 4 = update interrupt flag (UF) - bit 3-0 = reserved -ODh byte Register D (read only) - bit 7 = RTC has power - bit 6-0 = reserved -32h byte Current century BCD (*) - (*) location vendor specific and now determined from ACPI global tables - -2.3) APIC - -On Pentium and later processors, an on-board timer is available to each CPU -as part of the Advanced Programmable Interrupt Controller. The APIC is -accessed through memory-mapped registers and provides interrupt service to each -CPU, used for IPIs and local timer interrupts. - -Although in theory the APIC is a safe and stable source for local interrupts, -in practice, many bugs and glitches have occurred due to the special nature of -the APIC CPU-local memory-mapped hardware. Beware that CPU errata may affect -the use of the APIC and that workarounds may be required. In addition, some of -these workarounds pose unique constraints for virtualization - requiring either -extra overhead incurred from extra reads of memory-mapped I/O or additional -functionality that may be more computationally expensive to implement. - -Since the APIC is documented quite well in the Intel and AMD manuals, we will -avoid repetition of the detail here. It should be pointed out that the APIC -timer is programmed through the LVT (local vector timer) register, is capable -of one-shot or periodic operation, and is based on the bus clock divided down -by the programmable divider register. - -2.4) HPET - -HPET is quite complex, and was originally intended to replace the PIT / RTC -support of the X86 PC. It remains to be seen whether that will be the case, as -the de facto standard of PC hardware is to emulate these older devices. Some -systems designated as legacy free may support only the HPET as a hardware timer -device. - -The HPET spec is rather loose and vague, requiring at least 3 hardware timers, -but allowing implementation freedom to support many more. It also imposes no -fixed rate on the timer frequency, but does impose some extremal values on -frequency, error and slew. - -In general, the HPET is recommended as a high precision (compared to PIT /RTC) -time source which is independent of local variation (as there is only one HPET -in any given system). The HPET is also memory-mapped, and its presence is -indicated through ACPI tables by the BIOS. - -Detailed specification of the HPET is beyond the current scope of this -document, as it is also very well documented elsewhere. - -2.5) Offboard Timers - -Several cards, both proprietary (watchdog boards) and commonplace (e1000) have -timing chips built into the cards which may have registers which are accessible -to kernel or user drivers. To the author's knowledge, using these to generate -a clocksource for a Linux or other kernel has not yet been attempted and is in -general frowned upon as not playing by the agreed rules of the game. Such a -timer device would require additional support to be virtualized properly and is -not considered important at this time as no known operating system does this. - -========================================================================= - -3) TSC Hardware - -The TSC or time stamp counter is relatively simple in theory; it counts -instruction cycles issued by the processor, which can be used as a measure of -time. In practice, due to a number of problems, it is the most complicated -timekeeping device to use. - -The TSC is represented internally as a 64-bit MSR which can be read with the -RDMSR, RDTSC, or RDTSCP (when available) instructions. In the past, hardware -limitations made it possible to write the TSC, but generally on old hardware it -was only possible to write the low 32-bits of the 64-bit counter, and the upper -32-bits of the counter were cleared. Now, however, on Intel processors family -0Fh, for models 3, 4 and 6, and family 06h, models e and f, this restriction -has been lifted and all 64-bits are writable. On AMD systems, the ability to -write the TSC MSR is not an architectural guarantee. - -The TSC is accessible from CPL-0 and conditionally, for CPL > 0 software by -means of the CR4.TSD bit, which when enabled, disables CPL > 0 TSC access. - -Some vendors have implemented an additional instruction, RDTSCP, which returns -atomically not just the TSC, but an indicator which corresponds to the -processor number. This can be used to index into an array of TSC variables to -determine offset information in SMP systems where TSCs are not synchronized. -The presence of this instruction must be determined by consulting CPUID feature -bits. - -Both VMX and SVM provide extension fields in the virtualization hardware which -allows the guest visible TSC to be offset by a constant. Newer implementations -promise to allow the TSC to additionally be scaled, but this hardware is not -yet widely available. - -3.1) TSC synchronization - -The TSC is a CPU-local clock in most implementations. This means, on SMP -platforms, the TSCs of different CPUs may start at different times depending -on when the CPUs are powered on. Generally, CPUs on the same die will share -the same clock, however, this is not always the case. - -The BIOS may attempt to resynchronize the TSCs during the poweron process and -the operating system or other system software may attempt to do this as well. -Several hardware limitations make the problem worse - if it is not possible to -write the full 64-bits of the TSC, it may be impossible to match the TSC in -newly arriving CPUs to that of the rest of the system, resulting in -unsynchronized TSCs. This may be done by BIOS or system software, but in -practice, getting a perfectly synchronized TSC will not be possible unless all -values are read from the same clock, which generally only is possible on single -socket systems or those with special hardware support. - -3.2) TSC and CPU hotplug - -As touched on already, CPUs which arrive later than the boot time of the system -may not have a TSC value that is synchronized with the rest of the system. -Either system software, BIOS, or SMM code may actually try to establish the TSC -to a value matching the rest of the system, but a perfect match is usually not -a guarantee. This can have the effect of bringing a system from a state where -TSC is synchronized back to a state where TSC synchronization flaws, however -small, may be exposed to the OS and any virtualization environment. - -3.3) TSC and multi-socket / NUMA - -Multi-socket systems, especially large multi-socket systems are likely to have -individual clocksources rather than a single, universally distributed clock. -Since these clocks are driven by different crystals, they will not have -perfectly matched frequency, and temperature and electrical variations will -cause the CPU clocks, and thus the TSCs to drift over time. Depending on the -exact clock and bus design, the drift may or may not be fixed in absolute -error, and may accumulate over time. - -In addition, very large systems may deliberately slew the clocks of individual -cores. This technique, known as spread-spectrum clocking, reduces EMI at the -clock frequency and harmonics of it, which may be required to pass FCC -standards for telecommunications and computer equipment. - -It is recommended not to trust the TSCs to remain synchronized on NUMA or -multiple socket systems for these reasons. - -3.4) TSC and C-states - -C-states, or idling states of the processor, especially C1E and deeper sleep -states may be problematic for TSC as well. The TSC may stop advancing in such -a state, resulting in a TSC which is behind that of other CPUs when execution -is resumed. Such CPUs must be detected and flagged by the operating system -based on CPU and chipset identifications. - -The TSC in such a case may be corrected by catching it up to a known external -clocksource. - -3.5) TSC frequency change / P-states - -To make things slightly more interesting, some CPUs may change frequency. They -may or may not run the TSC at the same rate, and because the frequency change -may be staggered or slewed, at some points in time, the TSC rate may not be -known other than falling within a range of values. In this case, the TSC will -not be a stable time source, and must be calibrated against a known, stable, -external clock to be a usable source of time. - -Whether the TSC runs at a constant rate or scales with the P-state is model -dependent and must be determined by inspecting CPUID, chipset or vendor -specific MSR fields. - -In addition, some vendors have known bugs where the P-state is actually -compensated for properly during normal operation, but when the processor is -inactive, the P-state may be raised temporarily to service cache misses from -other processors. In such cases, the TSC on halted CPUs could advance faster -than that of non-halted processors. AMD Turion processors are known to have -this problem. - -3.6) TSC and STPCLK / T-states - -External signals given to the processor may also have the effect of stopping -the TSC. This is typically done for thermal emergency power control to prevent -an overheating condition, and typically, there is no way to detect that this -condition has happened. - -3.7) TSC virtualization - VMX - -VMX provides conditional trapping of RDTSC, RDMSR, WRMSR and RDTSCP -instructions, which is enough for full virtualization of TSC in any manner. In -addition, VMX allows passing through the host TSC plus an additional TSC_OFFSET -field specified in the VMCS. Special instructions must be used to read and -write the VMCS field. - -3.8) TSC virtualization - SVM - -SVM provides conditional trapping of RDTSC, RDMSR, WRMSR and RDTSCP -instructions, which is enough for full virtualization of TSC in any manner. In -addition, SVM allows passing through the host TSC plus an additional offset -field specified in the SVM control block. - -3.9) TSC feature bits in Linux - -In summary, there is no way to guarantee the TSC remains in perfect -synchronization unless it is explicitly guaranteed by the architecture. Even -if so, the TSCs in multi-sockets or NUMA systems may still run independently -despite being locally consistent. - -The following feature bits are used by Linux to signal various TSC attributes, -but they can only be taken to be meaningful for UP or single node systems. - -X86_FEATURE_TSC : The TSC is available in hardware -X86_FEATURE_RDTSCP : The RDTSCP instruction is available -X86_FEATURE_CONSTANT_TSC : The TSC rate is unchanged with P-states -X86_FEATURE_NONSTOP_TSC : The TSC does not stop in C-states -X86_FEATURE_TSC_RELIABLE : TSC sync checks are skipped (VMware) - -4) Virtualization Problems - -Timekeeping is especially problematic for virtualization because a number of -challenges arise. The most obvious problem is that time is now shared between -the host and, potentially, a number of virtual machines. Thus the virtual -operating system does not run with 100% usage of the CPU, despite the fact that -it may very well make that assumption. It may expect it to remain true to very -exacting bounds when interrupt sources are disabled, but in reality only its -virtual interrupt sources are disabled, and the machine may still be preempted -at any time. This causes problems as the passage of real time, the injection -of machine interrupts and the associated clock sources are no longer completely -synchronized with real time. - -This same problem can occur on native hardware to a degree, as SMM mode may -steal cycles from the naturally on X86 systems when SMM mode is used by the -BIOS, but not in such an extreme fashion. However, the fact that SMM mode may -cause similar problems to virtualization makes it a good justification for -solving many of these problems on bare metal. - -4.1) Interrupt clocking - -One of the most immediate problems that occurs with legacy operating systems -is that the system timekeeping routines are often designed to keep track of -time by counting periodic interrupts. These interrupts may come from the PIT -or the RTC, but the problem is the same: the host virtualization engine may not -be able to deliver the proper number of interrupts per second, and so guest -time may fall behind. This is especially problematic if a high interrupt rate -is selected, such as 1000 HZ, which is unfortunately the default for many Linux -guests. - -There are three approaches to solving this problem; first, it may be possible -to simply ignore it. Guests which have a separate time source for tracking -'wall clock' or 'real time' may not need any adjustment of their interrupts to -maintain proper time. If this is not sufficient, it may be necessary to inject -additional interrupts into the guest in order to increase the effective -interrupt rate. This approach leads to complications in extreme conditions, -where host load or guest lag is too much to compensate for, and thus another -solution to the problem has risen: the guest may need to become aware of lost -ticks and compensate for them internally. Although promising in theory, the -implementation of this policy in Linux has been extremely error prone, and a -number of buggy variants of lost tick compensation are distributed across -commonly used Linux systems. - -Windows uses periodic RTC clocking as a means of keeping time internally, and -thus requires interrupt slewing to keep proper time. It does use a low enough -rate (ed: is it 18.2 Hz?) however that it has not yet been a problem in -practice. - -4.2) TSC sampling and serialization - -As the highest precision time source available, the cycle counter of the CPU -has aroused much interest from developers. As explained above, this timer has -many problems unique to its nature as a local, potentially unstable and -potentially unsynchronized source. One issue which is not unique to the TSC, -but is highlighted because of its very precise nature is sampling delay. By -definition, the counter, once read is already old. However, it is also -possible for the counter to be read ahead of the actual use of the result. -This is a consequence of the superscalar execution of the instruction stream, -which may execute instructions out of order. Such execution is called -non-serialized. Forcing serialized execution is necessary for precise -measurement with the TSC, and requires a serializing instruction, such as CPUID -or an MSR read. - -Since CPUID may actually be virtualized by a trap and emulate mechanism, this -serialization can pose a performance issue for hardware virtualization. An -accurate time stamp counter reading may therefore not always be available, and -it may be necessary for an implementation to guard against "backwards" reads of -the TSC as seen from other CPUs, even in an otherwise perfectly synchronized -system. - -4.3) Timespec aliasing - -Additionally, this lack of serialization from the TSC poses another challenge -when using results of the TSC when measured against another time source. As -the TSC is much higher precision, many possible values of the TSC may be read -while another clock is still expressing the same value. - -That is, you may read (T,T+10) while external clock C maintains the same value. -Due to non-serialized reads, you may actually end up with a range which -fluctuates - from (T-1.. T+10). Thus, any time calculated from a TSC, but -calibrated against an external value may have a range of valid values. -Re-calibrating this computation may actually cause time, as computed after the -calibration, to go backwards, compared with time computed before the -calibration. - -This problem is particularly pronounced with an internal time source in Linux, -the kernel time, which is expressed in the theoretically high resolution -timespec - but which advances in much larger granularity intervals, sometimes -at the rate of jiffies, and possibly in catchup modes, at a much larger step. - -This aliasing requires care in the computation and recalibration of kvmclock -and any other values derived from TSC computation (such as TSC virtualization -itself). - -4.4) Migration - -Migration of a virtual machine raises problems for timekeeping in two ways. -First, the migration itself may take time, during which interrupts cannot be -delivered, and after which, the guest time may need to be caught up. NTP may -be able to help to some degree here, as the clock correction required is -typically small enough to fall in the NTP-correctable window. - -An additional concern is that timers based off the TSC (or HPET, if the raw bus -clock is exposed) may now be running at different rates, requiring compensation -in some way in the hypervisor by virtualizing these timers. In addition, -migrating to a faster machine may preclude the use of a passthrough TSC, as a -faster clock cannot be made visible to a guest without the potential of time -advancing faster than usual. A slower clock is less of a problem, as it can -always be caught up to the original rate. KVM clock avoids these problems by -simply storing multipliers and offsets against the TSC for the guest to convert -back into nanosecond resolution values. - -4.5) Scheduling - -Since scheduling may be based on precise timing and firing of interrupts, the -scheduling algorithms of an operating system may be adversely affected by -virtualization. In theory, the effect is random and should be universally -distributed, but in contrived as well as real scenarios (guest device access, -causes of virtualization exits, possible context switch), this may not always -be the case. The effect of this has not been well studied. - -In an attempt to work around this, several implementations have provided a -paravirtualized scheduler clock, which reveals the true amount of CPU time for -which a virtual machine has been running. - -4.6) Watchdogs - -Watchdog timers, such as the lock detector in Linux may fire accidentally when -running under hardware virtualization due to timer interrupts being delayed or -misinterpretation of the passage of real time. Usually, these warnings are -spurious and can be ignored, but in some circumstances it may be necessary to -disable such detection. - -4.7) Delays and precision timing - -Precise timing and delays may not be possible in a virtualized system. This -can happen if the system is controlling physical hardware, or issues delays to -compensate for slower I/O to and from devices. The first issue is not solvable -in general for a virtualized system; hardware control software can't be -adequately virtualized without a full real-time operating system, which would -require an RT aware virtualization platform. - -The second issue may cause performance problems, but this is unlikely to be a -significant issue. In many cases these delays may be eliminated through -configuration or paravirtualization. - -4.8) Covert channels and leaks - -In addition to the above problems, time information will inevitably leak to the -guest about the host in anything but a perfect implementation of virtualized -time. This may allow the guest to infer the presence of a hypervisor (as in a -red-pill type detection), and it may allow information to leak between guests -by using CPU utilization itself as a signalling channel. Preventing such -problems would require completely isolated virtual time which may not track -real time any longer. This may be useful in certain security or QA contexts, -but in general isn't recommended for real-world deployment scenarios. diff --git a/Documentation/virtual/kvm/vcpu-requests.rst b/Documentation/virtual/kvm/vcpu-requests.rst deleted file mode 100644 index 5feb3706a7ae..000000000000 --- a/Documentation/virtual/kvm/vcpu-requests.rst +++ /dev/null @@ -1,307 +0,0 @@ -================= -KVM VCPU Requests -================= - -Overview -======== - -KVM supports an internal API enabling threads to request a VCPU thread to -perform some activity. For example, a thread may request a VCPU to flush -its TLB with a VCPU request. The API consists of the following functions:: - - /* Check if any requests are pending for VCPU @vcpu. */ - bool kvm_request_pending(struct kvm_vcpu *vcpu); - - /* Check if VCPU @vcpu has request @req pending. */ - bool kvm_test_request(int req, struct kvm_vcpu *vcpu); - - /* Clear request @req for VCPU @vcpu. */ - void kvm_clear_request(int req, struct kvm_vcpu *vcpu); - - /* - * Check if VCPU @vcpu has request @req pending. When the request is - * pending it will be cleared and a memory barrier, which pairs with - * another in kvm_make_request(), will be issued. - */ - bool kvm_check_request(int req, struct kvm_vcpu *vcpu); - - /* - * Make request @req of VCPU @vcpu. Issues a memory barrier, which pairs - * with another in kvm_check_request(), prior to setting the request. - */ - void kvm_make_request(int req, struct kvm_vcpu *vcpu); - - /* Make request @req of all VCPUs of the VM with struct kvm @kvm. */ - bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req); - -Typically a requester wants the VCPU to perform the activity as soon -as possible after making the request. This means most requests -(kvm_make_request() calls) are followed by a call to kvm_vcpu_kick(), -and kvm_make_all_cpus_request() has the kicking of all VCPUs built -into it. - -VCPU Kicks ----------- - -The goal of a VCPU kick is to bring a VCPU thread out of guest mode in -order to perform some KVM maintenance. To do so, an IPI is sent, forcing -a guest mode exit. However, a VCPU thread may not be in guest mode at the -time of the kick. Therefore, depending on the mode and state of the VCPU -thread, there are two other actions a kick may take. All three actions -are listed below: - -1) Send an IPI. This forces a guest mode exit. -2) Waking a sleeping VCPU. Sleeping VCPUs are VCPU threads outside guest - mode that wait on waitqueues. Waking them removes the threads from - the waitqueues, allowing the threads to run again. This behavior - may be suppressed, see KVM_REQUEST_NO_WAKEUP below. -3) Nothing. When the VCPU is not in guest mode and the VCPU thread is not - sleeping, then there is nothing to do. - -VCPU Mode ---------- - -VCPUs have a mode state, ``vcpu->mode``, that is used to track whether the -guest is running in guest mode or not, as well as some specific -outside guest mode states. The architecture may use ``vcpu->mode`` to -ensure VCPU requests are seen by VCPUs (see "Ensuring Requests Are Seen"), -as well as to avoid sending unnecessary IPIs (see "IPI Reduction"), and -even to ensure IPI acknowledgements are waited upon (see "Waiting for -Acknowledgements"). The following modes are defined: - -OUTSIDE_GUEST_MODE - - The VCPU thread is outside guest mode. - -IN_GUEST_MODE - - The VCPU thread is in guest mode. - -EXITING_GUEST_MODE - - The VCPU thread is transitioning from IN_GUEST_MODE to - OUTSIDE_GUEST_MODE. - -READING_SHADOW_PAGE_TABLES - - The VCPU thread is outside guest mode, but it wants the sender of - certain VCPU requests, namely KVM_REQ_TLB_FLUSH, to wait until the VCPU - thread is done reading the page tables. - -VCPU Request Internals -====================== - -VCPU requests are simply bit indices of the ``vcpu->requests`` bitmap. -This means general bitops, like those documented in [atomic-ops]_ could -also be used, e.g. :: - - clear_bit(KVM_REQ_UNHALT & KVM_REQUEST_MASK, &vcpu->requests); - -However, VCPU request users should refrain from doing so, as it would -break the abstraction. The first 8 bits are reserved for architecture -independent requests, all additional bits are available for architecture -dependent requests. - -Architecture Independent Requests ---------------------------------- - -KVM_REQ_TLB_FLUSH - - KVM's common MMU notifier may need to flush all of a guest's TLB - entries, calling kvm_flush_remote_tlbs() to do so. Architectures that - choose to use the common kvm_flush_remote_tlbs() implementation will - need to handle this VCPU request. - -KVM_REQ_MMU_RELOAD - - When shadow page tables are used and memory slots are removed it's - necessary to inform each VCPU to completely refresh the tables. This - request is used for that. - -KVM_REQ_PENDING_TIMER - - This request may be made from a timer handler run on the host on behalf - of a VCPU. It informs the VCPU thread to inject a timer interrupt. - -KVM_REQ_UNHALT - - This request may be made from the KVM common function kvm_vcpu_block(), - which is used to emulate an instruction that causes a CPU to halt until - one of an architectural specific set of events and/or interrupts is - received (determined by checking kvm_arch_vcpu_runnable()). When that - event or interrupt arrives kvm_vcpu_block() makes the request. This is - in contrast to when kvm_vcpu_block() returns due to any other reason, - such as a pending signal, which does not indicate the VCPU's halt - emulation should stop, and therefore does not make the request. - -KVM_REQUEST_MASK ----------------- - -VCPU requests should be masked by KVM_REQUEST_MASK before using them with -bitops. This is because only the lower 8 bits are used to represent the -request's number. The upper bits are used as flags. Currently only two -flags are defined. - -VCPU Request Flags ------------------- - -KVM_REQUEST_NO_WAKEUP - - This flag is applied to requests that only need immediate attention - from VCPUs running in guest mode. That is, sleeping VCPUs do not need - to be awaken for these requests. Sleeping VCPUs will handle the - requests when they are awaken later for some other reason. - -KVM_REQUEST_WAIT - - When requests with this flag are made with kvm_make_all_cpus_request(), - then the caller will wait for each VCPU to acknowledge its IPI before - proceeding. This flag only applies to VCPUs that would receive IPIs. - If, for example, the VCPU is sleeping, so no IPI is necessary, then - the requesting thread does not wait. This means that this flag may be - safely combined with KVM_REQUEST_NO_WAKEUP. See "Waiting for - Acknowledgements" for more information about requests with - KVM_REQUEST_WAIT. - -VCPU Requests with Associated State -=================================== - -Requesters that want the receiving VCPU to handle new state need to ensure -the newly written state is observable to the receiving VCPU thread's CPU -by the time it observes the request. This means a write memory barrier -must be inserted after writing the new state and before setting the VCPU -request bit. Additionally, on the receiving VCPU thread's side, a -corresponding read barrier must be inserted after reading the request bit -and before proceeding to read the new state associated with it. See -scenario 3, Message and Flag, of [lwn-mb]_ and the kernel documentation -[memory-barriers]_. - -The pair of functions, kvm_check_request() and kvm_make_request(), provide -the memory barriers, allowing this requirement to be handled internally by -the API. - -Ensuring Requests Are Seen -========================== - -When making requests to VCPUs, we want to avoid the receiving VCPU -executing in guest mode for an arbitrary long time without handling the -request. We can be sure this won't happen as long as we ensure the VCPU -thread checks kvm_request_pending() before entering guest mode and that a -kick will send an IPI to force an exit from guest mode when necessary. -Extra care must be taken to cover the period after the VCPU thread's last -kvm_request_pending() check and before it has entered guest mode, as kick -IPIs will only trigger guest mode exits for VCPU threads that are in guest -mode or at least have already disabled interrupts in order to prepare to -enter guest mode. This means that an optimized implementation (see "IPI -Reduction") must be certain when it's safe to not send the IPI. One -solution, which all architectures except s390 apply, is to: - -- set ``vcpu->mode`` to IN_GUEST_MODE between disabling the interrupts and - the last kvm_request_pending() check; -- enable interrupts atomically when entering the guest. - -This solution also requires memory barriers to be placed carefully in both -the requesting thread and the receiving VCPU. With the memory barriers we -can exclude the possibility of a VCPU thread observing -!kvm_request_pending() on its last check and then not receiving an IPI for -the next request made of it, even if the request is made immediately after -the check. This is done by way of the Dekker memory barrier pattern -(scenario 10 of [lwn-mb]_). As the Dekker pattern requires two variables, -this solution pairs ``vcpu->mode`` with ``vcpu->requests``. Substituting -them into the pattern gives:: - - CPU1 CPU2 - ================= ================= - local_irq_disable(); - WRITE_ONCE(vcpu->mode, IN_GUEST_MODE); kvm_make_request(REQ, vcpu); - smp_mb(); smp_mb(); - if (kvm_request_pending(vcpu)) { if (READ_ONCE(vcpu->mode) == - IN_GUEST_MODE) { - ...abort guest entry... ...send IPI... - } } - -As stated above, the IPI is only useful for VCPU threads in guest mode or -that have already disabled interrupts. This is why this specific case of -the Dekker pattern has been extended to disable interrupts before setting -``vcpu->mode`` to IN_GUEST_MODE. WRITE_ONCE() and READ_ONCE() are used to -pedantically implement the memory barrier pattern, guaranteeing the -compiler doesn't interfere with ``vcpu->mode``'s carefully planned -accesses. - -IPI Reduction -------------- - -As only one IPI is needed to get a VCPU to check for any/all requests, -then they may be coalesced. This is easily done by having the first IPI -sending kick also change the VCPU mode to something !IN_GUEST_MODE. The -transitional state, EXITING_GUEST_MODE, is used for this purpose. - -Waiting for Acknowledgements ----------------------------- - -Some requests, those with the KVM_REQUEST_WAIT flag set, require IPIs to -be sent, and the acknowledgements to be waited upon, even when the target -VCPU threads are in modes other than IN_GUEST_MODE. For example, one case -is when a target VCPU thread is in READING_SHADOW_PAGE_TABLES mode, which -is set after disabling interrupts. To support these cases, the -KVM_REQUEST_WAIT flag changes the condition for sending an IPI from -checking that the VCPU is IN_GUEST_MODE to checking that it is not -OUTSIDE_GUEST_MODE. - -Request-less VCPU Kicks ------------------------ - -As the determination of whether or not to send an IPI depends on the -two-variable Dekker memory barrier pattern, then it's clear that -request-less VCPU kicks are almost never correct. Without the assurance -that a non-IPI generating kick will still result in an action by the -receiving VCPU, as the final kvm_request_pending() check does for -request-accompanying kicks, then the kick may not do anything useful at -all. If, for instance, a request-less kick was made to a VCPU that was -just about to set its mode to IN_GUEST_MODE, meaning no IPI is sent, then -the VCPU thread may continue its entry without actually having done -whatever it was the kick was meant to initiate. - -One exception is x86's posted interrupt mechanism. In this case, however, -even the request-less VCPU kick is coupled with the same -local_irq_disable() + smp_mb() pattern described above; the ON bit -(Outstanding Notification) in the posted interrupt descriptor takes the -role of ``vcpu->requests``. When sending a posted interrupt, PIR.ON is -set before reading ``vcpu->mode``; dually, in the VCPU thread, -vmx_sync_pir_to_irr() reads PIR after setting ``vcpu->mode`` to -IN_GUEST_MODE. - -Additional Considerations -========================= - -Sleeping VCPUs --------------- - -VCPU threads may need to consider requests before and/or after calling -functions that may put them to sleep, e.g. kvm_vcpu_block(). Whether they -do or not, and, if they do, which requests need consideration, is -architecture dependent. kvm_vcpu_block() calls kvm_arch_vcpu_runnable() -to check if it should awaken. One reason to do so is to provide -architectures a function where requests may be checked if necessary. - -Clearing Requests ------------------ - -Generally it only makes sense for the receiving VCPU thread to clear a -request. However, in some circumstances, such as when the requesting -thread and the receiving VCPU thread are executed serially, such as when -they are the same thread, or when they are using some form of concurrency -control to temporarily execute synchronously, then it's possible to know -that the request may be cleared immediately, rather than waiting for the -receiving VCPU thread to handle the request in VCPU RUN. The only current -examples of this are kvm_vcpu_block() calls made by VCPUs to block -themselves. A possible side-effect of that call is to make the -KVM_REQ_UNHALT request, which may then be cleared immediately when the -VCPU returns from the call. - -References -========== - -.. [atomic-ops] Documentation/core-api/atomic_ops.rst -.. [memory-barriers] Documentation/memory-barriers.txt -.. [lwn-mb] https://lwn.net/Articles/573436/ diff --git a/Documentation/virtual/paravirt_ops.rst b/Documentation/virtual/paravirt_ops.rst deleted file mode 100644 index 6b789d27cead..000000000000 --- a/Documentation/virtual/paravirt_ops.rst +++ /dev/null @@ -1,35 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -============ -Paravirt_ops -============ - -Linux provides support for different hypervisor virtualization technologies. -Historically different binary kernels would be required in order to support -different hypervisors, this restriction was removed with pv_ops. -Linux pv_ops is a virtualization API which enables support for different -hypervisors. It allows each hypervisor to override critical operations and -allows a single kernel binary to run on all supported execution environments -including native machine -- without any hypervisors. - -pv_ops provides a set of function pointers which represent operations -corresponding to low level critical instructions and high level -functionalities in various areas. pv-ops allows for optimizations at run -time by enabling binary patching of the low-ops critical operations -at boot time. - -pv_ops operations are classified into three categories: - -- simple indirect call - These operations correspond to high level functionality where it is - known that the overhead of indirect call isn't very important. - -- indirect call which allows optimization with binary patch - Usually these operations correspond to low level critical instructions. They - are called frequently and are performance critical. The overhead is - very important. - -- a set of macros for hand written assembly code - Hand written assembly codes (.S files) also need paravirtualization - because they include sensitive instructions or some of code paths in - them are very performance critical. diff --git a/Documentation/virtual/uml/UserModeLinux-HOWTO.txt b/Documentation/virtual/uml/UserModeLinux-HOWTO.txt deleted file mode 100644 index 87b80f589e1c..000000000000 --- a/Documentation/virtual/uml/UserModeLinux-HOWTO.txt +++ /dev/null @@ -1,4589 +0,0 @@ - User Mode Linux HOWTO - User Mode Linux Core Team - Mon Nov 18 14:16:16 EST 2002 - - This document describes the use and abuse of Jeff Dike's User Mode - Linux: a port of the Linux kernel as a normal Intel Linux process. - ______________________________________________________________________ - - Table of Contents - - 1. Introduction - - 1.1 How is User Mode Linux Different? - 1.2 Why Would I Want User Mode Linux? - - 2. Compiling the kernel and modules - - 2.1 Compiling the kernel - 2.2 Compiling and installing kernel modules - 2.3 Compiling and installing uml_utilities - - 3. Running UML and logging in - - 3.1 Running UML - 3.2 Logging in - 3.3 Examples - - 4. UML on 2G/2G hosts - - 4.1 Introduction - 4.2 The problem - 4.3 The solution - - 5. Setting up serial lines and consoles - - 5.1 Specifying the device - 5.2 Specifying the channel - 5.3 Examples - - 6. Setting up the network - - 6.1 General setup - 6.2 Userspace daemons - 6.3 Specifying ethernet addresses - 6.4 UML interface setup - 6.5 Multicast - 6.6 TUN/TAP with the uml_net helper - 6.7 TUN/TAP with a preconfigured tap device - 6.8 Ethertap - 6.9 The switch daemon - 6.10 Slip - 6.11 Slirp - 6.12 pcap - 6.13 Setting up the host yourself - - 7. Sharing Filesystems between Virtual Machines - - 7.1 A warning - 7.2 Using layered block devices - 7.3 Note! - 7.4 Another warning - 7.5 uml_moo : Merging a COW file with its backing file - - 8. Creating filesystems - - 8.1 Create the filesystem file - 8.2 Assign the file to a UML device - 8.3 Creating and mounting the filesystem - - 9. Host file access - - 9.1 Using hostfs - 9.2 hostfs as the root filesystem - 9.3 Building hostfs - - 10. The Management Console - 10.1 version - 10.2 halt and reboot - 10.3 config - 10.4 remove - 10.5 sysrq - 10.6 help - 10.7 cad - 10.8 stop - 10.9 go - - 11. Kernel debugging - - 11.1 Starting the kernel under gdb - 11.2 Examining sleeping processes - 11.3 Running ddd on UML - 11.4 Debugging modules - 11.5 Attaching gdb to the kernel - 11.6 Using alternate debuggers - - 12. Kernel debugging examples - - 12.1 The case of the hung fsck - 12.2 Episode 2: The case of the hung fsck - - 13. What to do when UML doesn't work - - 13.1 Strange compilation errors when you build from source - 13.2 (obsolete) - 13.3 A variety of panics and hangs with /tmp on a reiserfs filesystem - 13.4 The compile fails with errors about conflicting types for 'open', 'dup', and 'waitpid' - 13.5 UML doesn't work when /tmp is an NFS filesystem - 13.6 UML hangs on boot when compiled with gprof support - 13.7 syslogd dies with a SIGTERM on startup - 13.8 TUN/TAP networking doesn't work on a 2.4 host - 13.9 You can network to the host but not to other machines on the net - 13.10 I have no root and I want to scream - 13.11 UML build conflict between ptrace.h and ucontext.h - 13.12 The UML BogoMips is exactly half the host's BogoMips - 13.13 When you run UML, it immediately segfaults - 13.14 xterms appear, then immediately disappear - 13.15 Any other panic, hang, or strange behavior - - 14. Diagnosing Problems - - 14.1 Case 1 : Normal kernel panics - 14.2 Case 2 : Tracing thread panics - 14.3 Case 3 : Tracing thread panics caused by other threads - 14.4 Case 4 : Hangs - - 15. Thanks - - 15.1 Code and Documentation - 15.2 Flushing out bugs - 15.3 Buglets and clean-ups - 15.4 Case Studies - 15.5 Other contributions - - - ______________________________________________________________________ - - 1. Introduction - - Welcome to User Mode Linux. It's going to be fun. - - - - 1.1. How is User Mode Linux Different? - - Normally, the Linux Kernel talks straight to your hardware (video - card, keyboard, hard drives, etc), and any programs which run ask the - kernel to operate the hardware, like so: - - - - +-----------+-----------+----+ - | Process 1 | Process 2 | ...| - +-----------+-----------+----+ - | Linux Kernel | - +----------------------------+ - | Hardware | - +----------------------------+ - - - - - The User Mode Linux Kernel is different; instead of talking to the - hardware, it talks to a `real' Linux kernel (called the `host kernel' - from now on), like any other program. Programs can then run inside - User-Mode Linux as if they were running under a normal kernel, like - so: - - - - +----------------+ - | Process 2 | ...| - +-----------+----------------+ - | Process 1 | User-Mode Linux| - +----------------------------+ - | Linux Kernel | - +----------------------------+ - | Hardware | - +----------------------------+ - - - - - - 1.2. Why Would I Want User Mode Linux? - - - 1. If User Mode Linux crashes, your host kernel is still fine. - - 2. You can run a usermode kernel as a non-root user. - - 3. You can debug the User Mode Linux like any normal process. - - 4. You can run gprof (profiling) and gcov (coverage testing). - - 5. You can play with your kernel without breaking things. - - 6. You can use it as a sandbox for testing new apps. - - 7. You can try new development kernels safely. - - 8. You can run different distributions simultaneously. - - 9. It's extremely fun. - - - - - - 2. Compiling the kernel and modules - - - - - 2.1. Compiling the kernel - - - Compiling the user mode kernel is just like compiling any other - kernel. Let's go through the steps, using 2.4.0-prerelease (current - as of this writing) as an example: - - - 1. Download the latest UML patch from - - the download page - . - - - 3. Make a directory and unpack the kernel into it. - - - - host% - mkdir ~/uml - - - - - - - host% - cd ~/uml - - - - - - - host% - tar -xzvf linux-2.4.0-prerelease.tar.bz2 - - - - - - - 4. Apply the patch using - - - - host% - cd ~/uml/linux - - - - host% - bzcat uml-patch-2.4.0-prerelease.bz2 | patch -p1 - - - - - - - 5. Run your favorite config; `make xconfig ARCH=um' is the most - convenient. `make config ARCH=um' and 'make menuconfig ARCH=um' - will work as well. The defaults will give you a useful kernel. If - you want to change something, go ahead, it probably won't hurt - anything. - - - Note: If the host is configured with a 2G/2G address space split - rather than the usual 3G/1G split, then the packaged UML binaries - will not run. They will immediately segfault. See ``UML on 2G/2G - hosts'' for the scoop on running UML on your system. - - - - 6. Finish with `make linux ARCH=um': the result is a file called - `linux' in the top directory of your source tree. - - Make sure that you don't build this kernel in /usr/src/linux. On some - distributions, /usr/include/asm is a link into this pool. The user- - mode build changes the other end of that link, and things that include - stop compiling. - - The sources are also available from cvs at the project's cvs page, - which has directions on getting the sources. You can also browse the - CVS pool from there. - - If you get the CVS sources, you will have to check them out into an - empty directory. You will then have to copy each file into the - corresponding directory in the appropriate kernel pool. - - If you don't have the latest kernel pool, you can get the - corresponding user-mode sources with - - - host% cvs co -r v_2_3_x linux - - - - - where 'x' is the version in your pool. Note that you will not get the - bug fixes and enhancements that have gone into subsequent releases. - - - 2.2. Compiling and installing kernel modules - - UML modules are built in the same way as the native kernel (with the - exception of the 'ARCH=um' that you always need for UML): - - - host% make modules ARCH=um - - - - - Any modules that you want to load into this kernel need to be built in - the user-mode pool. Modules from the native kernel won't work. - - You can install them by using ftp or something to copy them into the - virtual machine and dropping them into /lib/modules/`uname -r`. - - You can also get the kernel build process to install them as follows: - - 1. with the kernel not booted, mount the root filesystem in the top - level of the kernel pool: - - - host% mount root_fs mnt -o loop - - - - - - - 2. run - - - host% - make modules_install INSTALL_MOD_PATH=`pwd`/mnt ARCH=um - - - - - - - 3. unmount the filesystem - - - host% umount mnt - - - - - - - 4. boot the kernel on it - - - When the system is booted, you can use insmod as usual to get the - modules into the kernel. A number of things have been loaded into UML - as modules, especially filesystems and network protocols and filters, - so most symbols which need to be exported probably already are. - However, if you do find symbols that need exporting, let us - know, and - they'll be "taken care of". - - - - 2.3. Compiling and installing uml_utilities - - Many features of the UML kernel require a user-space helper program, - so a uml_utilities package is distributed separately from the kernel - patch which provides these helpers. Included within this is: - - o port-helper - Used by consoles which connect to xterms or ports - - o tunctl - Configuration tool to create and delete tap devices - - o uml_net - Setuid binary for automatic tap device configuration - - o uml_switch - User-space virtual switch required for daemon - transport - - The uml_utilities tree is compiled with: - - - host# - make && make install - - - - - Note that UML kernel patches may require a specific version of the - uml_utilities distribution. If you don't keep up with the mailing - lists, ensure that you have the latest release of uml_utilities if you - are experiencing problems with your UML kernel, particularly when - dealing with consoles or command-line switches to the helper programs - - - - - - - - - 3. Running UML and logging in - - - - 3.1. Running UML - - It runs on 2.2.15 or later, and all 2.4 kernels. - - - Booting UML is straightforward. Simply run 'linux': it will try to - mount the file `root_fs' in the current directory. You do not need to - run it as root. If your root filesystem is not named `root_fs', then - you need to put a `ubd0=root_fs_whatever' switch on the linux command - line. - - - You will need a filesystem to boot UML from. There are a number - available for download from here . There are also several tools - which can be - used to generate UML-compatible filesystem images from media. - The kernel will boot up and present you with a login prompt. - - - Note: If the host is configured with a 2G/2G address space split - rather than the usual 3G/1G split, then the packaged UML binaries will - not run. They will immediately segfault. See ``UML on 2G/2G hosts'' - for the scoop on running UML on your system. - - - - 3.2. Logging in - - - - The prepackaged filesystems have a root account with password 'root' - and a user account with password 'user'. The login banner will - generally tell you how to log in. So, you log in and you will find - yourself inside a little virtual machine. Our filesystems have a - variety of commands and utilities installed (and it is fairly easy to - add more), so you will have a lot of tools with which to poke around - the system. - - There are a couple of other ways to log in: - - o On a virtual console - - - - Each virtual console that is configured (i.e. the device exists in - /dev and /etc/inittab runs a getty on it) will come up in its own - xterm. If you get tired of the xterms, read ``Setting up serial - lines and consoles'' to see how to attach the consoles to - something else, like host ptys. - - - - o Over the serial line - - - In the boot output, find a line that looks like: - - - - serial line 0 assigned pty /dev/ptyp1 - - - - - Attach your favorite terminal program to the corresponding tty. I.e. - for minicom, the command would be - - - host% minicom -o -p /dev/ttyp1 - - - - - - - o Over the net - - - If the network is running, then you can telnet to the virtual - machine and log in to it. See ``Setting up the network'' to learn - about setting up a virtual network. - - When you're done using it, run halt, and the kernel will bring itself - down and the process will exit. - - - 3.3. Examples - - Here are some examples of UML in action: - - o A login session - - o A virtual network - - - - - - - - 4. UML on 2G/2G hosts - - - - - 4.1. Introduction - - - Most Linux machines are configured so that the kernel occupies the - upper 1G (0xc0000000 - 0xffffffff) of the 4G address space and - processes use the lower 3G (0x00000000 - 0xbfffffff). However, some - machine are configured with a 2G/2G split, with the kernel occupying - the upper 2G (0x80000000 - 0xffffffff) and processes using the lower - 2G (0x00000000 - 0x7fffffff). - - - - - 4.2. The problem - - - The prebuilt UML binaries on this site will not run on 2G/2G hosts - because UML occupies the upper .5G of the 3G process address space - (0xa0000000 - 0xbfffffff). Obviously, on 2G/2G hosts, this is right - in the middle of the kernel address space, so UML won't even load - it - will immediately segfault. - - - - - 4.3. The solution - - - The fix for this is to rebuild UML from source after enabling - CONFIG_HOST_2G_2G (under 'General Setup'). This will cause UML to - load itself in the top .5G of that smaller process address space, - where it will run fine. See ``Compiling the kernel and modules'' if - you need help building UML from source. - - - - - - - - - - - 5. Setting up serial lines and consoles - - - It is possible to attach UML serial lines and consoles to many types - of host I/O channels by specifying them on the command line. - - - You can attach them to host ptys, ttys, file descriptors, and ports. - This allows you to do things like - - o have a UML console appear on an unused host console, - - o hook two virtual machines together by having one attach to a pty - and having the other attach to the corresponding tty - - o make a virtual machine accessible from the net by attaching a - console to a port on the host. - - - The general format of the command line option is device=channel. - - - - 5.1. Specifying the device - - Devices are specified with "con" or "ssl" (console or serial line, - respectively), optionally with a device number if you are talking - about a specific device. - - - Using just "con" or "ssl" describes all of the consoles or serial - lines. If you want to talk about console #3 or serial line #10, they - would be "con3" and "ssl10", respectively. - - - A specific device name will override a less general "con=" or "ssl=". - So, for example, you can assign a pty to each of the serial lines - except for the first two like this: - - - ssl=pty ssl0=tty:/dev/tty0 ssl1=tty:/dev/tty1 - - - - - The specificity of the device name is all that matters; order on the - command line is irrelevant. - - - - 5.2. Specifying the channel - - There are a number of different types of channels to attach a UML - device to, each with a different way of specifying exactly what to - attach to. - - o pseudo-terminals - device=pty pts terminals - device=pts - - - This will cause UML to allocate a free host pseudo-terminal for the - device. The terminal that it got will be announced in the boot - log. You access it by attaching a terminal program to the - corresponding tty: - - o screen /dev/pts/n - - o screen /dev/ttyxx - - o minicom -o -p /dev/ttyxx - minicom seems not able to handle pts - devices - - o kermit - start it up, 'open' the device, then 'connect' - - - - - - o terminals - device=tty:tty device file - - - This will make UML attach the device to the specified tty (i.e - - - con1=tty:/dev/tty3 - - - - - will attach UML's console 1 to the host's /dev/tty3). If the tty that - you specify is the slave end of a tty/pty pair, something else must - have already opened the corresponding pty in order for this to work. - - - - - - o xterms - device=xterm - - - UML will run an xterm and the device will be attached to it. - - - - - - o Port - device=port:port number - - - This will attach the UML devices to the specified host port. - Attaching console 1 to the host's port 9000 would be done like - this: - - - con1=port:9000 - - - - - Attaching all the serial lines to that port would be done similarly: - - - ssl=port:9000 - - - - - You access these devices by telnetting to that port. Each active tel- - net session gets a different device. If there are more telnets to a - port than UML devices attached to it, then the extra telnet sessions - will block until an existing telnet detaches, or until another device - becomes active (i.e. by being activated in /etc/inittab). - - This channel has the advantage that you can both attach multiple UML - devices to it and know how to access them without reading the UML boot - log. It is also unique in allowing access to a UML from remote - machines without requiring that the UML be networked. This could be - useful in allowing public access to UMLs because they would be - accessible from the net, but wouldn't need any kind of network - filtering or access control because they would have no network access. - - - If you attach the main console to a portal, then the UML boot will - appear to hang. In reality, it's waiting for a telnet to connect, at - which point the boot will proceed. - - - - - - o already-existing file descriptors - device=file descriptor - - - If you set up a file descriptor on the UML command line, you can - attach a UML device to it. This is most commonly used to put the - main console back on stdin and stdout after assigning all the other - consoles to something else: - - - con0=fd:0,fd:1 con=pts - - - - - - - - - o Nothing - device=null - - - This allows the device to be opened, in contrast to 'none', but - reads will block, and writes will succeed and the data will be - thrown out. - - - - - - o None - device=none - - - This causes the device to disappear. - - - - You can also specify different input and output channels for a device - by putting a comma between them: - - - ssl3=tty:/dev/tty2,xterm - - - - - will cause serial line 3 to accept input on the host's /dev/tty2 and - display output on an xterm. That's a silly example - the most common - use of this syntax is to reattach the main console to stdin and stdout - as shown above. - - - If you decide to move the main console away from stdin/stdout, the - initial boot output will appear in the terminal that you're running - UML in. However, once the console driver has been officially - initialized, then the boot output will start appearing wherever you - specified that console 0 should be. That device will receive all - subsequent output. - - - - 5.3. Examples - - There are a number of interesting things you can do with this - capability. - - - First, this is how you get rid of those bleeding console xterms by - attaching them to host ptys: - - - con=pty con0=fd:0,fd:1 - - - - - This will make a UML console take over an unused host virtual console, - so that when you switch to it, you will see the UML login prompt - rather than the host login prompt: - - - con1=tty:/dev/tty6 - - - - - You can attach two virtual machines together with what amounts to a - serial line as follows: - - Run one UML with a serial line attached to a pty - - - - ssl1=pty - - - - - Look at the boot log to see what pty it got (this example will assume - that it got /dev/ptyp1). - - Boot the other UML with a serial line attached to the corresponding - tty - - - - ssl1=tty:/dev/ttyp1 - - - - - Log in, make sure that it has no getty on that serial line, attach a - terminal program like minicom to it, and you should see the login - prompt of the other virtual machine. - - - 6. Setting up the network - - - - This page describes how to set up the various transports and to - provide a UML instance with network access to the host, other machines - on the local net, and the rest of the net. - - - As of 2.4.5, UML networking has been completely redone to make it much - easier to set up, fix bugs, and add new features. - - - There is a new helper, uml_net, which does the host setup that - requires root privileges. - - - There are currently five transport types available for a UML virtual - machine to exchange packets with other hosts: - - o ethertap - - o TUN/TAP - - o Multicast - - o a switch daemon - - o slip - - o slirp - - o pcap - - The TUN/TAP, ethertap, slip, and slirp transports allow a UML - instance to exchange packets with the host. They may be directed - to the host or the host may just act as a router to provide access - to other physical or virtual machines. - - - The pcap transport is a synthetic read-only interface, using the - libpcap binary to collect packets from interfaces on the host and - filter them. This is useful for building preconfigured traffic - monitors or sniffers. - - - The daemon and multicast transports provide a completely virtual - network to other virtual machines. This network is completely - disconnected from the physical network unless one of the virtual - machines on it is acting as a gateway. - - - With so many host transports, which one should you use? Here's when - you should use each one: - - o ethertap - if you want access to the host networking and it is - running 2.2 - - o TUN/TAP - if you want access to the host networking and it is - running 2.4. Also, the TUN/TAP transport is able to use a - preconfigured device, allowing it to avoid using the setuid uml_net - helper, which is a security advantage. - - o Multicast - if you want a purely virtual network and you don't want - to set up anything but the UML - - o a switch daemon - if you want a purely virtual network and you - don't mind running the daemon in order to get somewhat better - performance - - o slip - there is no particular reason to run the slip backend unless - ethertap and TUN/TAP are just not available for some reason - - o slirp - if you don't have root access on the host to setup - networking, or if you don't want to allocate an IP to your UML - - o pcap - not much use for actual network connectivity, but great for - monitoring traffic on the host - - Ethertap is available on 2.4 and works fine. TUN/TAP is preferred - to it because it has better performance and ethertap is officially - considered obsolete in 2.4. Also, the root helper only needs to - run occasionally for TUN/TAP, rather than handling every packet, as - it does with ethertap. This is a slight security advantage since - it provides fewer opportunities for a nasty UML user to somehow - exploit the helper's root privileges. - - - 6.1. General setup - - First, you must have the virtual network enabled in your UML. If are - running a prebuilt kernel from this site, everything is already - enabled. If you build the kernel yourself, under the "Network device - support" menu, enable "Network device support", and then the three - transports. - - - The next step is to provide a network device to the virtual machine. - This is done by describing it on the kernel command line. - - The general format is - - - eth = , - - - - - For example, a virtual ethernet device may be attached to a host - ethertap device as follows: - - - eth0=ethertap,tap0,fe:fd:0:0:0:1,192.168.0.254 - - - - - This sets up eth0 inside the virtual machine to attach itself to the - host /dev/tap0, assigns it an ethernet address, and assigns the host - tap0 interface an IP address. - - - - Note that the IP address you assign to the host end of the tap device - must be different than the IP you assign to the eth device inside UML. - If you are short on IPs and don't want to consume two per UML, then - you can reuse the host's eth IP address for the host ends of the tap - devices. Internally, the UMLs must still get unique IPs for their eth - devices. You can also give the UMLs non-routable IPs (192.168.x.x or - 10.x.x.x) and have the host masquerade them. This will let outgoing - connections work, but incoming connections won't without more work, - such as port forwarding from the host. - Also note that when you configure the host side of an interface, it is - only acting as a gateway. It will respond to pings sent to it - locally, but is not useful to do that since it's a host interface. - You are not talking to the UML when you ping that interface and get a - response. - - - You can also add devices to a UML and remove them at runtime. See the - ``The Management Console'' page for details. - - - The sections below describe this in more detail. - - - Once you've decided how you're going to set up the devices, you boot - UML, log in, configure the UML side of the devices, and set up routes - to the outside world. At that point, you will be able to talk to any - other machines, physical or virtual, on the net. - - - If ifconfig inside UML fails and the network refuses to come up, run - tell you what went wrong. - - - - 6.2. Userspace daemons - - You will likely need the setuid helper, or the switch daemon, or both. - They are both installed with the RPM and deb, so if you've installed - either, you can skip the rest of this section. - - - If not, then you need to check them out of CVS, build them, and - install them. The helper is uml_net, in CVS /tools/uml_net, and the - daemon is uml_switch, in CVS /tools/uml_router. They are both built - with a plain 'make'. Both need to be installed in a directory that's - in your path - /usr/bin is recommend. On top of that, uml_net needs - to be setuid root. - - - - 6.3. Specifying ethernet addresses - - Below, you will see that the TUN/TAP, ethertap, and daemon interfaces - allow you to specify hardware addresses for the virtual ethernet - devices. This is generally not necessary. If you don't have a - specific reason to do it, you probably shouldn't. If one is not - specified on the command line, the driver will assign one based on the - device IP address. It will provide the address fe:fd:nn:nn:nn:nn - where nn.nn.nn.nn is the device IP address. This is nearly always - sufficient to guarantee a unique hardware address for the device. A - couple of exceptions are: - - o Another set of virtual ethernet devices are on the same network and - they are assigned hardware addresses using a different scheme which - may conflict with the UML IP address-based scheme - - o You aren't going to use the device for IP networking, so you don't - assign the device an IP address - - If you let the driver provide the hardware address, you should make - sure that the device IP address is known before the interface is - brought up. So, inside UML, this will guarantee that: - - - - UML# - ifconfig eth0 192.168.0.250 up - - - - - If you decide to assign the hardware address yourself, make sure that - the first byte of the address is even. Addresses with an odd first - byte are broadcast addresses, which you don't want assigned to a - device. - - - - 6.4. UML interface setup - - Once the network devices have been described on the command line, you - should boot UML and log in. - - - The first thing to do is bring the interface up: - - - UML# ifconfig ethn ip-address up - - - - - You should be able to ping the host at this point. - - - To reach the rest of the world, you should set a default route to the - host: - - - UML# route add default gw host ip - - - - - Again, with host ip of 192.168.0.4: - - - UML# route add default gw 192.168.0.4 - - - - - This page used to recommend setting a network route to your local net. - This is wrong, because it will cause UML to try to figure out hardware - addresses of the local machines by arping on the interface to the - host. Since that interface is basically a single strand of ethernet - with two nodes on it (UML and the host) and arp requests don't cross - networks, they will fail to elicit any responses. So, what you want - is for UML to just blindly throw all packets at the host and let it - figure out what to do with them, which is what leaving out the network - route and adding the default route does. - - - Note: If you can't communicate with other hosts on your physical - ethernet, it's probably because of a network route that's - automatically set up. If you run 'route -n' and see a route that - looks like this: - - - - - Destination Gateway Genmask Flags Metric Ref Use Iface - 192.168.0.0 0.0.0.0 255.255.255.0 U 0 0 0 eth0 - - - - - with a mask that's not 255.255.255.255, then replace it with a route - to your host: - - - UML# - route del -net 192.168.0.0 dev eth0 netmask 255.255.255.0 - - - - - - - UML# - route add -host 192.168.0.4 dev eth0 - - - - - This, plus the default route to the host, will allow UML to exchange - packets with any machine on your ethernet. - - - - 6.5. Multicast - - The simplest way to set up a virtual network between multiple UMLs is - to use the mcast transport. This was written by Harald Welte and is - present in UML version 2.4.5-5um and later. Your system must have - multicast enabled in the kernel and there must be a multicast-capable - network device on the host. Normally, this is eth0, but if there is - no ethernet card on the host, then you will likely get strange error - messages when you bring the device up inside UML. - - - To use it, run two UMLs with - - - eth0=mcast - - - - - on their command lines. Log in, configure the ethernet device in each - machine with different IP addresses: - - - UML1# ifconfig eth0 192.168.0.254 - - - - - - - UML2# ifconfig eth0 192.168.0.253 - - - - - and they should be able to talk to each other. - - The full set of command line options for this transport are - - - - ethn=mcast,ethernet address,multicast - address,multicast port,ttl - - - - - Harald's original README is here and explains these in detail, as well as - some other issues. - - There is also a related point-to-point only "ucast" transport. - This is useful when your network does not support multicast, and - all network connections are simple point to point links. - - The full set of command line options for this transport are - - - ethn=ucast,ethernet address,remote address,listen port,remote port - - - - - 6.6. TUN/TAP with the uml_net helper - - TUN/TAP is the preferred mechanism on 2.4 to exchange packets with the - host. The TUN/TAP backend has been in UML since 2.4.9-3um. - - - The easiest way to get up and running is to let the setuid uml_net - helper do the host setup for you. This involves insmod-ing the tun.o - module if necessary, configuring the device, and setting up IP - forwarding, routing, and proxy arp. If you are new to UML networking, - do this first. If you're concerned about the security implications of - the setuid helper, use it to get up and running, then read the next - section to see how to have UML use a preconfigured tap device, which - avoids the use of uml_net. - - - If you specify an IP address for the host side of the device, the - uml_net helper will do all necessary setup on the host - the only - requirement is that TUN/TAP be available, either built in to the host - kernel or as the tun.o module. - - The format of the command line switch to attach a device to a TUN/TAP - device is - - - eth =tuntap,,, - - - - - For example, this argument will attach the UML's eth0 to the next - available tap device and assign an ethernet address to it based on its - IP address - - - eth0=tuntap,,,192.168.0.254 - - - - - - - Note that the IP address that must be used for the eth device inside - UML is fixed by the routing and proxy arp that is set up on the - TUN/TAP device on the host. You can use a different one, but it won't - work because reply packets won't reach the UML. This is a feature. - It prevents a nasty UML user from doing things like setting the UML IP - to the same as the network's nameserver or mail server. - - - There are a couple potential problems with running the TUN/TAP - transport on a 2.4 host kernel - - o TUN/TAP seems not to work on 2.4.3 and earlier. Upgrade the host - kernel or use the ethertap transport. - - o With an upgraded kernel, TUN/TAP may fail with - - - File descriptor in bad state - - - - - This is due to a header mismatch between the upgraded kernel and the - kernel that was originally installed on the machine. The fix is to - make sure that /usr/src/linux points to the headers for the running - kernel. - - These were pointed out by Tim Robinson in - name="this uml- - user post"> . - - - - 6.7. TUN/TAP with a preconfigured tap device - - If you prefer not to have UML use uml_net (which is somewhat - insecure), with UML 2.4.17-11, you can set up a TUN/TAP device - beforehand. The setup needs to be done as root, but once that's done, - there is no need for root assistance. Setting up the device is done - as follows: - - o Create the device with tunctl (available from the UML utilities - tarball) - - - - - host# tunctl -u uid - - - - - where uid is the user id or username that UML will be run as. This - will tell you what device was created. - - o Configure the device IP (change IP addresses and device name to - suit) - - - - - host# ifconfig tap0 192.168.0.254 up - - - - - - o Set up routing and arping if desired - this is my recipe, there are - other ways of doing the same thing - - - host# - bash -c 'echo 1 > /proc/sys/net/ipv4/ip_forward' - - host# - route add -host 192.168.0.253 dev tap0 - - - - - - - host# - bash -c 'echo 1 > /proc/sys/net/ipv4/conf/tap0/proxy_arp' - - - - - - - host# - arp -Ds 192.168.0.253 eth0 pub - - - - - Note that this must be done every time the host boots - this configu- - ration is not stored across host reboots. So, it's probably a good - idea to stick it in an rc file. An even better idea would be a little - utility which reads the information from a config file and sets up - devices at boot time. - - o Rather than using up two IPs and ARPing for one of them, you can - also provide direct access to your LAN by the UML by using a - bridge. - - - host# - brctl addbr br0 - - - - - - - host# - ifconfig eth0 0.0.0.0 promisc up - - - - - - - host# - ifconfig tap0 0.0.0.0 promisc up - - - - - - - host# - ifconfig br0 192.168.0.1 netmask 255.255.255.0 up - - - - - - - - host# - brctl stp br0 off - - - - - - - host# - brctl setfd br0 1 - - - - - - - host# - brctl sethello br0 1 - - - - - - - host# - brctl addif br0 eth0 - - - - - - - host# - brctl addif br0 tap0 - - - - - Note that 'br0' should be setup using ifconfig with the existing IP - address of eth0, as eth0 no longer has its own IP. - - o - - - Also, the /dev/net/tun device must be writable by the user running - UML in order for the UML to use the device that's been configured - for it. The simplest thing to do is - - - host# chmod 666 /dev/net/tun - - - - - Making it world-writable looks bad, but it seems not to be - exploitable as a security hole. However, it does allow anyone to cre- - ate useless tap devices (useless because they can't configure them), - which is a DOS attack. A somewhat more secure alternative would to be - to create a group containing all the users who have preconfigured tap - devices and chgrp /dev/net/tun to that group with mode 664 or 660. - - - o Once the device is set up, run UML with 'eth0=tuntap,device name' - (i.e. 'eth0=tuntap,tap0') on the command line (or do it with the - mconsole config command). - - o Bring the eth device up in UML and you're in business. - - If you don't want that tap device any more, you can make it non- - persistent with - - - host# tunctl -d tap device - - - - - Finally, tunctl has a -b (for brief mode) switch which causes it to - output only the name of the tap device it created. This makes it - suitable for capture by a script: - - - host# TAP=`tunctl -u 1000 -b` - - - - - - - 6.8. Ethertap - - Ethertap is the general mechanism on 2.2 for userspace processes to - exchange packets with the kernel. - - - - To use this transport, you need to describe the virtual network device - on the UML command line. The general format for this is - - - eth =ethertap, , , - - - - - So, the previous example - - - eth0=ethertap,tap0,fe:fd:0:0:0:1,192.168.0.254 - - - - - attaches the UML eth0 device to the host /dev/tap0, assigns it the - ethernet address fe:fd:0:0:0:1, and assigns the IP address - 192.168.0.254 to the tap device. - - - - The tap device is mandatory, but the others are optional. If the - ethernet address is omitted, one will be assigned to it. - - - The presence of the tap IP address will cause the helper to run and do - whatever host setup is needed to allow the virtual machine to - communicate with the outside world. If you're not sure you know what - you're doing, this is the way to go. - - - If it is absent, then you must configure the tap device and whatever - arping and routing you will need on the host. However, even in this - case, the uml_net helper still needs to be in your path and it must be - setuid root if you're not running UML as root. This is because the - tap device doesn't support SIGIO, which UML needs in order to use - something as a source of input. So, the helper is used as a - convenient asynchronous IO thread. - - If you're using the uml_net helper, you can ignore the following host - setup - uml_net will do it for you. You just need to make sure you - have ethertap available, either built in to the host kernel or - available as a module. - - - If you want to set things up yourself, you need to make sure that the - appropriate /dev entry exists. If it doesn't, become root and create - it as follows: - - - mknod /dev/tap c 36 + 16 - - - - - For example, this is how to create /dev/tap0: - - - mknod /dev/tap0 c 36 0 + 16 - - - - - You also need to make sure that the host kernel has ethertap support. - If ethertap is enabled as a module, you apparently need to insmod - ethertap once for each ethertap device you want to enable. So, - - - host# - insmod ethertap - - - - - will give you the tap0 interface. To get the tap1 interface, you need - to run - - - host# - insmod ethertap unit=1 -o ethertap1 - - - - - - - - 6.9. The switch daemon - - Note: This is the daemon formerly known as uml_router, but which was - renamed so the network weenies of the world would stop growling at me. - - - The switch daemon, uml_switch, provides a mechanism for creating a - totally virtual network. By default, it provides no connection to the - host network (but see -tap, below). - - - The first thing you need to do is run the daemon. Running it with no - arguments will make it listen on a default pair of unix domain - sockets. - - - If you want it to listen on a different pair of sockets, use - - - -unix control socket data socket - - - - - - If you want it to act as a hub rather than a switch, use - - - -hub - - - - - - If you want the switch to be connected to host networking (allowing - the umls to get access to the outside world through the host), use - - - -tap tap0 - - - - - - Note that the tap device must be preconfigured (see "TUN/TAP with a - preconfigured tap device", above). If you're using a different tap - device than tap0, specify that instead of tap0. - - - uml_switch can be backgrounded as follows - - - host% - uml_switch [ options ] < /dev/null > /dev/null - - - - - The reason it doesn't background by default is that it listens to - stdin for EOF. When it sees that, it exits. - - - The general format of the kernel command line switch is - - - - ethn=daemon,ethernet address,socket - type,control socket,data socket - - - - - You can leave off everything except the 'daemon'. You only need to - specify the ethernet address if the one that will be assigned to it - isn't acceptable for some reason. The rest of the arguments describe - how to communicate with the daemon. You should only specify them if - you told the daemon to use different sockets than the default. So, if - you ran the daemon with no arguments, running the UML on the same - machine with - eth0=daemon - - - - - will cause the eth0 driver to attach itself to the daemon correctly. - - - - 6.10. Slip - - Slip is another, less general, mechanism for a process to communicate - with the host networking. In contrast to the ethertap interface, - which exchanges ethernet frames with the host and can be used to - transport any higher-level protocol, it can only be used to transport - IP. - - - The general format of the command line switch is - - - - ethn=slip,slip IP - - - - - The slip IP argument is the IP address that will be assigned to the - host end of the slip device. If it is specified, the helper will run - and will set up the host so that the virtual machine can reach it and - the rest of the network. - - - There are some oddities with this interface that you should be aware - of. You should only specify one slip device on a given virtual - machine, and its name inside UML will be 'umn', not 'eth0' or whatever - you specified on the command line. These problems will be fixed at - some point. - - - - 6.11. Slirp - - slirp uses an external program, usually /usr/bin/slirp, to provide IP - only networking connectivity through the host. This is similar to IP - masquerading with a firewall, although the translation is performed in - user-space, rather than by the kernel. As slirp does not set up any - interfaces on the host, or changes routing, slirp does not require - root access or setuid binaries on the host. - - - The general format of the command line switch for slirp is: - - - - ethn=slirp,ethernet address,slirp path - - - - - The ethernet address is optional, as UML will set up the interface - with an ethernet address based upon the initial IP address of the - interface. The slirp path is generally /usr/bin/slirp, although it - will depend on distribution. - - - The slirp program can have a number of options passed to the command - line and we can't add them to the UML command line, as they will be - parsed incorrectly. Instead, a wrapper shell script can be written or - the options inserted into the /.slirprc file. More information on - all of the slirp options can be found in its man pages. - - - The eth0 interface on UML should be set up with the IP 10.2.0.15, - although you can use anything as long as it is not used by a network - you will be connecting to. The default route on UML should be set to - use - - - UML# - route add default dev eth0 - - - - - slirp provides a number of useful IP addresses which can be used by - UML, such as 10.0.2.3 which is an alias for the DNS server specified - in /etc/resolv.conf on the host or the IP given in the 'dns' option - for slirp. - - - Even with a baudrate setting higher than 115200, the slirp connection - is limited to 115200. If you need it to go faster, the slirp binary - needs to be compiled with FULL_BOLT defined in config.h. - - - - 6.12. pcap - - The pcap transport is attached to a UML ethernet device on the command - line or with uml_mconsole with the following syntax: - - - - ethn=pcap,host interface,filter - expression,option1,option2 - - - - - The expression and options are optional. - - - The interface is whatever network device on the host you want to - sniff. The expression is a pcap filter expression, which is also what - tcpdump uses, so if you know how to specify tcpdump filters, you will - use the same expressions here. The options are up to two of - 'promisc', control whether pcap puts the host interface into - promiscuous mode. 'optimize' and 'nooptimize' control whether the pcap - expression optimizer is used. - - - Example: - - - - eth0=pcap,eth0,tcp - - eth1=pcap,eth0,!tcp - - - - will cause the UML eth0 to emit all tcp packets on the host eth0 and - the UML eth1 to emit all non-tcp packets on the host eth0. - - - - 6.13. Setting up the host yourself - - If you don't specify an address for the host side of the ethertap or - slip device, UML won't do any setup on the host. So this is what is - needed to get things working (the examples use a host-side IP of - 192.168.0.251 and a UML-side IP of 192.168.0.250 - adjust to suit your - own network): - - o The device needs to be configured with its IP address. Tap devices - are also configured with an mtu of 1484. Slip devices are - configured with a point-to-point address pointing at the UML ip - address. - - - host# ifconfig tap0 arp mtu 1484 192.168.0.251 up - - - - - - - host# - ifconfig sl0 192.168.0.251 pointopoint 192.168.0.250 up - - - - - - o If a tap device is being set up, a route is set to the UML IP. - - - UML# route add -host 192.168.0.250 gw 192.168.0.251 - - - - - - o To allow other hosts on your network to see the virtual machine, - proxy arp is set up for it. - - - host# arp -Ds 192.168.0.250 eth0 pub - - - - - - o Finally, the host is set up to route packets. - - - host# echo 1 > /proc/sys/net/ipv4/ip_forward - - - - - - - - - - - 7. Sharing Filesystems between Virtual Machines - - - - - 7.1. A warning - - Don't attempt to share filesystems simply by booting two UMLs from the - same file. That's the same thing as booting two physical machines - from a shared disk. It will result in filesystem corruption. - - - - 7.2. Using layered block devices - - The way to share a filesystem between two virtual machines is to use - the copy-on-write (COW) layering capability of the ubd block driver. - As of 2.4.6-2um, the driver supports layering a read-write private - device over a read-only shared device. A machine's writes are stored - in the private device, while reads come from either device - the - private one if the requested block is valid in it, the shared one if - not. Using this scheme, the majority of data which is unchanged is - shared between an arbitrary number of virtual machines, each of which - has a much smaller file containing the changes that it has made. With - a large number of UMLs booting from a large root filesystem, this - leads to a huge disk space saving. It will also help performance, - since the host will be able to cache the shared data using a much - smaller amount of memory, so UML disk requests will be served from the - host's memory rather than its disks. - - - - - To add a copy-on-write layer to an existing block device file, simply - add the name of the COW file to the appropriate ubd switch: - - - ubd0=root_fs_cow,root_fs_debian_22 - - - - - where 'root_fs_cow' is the private COW file and 'root_fs_debian_22' is - the existing shared filesystem. The COW file need not exist. If it - doesn't, the driver will create and initialize it. Once the COW file - has been initialized, it can be used on its own on the command line: - - - ubd0=root_fs_cow - - - - - The name of the backing file is stored in the COW file header, so it - would be redundant to continue specifying it on the command line. - - - - 7.3. Note! - - When checking the size of the COW file in order to see the gobs of - space that you're saving, make sure you use 'ls -ls' to see the actual - disk consumption rather than the length of the file. The COW file is - sparse, so the length will be very different from the disk usage. - Here is a 'ls -l' of a COW file and backing file from one boot and - shutdown: - host% ls -l cow.debian debian2.2 - -rw-r--r-- 1 jdike jdike 492504064 Aug 6 21:16 cow.debian - -rwxrw-rw- 1 jdike jdike 537919488 Aug 6 20:42 debian2.2 - - - - - Doesn't look like much saved space, does it? Well, here's 'ls -ls': - - - host% ls -ls cow.debian debian2.2 - 880 -rw-r--r-- 1 jdike jdike 492504064 Aug 6 21:16 cow.debian - 525832 -rwxrw-rw- 1 jdike jdike 537919488 Aug 6 20:42 debian2.2 - - - - - Now, you can see that the COW file has less than a meg of disk, rather - than 492 meg. - - - - 7.4. Another warning - - Once a filesystem is being used as a readonly backing file for a COW - file, do not boot directly from it or modify it in any way. Doing so - will invalidate any COW files that are using it. The mtime and size - of the backing file are stored in the COW file header at its creation, - and they must continue to match. If they don't, the driver will - refuse to use the COW file. - - - - - If you attempt to evade this restriction by changing either the - backing file or the COW header by hand, you will get a corrupted - filesystem. - - - - - Among other things, this means that upgrading the distribution in a - backing file and expecting that all of the COW files using it will see - the upgrade will not work. - - - - - 7.5. uml_moo : Merging a COW file with its backing file - - Depending on how you use UML and COW devices, it may be advisable to - merge the changes in the COW file into the backing file every once in - a while. - - - - - The utility that does this is uml_moo. Its usage is - - - host% uml_moo COW file new backing file - - - - - There's no need to specify the backing file since that information is - already in the COW file header. If you're paranoid, boot the new - merged file, and if you're happy with it, move it over the old backing - file. - - - - - uml_moo creates a new backing file by default as a safety measure. It - also has a destructive merge option which will merge the COW file - directly into its current backing file. This is really only usable - when the backing file only has one COW file associated with it. If - there are multiple COWs associated with a backing file, a -d merge of - one of them will invalidate all of the others. However, it is - convenient if you're short of disk space, and it should also be - noticeably faster than a non-destructive merge. - - - - - uml_moo is installed with the UML deb and RPM. If you didn't install - UML from one of those packages, you can also get it from the UML - utilities tar file in tools/moo. - - - - - - - - - 8. Creating filesystems - - - You may want to create and mount new UML filesystems, either because - your root filesystem isn't large enough or because you want to use a - filesystem other than ext2. - - - This was written on the occasion of reiserfs being included in the - 2.4.1 kernel pool, and therefore the 2.4.1 UML, so the examples will - talk about reiserfs. This information is generic, and the examples - should be easy to translate to the filesystem of your choice. - - - 8.1. Create the filesystem file - - dd is your friend. All you need to do is tell dd to create an empty - file of the appropriate size. I usually make it sparse to save time - and to avoid allocating disk space until it's actually used. For - example, the following command will create a sparse 100 meg file full - of zeroes. - - - host% - dd if=/dev/zero of=new_filesystem seek=100 count=1 bs=1M - - - - - - - 8.2. Assign the file to a UML device - - Add an argument like the following to the UML command line: - - ubd4=new_filesystem - - - - - making sure that you use an unassigned ubd device number. - - - - 8.3. Creating and mounting the filesystem - - Make sure that the filesystem is available, either by being built into - the kernel, or available as a module, then boot up UML and log in. If - the root filesystem doesn't have the filesystem utilities (mkfs, fsck, - etc), then get them into UML by way of the net or hostfs. - - - Make the new filesystem on the device assigned to the new file: - - - host# mkreiserfs /dev/ubd/4 - - - <----------- MKREISERFSv2 -----------> - - ReiserFS version 3.6.25 - Block size 4096 bytes - Block count 25856 - Used blocks 8212 - Journal - 8192 blocks (18-8209), journal header is in block 8210 - Bitmaps: 17 - Root block 8211 - Hash function "r5" - ATTENTION: ALL DATA WILL BE LOST ON '/dev/ubd/4'! (y/n)y - journal size 8192 (from 18) - Initializing journal - 0%....20%....40%....60%....80%....100% - Syncing..done. - - - - - Now, mount it: - - - UML# - mount /dev/ubd/4 /mnt - - - - - and you're in business. - - - - - - - - - - 9. Host file access - - - If you want to access files on the host machine from inside UML, you - can treat it as a separate machine and either nfs mount directories - from the host or copy files into the virtual machine with scp or rcp. - However, since UML is running on the host, it can access those - files just like any other process and make them available inside the - virtual machine without needing to use the network. - - - This is now possible with the hostfs virtual filesystem. With it, you - can mount a host directory into the UML filesystem and access the - files contained in it just as you would on the host. - - - 9.1. Using hostfs - - To begin with, make sure that hostfs is available inside the virtual - machine with - - - UML# cat /proc/filesystems - - - - . hostfs should be listed. If it's not, either rebuild the kernel - with hostfs configured into it or make sure that hostfs is built as a - module and available inside the virtual machine, and insmod it. - - - Now all you need to do is run mount: - - - UML# mount none /mnt/host -t hostfs - - - - - will mount the host's / on the virtual machine's /mnt/host. - - - If you don't want to mount the host root directory, then you can - specify a subdirectory to mount with the -o switch to mount: - - - UML# mount none /mnt/home -t hostfs -o /home - - - - - will mount the hosts's /home on the virtual machine's /mnt/home. - - - - 9.2. hostfs as the root filesystem - - It's possible to boot from a directory hierarchy on the host using - hostfs rather than using the standard filesystem in a file. - - To start, you need that hierarchy. The easiest way is to loop mount - an existing root_fs file: - - - host# mount root_fs uml_root_dir -o loop - - - - - You need to change the filesystem type of / in etc/fstab to be - 'hostfs', so that line looks like this: - - /dev/ubd/0 / hostfs defaults 1 1 - - - - - Then you need to chown to yourself all the files in that directory - that are owned by root. This worked for me: - - - host# find . -uid 0 -exec chown jdike {} \; - - - - - Next, make sure that your UML kernel has hostfs compiled in, not as a - module. Then run UML with the boot device pointing at that directory: - - - ubd0=/path/to/uml/root/directory - - - - - UML should then boot as it does normally. - - - 9.3. Building hostfs - - If you need to build hostfs because it's not in your kernel, you have - two choices: - - - - o Compiling hostfs into the kernel: - - - Reconfigure the kernel and set the 'Host filesystem' option under - - - o Compiling hostfs as a module: - - - Reconfigure the kernel and set the 'Host filesystem' option under - be in arch/um/fs/hostfs/hostfs.o. Install that in - /lib/modules/`uname -r`/fs in the virtual machine, boot it up, and - - - UML# insmod hostfs - - - - - - - - - - - - - 10. The Management Console - - - - The UML management console is a low-level interface to the kernel, - somewhat like the i386 SysRq interface. Since there is a full-blown - operating system under UML, there is much greater flexibility possible - than with the SysRq mechanism. - - - There are a number of things you can do with the mconsole interface: - - o get the kernel version - - o add and remove devices - - o halt or reboot the machine - - o Send SysRq commands - - o Pause and resume the UML - - - You need the mconsole client (uml_mconsole) which is present in CVS - (/tools/mconsole) in 2.4.5-9um and later, and will be in the RPM in - 2.4.6. - - - You also need CONFIG_MCONSOLE (under 'General Setup') enabled in UML. - When you boot UML, you'll see a line like: - - - mconsole initialized on /home/jdike/.uml/umlNJ32yL/mconsole - - - - - If you specify a unique machine id one the UML command line, i.e. - - - umid=debian - - - - - you'll see this - - - mconsole initialized on /home/jdike/.uml/debian/mconsole - - - - - That file is the socket that uml_mconsole will use to communicate with - UML. Run it with either the umid or the full path as its argument: - - - host% uml_mconsole debian - - - - - or - - - host% uml_mconsole /home/jdike/.uml/debian/mconsole - - - - - You'll get a prompt, at which you can run one of these commands: - - o version - - o halt - - o reboot - - o config - - o remove - - o sysrq - - o help - - o cad - - o stop - - o go - - - 10.1. version - - This takes no arguments. It prints the UML version. - - - (mconsole) version - OK Linux usermode 2.4.5-9um #1 Wed Jun 20 22:47:08 EDT 2001 i686 - - - - - There are a couple actual uses for this. It's a simple no-op which - can be used to check that a UML is running. It's also a way of - sending an interrupt to the UML. This is sometimes useful on SMP - hosts, where there's a bug which causes signals to UML to be lost, - often causing it to appear to hang. Sending such a UML the mconsole - version command is a good way to 'wake it up' before networking has - been enabled, as it does not do anything to the function of the UML. - - - - 10.2. halt and reboot - - These take no arguments. They shut the machine down immediately, with - no syncing of disks and no clean shutdown of userspace. So, they are - pretty close to crashing the machine. - - - (mconsole) halt - OK - - - - - - - 10.3. config - - "config" adds a new device to the virtual machine. Currently the ubd - and network drivers support this. It takes one argument, which is the - device to add, with the same syntax as the kernel command line. - - - - - (mconsole) - config ubd3=/home/jdike/incoming/roots/root_fs_debian22 - - OK - (mconsole) config eth1=mcast - OK - - - - - - - 10.4. remove - - "remove" deletes a device from the system. Its argument is just the - name of the device to be removed. The device must be idle in whatever - sense the driver considers necessary. In the case of the ubd driver, - the removed block device must not be mounted, swapped on, or otherwise - open, and in the case of the network driver, the device must be down. - - - (mconsole) remove ubd3 - OK - (mconsole) remove eth1 - OK - - - - - - - 10.5. sysrq - - This takes one argument, which is a single letter. It calls the - generic kernel's SysRq driver, which does whatever is called for by - that argument. See the SysRq documentation in - Documentation/admin-guide/sysrq.rst in your favorite kernel tree to - see what letters are valid and what they do. - - - - 10.6. help - - "help" returns a string listing the valid commands and what each one - does. - - - - 10.7. cad - - This invokes the Ctl-Alt-Del action on init. What exactly this ends - up doing is up to /etc/inittab. Normally, it reboots the machine. - With UML, this is usually not desired, so if a halt would be better, - then find the section of inittab that looks like this - - - # What to do when CTRL-ALT-DEL is pressed. - ca:12345:ctrlaltdel:/sbin/shutdown -t1 -a -r now - - - - - and change the command to halt. - - - - 10.8. stop - - This puts the UML in a loop reading mconsole requests until a 'go' - mconsole command is received. This is very useful for making backups - of UML filesystems, as the UML can be stopped, then synced via 'sysrq - s', so that everything is written to the filesystem. You can then copy - the filesystem and then send the UML 'go' via mconsole. - - - Note that a UML running with more than one CPU will have problems - after you send the 'stop' command, as only one CPU will be held in a - mconsole loop and all others will continue as normal. This is a bug, - and will be fixed. - - - - 10.9. go - - This resumes a UML after being paused by a 'stop' command. Note that - when the UML has resumed, TCP connections may have timed out and if - the UML is paused for a long period of time, crond might go a little - crazy, running all the jobs it didn't do earlier. - - - - - - - - - 11. Kernel debugging - - - Note: The interface that makes debugging, as described here, possible - is present in 2.4.0-test6 kernels and later. - - - Since the user-mode kernel runs as a normal Linux process, it is - possible to debug it with gdb almost like any other process. It is - slightly different because the kernel's threads are already being - ptraced for system call interception, so gdb can't ptrace them. - However, a mechanism has been added to work around that problem. - - - In order to debug the kernel, you need build it from source. See - ``Compiling the kernel and modules'' for information on doing that. - Make sure that you enable CONFIG_DEBUGSYM and CONFIG_PT_PROXY during - the config. These will compile the kernel with -g, and enable the - ptrace proxy so that gdb works with UML, respectively. - - - - - 11.1. Starting the kernel under gdb - - You can have the kernel running under the control of gdb from the - beginning by putting 'debug' on the command line. You will get an - xterm with gdb running inside it. The kernel will send some commands - to gdb which will leave it stopped at the beginning of start_kernel. - At this point, you can get things going with 'next', 'step', or - 'cont'. - - - There is a transcript of a debugging session here , with breakpoints being set in the scheduler and in an - interrupt handler. - 11.2. Examining sleeping processes - - Not every bug is evident in the currently running process. Sometimes, - processes hang in the kernel when they shouldn't because they've - deadlocked on a semaphore or something similar. In this case, when - you ^C gdb and get a backtrace, you will see the idle thread, which - isn't very relevant. - - - What you want is the stack of whatever process is sleeping when it - shouldn't be. You need to figure out which process that is, which is - generally fairly easy. Then you need to get its host process id, - which you can do either by looking at ps on the host or at - task.thread.extern_pid in gdb. - - - Now what you do is this: - - o detach from the current thread - - - (UML gdb) det - - - - - - o attach to the thread you are interested in - - - (UML gdb) att - - - - - - o look at its stack and anything else of interest - - - (UML gdb) bt - - - - - Note that you can't do anything at this point that requires that a - process execute, e.g. calling a function - - o when you're done looking at that process, reattach to the current - thread and continue it - - - (UML gdb) - att 1 - - - - - - - (UML gdb) - c - - - - - Here, specifying any pid which is not the process id of a UML thread - will cause gdb to reattach to the current thread. I commonly use 1, - but any other invalid pid would work. - - - - 11.3. Running ddd on UML - - ddd works on UML, but requires a special kludge. The process goes - like this: - - o Start ddd - - - host% ddd linux - - - - - - o With ps, get the pid of the gdb that ddd started. You can ask the - gdb to tell you, but for some reason that confuses things and - causes a hang. - - o run UML with 'debug=parent gdb-pid=' added to the command line - - it will just sit there after you hit return - - o type 'att 1' to the ddd gdb and you will see something like - - - 0xa013dc51 in __kill () - - - (gdb) - - - - - - o At this point, type 'c', UML will boot up, and you can use ddd just - as you do on any other process. - - - - 11.4. Debugging modules - - gdb has support for debugging code which is dynamically loaded into - the process. This support is what is needed to debug kernel modules - under UML. - - - Using that support is somewhat complicated. You have to tell gdb what - object file you just loaded into UML and where in memory it is. Then, - it can read the symbol table, and figure out where all the symbols are - from the load address that you provided. It gets more interesting - when you load the module again (i.e. after an rmmod). You have to - tell gdb to forget about all its symbols, including the main UML ones - for some reason, then load then all back in again. - - - There's an easy way and a hard way to do this. The easy way is to use - the umlgdb expect script written by Chandan Kudige. It basically - automates the process for you. - - - First, you must tell it where your modules are. There is a list in - the script that looks like this: - set MODULE_PATHS { - "fat" "/usr/src/uml/linux-2.4.18/fs/fat/fat.o" - "isofs" "/usr/src/uml/linux-2.4.18/fs/isofs/isofs.o" - "minix" "/usr/src/uml/linux-2.4.18/fs/minix/minix.o" - } - - - - - You change that to list the names and paths of the modules that you - are going to debug. Then you run it from the toplevel directory of - your UML pool and it basically tells you what to do: - - - - - ******** GDB pid is 21903 ******** - Start UML as: ./linux debug gdb-pid=21903 - - - - GNU gdb 5.0rh-5 Red Hat Linux 7.1 - Copyright 2001 Free Software Foundation, Inc. - GDB is free software, covered by the GNU General Public License, and you are - welcome to change it and/or distribute copies of it under certain conditions. - Type "show copying" to see the conditions. - There is absolutely no warranty for GDB. Type "show warranty" for details. - This GDB was configured as "i386-redhat-linux"... - (gdb) b sys_init_module - Breakpoint 1 at 0xa0011923: file module.c, line 349. - (gdb) att 1 - - - - - After you run UML and it sits there doing nothing, you hit return at - the 'att 1' and continue it: - - - Attaching to program: /home/jdike/linux/2.4/um/./linux, process 1 - 0xa00f4221 in __kill () - (UML gdb) c - Continuing. - - - - - At this point, you debug normally. When you insmod something, the - expect magic will kick in and you'll see something like: - - - - - - - - - - - - - - - - - - *** Module hostfs loaded *** - Breakpoint 1, sys_init_module (name_user=0x805abb0 "hostfs", - mod_user=0x8070e00) at module.c:349 - 349 char *name, *n_name, *name_tmp = NULL; - (UML gdb) finish - Run till exit from #0 sys_init_module (name_user=0x805abb0 "hostfs", - mod_user=0x8070e00) at module.c:349 - 0xa00e2e23 in execute_syscall (r=0xa8140284) at syscall_kern.c:411 - 411 else res = EXECUTE_SYSCALL(syscall, regs); - Value returned is $1 = 0 - (UML gdb) - p/x (int)module_list + module_list->size_of_struct - - $2 = 0xa9021054 - (UML gdb) symbol-file ./linux - Load new symbol table from "./linux"? (y or n) y - Reading symbols from ./linux... - done. - (UML gdb) - add-symbol-file /home/jdike/linux/2.4/um/arch/um/fs/hostfs/hostfs.o 0xa9021054 - - add symbol table from file "/home/jdike/linux/2.4/um/arch/um/fs/hostfs/hostfs.o" at - .text_addr = 0xa9021054 - (y or n) y - - Reading symbols from /home/jdike/linux/2.4/um/arch/um/fs/hostfs/hostfs.o... - done. - (UML gdb) p *module_list - $1 = {size_of_struct = 84, next = 0xa0178720, name = 0xa9022de0 "hostfs", - size = 9016, uc = {usecount = {counter = 0}, pad = 0}, flags = 1, - nsyms = 57, ndeps = 0, syms = 0xa9023170, deps = 0x0, refs = 0x0, - init = 0xa90221f0 , cleanup = 0xa902222c , - ex_table_start = 0x0, ex_table_end = 0x0, persist_start = 0x0, - persist_end = 0x0, can_unload = 0, runsize = 0, kallsyms_start = 0x0, - kallsyms_end = 0x0, - archdata_start = 0x1b855
, - archdata_end = 0xe5890000
, - kernel_data = 0xf689c35d
} - >> Finished loading symbols for hostfs ... - - - - - That's the easy way. It's highly recommended. The hard way is - described below in case you're interested in what's going on. - - - Boot the kernel under the debugger and load the module with insmod or - modprobe. With gdb, do: - - - (UML gdb) p module_list - - - - - This is a list of modules that have been loaded into the kernel, with - the most recently loaded module first. Normally, the module you want - is at module_list. If it's not, walk down the next links, looking at - the name fields until find the module you want to debug. Take the - address of that structure, and add module.size_of_struct (which in - 2.4.10 kernels is 96 (0x60)) to it. Gdb can make this hard addition - for you :-): - - - - (UML gdb) - printf "%#x\n", (int)module_list module_list->size_of_struct - - - - - The offset from the module start occasionally changes (before 2.4.0, - it was module.size_of_struct + 4), so it's a good idea to check the - init and cleanup addresses once in a while, as describe below. Now - do: - - - (UML gdb) - add-symbol-file /path/to/module/on/host that_address - - - - - Tell gdb you really want to do it, and you're in business. - - - If there's any doubt that you got the offset right, like breakpoints - appear not to work, or they're appearing in the wrong place, you can - check it by looking at the module structure. The init and cleanup - fields should look like: - - - init = 0x588066b0 , cleanup = 0x588066c0 - - - - - with no offsets on the symbol names. If the names are right, but they - are offset, then the offset tells you how much you need to add to the - address you gave to add-symbol-file. - - - When you want to load in a new version of the module, you need to get - gdb to forget about the old one. The only way I've found to do that - is to tell gdb to forget about all symbols that it knows about: - - - (UML gdb) symbol-file - - - - - Then reload the symbols from the kernel binary: - - - (UML gdb) symbol-file /path/to/kernel - - - - - and repeat the process above. You'll also need to re-enable break- - points. They were disabled when you dumped all the symbols because - gdb couldn't figure out where they should go. - - - - 11.5. Attaching gdb to the kernel - - If you don't have the kernel running under gdb, you can attach gdb to - it later by sending the tracing thread a SIGUSR1. The first line of - the console output identifies its pid: - tracing thread pid = 20093 - - - - - When you send it the signal: - - - host% kill -USR1 20093 - - - - - you will get an xterm with gdb running in it. - - - If you have the mconsole compiled into UML, then the mconsole client - can be used to start gdb: - - - (mconsole) (mconsole) config gdb=xterm - - - - - will fire up an xterm with gdb running in it. - - - - 11.6. Using alternate debuggers - - UML has support for attaching to an already running debugger rather - than starting gdb itself. This is present in CVS as of 17 Apr 2001. - I sent it to Alan for inclusion in the ac tree, and it will be in my - 2.4.4 release. - - - This is useful when gdb is a subprocess of some UI, such as emacs or - ddd. It can also be used to run debuggers other than gdb on UML. - Below is an example of using strace as an alternate debugger. - - - To do this, you need to get the pid of the debugger and pass it in - with the - - - If you are using gdb under some UI, then tell it to 'att 1', and - you'll find yourself attached to UML. - - - If you are using something other than gdb as your debugger, then - you'll need to get it to do the equivalent of 'att 1' if it doesn't do - it automatically. - - - An example of an alternate debugger is strace. You can strace the - actual kernel as follows: - - o Run the following in a shell - - - host% - sh -c 'echo pid=$$; echo -n hit return; read x; exec strace -p 1 -o strace.out' - - - - o Run UML with 'debug' and 'gdb-pid=' with the pid printed out - by the previous command - - o Hit return in the shell, and UML will start running, and strace - output will start accumulating in the output file. - - Note that this is different from running - - - host% strace ./linux - - - - - That will strace only the main UML thread, the tracing thread, which - doesn't do any of the actual kernel work. It just oversees the vir- - tual machine. In contrast, using strace as described above will show - you the low-level activity of the virtual machine. - - - - - - 12. Kernel debugging examples - - 12.1. The case of the hung fsck - - When booting up the kernel, fsck failed, and dropped me into a shell - to fix things up. I ran fsck -y, which hung: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Setting hostname uml [ OK ] - Checking root filesystem - /dev/fhd0 was not cleanly unmounted, check forced. - Error reading block 86894 (Attempt to read block from filesystem resulted in short read) while reading indirect blocks of inode 19780. - - /dev/fhd0: UNEXPECTED INCONSISTENCY; RUN fsck MANUALLY. - (i.e., without -a or -p options) - [ FAILED ] - - *** An error occurred during the file system check. - *** Dropping you to a shell; the system will reboot - *** when you leave the shell. - Give root password for maintenance - (or type Control-D for normal startup): - - [root@uml /root]# fsck -y /dev/fhd0 - fsck -y /dev/fhd0 - Parallelizing fsck version 1.14 (9-Jan-1999) - e2fsck 1.14, 9-Jan-1999 for EXT2 FS 0.5b, 95/08/09 - /dev/fhd0 contains a file system with errors, check forced. - Pass 1: Checking inodes, blocks, and sizes - Error reading block 86894 (Attempt to read block from filesystem resulted in short read) while reading indirect blocks of inode 19780. Ignore error? yes - - Inode 19780, i_blocks is 1548, should be 540. Fix? yes - - Pass 2: Checking directory structure - Error reading block 49405 (Attempt to read block from filesystem resulted in short read). Ignore error? yes - - Directory inode 11858, block 0, offset 0: directory corrupted - Salvage? yes - - Missing '.' in directory inode 11858. - Fix? yes - - Missing '..' in directory inode 11858. - Fix? yes - - - - - - The standard drill in this sort of situation is to fire up gdb on the - signal thread, which, in this case, was pid 1935. In another window, - I run gdb and attach pid 1935. - - - - - ~/linux/2.3.26/um 1016: gdb linux - GNU gdb 4.17.0.11 with Linux support - Copyright 1998 Free Software Foundation, Inc. - GDB is free software, covered by the GNU General Public License, and you are - welcome to change it and/or distribute copies of it under certain conditions. - Type "show copying" to see the conditions. - There is absolutely no warranty for GDB. Type "show warranty" for details. - This GDB was configured as "i386-redhat-linux"... - - (gdb) att 1935 - Attaching to program `/home/dike/linux/2.3.26/um/linux', Pid 1935 - 0x100756d9 in __wait4 () - - - - - - - Let's see what's currently running: - - - - (gdb) p current_task.pid - $1 = 0 - - - - - - It's the idle thread, which means that fsck went to sleep for some - reason and never woke up. - - - Let's guess that the last process in the process list is fsck: - - - - (gdb) p current_task.prev_task.comm - $13 = "fsck.ext2\000\000\000\000\000\000" - - - - - - It is, so let's see what it thinks it's up to: - - - - (gdb) p current_task.prev_task.thread - $14 = {extern_pid = 1980, tracing = 0, want_tracing = 0, forking = 0, - kernel_stack_page = 0, signal_stack = 1342627840, syscall = {id = 4, args = { - 3, 134973440, 1024, 0, 1024}, have_result = 0, result = 50590720}, - request = {op = 2, u = {exec = {ip = 1350467584, sp = 2952789424}, fork = { - regs = {1350467584, 2952789424, 0 }, sigstack = 0, - pid = 0}, switch_to = 0x507e8000, thread = {proc = 0x507e8000, - arg = 0xaffffdb0, flags = 0, new_pid = 0}, input_request = { - op = 1350467584, fd = -1342177872, proc = 0, pid = 0}}}} - - - - - - The interesting things here are the fact that its .thread.syscall.id - is __NR_write (see the big switch in arch/um/kernel/syscall_kern.c or - the defines in include/asm-um/arch/unistd.h), and that it never - returned. Also, its .request.op is OP_SWITCH (see - arch/um/include/user_util.h). These mean that it went into a write, - and, for some reason, called schedule(). - - - The fact that it never returned from write means that its stack should - be fairly interesting. Its pid is 1980 (.thread.extern_pid). That - process is being ptraced by the signal thread, so it must be detached - before gdb can attach it: - - - - - - - - - - - (gdb) call detach(1980) - - Program received signal SIGSEGV, Segmentation fault. - - The program being debugged stopped while in a function called from GDB. - When the function (detach) is done executing, GDB will silently - stop (instead of continuing to evaluate the expression containing - the function call). - (gdb) call detach(1980) - $15 = 0 - - - - - - The first detach segfaults for some reason, and the second one - succeeds. - - - Now I detach from the signal thread, attach to the fsck thread, and - look at its stack: - - - (gdb) det - Detaching from program: /home/dike/linux/2.3.26/um/linux Pid 1935 - (gdb) att 1980 - Attaching to program `/home/dike/linux/2.3.26/um/linux', Pid 1980 - 0x10070451 in __kill () - (gdb) bt - #0 0x10070451 in __kill () - #1 0x10068ccd in usr1_pid (pid=1980) at process.c:30 - #2 0x1006a03f in _switch_to (prev=0x50072000, next=0x507e8000) - at process_kern.c:156 - #3 0x1006a052 in switch_to (prev=0x50072000, next=0x507e8000, last=0x50072000) - at process_kern.c:161 - #4 0x10001d12 in schedule () at core.c:777 - #5 0x1006a744 in __down (sem=0x507d241c) at semaphore.c:71 - #6 0x1006aa10 in __down_failed () at semaphore.c:157 - #7 0x1006c5d8 in segv_handler (sc=0x5006e940) at trap_user.c:174 - #8 0x1006c5ec in kern_segv_handler (sig=11) at trap_user.c:182 - #9 - #10 0x10155404 in errno () - #11 0x1006c0aa in segv (address=1342179328, is_write=2) at trap_kern.c:50 - #12 0x1006c5d8 in segv_handler (sc=0x5006eaf8) at trap_user.c:174 - #13 0x1006c5ec in kern_segv_handler (sig=11) at trap_user.c:182 - #14 - #15 0xc0fd in ?? () - #16 0x10016647 in sys_write (fd=3, - buf=0x80b8800
, count=1024) - at read_write.c:159 - #17 0x1006d5b3 in execute_syscall (syscall=4, args=0x5006ef08) - at syscall_kern.c:254 - #18 0x1006af87 in really_do_syscall (sig=12) at syscall_user.c:35 - #19 - #20 0x400dc8b0 in ?? () - - - - - - The interesting things here are : - - o There are two segfaults on this stack (frames 9 and 14) - - o The first faulting address (frame 11) is 0x50000800 - - (gdb) p (void *)1342179328 - $16 = (void *) 0x50000800 - - - - - - The initial faulting address is interesting because it is on the idle - thread's stack. I had been seeing the idle thread segfault for no - apparent reason, and the cause looked like stack corruption. In hopes - of catching the culprit in the act, I had turned off all protections - to that stack while the idle thread wasn't running. This apparently - tripped that trap. - - - However, the more immediate problem is that second segfault and I'm - going to concentrate on that. First, I want to see where the fault - happened, so I have to go look at the sigcontent struct in frame 8: - - - - (gdb) up - #1 0x10068ccd in usr1_pid (pid=1980) at process.c:30 - 30 kill(pid, SIGUSR1); - (gdb) - #2 0x1006a03f in _switch_to (prev=0x50072000, next=0x507e8000) - at process_kern.c:156 - 156 usr1_pid(getpid()); - (gdb) - #3 0x1006a052 in switch_to (prev=0x50072000, next=0x507e8000, last=0x50072000) - at process_kern.c:161 - 161 _switch_to(prev, next); - (gdb) - #4 0x10001d12 in schedule () at core.c:777 - 777 switch_to(prev, next, prev); - (gdb) - #5 0x1006a744 in __down (sem=0x507d241c) at semaphore.c:71 - 71 schedule(); - (gdb) - #6 0x1006aa10 in __down_failed () at semaphore.c:157 - 157 } - (gdb) - #7 0x1006c5d8 in segv_handler (sc=0x5006e940) at trap_user.c:174 - 174 segv(sc->cr2, sc->err & 2); - (gdb) - #8 0x1006c5ec in kern_segv_handler (sig=11) at trap_user.c:182 - 182 segv_handler(sc); - (gdb) p *sc - Cannot access memory at address 0x0. - - - - - That's not very useful, so I'll try a more manual method: - - - (gdb) p *((struct sigcontext *) (&sig + 1)) - $19 = {gs = 0, __gsh = 0, fs = 0, __fsh = 0, es = 43, __esh = 0, ds = 43, - __dsh = 0, edi = 1342179328, esi = 1350378548, ebp = 1342630440, - esp = 1342630420, ebx = 1348150624, edx = 1280, ecx = 0, eax = 0, - trapno = 14, err = 4, eip = 268480945, cs = 35, __csh = 0, eflags = 66118, - esp_at_signal = 1342630420, ss = 43, __ssh = 0, fpstate = 0x0, oldmask = 0, - cr2 = 1280} - - - - The ip is in handle_mm_fault: - - - (gdb) p (void *)268480945 - $20 = (void *) 0x1000b1b1 - (gdb) i sym $20 - handle_mm_fault + 57 in section .text - - - - - - Specifically, it's in pte_alloc: - - - (gdb) i line *$20 - Line 124 of "/home/dike/linux/2.3.26/um/include/asm/pgalloc.h" - starts at address 0x1000b1b1 - and ends at 0x1000b1b7 . - - - - - - To find where in handle_mm_fault this is, I'll jump forward in the - code until I see an address in that procedure: - - - - (gdb) i line *0x1000b1c0 - Line 126 of "/home/dike/linux/2.3.26/um/include/asm/pgalloc.h" - starts at address 0x1000b1b7 - and ends at 0x1000b1c3 . - (gdb) i line *0x1000b1d0 - Line 131 of "/home/dike/linux/2.3.26/um/include/asm/pgalloc.h" - starts at address 0x1000b1d0 - and ends at 0x1000b1da . - (gdb) i line *0x1000b1e0 - Line 61 of "/home/dike/linux/2.3.26/um/include/asm/pgalloc.h" - starts at address 0x1000b1da - and ends at 0x1000b1e1 . - (gdb) i line *0x1000b1f0 - Line 134 of "/home/dike/linux/2.3.26/um/include/asm/pgalloc.h" - starts at address 0x1000b1f0 - and ends at 0x1000b200 . - (gdb) i line *0x1000b200 - Line 135 of "/home/dike/linux/2.3.26/um/include/asm/pgalloc.h" - starts at address 0x1000b200 - and ends at 0x1000b208 . - (gdb) i line *0x1000b210 - Line 139 of "/home/dike/linux/2.3.26/um/include/asm/pgalloc.h" - starts at address 0x1000b210 - and ends at 0x1000b219 . - (gdb) i line *0x1000b220 - Line 1168 of "memory.c" starts at address 0x1000b21e - and ends at 0x1000b222 . - - - - - - Something is apparently wrong with the page tables or vma_structs, so - lets go back to frame 11 and have a look at them: - - - - #11 0x1006c0aa in segv (address=1342179328, is_write=2) at trap_kern.c:50 - 50 handle_mm_fault(current, vma, address, is_write); - (gdb) call pgd_offset_proc(vma->vm_mm, address) - $22 = (pgd_t *) 0x80a548c - - - - - - That's pretty bogus. Page tables aren't supposed to be in process - text or data areas. Let's see what's in the vma: - - - (gdb) p *vma - $23 = {vm_mm = 0x507d2434, vm_start = 0, vm_end = 134512640, - vm_next = 0x80a4f8c, vm_page_prot = {pgprot = 0}, vm_flags = 31200, - vm_avl_height = 2058, vm_avl_left = 0x80a8c94, vm_avl_right = 0x80d1000, - vm_next_share = 0xaffffdb0, vm_pprev_share = 0xaffffe63, - vm_ops = 0xaffffe7a, vm_pgoff = 2952789626, vm_file = 0xafffffec, - vm_private_data = 0x62} - (gdb) p *vma.vm_mm - $24 = {mmap = 0x507d2434, mmap_avl = 0x0, mmap_cache = 0x8048000, - pgd = 0x80a4f8c, mm_users = {counter = 0}, mm_count = {counter = 134904288}, - map_count = 134909076, mmap_sem = {count = {counter = 135073792}, - sleepers = -1342177872, wait = {lock = , - task_list = {next = 0xaffffe63, prev = 0xaffffe7a}, - __magic = -1342177670, __creator = -1342177300}, __magic = 98}, - page_table_lock = {}, context = 138, start_code = 0, end_code = 0, - start_data = 0, end_data = 0, start_brk = 0, brk = 0, start_stack = 0, - arg_start = 0, arg_end = 0, env_start = 0, env_end = 0, rss = 1350381536, - total_vm = 0, locked_vm = 0, def_flags = 0, cpu_vm_mask = 0, swap_cnt = 0, - swap_address = 0, segments = 0x0} - - - - - - This also pretty bogus. With all of the 0x80xxxxx and 0xaffffxxx - addresses, this is looking like a stack was plonked down on top of - these structures. Maybe it's a stack overflow from the next page: - - - - (gdb) p vma - $25 = (struct vm_area_struct *) 0x507d2434 - - - - - - That's towards the lower quarter of the page, so that would have to - have been pretty heavy stack overflow: - - - - - - - - - - - - - - - (gdb) x/100x $25 - 0x507d2434: 0x507d2434 0x00000000 0x08048000 0x080a4f8c - 0x507d2444: 0x00000000 0x080a79e0 0x080a8c94 0x080d1000 - 0x507d2454: 0xaffffdb0 0xaffffe63 0xaffffe7a 0xaffffe7a - 0x507d2464: 0xafffffec 0x00000062 0x0000008a 0x00000000 - 0x507d2474: 0x00000000 0x00000000 0x00000000 0x00000000 - 0x507d2484: 0x00000000 0x00000000 0x00000000 0x00000000 - 0x507d2494: 0x00000000 0x00000000 0x507d2fe0 0x00000000 - 0x507d24a4: 0x00000000 0x00000000 0x00000000 0x00000000 - 0x507d24b4: 0x00000000 0x00000000 0x00000000 0x00000000 - 0x507d24c4: 0x00000000 0x00000000 0x00000000 0x00000000 - 0x507d24d4: 0x00000000 0x00000000 0x00000000 0x00000000 - 0x507d24e4: 0x00000000 0x00000000 0x00000000 0x00000000 - 0x507d24f4: 0x00000000 0x00000000 0x00000000 0x00000000 - 0x507d2504: 0x00000000 0x00000000 0x00000000 0x00000000 - 0x507d2514: 0x00000000 0x00000000 0x00000000 0x00000000 - 0x507d2524: 0x00000000 0x00000000 0x00000000 0x00000000 - 0x507d2534: 0x00000000 0x00000000 0x507d25dc 0x00000000 - 0x507d2544: 0x00000000 0x00000000 0x00000000 0x00000000 - 0x507d2554: 0x00000000 0x00000000 0x00000000 0x00000000 - 0x507d2564: 0x00000000 0x00000000 0x00000000 0x00000000 - 0x507d2574: 0x00000000 0x00000000 0x00000000 0x00000000 - 0x507d2584: 0x00000000 0x00000000 0x00000000 0x00000000 - 0x507d2594: 0x00000000 0x00000000 0x00000000 0x00000000 - 0x507d25a4: 0x00000000 0x00000000 0x00000000 0x00000000 - 0x507d25b4: 0x00000000 0x00000000 0x00000000 0x00000000 - - - - - - It's not stack overflow. The only "stack-like" piece of this data is - the vma_struct itself. - - - At this point, I don't see any avenues to pursue, so I just have to - admit that I have no idea what's going on. What I will do, though, is - stick a trap on the segfault handler which will stop if it sees any - writes to the idle thread's stack. That was the thing that happened - first, and it may be that if I can catch it immediately, what's going - on will be somewhat clearer. - - - 12.2. Episode 2: The case of the hung fsck - - After setting a trap in the SEGV handler for accesses to the signal - thread's stack, I reran the kernel. - - - fsck hung again, this time by hitting the trap: - - - - - - - - - - - - - - - - - Setting hostname uml [ OK ] - Checking root filesystem - /dev/fhd0 contains a file system with errors, check forced. - Error reading block 86894 (Attempt to read block from filesystem resulted in short read) while reading indirect blocks of inode 19780. - - /dev/fhd0: UNEXPECTED INCONSISTENCY; RUN fsck MANUALLY. - (i.e., without -a or -p options) - [ FAILED ] - - *** An error occurred during the file system check. - *** Dropping you to a shell; the system will reboot - *** when you leave the shell. - Give root password for maintenance - (or type Control-D for normal startup): - - [root@uml /root]# fsck -y /dev/fhd0 - fsck -y /dev/fhd0 - Parallelizing fsck version 1.14 (9-Jan-1999) - e2fsck 1.14, 9-Jan-1999 for EXT2 FS 0.5b, 95/08/09 - /dev/fhd0 contains a file system with errors, check forced. - Pass 1: Checking inodes, blocks, and sizes - Error reading block 86894 (Attempt to read block from filesystem resulted in short read) while reading indirect blocks of inode 19780. Ignore error? yes - - Pass 2: Checking directory structure - Error reading block 49405 (Attempt to read block from filesystem resulted in short read). Ignore error? yes - - Directory inode 11858, block 0, offset 0: directory corrupted - Salvage? yes - - Missing '.' in directory inode 11858. - Fix? yes - - Missing '..' in directory inode 11858. - Fix? yes - - Untested (4127) [100fe44c]: trap_kern.c line 31 - - - - - - I need to get the signal thread to detach from pid 4127 so that I can - attach to it with gdb. This is done by sending it a SIGUSR1, which is - caught by the signal thread, which detaches the process: - - - kill -USR1 4127 - - - - - - Now I can run gdb on it: - - - - - - - - - - - - - - ~/linux/2.3.26/um 1034: gdb linux - GNU gdb 4.17.0.11 with Linux support - Copyright 1998 Free Software Foundation, Inc. - GDB is free software, covered by the GNU General Public License, and you are - welcome to change it and/or distribute copies of it under certain conditions. - Type "show copying" to see the conditions. - There is absolutely no warranty for GDB. Type "show warranty" for details. - This GDB was configured as "i386-redhat-linux"... - (gdb) att 4127 - Attaching to program `/home/dike/linux/2.3.26/um/linux', Pid 4127 - 0x10075891 in __libc_nanosleep () - - - - - - The backtrace shows that it was in a write and that the fault address - (address in frame 3) is 0x50000800, which is right in the middle of - the signal thread's stack page: - - - (gdb) bt - #0 0x10075891 in __libc_nanosleep () - #1 0x1007584d in __sleep (seconds=1000000) - at ../sysdeps/unix/sysv/linux/sleep.c:78 - #2 0x1006ce9a in stop () at user_util.c:191 - #3 0x1006bf88 in segv (address=1342179328, is_write=2) at trap_kern.c:31 - #4 0x1006c628 in segv_handler (sc=0x5006eaf8) at trap_user.c:174 - #5 0x1006c63c in kern_segv_handler (sig=11) at trap_user.c:182 - #6 - #7 0xc0fd in ?? () - #8 0x10016647 in sys_write (fd=3, buf=0x80b8800 "R.", count=1024) - at read_write.c:159 - #9 0x1006d603 in execute_syscall (syscall=4, args=0x5006ef08) - at syscall_kern.c:254 - #10 0x1006af87 in really_do_syscall (sig=12) at syscall_user.c:35 - #11 - #12 0x400dc8b0 in ?? () - #13 - #14 0x400dc8b0 in ?? () - #15 0x80545fd in ?? () - #16 0x804daae in ?? () - #17 0x8054334 in ?? () - #18 0x804d23e in ?? () - #19 0x8049632 in ?? () - #20 0x80491d2 in ?? () - #21 0x80596b5 in ?? () - (gdb) p (void *)1342179328 - $3 = (void *) 0x50000800 - - - - - - Going up the stack to the segv_handler frame and looking at where in - the code the access happened shows that it happened near line 110 of - block_dev.c: - - - - - - - - - - (gdb) up - #1 0x1007584d in __sleep (seconds=1000000) - at ../sysdeps/unix/sysv/linux/sleep.c:78 - ../sysdeps/unix/sysv/linux/sleep.c:78: No such file or directory. - (gdb) - #2 0x1006ce9a in stop () at user_util.c:191 - 191 while(1) sleep(1000000); - (gdb) - #3 0x1006bf88 in segv (address=1342179328, is_write=2) at trap_kern.c:31 - 31 KERN_UNTESTED(); - (gdb) - #4 0x1006c628 in segv_handler (sc=0x5006eaf8) at trap_user.c:174 - 174 segv(sc->cr2, sc->err & 2); - (gdb) p *sc - $1 = {gs = 0, __gsh = 0, fs = 0, __fsh = 0, es = 43, __esh = 0, ds = 43, - __dsh = 0, edi = 1342179328, esi = 134973440, ebp = 1342631484, - esp = 1342630864, ebx = 256, edx = 0, ecx = 256, eax = 1024, trapno = 14, - err = 6, eip = 268550834, cs = 35, __csh = 0, eflags = 66070, - esp_at_signal = 1342630864, ss = 43, __ssh = 0, fpstate = 0x0, oldmask = 0, - cr2 = 1342179328} - (gdb) p (void *)268550834 - $2 = (void *) 0x1001c2b2 - (gdb) i sym $2 - block_write + 1090 in section .text - (gdb) i line *$2 - Line 209 of "/home/dike/linux/2.3.26/um/include/asm/arch/string.h" - starts at address 0x1001c2a1 - and ends at 0x1001c2bf . - (gdb) i line *0x1001c2c0 - Line 110 of "block_dev.c" starts at address 0x1001c2bf - and ends at 0x1001c2e3 . - - - - - - Looking at the source shows that the fault happened during a call to - copy_from_user to copy the data into the kernel: - - - 107 count -= chars; - 108 copy_from_user(p,buf,chars); - 109 p += chars; - 110 buf += chars; - - - - - - p is the pointer which must contain 0x50000800, since buf contains - 0x80b8800 (frame 8 above). It is defined as: - - - p = offset + bh->b_data; - - - - - - I need to figure out what bh is, and it just so happens that bh is - passed as an argument to mark_buffer_uptodate and mark_buffer_dirty a - few lines later, so I do a little disassembly: - - - - - (gdb) disas 0x1001c2bf 0x1001c2e0 - Dump of assembler code from 0x1001c2bf to 0x1001c2d0: - 0x1001c2bf : addl %eax,0xc(%ebp) - 0x1001c2c2 : movl 0xfffffdd4(%ebp),%edx - 0x1001c2c8 : btsl $0x0,0x18(%edx) - 0x1001c2cd : btsl $0x1,0x18(%edx) - 0x1001c2d2 : sbbl %ecx,%ecx - 0x1001c2d4 : testl %ecx,%ecx - 0x1001c2d6 : jne 0x1001c2e3 - 0x1001c2d8 : pushl $0x0 - 0x1001c2da : pushl %edx - 0x1001c2db : call 0x1001819c <__mark_buffer_dirty> - End of assembler dump. - - - - - - At that point, bh is in %edx (address 0x1001c2da), which is calculated - at 0x1001c2c2 as %ebp + 0xfffffdd4, so I figure exactly what that is, - taking %ebp from the sigcontext_struct above: - - - (gdb) p (void *)1342631484 - $5 = (void *) 0x5006ee3c - (gdb) p 0x5006ee3c+0xfffffdd4 - $6 = 1342630928 - (gdb) p (void *)$6 - $7 = (void *) 0x5006ec10 - (gdb) p *((void **)$7) - $8 = (void *) 0x50100200 - - - - - - Now, I look at the structure to see what's in it, and particularly, - what its b_data field contains: - - - (gdb) p *((struct buffer_head *)0x50100200) - $13 = {b_next = 0x50289380, b_blocknr = 49405, b_size = 1024, b_list = 0, - b_dev = 15872, b_count = {counter = 1}, b_rdev = 15872, b_state = 24, - b_flushtime = 0, b_next_free = 0x501001a0, b_prev_free = 0x50100260, - b_this_page = 0x501001a0, b_reqnext = 0x0, b_pprev = 0x507fcf58, - b_data = 0x50000800 "", b_page = 0x50004000, - b_end_io = 0x10017f60 , b_dev_id = 0x0, - b_rsector = 98810, b_wait = {lock = , - task_list = {next = 0x50100248, prev = 0x50100248}, __magic = 1343226448, - __creator = 0}, b_kiobuf = 0x0} - - - - - - The b_data field is indeed 0x50000800, so the question becomes how - that happened. The rest of the structure looks fine, so this probably - is not a case of data corruption. It happened on purpose somehow. - - - The b_page field is a pointer to the page_struct representing the - 0x50000000 page. Looking at it shows the kernel's idea of the state - of that page: - - - - (gdb) p *$13.b_page - $17 = {list = {next = 0x50004a5c, prev = 0x100c5174}, mapping = 0x0, - index = 0, next_hash = 0x0, count = {counter = 1}, flags = 132, lru = { - next = 0x50008460, prev = 0x50019350}, wait = { - lock = , task_list = {next = 0x50004024, - prev = 0x50004024}, __magic = 1342193708, __creator = 0}, - pprev_hash = 0x0, buffers = 0x501002c0, virtual = 1342177280, - zone = 0x100c5160} - - - - - - Some sanity-checking: the virtual field shows the "virtual" address of - this page, which in this kernel is the same as its "physical" address, - and the page_struct itself should be mem_map[0], since it represents - the first page of memory: - - - - (gdb) p (void *)1342177280 - $18 = (void *) 0x50000000 - (gdb) p mem_map - $19 = (mem_map_t *) 0x50004000 - - - - - - These check out fine. - - - Now to check out the page_struct itself. In particular, the flags - field shows whether the page is considered free or not: - - - (gdb) p (void *)132 - $21 = (void *) 0x84 - - - - - - The "reserved" bit is the high bit, which is definitely not set, so - the kernel considers the signal stack page to be free and available to - be used. - - - At this point, I jump to conclusions and start looking at my early - boot code, because that's where that page is supposed to be reserved. - - - In my setup_arch procedure, I have the following code which looks just - fine: - - - - bootmap_size = init_bootmem(start_pfn, end_pfn - start_pfn); - free_bootmem(__pa(low_physmem) + bootmap_size, high_physmem - low_physmem); - - - - - - Two stack pages have already been allocated, and low_physmem points to - the third page, which is the beginning of free memory. - The init_bootmem call declares the entire memory to the boot memory - manager, which marks it all reserved. The free_bootmem call frees up - all of it, except for the first two pages. This looks correct to me. - - - So, I decide to see init_bootmem run and make sure that it is marking - those first two pages as reserved. I never get that far. - - - Stepping into init_bootmem, and looking at bootmem_map before looking - at what it contains shows the following: - - - - (gdb) p bootmem_map - $3 = (void *) 0x50000000 - - - - - - Aha! The light dawns. That first page is doing double duty as a - stack and as the boot memory map. The last thing that the boot memory - manager does is to free the pages used by its memory map, so this page - is getting freed even its marked as reserved. - - - The fix was to initialize the boot memory manager before allocating - those two stack pages, and then allocate them through the boot memory - manager. After doing this, and fixing a couple of subsequent buglets, - the stack corruption problem disappeared. - - - - - - 13. What to do when UML doesn't work - - - - - 13.1. Strange compilation errors when you build from source - - As of test11, it is necessary to have "ARCH=um" in the environment or - on the make command line for all steps in building UML, including - clean, distclean, or mrproper, config, menuconfig, or xconfig, dep, - and linux. If you forget for any of them, the i386 build seems to - contaminate the UML build. If this happens, start from scratch with - - - host% - make mrproper ARCH=um - - - - - and repeat the build process with ARCH=um on all the steps. - - - See ``Compiling the kernel and modules'' for more details. - - - Another cause of strange compilation errors is building UML in - /usr/src/linux. If you do this, the first thing you need to do is - clean up the mess you made. The /usr/src/linux/asm link will now - point to /usr/src/linux/asm-um. Make it point back to - /usr/src/linux/asm-i386. Then, move your UML pool someplace else and - build it there. Also see below, where a more specific set of symptoms - is described. - - - - 13.3. A variety of panics and hangs with /tmp on a reiserfs filesys- - tem - - I saw this on reiserfs 3.5.21 and it seems to be fixed in 3.5.27. - Panics preceded by - - - Detaching pid nnnn - - - - are diagnostic of this problem. This is a reiserfs bug which causes a - thread to occasionally read stale data from a mmapped page shared with - another thread. The fix is to upgrade the filesystem or to have /tmp - be an ext2 filesystem. - - - - 13.4. The compile fails with errors about conflicting types for - 'open', 'dup', and 'waitpid' - - This happens when you build in /usr/src/linux. The UML build makes - the include/asm link point to include/asm-um. /usr/include/asm points - to /usr/src/linux/include/asm, so when that link gets moved, files - which need to include the asm-i386 versions of headers get the - incompatible asm-um versions. The fix is to move the include/asm link - back to include/asm-i386 and to do UML builds someplace else. - - - - 13.5. UML doesn't work when /tmp is an NFS filesystem - - This seems to be a similar situation with the ReiserFS problem above. - Some versions of NFS seems not to handle mmap correctly, which UML - depends on. The workaround is have /tmp be a non-NFS directory. - - - 13.6. UML hangs on boot when compiled with gprof support - - If you build UML with gprof support and, early in the boot, it does - this - - - kernel BUG at page_alloc.c:100! - - - - - you have a buggy gcc. You can work around the problem by removing - UM_FASTCALL from CFLAGS in arch/um/Makefile-i386. This will open up - another bug, but that one is fairly hard to reproduce. - - - - 13.7. syslogd dies with a SIGTERM on startup - - The exact boot error depends on the distribution that you're booting, - but Debian produces this: - - - /etc/rc2.d/S10sysklogd: line 49: 93 Terminated - start-stop-daemon --start --quiet --exec /sbin/syslogd -- $SYSLOGD - - - - - This is a syslogd bug. There's a race between a parent process - installing a signal handler and its child sending the signal. See - this uml-devel post for the details. - - - - 13.8. TUN/TAP networking doesn't work on a 2.4 host - - There are a couple of problems which were - name="pointed - out"> by Tim Robinson - - o It doesn't work on hosts running 2.4.7 (or thereabouts) or earlier. - The fix is to upgrade to something more recent and then read the - next item. - - o If you see - - - File descriptor in bad state - - - - when you bring up the device inside UML, you have a header mismatch - between the original kernel and the upgraded one. Make /usr/src/linux - point at the new headers. This will only be a problem if you build - uml_net yourself. - - - - 13.9. You can network to the host but not to other machines on the - net - - If you can connect to the host, and the host can connect to UML, but - you cannot connect to any other machines, then you may need to enable - IP Masquerading on the host. Usually this is only experienced when - using private IP addresses (192.168.x.x or 10.x.x.x) for host/UML - networking, rather than the public address space that your host is - connected to. UML does not enable IP Masquerading, so you will need - to create a static rule to enable it: - - - host% - iptables -t nat -A POSTROUTING -o eth0 -j MASQUERADE - - - - - Replace eth0 with the interface that you use to talk to the rest of - the world. - - - Documentation on IP Masquerading, and SNAT, can be found at - www.netfilter.org . - - - If you can reach the local net, but not the outside Internet, then - that is usually a routing problem. The UML needs a default route: - - - UML# - route add default gw gateway IP - - - - - The gateway IP can be any machine on the local net that knows how to - reach the outside world. Usually, this is the host or the local net- - work's gateway. - - - Occasionally, we hear from someone who can reach some machines, but - not others on the same net, or who can reach some ports on other - machines, but not others. These are usually caused by strange - firewalling somewhere between the UML and the other box. You track - this down by running tcpdump on every interface the packets travel - over and see where they disappear. When you find a machine that takes - the packets in, but does not send them onward, that's the culprit. - - - - 13.10. I have no root and I want to scream - - Thanks to Birgit Wahlich for telling me about this strange one. It - turns out that there's a limit of six environment variables on the - kernel command line. When that limit is reached or exceeded, argument - processing stops, which means that the 'root=' argument that UML - usually adds is not seen. So, the filesystem has no idea what the - root device is, so it panics. - - - The fix is to put less stuff on the command line. Glomming all your - setup variables into one is probably the best way to go. - - - - 13.11. UML build conflict between ptrace.h and ucontext.h - - On some older systems, /usr/include/asm/ptrace.h and - /usr/include/sys/ucontext.h define the same names. So, when they're - included together, the defines from one completely mess up the parsing - of the other, producing errors like: - /usr/include/sys/ucontext.h:47: parse error before - `10' - - - - - plus a pile of warnings. - - - This is a libc botch, which has since been fixed, and I don't see any - way around it besides upgrading. - - - - 13.12. The UML BogoMips is exactly half the host's BogoMips - - On i386 kernels, there are two ways of running the loop that is used - to calculate the BogoMips rating, using the TSC if it's there or using - a one-instruction loop. The TSC produces twice the BogoMips as the - loop. UML uses the loop, since it has nothing resembling a TSC, and - will get almost exactly the same BogoMips as a host using the loop. - However, on a host with a TSC, its BogoMips will be double the loop - BogoMips, and therefore double the UML BogoMips. - - - - 13.13. When you run UML, it immediately segfaults - - If the host is configured with the 2G/2G address space split, that's - why. See ``UML on 2G/2G hosts'' for the details on getting UML to - run on your host. - - - - 13.14. xterms appear, then immediately disappear - - If you're running an up to date kernel with an old release of - uml_utilities, the port-helper program will not work properly, so - xterms will exit straight after they appear. The solution is to - upgrade to the latest release of uml_utilities. Usually this problem - occurs when you have installed a packaged release of UML then compiled - your own development kernel without upgrading the uml_utilities from - the source distribution. - - - - 13.15. Any other panic, hang, or strange behavior - - If you're seeing truly strange behavior, such as hangs or panics that - happen in random places, or you try running the debugger to see what's - happening and it acts strangely, then it could be a problem in the - host kernel. If you're not running a stock Linus or -ac kernel, then - try that. An early version of the preemption patch and a 2.4.10 SuSE - kernel have caused very strange problems in UML. - - - Otherwise, let me know about it. Send a message to one of the UML - mailing lists - either the developer list - user-mode-linux-devel at - lists dot sourceforge dot net (subscription info) or the user list - - user-mode-linux-user at lists dot sourceforge do net (subscription - info), whichever you prefer. Don't assume that everyone knows about - it and that a fix is imminent. - - - If you want to be super-helpful, read ``Diagnosing Problems'' and - follow the instructions contained therein. - 14. Diagnosing Problems - - - If you get UML to crash, hang, or otherwise misbehave, you should - report this on one of the project mailing lists, either the developer - list - user-mode-linux-devel at lists dot sourceforge dot net - (subscription info) or the user list - user-mode-linux-user at lists - dot sourceforge dot net (subscription info). When you do, it is - likely that I will want more information. So, it would be helpful to - read the stuff below, do whatever is applicable in your case, and - report the results to the list. - - - For any diagnosis, you're going to need to build a debugging kernel. - The binaries from this site aren't debuggable. If you haven't done - this before, read about ``Compiling the kernel and modules'' and - ``Kernel debugging'' UML first. - - - 14.1. Case 1 : Normal kernel panics - - The most common case is for a normal thread to panic. To debug this, - you will need to run it under the debugger (add 'debug' to the command - line). An xterm will start up with gdb running inside it. Continue - it when it stops in start_kernel and make it crash. Now ^C gdb and - - - If the panic was a "Kernel mode fault", then there will be a segv - frame on the stack and I'm going to want some more information. The - stack might look something like this: - - - (UML gdb) backtrace - #0 0x1009bf76 in __sigprocmask (how=1, set=0x5f347940, oset=0x0) - at ../sysdeps/unix/sysv/linux/sigprocmask.c:49 - #1 0x10091411 in change_sig (signal=10, on=1) at process.c:218 - #2 0x10094785 in timer_handler (sig=26) at time_kern.c:32 - #3 0x1009bf38 in __restore () - at ../sysdeps/unix/sysv/linux/i386/sigaction.c:125 - #4 0x1009534c in segv (address=8, ip=268849158, is_write=2, is_user=0) - at trap_kern.c:66 - #5 0x10095c04 in segv_handler (sig=11) at trap_user.c:285 - #6 0x1009bf38 in __restore () - - - - - I'm going to want to see the symbol and line information for the value - of ip in the segv frame. In this case, you would do the following: - - - (UML gdb) i sym 268849158 - - - - - and - - - (UML gdb) i line *268849158 - - - - - The reason for this is the __restore frame right above the segv_han- - dler frame is hiding the frame that actually segfaulted. So, I have - to get that information from the faulting ip. - - - 14.2. Case 2 : Tracing thread panics - - The less common and more painful case is when the tracing thread - panics. In this case, the kernel debugger will be useless because it - needs a healthy tracing thread in order to work. The first thing to - do is get a backtrace from the tracing thread. This is done by - figuring out what its pid is, firing up gdb, and attaching it to that - pid. You can figure out the tracing thread pid by looking at the - first line of the console output, which will look like this: - - - tracing thread pid = 15851 - - - - - or by running ps on the host and finding the line that looks like - this: - - - jdike 15851 4.5 0.4 132568 1104 pts/0 S 21:34 0:05 ./linux [(tracing thread)] - - - - - If the panic was 'segfault in signals', then follow the instructions - above for collecting information about the location of the seg fault. - - - If the tracing thread flaked out all by itself, then send that - backtrace in and wait for our crack debugging team to fix the problem. - - - 14.3. Case 3 : Tracing thread panics caused by other threads - - However, there are cases where the misbehavior of another thread - caused the problem. The most common panic of this type is: - - - wait_for_stop failed to wait for to stop with - - - - - In this case, you'll need to get a backtrace from the process men- - tioned in the panic, which is complicated by the fact that the kernel - debugger is defunct and without some fancy footwork, another gdb can't - attach to it. So, this is how the fancy footwork goes: - - In a shell: - - - host% kill -STOP pid - - - - - Run gdb on the tracing thread as described in case 2 and do: - - - (host gdb) call detach(pid) - - - If you get a segfault, do it again. It always works the second time. - - Detach from the tracing thread and attach to that other thread: - - - (host gdb) detach - - - - - - - (host gdb) attach pid - - - - - If gdb hangs when attaching to that process, go back to a shell and - do: - - - host% - kill -CONT pid - - - - - And then get the backtrace: - - - (host gdb) backtrace - - - - - - 14.4. Case 4 : Hangs - - Hangs seem to be fairly rare, but they sometimes happen. When a hang - happens, we need a backtrace from the offending process. Run the - kernel debugger as described in case 1 and get a backtrace. If the - current process is not the idle thread, then send in the backtrace. - You can tell that it's the idle thread if the stack looks like this: - - - #0 0x100b1401 in __libc_nanosleep () - #1 0x100a2885 in idle_sleep (secs=10) at time.c:122 - #2 0x100a546f in do_idle () at process_kern.c:445 - #3 0x100a5508 in cpu_idle () at process_kern.c:471 - #4 0x100ec18f in start_kernel () at init/main.c:592 - #5 0x100a3e10 in start_kernel_proc (unused=0x0) at um_arch.c:71 - #6 0x100a383f in signal_tramp (arg=0x100a3dd8) at trap_user.c:50 - - - - - If this is the case, then some other process is at fault, and went to - sleep when it shouldn't have. Run ps on the host and figure out which - process should not have gone to sleep and stayed asleep. Then attach - to it with gdb and get a backtrace as described in case 3. - - - - - - - 15. Thanks - - - A number of people have helped this project in various ways, and this - page gives recognition where recognition is due. - - - If you're listed here and you would prefer a real link on your name, - or no link at all, instead of the despammed email address pseudo-link, - let me know. - - - If you're not listed here and you think maybe you should be, please - let me know that as well. I try to get everyone, but sometimes my - bookkeeping lapses and I forget about contributions. - - - 15.1. Code and Documentation - - Rusty Russell - - - o wrote the HOWTO - - o prodded me into making this project official and putting it on - SourceForge - - o came up with the way cool UML logo - - o redid the config process - - - Peter Moulder - Fixed my config and build - processes, and added some useful code to the block driver - - - Bill Stearns - - - o HOWTO updates - - o lots of bug reports - - o lots of testing - - o dedicated a box (uml.ists.dartmouth.edu) to support UML development - - o wrote the mkrootfs script, which allows bootable filesystems of - RPM-based distributions to be cranked out - - o cranked out a large number of filesystems with said script - - - Jim Leu - Wrote the virtual ethernet driver - and associated usermode tools - - Lars Brinkhoff - Contributed the ptrace - proxy from his own project to allow easier - kernel debugging - - - Andrea Arcangeli - Redid some of the early boot - code so that it would work on machines with Large File Support - - - Chris Emerson - Did - the first UML port to Linux/ppc - - - Harald Welte - Wrote the multicast - transport for the network driver - - - Jorgen Cederlof - Added special file support to hostfs - - - Greg Lonnon - Changed the ubd driver - to allow it to layer a COW file on a shared read-only filesystem and - wrote the iomem emulation support - - - Henrik Nordstrom - Provided a variety - of patches, fixes, and clues - - - Lennert Buytenhek - Contributed various patches, a rewrite of the - network driver, the first implementation of the mconsole driver, and - did the bulk of the work needed to get SMP working again. - - - Yon Uriarte - Fixed the TUN/TAP network backend while I slept. - - - Adam Heath - Made a bunch of nice cleanups to the initialization code, - plus various other small patches. - - - Matt Zimmerman - Matt volunteered to be the UML Debian maintainer and - is doing a real nice job of it. He also noticed and fixed a number of - actually and potentially exploitable security holes in uml_net. Plus - the occasional patch. I like patches. - - - James McMechan - James seems to have taken over maintenance of the ubd - driver and is doing a nice job of it. - - - Chandan Kudige - wrote the umlgdb script which automates the reloading - of module symbols. - - - Steve Schmidtke - wrote the UML slirp transport and hostaudio drivers, - enabling UML processes to access audio devices on the host. He also - submitted patches for the slip transport and lots of other things. - - - David Coulson - - - o Set up the usermodelinux.org site, - which is a great way of keeping the UML user community on top of - UML goings-on. - - o Site documentation and updates - - o Nifty little UML management daemon UMLd - - - o Lots of testing and bug reports - - - - - 15.2. Flushing out bugs - - - - o Yuri Pudgorodsky - - o Gerald Britton - - o Ian Wehrman - - o Gord Lamb - - o Eugene Koontz - - o John H. Hartman - - o Anders Karlsson - - o Daniel Phillips - - o John Fremlin - - o Rainer Burgstaller - - o James Stevenson - - o Matt Clay - - o Cliff Jefferies - - o Geoff Hoff - - o Lennert Buytenhek - - o Al Viro - - o Frank Klingenhoefer - - o Livio Baldini Soares - - o Jon Burgess - - o Petru Paler - - o Paul - - o Chris Reahard - - o Sverker Nilsson - - o Gong Su - - o johan verrept - - o Bjorn Eriksson - - o Lorenzo Allegrucci - - o Muli Ben-Yehuda - - o David Mansfield - - o Howard Goff - - o Mike Anderson - - o John Byrne - - o Sapan J. Batia - - o Iris Huang - - o Jan Hudec - - o Voluspa - - - - - 15.3. Buglets and clean-ups - - - - o Dave Zarzycki - - o Adam Lazur - - o Boria Feigin - - o Brian J. Murrell - - o JS - - o Roman Zippel - - o Wil Cooley - - o Ayelet Shemesh - - o Will Dyson - - o Sverker Nilsson - - o dvorak - - o v.naga srinivas - - o Shlomi Fish - - o Roger Binns - - o johan verrept - - o MrChuoi - - o Peter Cleve - - o Vincent Guffens - - o Nathan Scott - - o Patrick Caulfield - - o jbearce - - o Catalin Marinas - - o Shane Spencer - - o Zou Min - - - o Ryan Boder - - o Lorenzo Colitti - - o Gwendal Grignou - - o Andre' Breiler - - o Tsutomu Yasuda - - - - 15.4. Case Studies - - - o Jon Wright - - o William McEwan - - o Michael Richardson - - - - 15.5. Other contributions - - - Bill Carr made the Red Hat mkrootfs script - work with RH 6.2. - - Michael Jennings sent in some material which - is now gracing the top of the index page of this site. - - SGI (and more specifically Ralf Baechle ) gave me an account on oss.sgi.com - . The bandwidth there made it possible to - produce most of the filesystems available on the project download - page. - - Laurent Bonnaud took the old grotty - Debian filesystem that I've been distributing and updated it to 2.2. - It is now available by itself here. - - Rik van Riel gave me some ftp space on ftp.nl.linux.org so I can make - releases even when Sourceforge is broken. - - Rodrigo de Castro looked at my broken pte code and told me what was - wrong with it, letting me fix a long-standing (several weeks) and - serious set of bugs. - - Chris Reahard built a specialized root filesystem for running a DNS - server jailed inside UML. It's available from the download - page in the Jail - Filesystems section. - - - - - - - - - - - - diff --git a/MAINTAINERS b/MAINTAINERS index debbb7b97c98..1aec93695040 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -8727,7 +8727,7 @@ L: kvm@vger.kernel.org W: http://www.linux-kvm.org T: git git://git.kernel.org/pub/scm/virt/kvm/kvm.git S: Supported -F: Documentation/virtual/kvm/ +F: Documentation/virt/kvm/ F: include/trace/events/kvm.h F: include/uapi/asm-generic/kvm* F: include/uapi/linux/kvm* @@ -12054,7 +12054,7 @@ M: Juergen Gross M: Alok Kataria L: virtualization@lists.linux-foundation.org S: Supported -F: Documentation/virtual/paravirt_ops.txt +F: Documentation/virt/paravirt_ops.txt F: arch/*/kernel/paravirt* F: arch/*/include/asm/paravirt*.h F: include/linux/hypervisor.h @@ -16745,7 +16745,7 @@ W: http://user-mode-linux.sourceforge.net Q: https://patchwork.ozlabs.org/project/linux-um/list/ T: git git://git.kernel.org/pub/scm/linux/kernel/git/rw/uml.git S: Maintained -F: Documentation/virtual/uml/ +F: Documentation/virt/uml/ F: arch/um/ F: arch/x86/um/ F: fs/hostfs/ diff --git a/arch/powerpc/include/uapi/asm/kvm_para.h b/arch/powerpc/include/uapi/asm/kvm_para.h index 01555c6ae0f5..be48c2215fa2 100644 --- a/arch/powerpc/include/uapi/asm/kvm_para.h +++ b/arch/powerpc/include/uapi/asm/kvm_para.h @@ -31,7 +31,7 @@ * Struct fields are always 32 or 64 bit aligned, depending on them being 32 * or 64 bit wide respectively. * - * See Documentation/virtual/kvm/ppc-pv.txt + * See Documentation/virt/kvm/ppc-pv.txt */ struct kvm_vcpu_arch_shared { __u64 scratch1; diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 8f72526e2f68..24843cf49579 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3466,7 +3466,7 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level, /* * Currently, fast page fault only works for direct mapping * since the gfn is not stable for indirect shadow page. See - * Documentation/virtual/kvm/locking.txt to get more detail. + * Documentation/virt/kvm/locking.txt to get more detail. */ fault_handled = fast_pf_fix_direct_spte(vcpu, sp, iterator.sptep, spte, diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index a7c19540ce21..5e3f12d5359e 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -116,7 +116,7 @@ struct kvm_irq_level { * ACPI gsi notion of irq. * For IA-64 (APIC model) IOAPIC0: irq 0-23; IOAPIC1: irq 24-47.. * For X86 (standard AT mode) PIC0/1: irq 0-15. IOAPIC0: 0-23.. - * For ARM: See Documentation/virtual/kvm/api.txt + * For ARM: See Documentation/virt/kvm/api.txt */ union { __u32 irq; @@ -1086,7 +1086,7 @@ struct kvm_xen_hvm_config { * * KVM_IRQFD_FLAG_RESAMPLE indicates resamplefd is valid and specifies * the irqfd to operate in resampling mode for level triggered interrupt - * emulation. See Documentation/virtual/kvm/api.txt. + * emulation. See Documentation/virt/kvm/api.txt. */ #define KVM_IRQFD_FLAG_RESAMPLE (1 << 1) diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h index c2152f3dd02d..e7c67be7c15f 100644 --- a/tools/include/uapi/linux/kvm.h +++ b/tools/include/uapi/linux/kvm.h @@ -116,7 +116,7 @@ struct kvm_irq_level { * ACPI gsi notion of irq. * For IA-64 (APIC model) IOAPIC0: irq 0-23; IOAPIC1: irq 24-47.. * For X86 (standard AT mode) PIC0/1: irq 0-15. IOAPIC0: 0-23.. - * For ARM: See Documentation/virtual/kvm/api.txt + * For ARM: See Documentation/virt/kvm/api.txt */ union { __u32 irq; @@ -1085,7 +1085,7 @@ struct kvm_xen_hvm_config { * * KVM_IRQFD_FLAG_RESAMPLE indicates resamplefd is valid and specifies * the irqfd to operate in resampling mode for level triggered interrupt - * emulation. See Documentation/virtual/kvm/api.txt. + * emulation. See Documentation/virt/kvm/api.txt. */ #define KVM_IRQFD_FLAG_RESAMPLE (1 << 1) diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c index f645c0fbf7ec..acc43242a310 100644 --- a/virt/kvm/arm/arm.c +++ b/virt/kvm/arm/arm.c @@ -727,7 +727,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) * Ensure we set mode to IN_GUEST_MODE after we disable * interrupts and before the final VCPU requests check. * See the comment in kvm_vcpu_exiting_guest_mode() and - * Documentation/virtual/kvm/vcpu-requests.rst + * Documentation/virt/kvm/vcpu-requests.rst */ smp_store_mb(vcpu->mode, IN_GUEST_MODE); diff --git a/virt/kvm/arm/vgic/vgic-mmio-v3.c b/virt/kvm/arm/vgic/vgic-mmio-v3.c index 936962abc38d..c45e2d7e942f 100644 --- a/virt/kvm/arm/vgic/vgic-mmio-v3.c +++ b/virt/kvm/arm/vgic/vgic-mmio-v3.c @@ -250,7 +250,7 @@ static unsigned long vgic_v3_uaccess_read_pending(struct kvm_vcpu *vcpu, * pending state of interrupt is latched in pending_latch variable. * Userspace will save and restore pending state and line_level * separately. - * Refer to Documentation/virtual/kvm/devices/arm-vgic-v3.txt + * Refer to Documentation/virt/kvm/devices/arm-vgic-v3.txt * for handling of ISPENDR and ICPENDR. */ for (i = 0; i < len * 8; i++) { diff --git a/virt/kvm/arm/vgic/vgic.h b/virt/kvm/arm/vgic/vgic.h index 57205beaa981..3b7525deec80 100644 --- a/virt/kvm/arm/vgic/vgic.h +++ b/virt/kvm/arm/vgic/vgic.h @@ -42,7 +42,7 @@ VGIC_AFFINITY_LEVEL(val, 3)) /* - * As per Documentation/virtual/kvm/devices/arm-vgic-v3.txt, + * As per Documentation/virt/kvm/devices/arm-vgic-v3.txt, * below macros are defined for CPUREG encoding. */ #define KVM_REG_ARM_VGIC_SYSREG_OP0_MASK 0x000000000000c000 @@ -63,7 +63,7 @@ KVM_REG_ARM_VGIC_SYSREG_OP2_MASK) /* - * As per Documentation/virtual/kvm/devices/arm-vgic-its.txt, + * As per Documentation/virt/kvm/devices/arm-vgic-its.txt, * below macros are defined for ITS table entry encoding. */ #define KVM_ITS_CTE_VALID_SHIFT 63 -- cgit v1.2.3-71-gd317 From d9c5252295218df4cfe64353aa860d7b5c8700ef Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Thu, 25 Jul 2019 16:58:31 +0900 Subject: treewide: add "WITH Linux-syscall-note" to SPDX tag of uapi headers UAPI headers licensed under GPL are supposed to have exception "WITH Linux-syscall-note" so that they can be included into non-GPL user space application code. The exception note is missing in some UAPI headers. Some of them slipped in by the treewide conversion commit b24413180f56 ("License cleanup: add SPDX GPL-2.0 license identifier to files with no license"). Just run: $ git show --oneline b24413180f56 -- arch/x86/include/uapi/asm/ I believe they are not intentional, and should be fixed too. This patch was generated by the following script: git grep -l --not -e Linux-syscall-note --and -e SPDX-License-Identifier \ -- :arch/*/include/uapi/asm/*.h :include/uapi/ :^*/Kbuild | while read file do sed -i -e '/[[:space:]]OR[[:space:]]/s/\(GPL-[^[:space:]]*\)/(\1 WITH Linux-syscall-note)/g' \ -e '/[[:space:]]or[[:space:]]/s/\(GPL-[^[:space:]]*\)/(\1 WITH Linux-syscall-note)/g' \ -e '/[[:space:]]OR[[:space:]]/!{/[[:space:]]or[[:space:]]/!s/\(GPL-[^[:space:]]*\)/\1 WITH Linux-syscall-note/g}' $file done After this patch is applied, there are 5 UAPI headers that do not contain "WITH Linux-syscall-note". They are kept untouched since this exception applies only to GPL variants. $ git grep --not -e Linux-syscall-note --and -e SPDX-License-Identifier \ -- :arch/*/include/uapi/asm/*.h :include/uapi/ :^*/Kbuild include/uapi/drm/panfrost_drm.h:/* SPDX-License-Identifier: MIT */ include/uapi/linux/batman_adv.h:/* SPDX-License-Identifier: MIT */ include/uapi/linux/qemu_fw_cfg.h:/* SPDX-License-Identifier: BSD-3-Clause */ include/uapi/linux/vbox_err.h:/* SPDX-License-Identifier: MIT */ include/uapi/linux/virtio_iommu.h:/* SPDX-License-Identifier: BSD-3-Clause */ Signed-off-by: Masahiro Yamada Reviewed-by: Thomas Gleixner Signed-off-by: Greg Kroah-Hartman --- arch/arm64/include/uapi/asm/bpf_perf_event.h | 2 +- arch/csky/include/uapi/asm/byteorder.h | 2 +- arch/csky/include/uapi/asm/cachectl.h | 2 +- arch/csky/include/uapi/asm/perf_regs.h | 2 +- arch/csky/include/uapi/asm/ptrace.h | 2 +- arch/csky/include/uapi/asm/sigcontext.h | 2 +- arch/csky/include/uapi/asm/unistd.h | 2 +- arch/nds32/include/uapi/asm/auxvec.h | 2 +- arch/nds32/include/uapi/asm/byteorder.h | 2 +- arch/nds32/include/uapi/asm/cachectl.h | 2 +- arch/nds32/include/uapi/asm/fp_udfiex_crtl.h | 2 +- arch/nds32/include/uapi/asm/param.h | 2 +- arch/nds32/include/uapi/asm/ptrace.h | 2 +- arch/nds32/include/uapi/asm/sigcontext.h | 2 +- arch/nds32/include/uapi/asm/unistd.h | 2 +- arch/powerpc/include/uapi/asm/bpf_perf_event.h | 2 +- arch/riscv/include/uapi/asm/auxvec.h | 2 +- arch/riscv/include/uapi/asm/bitsperlong.h | 2 +- arch/riscv/include/uapi/asm/byteorder.h | 2 +- arch/riscv/include/uapi/asm/hwcap.h | 2 +- arch/riscv/include/uapi/asm/ptrace.h | 2 +- arch/riscv/include/uapi/asm/sigcontext.h | 2 +- arch/riscv/include/uapi/asm/ucontext.h | 2 +- arch/s390/include/uapi/asm/bpf_perf_event.h | 2 +- arch/s390/include/uapi/asm/ipl.h | 2 +- arch/sh/include/uapi/asm/setup.h | 2 +- arch/sh/include/uapi/asm/types.h | 2 +- arch/sparc/include/uapi/asm/oradax.h | 2 +- arch/x86/include/uapi/asm/byteorder.h | 2 +- arch/x86/include/uapi/asm/hwcap2.h | 2 +- arch/x86/include/uapi/asm/sigcontext32.h | 2 +- arch/x86/include/uapi/asm/types.h | 2 +- include/uapi/linux/bpfilter.h | 2 +- include/uapi/linux/ipmi_bmc.h | 2 +- include/uapi/linux/isst_if.h | 2 +- include/uapi/linux/netfilter/nf_synproxy.h | 2 +- include/uapi/linux/psp-sev.h | 2 +- include/uapi/linux/rxrpc.h | 2 +- include/uapi/linux/usb/g_uvc.h | 2 +- include/uapi/linux/vbox_vmmdev_types.h | 2 +- include/uapi/linux/vboxguest.h | 2 +- include/uapi/linux/virtio_pmem.h | 2 +- include/uapi/linux/vmcore.h | 2 +- include/uapi/linux/wmi.h | 2 +- include/uapi/misc/fastrpc.h | 2 +- include/uapi/rdma/rvt-abi.h | 2 +- include/uapi/rdma/siw-abi.h | 2 +- include/uapi/scsi/scsi_bsg_ufs.h | 2 +- include/uapi/sound/skl-tplg-interface.h | 2 +- 49 files changed, 49 insertions(+), 49 deletions(-) (limited to 'include/uapi/linux') diff --git a/arch/arm64/include/uapi/asm/bpf_perf_event.h b/arch/arm64/include/uapi/asm/bpf_perf_event.h index b551b741653d..5e1e648aeec4 100644 --- a/arch/arm64/include/uapi/asm/bpf_perf_event.h +++ b/arch/arm64/include/uapi/asm/bpf_perf_event.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ #ifndef _UAPI__ASM_BPF_PERF_EVENT_H__ #define _UAPI__ASM_BPF_PERF_EVENT_H__ diff --git a/arch/csky/include/uapi/asm/byteorder.h b/arch/csky/include/uapi/asm/byteorder.h index b079ec715cdf..d150cd664873 100644 --- a/arch/csky/include/uapi/asm/byteorder.h +++ b/arch/csky/include/uapi/asm/byteorder.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ // Copyright (C) 2018 Hangzhou C-SKY Microsystems co.,ltd. #ifndef __ASM_CSKY_BYTEORDER_H diff --git a/arch/csky/include/uapi/asm/cachectl.h b/arch/csky/include/uapi/asm/cachectl.h index ddf2f39aa925..ed7fad1ea20d 100644 --- a/arch/csky/include/uapi/asm/cachectl.h +++ b/arch/csky/include/uapi/asm/cachectl.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ #ifndef __ASM_CSKY_CACHECTL_H #define __ASM_CSKY_CACHECTL_H diff --git a/arch/csky/include/uapi/asm/perf_regs.h b/arch/csky/include/uapi/asm/perf_regs.h index ee323d818592..49d4e147a559 100644 --- a/arch/csky/include/uapi/asm/perf_regs.h +++ b/arch/csky/include/uapi/asm/perf_regs.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ // Copyright (C) 2019 Hangzhou C-SKY Microsystems co.,ltd. #ifndef _ASM_CSKY_PERF_REGS_H diff --git a/arch/csky/include/uapi/asm/ptrace.h b/arch/csky/include/uapi/asm/ptrace.h index 4e248d5b86ef..66b2268e324e 100644 --- a/arch/csky/include/uapi/asm/ptrace.h +++ b/arch/csky/include/uapi/asm/ptrace.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ // Copyright (C) 2018 Hangzhou C-SKY Microsystems co.,ltd. #ifndef _CSKY_PTRACE_H diff --git a/arch/csky/include/uapi/asm/sigcontext.h b/arch/csky/include/uapi/asm/sigcontext.h index e81e7ff11e36..670c020f2cb8 100644 --- a/arch/csky/include/uapi/asm/sigcontext.h +++ b/arch/csky/include/uapi/asm/sigcontext.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ // Copyright (C) 2018 Hangzhou C-SKY Microsystems co.,ltd. #ifndef __ASM_CSKY_SIGCONTEXT_H diff --git a/arch/csky/include/uapi/asm/unistd.h b/arch/csky/include/uapi/asm/unistd.h index ec60e49cea66..211c983c7282 100644 --- a/arch/csky/include/uapi/asm/unistd.h +++ b/arch/csky/include/uapi/asm/unistd.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ // Copyright (C) 2018 Hangzhou C-SKY Microsystems co.,ltd. #define __ARCH_WANT_SYS_CLONE diff --git a/arch/nds32/include/uapi/asm/auxvec.h b/arch/nds32/include/uapi/asm/auxvec.h index b5d58ea8decb..bc0b92ab8c15 100644 --- a/arch/nds32/include/uapi/asm/auxvec.h +++ b/arch/nds32/include/uapi/asm/auxvec.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ // Copyright (C) 2005-2017 Andes Technology Corporation #ifndef __ASM_AUXVEC_H diff --git a/arch/nds32/include/uapi/asm/byteorder.h b/arch/nds32/include/uapi/asm/byteorder.h index 511e653c709d..c264ef12c49c 100644 --- a/arch/nds32/include/uapi/asm/byteorder.h +++ b/arch/nds32/include/uapi/asm/byteorder.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ // Copyright (C) 2005-2017 Andes Technology Corporation #ifndef __NDS32_BYTEORDER_H__ diff --git a/arch/nds32/include/uapi/asm/cachectl.h b/arch/nds32/include/uapi/asm/cachectl.h index 73793662815c..31b9b439d819 100644 --- a/arch/nds32/include/uapi/asm/cachectl.h +++ b/arch/nds32/include/uapi/asm/cachectl.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ // Copyright (C) 1994, 1995, 1996 by Ralf Baechle // Copyright (C) 2005-2017 Andes Technology Corporation #ifndef _ASM_CACHECTL diff --git a/arch/nds32/include/uapi/asm/fp_udfiex_crtl.h b/arch/nds32/include/uapi/asm/fp_udfiex_crtl.h index d54a5d6c6538..f17396db16ec 100644 --- a/arch/nds32/include/uapi/asm/fp_udfiex_crtl.h +++ b/arch/nds32/include/uapi/asm/fp_udfiex_crtl.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ /* Copyright (C) 2005-2019 Andes Technology Corporation */ #ifndef _FP_UDF_IEX_CRTL_H #define _FP_UDF_IEX_CRTL_H diff --git a/arch/nds32/include/uapi/asm/param.h b/arch/nds32/include/uapi/asm/param.h index 2977534a6bd3..48d00328d328 100644 --- a/arch/nds32/include/uapi/asm/param.h +++ b/arch/nds32/include/uapi/asm/param.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ // Copyright (C) 2005-2017 Andes Technology Corporation #ifndef __ASM_NDS32_PARAM_H diff --git a/arch/nds32/include/uapi/asm/ptrace.h b/arch/nds32/include/uapi/asm/ptrace.h index 1a6e01c00e6f..d76217c7c010 100644 --- a/arch/nds32/include/uapi/asm/ptrace.h +++ b/arch/nds32/include/uapi/asm/ptrace.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ // Copyright (C) 2005-2017 Andes Technology Corporation #ifndef __UAPI_ASM_NDS32_PTRACE_H diff --git a/arch/nds32/include/uapi/asm/sigcontext.h b/arch/nds32/include/uapi/asm/sigcontext.h index dc89af7ddcc3..6c1e6648878f 100644 --- a/arch/nds32/include/uapi/asm/sigcontext.h +++ b/arch/nds32/include/uapi/asm/sigcontext.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ // Copyright (C) 2005-2017 Andes Technology Corporation #ifndef _ASMNDS32_SIGCONTEXT_H diff --git a/arch/nds32/include/uapi/asm/unistd.h b/arch/nds32/include/uapi/asm/unistd.h index a0b2f7b9c0f2..410795e280fe 100644 --- a/arch/nds32/include/uapi/asm/unistd.h +++ b/arch/nds32/include/uapi/asm/unistd.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ // Copyright (C) 2005-2017 Andes Technology Corporation #define __ARCH_WANT_STAT64 diff --git a/arch/powerpc/include/uapi/asm/bpf_perf_event.h b/arch/powerpc/include/uapi/asm/bpf_perf_event.h index b551b741653d..5e1e648aeec4 100644 --- a/arch/powerpc/include/uapi/asm/bpf_perf_event.h +++ b/arch/powerpc/include/uapi/asm/bpf_perf_event.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ #ifndef _UAPI__ASM_BPF_PERF_EVENT_H__ #define _UAPI__ASM_BPF_PERF_EVENT_H__ diff --git a/arch/riscv/include/uapi/asm/auxvec.h b/arch/riscv/include/uapi/asm/auxvec.h index 62716653554b..d86cb17bbabe 100644 --- a/arch/riscv/include/uapi/asm/auxvec.h +++ b/arch/riscv/include/uapi/asm/auxvec.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ +/* SPDX-License-Identifier: GPL-2.0-only WITH Linux-syscall-note */ /* * Copyright (C) 2012 ARM Ltd. * Copyright (C) 2015 Regents of the University of California diff --git a/arch/riscv/include/uapi/asm/bitsperlong.h b/arch/riscv/include/uapi/asm/bitsperlong.h index 0b9b58b57ff6..7d0b32e3b701 100644 --- a/arch/riscv/include/uapi/asm/bitsperlong.h +++ b/arch/riscv/include/uapi/asm/bitsperlong.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ +/* SPDX-License-Identifier: GPL-2.0-only WITH Linux-syscall-note */ /* * Copyright (C) 2012 ARM Ltd. * Copyright (C) 2015 Regents of the University of California diff --git a/arch/riscv/include/uapi/asm/byteorder.h b/arch/riscv/include/uapi/asm/byteorder.h index 1920debc09c0..f671e16bf6af 100644 --- a/arch/riscv/include/uapi/asm/byteorder.h +++ b/arch/riscv/include/uapi/asm/byteorder.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ +/* SPDX-License-Identifier: GPL-2.0-only WITH Linux-syscall-note */ /* * Copyright (C) 2012 ARM Ltd. * Copyright (C) 2015 Regents of the University of California diff --git a/arch/riscv/include/uapi/asm/hwcap.h b/arch/riscv/include/uapi/asm/hwcap.h index 7d786145183b..4e7646077056 100644 --- a/arch/riscv/include/uapi/asm/hwcap.h +++ b/arch/riscv/include/uapi/asm/hwcap.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ +/* SPDX-License-Identifier: GPL-2.0-only WITH Linux-syscall-note */ /* * Copied from arch/arm64/include/asm/hwcap.h * diff --git a/arch/riscv/include/uapi/asm/ptrace.h b/arch/riscv/include/uapi/asm/ptrace.h index 92d8f7cd8f84..882547f6bd5c 100644 --- a/arch/riscv/include/uapi/asm/ptrace.h +++ b/arch/riscv/include/uapi/asm/ptrace.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ +/* SPDX-License-Identifier: GPL-2.0-only WITH Linux-syscall-note */ /* * Copyright (C) 2012 Regents of the University of California */ diff --git a/arch/riscv/include/uapi/asm/sigcontext.h b/arch/riscv/include/uapi/asm/sigcontext.h index 053f809e52ce..84f2dfcfdbce 100644 --- a/arch/riscv/include/uapi/asm/sigcontext.h +++ b/arch/riscv/include/uapi/asm/sigcontext.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ +/* SPDX-License-Identifier: GPL-2.0-only WITH Linux-syscall-note */ /* * Copyright (C) 2012 Regents of the University of California */ diff --git a/arch/riscv/include/uapi/asm/ucontext.h b/arch/riscv/include/uapi/asm/ucontext.h index b58e00cee2ec..411dd7b52ed6 100644 --- a/arch/riscv/include/uapi/asm/ucontext.h +++ b/arch/riscv/include/uapi/asm/ucontext.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ +/* SPDX-License-Identifier: GPL-2.0-only WITH Linux-syscall-note */ /* * Copyright (C) 2012 ARM Ltd. * Copyright (C) 2017 SiFive, Inc. diff --git a/arch/s390/include/uapi/asm/bpf_perf_event.h b/arch/s390/include/uapi/asm/bpf_perf_event.h index cefe7c7cd4f6..3ed42ff6da94 100644 --- a/arch/s390/include/uapi/asm/bpf_perf_event.h +++ b/arch/s390/include/uapi/asm/bpf_perf_event.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ #ifndef _UAPI__ASM_BPF_PERF_EVENT_H__ #define _UAPI__ASM_BPF_PERF_EVENT_H__ diff --git a/arch/s390/include/uapi/asm/ipl.h b/arch/s390/include/uapi/asm/ipl.h index fd32b1cd80d2..451ba7d08905 100644 --- a/arch/s390/include/uapi/asm/ipl.h +++ b/arch/s390/include/uapi/asm/ipl.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ #ifndef _ASM_S390_UAPI_IPL_H #define _ASM_S390_UAPI_IPL_H diff --git a/arch/sh/include/uapi/asm/setup.h b/arch/sh/include/uapi/asm/setup.h index 1170dd2fb998..4bd19f80f9b0 100644 --- a/arch/sh/include/uapi/asm/setup.h +++ b/arch/sh/include/uapi/asm/setup.h @@ -1,2 +1,2 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ #include diff --git a/arch/sh/include/uapi/asm/types.h b/arch/sh/include/uapi/asm/types.h index f83795fdc0da..68100e108ea6 100644 --- a/arch/sh/include/uapi/asm/types.h +++ b/arch/sh/include/uapi/asm/types.h @@ -1,2 +1,2 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ #include diff --git a/arch/sparc/include/uapi/asm/oradax.h b/arch/sparc/include/uapi/asm/oradax.h index 64c67f2ea33f..0dace69058ab 100644 --- a/arch/sparc/include/uapi/asm/oradax.h +++ b/arch/sparc/include/uapi/asm/oradax.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* SPDX-License-Identifier: GPL-2.0-or-later WITH Linux-syscall-note */ /* * Copyright (c) 2017, Oracle and/or its affiliates. All rights reserved. */ diff --git a/arch/x86/include/uapi/asm/byteorder.h b/arch/x86/include/uapi/asm/byteorder.h index 484e3cfd7ef2..149143cab9ff 100644 --- a/arch/x86/include/uapi/asm/byteorder.h +++ b/arch/x86/include/uapi/asm/byteorder.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ #ifndef _ASM_X86_BYTEORDER_H #define _ASM_X86_BYTEORDER_H diff --git a/arch/x86/include/uapi/asm/hwcap2.h b/arch/x86/include/uapi/asm/hwcap2.h index 6ebaae90e207..8b2effe6efb8 100644 --- a/arch/x86/include/uapi/asm/hwcap2.h +++ b/arch/x86/include/uapi/asm/hwcap2.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ #ifndef _ASM_X86_HWCAP2_H #define _ASM_X86_HWCAP2_H diff --git a/arch/x86/include/uapi/asm/sigcontext32.h b/arch/x86/include/uapi/asm/sigcontext32.h index 6b18e88de8a6..7114801d0499 100644 --- a/arch/x86/include/uapi/asm/sigcontext32.h +++ b/arch/x86/include/uapi/asm/sigcontext32.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ #ifndef _ASM_X86_SIGCONTEXT32_H #define _ASM_X86_SIGCONTEXT32_H diff --git a/arch/x86/include/uapi/asm/types.h b/arch/x86/include/uapi/asm/types.h index df55e1ddb0c9..9d5c11a24279 100644 --- a/arch/x86/include/uapi/asm/types.h +++ b/arch/x86/include/uapi/asm/types.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ #ifndef _ASM_X86_TYPES_H #define _ASM_X86_TYPES_H diff --git a/include/uapi/linux/bpfilter.h b/include/uapi/linux/bpfilter.h index 2ec3cc99ea4c..cbc1f5813f50 100644 --- a/include/uapi/linux/bpfilter.h +++ b/include/uapi/linux/bpfilter.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ #ifndef _UAPI_LINUX_BPFILTER_H #define _UAPI_LINUX_BPFILTER_H diff --git a/include/uapi/linux/ipmi_bmc.h b/include/uapi/linux/ipmi_bmc.h index 1670f0944227..782a03eb1086 100644 --- a/include/uapi/linux/ipmi_bmc.h +++ b/include/uapi/linux/ipmi_bmc.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ /* * Copyright (c) 2015-2018, Intel Corporation. */ diff --git a/include/uapi/linux/isst_if.h b/include/uapi/linux/isst_if.h index d10b832c58c5..0a52b7b093d3 100644 --- a/include/uapi/linux/isst_if.h +++ b/include/uapi/linux/isst_if.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ /* * Intel Speed Select Interface: OS to hardware Interface * Copyright (c) 2019, Intel Corporation. diff --git a/include/uapi/linux/netfilter/nf_synproxy.h b/include/uapi/linux/netfilter/nf_synproxy.h index 6f3791c8946f..00d787f0260e 100644 --- a/include/uapi/linux/netfilter/nf_synproxy.h +++ b/include/uapi/linux/netfilter/nf_synproxy.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ #ifndef _NF_SYNPROXY_H #define _NF_SYNPROXY_H diff --git a/include/uapi/linux/psp-sev.h b/include/uapi/linux/psp-sev.h index 8654b2442f6a..592a0c1b77c9 100644 --- a/include/uapi/linux/psp-sev.h +++ b/include/uapi/linux/psp-sev.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ +/* SPDX-License-Identifier: GPL-2.0-only WITH Linux-syscall-note */ /* * Userspace interface for AMD Secure Encrypted Virtualization (SEV) * platform management commands. diff --git a/include/uapi/linux/rxrpc.h b/include/uapi/linux/rxrpc.h index 782069dcf607..4accfa7e266d 100644 --- a/include/uapi/linux/rxrpc.h +++ b/include/uapi/linux/rxrpc.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* SPDX-License-Identifier: GPL-2.0-or-later WITH Linux-syscall-note */ /* Types and definitions for AF_RXRPC. * * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. diff --git a/include/uapi/linux/usb/g_uvc.h b/include/uapi/linux/usb/g_uvc.h index 3c9ee3020cbb..652f169a019e 100644 --- a/include/uapi/linux/usb/g_uvc.h +++ b/include/uapi/linux/usb/g_uvc.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0+ */ +/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */ /* * g_uvc.h -- USB Video Class Gadget driver API * diff --git a/include/uapi/linux/vbox_vmmdev_types.h b/include/uapi/linux/vbox_vmmdev_types.h index 26f39816af14..c27289fd619a 100644 --- a/include/uapi/linux/vbox_vmmdev_types.h +++ b/include/uapi/linux/vbox_vmmdev_types.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: (GPL-2.0 OR CDDL-1.0) */ +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR CDDL-1.0) */ /* * Virtual Device for Guest <-> VMM/Host communication, type definitions * which are also used for the vboxguest ioctl interface / by vboxsf diff --git a/include/uapi/linux/vboxguest.h b/include/uapi/linux/vboxguest.h index 612f0c7d3558..9cec58a6a5ea 100644 --- a/include/uapi/linux/vboxguest.h +++ b/include/uapi/linux/vboxguest.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: (GPL-2.0 OR CDDL-1.0) */ +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR CDDL-1.0) */ /* * VBoxGuest - VirtualBox Guest Additions Driver Interface. * diff --git a/include/uapi/linux/virtio_pmem.h b/include/uapi/linux/virtio_pmem.h index 9a63ed6d062f..b022787ffb94 100644 --- a/include/uapi/linux/virtio_pmem.h +++ b/include/uapi/linux/virtio_pmem.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */ +/* SPDX-License-Identifier: (GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause */ /* * Definitions for virtio-pmem devices. * diff --git a/include/uapi/linux/vmcore.h b/include/uapi/linux/vmcore.h index 022619668e0e..3e9da91866ff 100644 --- a/include/uapi/linux/vmcore.h +++ b/include/uapi/linux/vmcore.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ #ifndef _UAPI_VMCORE_H #define _UAPI_VMCORE_H diff --git a/include/uapi/linux/wmi.h b/include/uapi/linux/wmi.h index c36f2d7675a4..7085c5dca9fa 100644 --- a/include/uapi/linux/wmi.h +++ b/include/uapi/linux/wmi.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ +/* SPDX-License-Identifier: GPL-2.0-only WITH Linux-syscall-note */ /* * User API methods for ACPI-WMI mapping driver * diff --git a/include/uapi/misc/fastrpc.h b/include/uapi/misc/fastrpc.h index 6d701af9fc42..fb792e882cef 100644 --- a/include/uapi/misc/fastrpc.h +++ b/include/uapi/misc/fastrpc.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ #ifndef __QCOM_FASTRPC_H__ #define __QCOM_FASTRPC_H__ diff --git a/include/uapi/rdma/rvt-abi.h b/include/uapi/rdma/rvt-abi.h index 7328293c715c..7c05a02d2be5 100644 --- a/include/uapi/rdma/rvt-abi.h +++ b/include/uapi/rdma/rvt-abi.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */ +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */ /* * This file contains defines, structures, etc. that are used diff --git a/include/uapi/rdma/siw-abi.h b/include/uapi/rdma/siw-abi.h index 3dd8071ace7b..7de68f1dc707 100644 --- a/include/uapi/rdma/siw-abi.h +++ b/include/uapi/rdma/siw-abi.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */ +/* SPDX-License-Identifier: (GPL-2.0 WITH Linux-syscall-note) or BSD-3-Clause */ /* Authors: Bernard Metzler */ /* Copyright (c) 2008-2019, IBM Corporation */ diff --git a/include/uapi/scsi/scsi_bsg_ufs.h b/include/uapi/scsi/scsi_bsg_ufs.h index 17c7abd0803a..9988db6ad244 100644 --- a/include/uapi/scsi/scsi_bsg_ufs.h +++ b/include/uapi/scsi/scsi_bsg_ufs.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ /* * UFS Transport SGIO v4 BSG Message Support * diff --git a/include/uapi/sound/skl-tplg-interface.h b/include/uapi/sound/skl-tplg-interface.h index f39352cef382..9eee32f5e407 100644 --- a/include/uapi/sound/skl-tplg-interface.h +++ b/include/uapi/sound/skl-tplg-interface.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ /* * skl-tplg-interface.h - Intel DSP FW private data interface * -- cgit v1.2.3-71-gd317 From 4a2b8560e3dff8637ccb09524650864f60ebab7f Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Mon, 22 Jul 2019 08:51:46 +0200 Subject: tty: serial: netx: Delete driver The Netx ARM machine was deleted from the kernel. This driver had no users and has to go. Cc: Robert Schwebel Cc: Sascha Hauer Signed-off-by: Linus Walleij Signed-off-by: Arnd Bergmann Link: https://lore.kernel.org/r/20190722065146.4844-1-linus.walleij@linaro.org Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/Kconfig | 19 - drivers/tty/serial/Makefile | 1 - drivers/tty/serial/netx-serial.c | 733 --------------------------------------- include/uapi/linux/serial_core.h | 3 - 4 files changed, 756 deletions(-) delete mode 100644 drivers/tty/serial/netx-serial.c (limited to 'include/uapi/linux') diff --git a/drivers/tty/serial/Kconfig b/drivers/tty/serial/Kconfig index fd385c8c53a5..3083dbae35f7 100644 --- a/drivers/tty/serial/Kconfig +++ b/drivers/tty/serial/Kconfig @@ -1035,25 +1035,6 @@ config SERIAL_VT8500_CONSOLE depends on SERIAL_VT8500=y select SERIAL_CORE_CONSOLE -config SERIAL_NETX - tristate "NetX serial port support" - depends on ARCH_NETX - select SERIAL_CORE - help - If you have a machine based on a Hilscher NetX SoC you - can enable its onboard serial port by enabling this option. - - To compile this driver as a module, choose M here: the - module will be called netx-serial. - -config SERIAL_NETX_CONSOLE - bool "Console on NetX serial port" - depends on SERIAL_NETX=y - select SERIAL_CORE_CONSOLE - help - If you have enabled the serial port on the Hilscher NetX SoC - you can make it the console by answering Y to this option. - config SERIAL_OMAP tristate "OMAP serial port support" depends on ARCH_OMAP2PLUS diff --git a/drivers/tty/serial/Makefile b/drivers/tty/serial/Makefile index 7cd7cabfa6c4..15a0fccadf7e 100644 --- a/drivers/tty/serial/Makefile +++ b/drivers/tty/serial/Makefile @@ -59,7 +59,6 @@ obj-$(CONFIG_SERIAL_ATMEL) += atmel_serial.o obj-$(CONFIG_SERIAL_UARTLITE) += uartlite.o obj-$(CONFIG_SERIAL_MSM) += msm_serial.o obj-$(CONFIG_SERIAL_QCOM_GENI) += qcom_geni_serial.o -obj-$(CONFIG_SERIAL_NETX) += netx-serial.o obj-$(CONFIG_SERIAL_KS8695) += serial_ks8695.o obj-$(CONFIG_SERIAL_OMAP) += omap-serial.o obj-$(CONFIG_SERIAL_ALTERA_UART) += altera_uart.o diff --git a/drivers/tty/serial/netx-serial.c b/drivers/tty/serial/netx-serial.c deleted file mode 100644 index b3556863491f..000000000000 --- a/drivers/tty/serial/netx-serial.c +++ /dev/null @@ -1,733 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (c) 2005 Sascha Hauer , Pengutronix - */ - -#if defined(CONFIG_SERIAL_NETX_CONSOLE) && defined(CONFIG_MAGIC_SYSRQ) -#define SUPPORT_SYSRQ -#endif - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -/* We've been assigned a range on the "Low-density serial ports" major */ -#define SERIAL_NX_MAJOR 204 -#define MINOR_START 170 - -enum uart_regs { - UART_DR = 0x00, - UART_SR = 0x04, - UART_LINE_CR = 0x08, - UART_BAUDDIV_MSB = 0x0c, - UART_BAUDDIV_LSB = 0x10, - UART_CR = 0x14, - UART_FR = 0x18, - UART_IIR = 0x1c, - UART_ILPR = 0x20, - UART_RTS_CR = 0x24, - UART_RTS_LEAD = 0x28, - UART_RTS_TRAIL = 0x2c, - UART_DRV_ENABLE = 0x30, - UART_BRM_CR = 0x34, - UART_RXFIFO_IRQLEVEL = 0x38, - UART_TXFIFO_IRQLEVEL = 0x3c, -}; - -#define SR_FE (1<<0) -#define SR_PE (1<<1) -#define SR_BE (1<<2) -#define SR_OE (1<<3) - -#define LINE_CR_BRK (1<<0) -#define LINE_CR_PEN (1<<1) -#define LINE_CR_EPS (1<<2) -#define LINE_CR_STP2 (1<<3) -#define LINE_CR_FEN (1<<4) -#define LINE_CR_5BIT (0<<5) -#define LINE_CR_6BIT (1<<5) -#define LINE_CR_7BIT (2<<5) -#define LINE_CR_8BIT (3<<5) -#define LINE_CR_BITS_MASK (3<<5) - -#define CR_UART_EN (1<<0) -#define CR_SIREN (1<<1) -#define CR_SIRLP (1<<2) -#define CR_MSIE (1<<3) -#define CR_RIE (1<<4) -#define CR_TIE (1<<5) -#define CR_RTIE (1<<6) -#define CR_LBE (1<<7) - -#define FR_CTS (1<<0) -#define FR_DSR (1<<1) -#define FR_DCD (1<<2) -#define FR_BUSY (1<<3) -#define FR_RXFE (1<<4) -#define FR_TXFF (1<<5) -#define FR_RXFF (1<<6) -#define FR_TXFE (1<<7) - -#define IIR_MIS (1<<0) -#define IIR_RIS (1<<1) -#define IIR_TIS (1<<2) -#define IIR_RTIS (1<<3) -#define IIR_MASK 0xf - -#define RTS_CR_AUTO (1<<0) -#define RTS_CR_RTS (1<<1) -#define RTS_CR_COUNT (1<<2) -#define RTS_CR_MOD2 (1<<3) -#define RTS_CR_RTS_POL (1<<4) -#define RTS_CR_CTS_CTR (1<<5) -#define RTS_CR_CTS_POL (1<<6) -#define RTS_CR_STICK (1<<7) - -#define UART_PORT_SIZE 0x40 -#define DRIVER_NAME "netx-uart" - -struct netx_port { - struct uart_port port; -}; - -static void netx_stop_tx(struct uart_port *port) -{ - unsigned int val; - val = readl(port->membase + UART_CR); - writel(val & ~CR_TIE, port->membase + UART_CR); -} - -static void netx_stop_rx(struct uart_port *port) -{ - unsigned int val; - val = readl(port->membase + UART_CR); - writel(val & ~CR_RIE, port->membase + UART_CR); -} - -static void netx_enable_ms(struct uart_port *port) -{ - unsigned int val; - val = readl(port->membase + UART_CR); - writel(val | CR_MSIE, port->membase + UART_CR); -} - -static inline void netx_transmit_buffer(struct uart_port *port) -{ - struct circ_buf *xmit = &port->state->xmit; - - if (port->x_char) { - writel(port->x_char, port->membase + UART_DR); - port->icount.tx++; - port->x_char = 0; - return; - } - - if (uart_tx_stopped(port) || uart_circ_empty(xmit)) { - netx_stop_tx(port); - return; - } - - do { - /* send xmit->buf[xmit->tail] - * out the port here */ - writel(xmit->buf[xmit->tail], port->membase + UART_DR); - xmit->tail = (xmit->tail + 1) & - (UART_XMIT_SIZE - 1); - port->icount.tx++; - if (uart_circ_empty(xmit)) - break; - } while (!(readl(port->membase + UART_FR) & FR_TXFF)); - - if (uart_circ_empty(xmit)) - netx_stop_tx(port); -} - -static void netx_start_tx(struct uart_port *port) -{ - writel( - readl(port->membase + UART_CR) | CR_TIE, port->membase + UART_CR); - - if (!(readl(port->membase + UART_FR) & FR_TXFF)) - netx_transmit_buffer(port); -} - -static unsigned int netx_tx_empty(struct uart_port *port) -{ - return readl(port->membase + UART_FR) & FR_BUSY ? 0 : TIOCSER_TEMT; -} - -static void netx_txint(struct uart_port *port) -{ - struct circ_buf *xmit = &port->state->xmit; - - if (uart_circ_empty(xmit) || uart_tx_stopped(port)) { - netx_stop_tx(port); - return; - } - - netx_transmit_buffer(port); - - if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS) - uart_write_wakeup(port); -} - -static void netx_rxint(struct uart_port *port, unsigned long *flags) -{ - unsigned char rx, flg, status; - - while (!(readl(port->membase + UART_FR) & FR_RXFE)) { - rx = readl(port->membase + UART_DR); - flg = TTY_NORMAL; - port->icount.rx++; - status = readl(port->membase + UART_SR); - if (status & SR_BE) { - writel(0, port->membase + UART_SR); - if (uart_handle_break(port)) - continue; - } - - if (unlikely(status & (SR_FE | SR_PE | SR_OE))) { - - if (status & SR_PE) - port->icount.parity++; - else if (status & SR_FE) - port->icount.frame++; - if (status & SR_OE) - port->icount.overrun++; - - status &= port->read_status_mask; - - if (status & SR_BE) - flg = TTY_BREAK; - else if (status & SR_PE) - flg = TTY_PARITY; - else if (status & SR_FE) - flg = TTY_FRAME; - } - - if (uart_handle_sysrq_char(port, rx)) - continue; - - uart_insert_char(port, status, SR_OE, rx, flg); - } - - spin_unlock_irqrestore(&port->lock, *flags); - tty_flip_buffer_push(&port->state->port); - spin_lock_irqsave(&port->lock, *flags); -} - -static irqreturn_t netx_int(int irq, void *dev_id) -{ - struct uart_port *port = dev_id; - unsigned long flags; - unsigned char status; - - spin_lock_irqsave(&port->lock,flags); - - status = readl(port->membase + UART_IIR) & IIR_MASK; - while (status) { - if (status & IIR_RIS) - netx_rxint(port, &flags); - if (status & IIR_TIS) - netx_txint(port); - if (status & IIR_MIS) { - if (readl(port->membase + UART_FR) & FR_CTS) - uart_handle_cts_change(port, 1); - else - uart_handle_cts_change(port, 0); - } - writel(0, port->membase + UART_IIR); - status = readl(port->membase + UART_IIR) & IIR_MASK; - } - - spin_unlock_irqrestore(&port->lock,flags); - return IRQ_HANDLED; -} - -static unsigned int netx_get_mctrl(struct uart_port *port) -{ - unsigned int ret = TIOCM_DSR | TIOCM_CAR; - - if (readl(port->membase + UART_FR) & FR_CTS) - ret |= TIOCM_CTS; - - return ret; -} - -static void netx_set_mctrl(struct uart_port *port, unsigned int mctrl) -{ - unsigned int val; - - /* FIXME: Locking needed ? */ - if (mctrl & TIOCM_RTS) { - val = readl(port->membase + UART_RTS_CR); - writel(val | RTS_CR_RTS, port->membase + UART_RTS_CR); - } -} - -static void netx_break_ctl(struct uart_port *port, int break_state) -{ - unsigned int line_cr; - spin_lock_irq(&port->lock); - - line_cr = readl(port->membase + UART_LINE_CR); - if (break_state != 0) - line_cr |= LINE_CR_BRK; - else - line_cr &= ~LINE_CR_BRK; - writel(line_cr, port->membase + UART_LINE_CR); - - spin_unlock_irq(&port->lock); -} - -static int netx_startup(struct uart_port *port) -{ - int ret; - - ret = request_irq(port->irq, netx_int, 0, - DRIVER_NAME, port); - if (ret) { - dev_err(port->dev, "unable to grab irq%d\n",port->irq); - goto exit; - } - - writel(readl(port->membase + UART_LINE_CR) | LINE_CR_FEN, - port->membase + UART_LINE_CR); - - writel(CR_MSIE | CR_RIE | CR_TIE | CR_RTIE | CR_UART_EN, - port->membase + UART_CR); - -exit: - return ret; -} - -static void netx_shutdown(struct uart_port *port) -{ - writel(0, port->membase + UART_CR) ; - - free_irq(port->irq, port); -} - -static void -netx_set_termios(struct uart_port *port, struct ktermios *termios, - struct ktermios *old) -{ - unsigned int baud, quot; - unsigned char old_cr; - unsigned char line_cr = LINE_CR_FEN; - unsigned char rts_cr = 0; - - switch (termios->c_cflag & CSIZE) { - case CS5: - line_cr |= LINE_CR_5BIT; - break; - case CS6: - line_cr |= LINE_CR_6BIT; - break; - case CS7: - line_cr |= LINE_CR_7BIT; - break; - case CS8: - line_cr |= LINE_CR_8BIT; - break; - } - - if (termios->c_cflag & CSTOPB) - line_cr |= LINE_CR_STP2; - - if (termios->c_cflag & PARENB) { - line_cr |= LINE_CR_PEN; - if (!(termios->c_cflag & PARODD)) - line_cr |= LINE_CR_EPS; - } - - if (termios->c_cflag & CRTSCTS) - rts_cr = RTS_CR_AUTO | RTS_CR_CTS_CTR | RTS_CR_RTS_POL; - - baud = uart_get_baud_rate(port, termios, old, 0, port->uartclk/16); - quot = baud * 4096; - quot /= 1000; - quot *= 256; - quot /= 100000; - - spin_lock_irq(&port->lock); - - uart_update_timeout(port, termios->c_cflag, baud); - - old_cr = readl(port->membase + UART_CR); - - /* disable interrupts */ - writel(old_cr & ~(CR_MSIE | CR_RIE | CR_TIE | CR_RTIE), - port->membase + UART_CR); - - /* drain transmitter */ - while (readl(port->membase + UART_FR) & FR_BUSY); - - /* disable UART */ - writel(old_cr & ~CR_UART_EN, port->membase + UART_CR); - - /* modem status interrupts */ - old_cr &= ~CR_MSIE; - if (UART_ENABLE_MS(port, termios->c_cflag)) - old_cr |= CR_MSIE; - - writel((quot>>8) & 0xff, port->membase + UART_BAUDDIV_MSB); - writel(quot & 0xff, port->membase + UART_BAUDDIV_LSB); - writel(line_cr, port->membase + UART_LINE_CR); - - writel(rts_cr, port->membase + UART_RTS_CR); - - /* - * Characters to ignore - */ - port->ignore_status_mask = 0; - if (termios->c_iflag & IGNPAR) - port->ignore_status_mask |= SR_PE; - if (termios->c_iflag & IGNBRK) { - port->ignore_status_mask |= SR_BE; - /* - * If we're ignoring parity and break indicators, - * ignore overruns too (for real raw support). - */ - if (termios->c_iflag & IGNPAR) - port->ignore_status_mask |= SR_PE; - } - - port->read_status_mask = 0; - if (termios->c_iflag & (IGNBRK | BRKINT | PARMRK)) - port->read_status_mask |= SR_BE; - if (termios->c_iflag & INPCK) - port->read_status_mask |= SR_PE | SR_FE; - - writel(old_cr, port->membase + UART_CR); - - spin_unlock_irq(&port->lock); -} - -static const char *netx_type(struct uart_port *port) -{ - return port->type == PORT_NETX ? "NETX" : NULL; -} - -static void netx_release_port(struct uart_port *port) -{ - release_mem_region(port->mapbase, UART_PORT_SIZE); -} - -static int netx_request_port(struct uart_port *port) -{ - return request_mem_region(port->mapbase, UART_PORT_SIZE, - DRIVER_NAME) != NULL ? 0 : -EBUSY; -} - -static void netx_config_port(struct uart_port *port, int flags) -{ - if (flags & UART_CONFIG_TYPE && netx_request_port(port) == 0) - port->type = PORT_NETX; -} - -static int -netx_verify_port(struct uart_port *port, struct serial_struct *ser) -{ - int ret = 0; - - if (ser->type != PORT_UNKNOWN && ser->type != PORT_NETX) - ret = -EINVAL; - - return ret; -} - -static struct uart_ops netx_pops = { - .tx_empty = netx_tx_empty, - .set_mctrl = netx_set_mctrl, - .get_mctrl = netx_get_mctrl, - .stop_tx = netx_stop_tx, - .start_tx = netx_start_tx, - .stop_rx = netx_stop_rx, - .enable_ms = netx_enable_ms, - .break_ctl = netx_break_ctl, - .startup = netx_startup, - .shutdown = netx_shutdown, - .set_termios = netx_set_termios, - .type = netx_type, - .release_port = netx_release_port, - .request_port = netx_request_port, - .config_port = netx_config_port, - .verify_port = netx_verify_port, -}; - -static struct netx_port netx_ports[] = { - { - .port = { - .type = PORT_NETX, - .iotype = UPIO_MEM, - .membase = (char __iomem *)io_p2v(NETX_PA_UART0), - .mapbase = NETX_PA_UART0, - .irq = NETX_IRQ_UART0, - .uartclk = 100000000, - .fifosize = 16, - .flags = UPF_BOOT_AUTOCONF, - .ops = &netx_pops, - .line = 0, - }, - }, { - .port = { - .type = PORT_NETX, - .iotype = UPIO_MEM, - .membase = (char __iomem *)io_p2v(NETX_PA_UART1), - .mapbase = NETX_PA_UART1, - .irq = NETX_IRQ_UART1, - .uartclk = 100000000, - .fifosize = 16, - .flags = UPF_BOOT_AUTOCONF, - .ops = &netx_pops, - .line = 1, - }, - }, { - .port = { - .type = PORT_NETX, - .iotype = UPIO_MEM, - .membase = (char __iomem *)io_p2v(NETX_PA_UART2), - .mapbase = NETX_PA_UART2, - .irq = NETX_IRQ_UART2, - .uartclk = 100000000, - .fifosize = 16, - .flags = UPF_BOOT_AUTOCONF, - .ops = &netx_pops, - .line = 2, - }, - } -}; - -#ifdef CONFIG_SERIAL_NETX_CONSOLE - -static void netx_console_putchar(struct uart_port *port, int ch) -{ - while (readl(port->membase + UART_FR) & FR_BUSY); - writel(ch, port->membase + UART_DR); -} - -static void -netx_console_write(struct console *co, const char *s, unsigned int count) -{ - struct uart_port *port = &netx_ports[co->index].port; - unsigned char cr_save; - - cr_save = readl(port->membase + UART_CR); - writel(cr_save | CR_UART_EN, port->membase + UART_CR); - - uart_console_write(port, s, count, netx_console_putchar); - - while (readl(port->membase + UART_FR) & FR_BUSY); - writel(cr_save, port->membase + UART_CR); -} - -static void __init -netx_console_get_options(struct uart_port *port, int *baud, - int *parity, int *bits, int *flow) -{ - unsigned char line_cr; - - *baud = (readl(port->membase + UART_BAUDDIV_MSB) << 8) | - readl(port->membase + UART_BAUDDIV_LSB); - *baud *= 1000; - *baud /= 4096; - *baud *= 1000; - *baud /= 256; - *baud *= 100; - - line_cr = readl(port->membase + UART_LINE_CR); - *parity = 'n'; - if (line_cr & LINE_CR_PEN) { - if (line_cr & LINE_CR_EPS) - *parity = 'e'; - else - *parity = 'o'; - } - - switch (line_cr & LINE_CR_BITS_MASK) { - case LINE_CR_8BIT: - *bits = 8; - break; - case LINE_CR_7BIT: - *bits = 7; - break; - case LINE_CR_6BIT: - *bits = 6; - break; - case LINE_CR_5BIT: - *bits = 5; - break; - } - - if (readl(port->membase + UART_RTS_CR) & RTS_CR_AUTO) - *flow = 'r'; -} - -static int __init -netx_console_setup(struct console *co, char *options) -{ - struct netx_port *sport; - int baud = 9600; - int bits = 8; - int parity = 'n'; - int flow = 'n'; - - /* - * Check whether an invalid uart number has been specified, and - * if so, search for the first available port that does have - * console support. - */ - if (co->index == -1 || co->index >= ARRAY_SIZE(netx_ports)) - co->index = 0; - sport = &netx_ports[co->index]; - - if (options) { - uart_parse_options(options, &baud, &parity, &bits, &flow); - } else { - /* if the UART is enabled, assume it has been correctly setup - * by the bootloader and get the options - */ - if (readl(sport->port.membase + UART_CR) & CR_UART_EN) { - netx_console_get_options(&sport->port, &baud, - &parity, &bits, &flow); - } - - } - - return uart_set_options(&sport->port, co, baud, parity, bits, flow); -} - -static struct uart_driver netx_reg; -static struct console netx_console = { - .name = "ttyNX", - .write = netx_console_write, - .device = uart_console_device, - .setup = netx_console_setup, - .flags = CON_PRINTBUFFER, - .index = -1, - .data = &netx_reg, -}; - -static int __init netx_console_init(void) -{ - register_console(&netx_console); - return 0; -} -console_initcall(netx_console_init); - -#define NETX_CONSOLE &netx_console -#else -#define NETX_CONSOLE NULL -#endif - -static struct uart_driver netx_reg = { - .owner = THIS_MODULE, - .driver_name = DRIVER_NAME, - .dev_name = "ttyNX", - .major = SERIAL_NX_MAJOR, - .minor = MINOR_START, - .nr = ARRAY_SIZE(netx_ports), - .cons = NETX_CONSOLE, -}; - -static int serial_netx_suspend(struct platform_device *pdev, pm_message_t state) -{ - struct netx_port *sport = platform_get_drvdata(pdev); - - if (sport) - uart_suspend_port(&netx_reg, &sport->port); - - return 0; -} - -static int serial_netx_resume(struct platform_device *pdev) -{ - struct netx_port *sport = platform_get_drvdata(pdev); - - if (sport) - uart_resume_port(&netx_reg, &sport->port); - - return 0; -} - -static int serial_netx_probe(struct platform_device *pdev) -{ - struct uart_port *port = &netx_ports[pdev->id].port; - - dev_info(&pdev->dev, "initialising\n"); - - port->dev = &pdev->dev; - - writel(1, port->membase + UART_RXFIFO_IRQLEVEL); - uart_add_one_port(&netx_reg, &netx_ports[pdev->id].port); - platform_set_drvdata(pdev, &netx_ports[pdev->id]); - - return 0; -} - -static int serial_netx_remove(struct platform_device *pdev) -{ - struct netx_port *sport = platform_get_drvdata(pdev); - - if (sport) - uart_remove_one_port(&netx_reg, &sport->port); - - return 0; -} - -static struct platform_driver serial_netx_driver = { - .probe = serial_netx_probe, - .remove = serial_netx_remove, - - .suspend = serial_netx_suspend, - .resume = serial_netx_resume, - - .driver = { - .name = DRIVER_NAME, - }, -}; - -static int __init netx_serial_init(void) -{ - int ret; - - printk(KERN_INFO "Serial: NetX driver\n"); - - ret = uart_register_driver(&netx_reg); - if (ret) - return ret; - - ret = platform_driver_register(&serial_netx_driver); - if (ret != 0) - uart_unregister_driver(&netx_reg); - - return 0; -} - -static void __exit netx_serial_exit(void) -{ - platform_driver_unregister(&serial_netx_driver); - uart_unregister_driver(&netx_reg); -} - -module_init(netx_serial_init); -module_exit(netx_serial_exit); - -MODULE_AUTHOR("Sascha Hauer"); -MODULE_DESCRIPTION("NetX serial port driver"); -MODULE_LICENSE("GPL"); -MODULE_ALIAS("platform:" DRIVER_NAME); diff --git a/include/uapi/linux/serial_core.h b/include/uapi/linux/serial_core.h index 5642c05e0da0..3cc3af1c2ee1 100644 --- a/include/uapi/linux/serial_core.h +++ b/include/uapi/linux/serial_core.h @@ -150,9 +150,6 @@ #define PORT_PNX8XXX 70 -/* Hilscher netx */ -#define PORT_NETX 71 - /* SUN4V Hypervisor Console */ #define PORT_SUNHV 72 -- cgit v1.2.3-71-gd317 From 91826ba13855f73e252fef68369b3b0e1ed25253 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 29 Jul 2019 00:51:38 +0900 Subject: netfilter: add include guard to xt_connlabel.h Add a header include guard just in case. Signed-off-by: Masahiro Yamada Acked-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/xt_connlabel.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/netfilter/xt_connlabel.h b/include/uapi/linux/netfilter/xt_connlabel.h index 2312f0ec07b2..323f0dfc2a4e 100644 --- a/include/uapi/linux/netfilter/xt_connlabel.h +++ b/include/uapi/linux/netfilter/xt_connlabel.h @@ -1,4 +1,8 @@ /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ + +#ifndef _UAPI_XT_CONNLABEL_H +#define _UAPI_XT_CONNLABEL_H + #include #define XT_CONNLABEL_MAXBIT 127 @@ -11,3 +15,5 @@ struct xt_connlabel_mtinfo { __u16 bit; __u16 options; }; + +#endif /* _UAPI_XT_CONNLABEL_H */ -- cgit v1.2.3-71-gd317 From 5e5412c365a32e452daa762eac36121cb8a370bb Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Tue, 30 Jul 2019 11:30:33 -0400 Subject: net/socket: fix GCC8+ Wpacked-not-aligned warnings There are a lot of those warnings with GCC8+ 64-bit, In file included from ./include/linux/sctp.h:42, from net/core/skbuff.c:47: ./include/uapi/linux/sctp.h:395:1: warning: alignment 4 of 'struct sctp_paddr_change' is less than 8 [-Wpacked-not-aligned] } __attribute__((packed, aligned(4))); ^ ./include/uapi/linux/sctp.h:728:1: warning: alignment 4 of 'struct sctp_setpeerprim' is less than 8 [-Wpacked-not-aligned] } __attribute__((packed, aligned(4))); ^ ./include/uapi/linux/sctp.h:727:26: warning: 'sspp_addr' offset 4 in 'struct sctp_setpeerprim' isn't aligned to 8 [-Wpacked-not-aligned] struct sockaddr_storage sspp_addr; ^~~~~~~~~ ./include/uapi/linux/sctp.h:741:1: warning: alignment 4 of 'struct sctp_prim' is less than 8 [-Wpacked-not-aligned] } __attribute__((packed, aligned(4))); ^ ./include/uapi/linux/sctp.h:740:26: warning: 'ssp_addr' offset 4 in 'struct sctp_prim' isn't aligned to 8 [-Wpacked-not-aligned] struct sockaddr_storage ssp_addr; ^~~~~~~~ ./include/uapi/linux/sctp.h:792:1: warning: alignment 4 of 'struct sctp_paddrparams' is less than 8 [-Wpacked-not-aligned] } __attribute__((packed, aligned(4))); ^ ./include/uapi/linux/sctp.h:784:26: warning: 'spp_address' offset 4 in 'struct sctp_paddrparams' isn't aligned to 8 [-Wpacked-not-aligned] struct sockaddr_storage spp_address; ^~~~~~~~~~~ ./include/uapi/linux/sctp.h:905:1: warning: alignment 4 of 'struct sctp_paddrinfo' is less than 8 [-Wpacked-not-aligned] } __attribute__((packed, aligned(4))); ^ ./include/uapi/linux/sctp.h:899:26: warning: 'spinfo_address' offset 4 in 'struct sctp_paddrinfo' isn't aligned to 8 [-Wpacked-not-aligned] struct sockaddr_storage spinfo_address; ^~~~~~~~~~~~~~ This is because the commit 20c9c825b12f ("[SCTP] Fix SCTP socket options to work with 32-bit apps on 64-bit kernels.") added "packed, aligned(4)" GCC attributes to some structures but one of the members, i.e, "struct sockaddr_storage" in those structures has the attribute, "aligned(__alignof__ (struct sockaddr *)" which is 8-byte on 64-bit systems, so the commit overwrites the designed alignments for "sockaddr_storage". To fix this, "struct sockaddr_storage" needs to be aligned to 4-byte as it is only used in those packed sctp structure which is part of UAPI, and "struct __kernel_sockaddr_storage" is used in some other places of UAPI that need not to change alignments in order to not breaking userspace. Use an implicit alignment for "struct __kernel_sockaddr_storage" so it can keep the same alignments as a member in both packed and un-packed structures without breaking UAPI. Suggested-by: David Laight Signed-off-by: Qian Cai Signed-off-by: David S. Miller --- include/uapi/linux/socket.h | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/socket.h b/include/uapi/linux/socket.h index 8eb96021709c..c3409c8ec0dd 100644 --- a/include/uapi/linux/socket.h +++ b/include/uapi/linux/socket.h @@ -6,17 +6,24 @@ * Desired design of maximum size and alignment (see RFC2553) */ #define _K_SS_MAXSIZE 128 /* Implementation specific max size */ -#define _K_SS_ALIGNSIZE (__alignof__ (struct sockaddr *)) - /* Implementation specific desired alignment */ typedef unsigned short __kernel_sa_family_t; +/* + * The definition uses anonymous union and struct in order to control the + * default alignment. + */ struct __kernel_sockaddr_storage { - __kernel_sa_family_t ss_family; /* address family */ - /* Following field(s) are implementation specific */ - char __data[_K_SS_MAXSIZE - sizeof(unsigned short)]; + union { + struct { + __kernel_sa_family_t ss_family; /* address family */ + /* Following field(s) are implementation specific */ + char __data[_K_SS_MAXSIZE - sizeof(unsigned short)]; /* space to achieve desired size, */ /* _SS_MAXSIZE value minus size of ss_family */ -} __attribute__ ((aligned(_K_SS_ALIGNSIZE))); /* force desired alignment */ + }; + void *__align; /* implementation specific desired alignment */ + }; +}; #endif /* _UAPI_LINUX_SOCKET_H */ -- cgit v1.2.3-71-gd317 From 4b3e30ed3ec7864e798403a63ff2e96bd0c19ab0 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Wed, 7 Aug 2019 00:23:07 -0500 Subject: Revert "drm/amdkfd: New IOCTL to allocate queue GWS" This reverts commit 1a058c3376765ee31d65e28cbbb9d4ff15120056. This interface is still in too much flux. Revert until it's sorted out. Acked-by: Oak Zeng Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 28 ---------------------------- include/uapi/linux/kfd_ioctl.h | 20 +------------------- 2 files changed, 1 insertion(+), 47 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c index 26b15cc56c31..1d3cd5c50d5f 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c @@ -1567,32 +1567,6 @@ copy_from_user_failed: return err; } -static int kfd_ioctl_alloc_queue_gws(struct file *filep, - struct kfd_process *p, void *data) -{ - int retval; - struct kfd_ioctl_alloc_queue_gws_args *args = data; - struct kfd_dev *dev; - - if (!hws_gws_support) - return -ENODEV; - - dev = kfd_device_by_id(args->gpu_id); - if (!dev) { - pr_debug("Could not find gpu id 0x%x\n", args->gpu_id); - return -ENODEV; - } - if (dev->dqm->sched_policy == KFD_SCHED_POLICY_NO_HWS) - return -ENODEV; - - mutex_lock(&p->mutex); - retval = pqm_set_gws(&p->pqm, args->queue_id, args->num_gws ? dev->gws : NULL); - mutex_unlock(&p->mutex); - - args->first_gws = 0; - return retval; -} - static int kfd_ioctl_get_dmabuf_info(struct file *filep, struct kfd_process *p, void *data) { @@ -1795,8 +1769,6 @@ static const struct amdkfd_ioctl_desc amdkfd_ioctls[] = { AMDKFD_IOCTL_DEF(AMDKFD_IOC_IMPORT_DMABUF, kfd_ioctl_import_dmabuf, 0), - AMDKFD_IOCTL_DEF(AMDKFD_IOC_ALLOC_QUEUE_GWS, - kfd_ioctl_alloc_queue_gws, 0), }; #define AMDKFD_CORE_IOCTL_COUNT ARRAY_SIZE(amdkfd_ioctls) diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h index 070d1bc7e725..20917c59f39c 100644 --- a/include/uapi/linux/kfd_ioctl.h +++ b/include/uapi/linux/kfd_ioctl.h @@ -410,21 +410,6 @@ struct kfd_ioctl_unmap_memory_from_gpu_args { __u32 n_success; /* to/from KFD */ }; -/* Allocate GWS for specific queue - * - * @gpu_id: device identifier - * @queue_id: queue's id that GWS is allocated for - * @num_gws: how many GWS to allocate - * @first_gws: index of the first GWS allocated. - * only support contiguous GWS allocation - */ -struct kfd_ioctl_alloc_queue_gws_args { - __u32 gpu_id; /* to KFD */ - __u32 queue_id; /* to KFD */ - __u32 num_gws; /* to KFD */ - __u32 first_gws; /* from KFD */ -}; - struct kfd_ioctl_get_dmabuf_info_args { __u64 size; /* from KFD */ __u64 metadata_ptr; /* to KFD */ @@ -544,10 +529,7 @@ enum kfd_mmio_remap { #define AMDKFD_IOC_IMPORT_DMABUF \ AMDKFD_IOWR(0x1D, struct kfd_ioctl_import_dmabuf_args) -#define AMDKFD_IOC_ALLOC_QUEUE_GWS \ - AMDKFD_IOWR(0x1E, struct kfd_ioctl_alloc_queue_gws_args) - #define AMDKFD_COMMAND_START 0x01 -#define AMDKFD_COMMAND_END 0x1F +#define AMDKFD_COMMAND_END 0x1E #endif -- cgit v1.2.3-71-gd317