From 501db511397fd6efff3aa5b4e8de415b55559550 Mon Sep 17 00:00:00 2001 From: Rolf Neugebauer Date: Tue, 17 Jan 2017 18:13:51 +0000 Subject: virtio: don't set VIRTIO_NET_HDR_F_DATA_VALID on xmit This patch part reverts fd2a0437dc33 and e858fae2b0b8 which introduced a subtle change in how the virtio_net flags are derived from the SKBs ip_summed field. With the above commits, the flags are set to VIRTIO_NET_HDR_F_DATA_VALID when ip_summed == CHECKSUM_UNNECESSARY, thus treating it differently to ip_summed == CHECKSUM_NONE, which should be the same. Further, the virtio spec 1.0 / CS04 explicitly says that VIRTIO_NET_HDR_F_DATA_VALID must not be set by the driver. Fixes: fd2a0437dc33 ("virtio_net: introduce virtio_net_hdr_{from,to}_skb") Fixes: e858fae2b0b8 (" virtio_net: use common code for virtio_net_hdr and skb GSO conversion") Signed-off-by: Rolf Neugebauer Acked-by: Michael S. Tsirkin Signed-off-by: David S. Miller --- include/linux/virtio_net.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h index 66204007d7ac..56436472ccc7 100644 --- a/include/linux/virtio_net.h +++ b/include/linux/virtio_net.h @@ -91,8 +91,6 @@ static inline int virtio_net_hdr_from_skb(const struct sk_buff *skb, skb_checksum_start_offset(skb)); hdr->csum_offset = __cpu_to_virtio16(little_endian, skb->csum_offset); - } else if (skb->ip_summed == CHECKSUM_UNNECESSARY) { - hdr->flags = VIRTIO_NET_HDR_F_DATA_VALID; } /* else everything is zero */ return 0; -- cgit v1.2.3-71-gd317 From d407bd25a204bd66b7346dde24bd3d37ef0e0b05 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 18 Jan 2017 15:14:17 +0100 Subject: bpf: don't trigger OOM killer under pressure with map alloc This patch adds two helpers, bpf_map_area_alloc() and bpf_map_area_free(), that are to be used for map allocations. Using kmalloc() for very large allocations can cause excessive work within the page allocator, so i) fall back earlier to vmalloc() when the attempt is considered costly anyway, and even more importantly ii) don't trigger OOM killer with any of the allocators. Since this is based on a user space request, for example, when creating maps with element pre-allocation, we really want such requests to fail instead of killing other user space processes. Also, don't spam the kernel log with warnings should any of the allocations fail under pressure. Given that, we can make backend selection in bpf_map_area_alloc() generic, and convert all maps over to use this API for spots with potentially large allocation requests. Note, replacing the one kmalloc_array() is fine as overflow checks happen earlier in htab_map_alloc(), since it must also protect the multiplication for vmalloc() should kmalloc_array() fail. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/bpf.h | 2 ++ kernel/bpf/arraymap.c | 18 +++++++----------- kernel/bpf/hashtab.c | 22 +++++++++------------- kernel/bpf/stackmap.c | 20 ++++++++------------ kernel/bpf/syscall.c | 26 ++++++++++++++++++++++++++ 5 files changed, 52 insertions(+), 36 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 05cf951df3fe..3ed1f3b1d594 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -247,6 +247,8 @@ struct bpf_map * __must_check bpf_map_inc(struct bpf_map *map, bool uref); void bpf_map_put_with_uref(struct bpf_map *map); void bpf_map_put(struct bpf_map *map); int bpf_map_precharge_memlock(u32 pages); +void *bpf_map_area_alloc(size_t size); +void bpf_map_area_free(void *base); extern int sysctl_unprivileged_bpf_disabled; diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 229a5d5df977..3d55d95dcf49 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -11,7 +11,6 @@ */ #include #include -#include #include #include #include @@ -74,14 +73,10 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) if (array_size >= U32_MAX - PAGE_SIZE) return ERR_PTR(-ENOMEM); - /* allocate all map elements and zero-initialize them */ - array = kzalloc(array_size, GFP_USER | __GFP_NOWARN); - if (!array) { - array = vzalloc(array_size); - if (!array) - return ERR_PTR(-ENOMEM); - } + array = bpf_map_area_alloc(array_size); + if (!array) + return ERR_PTR(-ENOMEM); /* copy mandatory map attributes */ array->map.map_type = attr->map_type; @@ -97,7 +92,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) if (array_size >= U32_MAX - PAGE_SIZE || elem_size > PCPU_MIN_UNIT_SIZE || bpf_array_alloc_percpu(array)) { - kvfree(array); + bpf_map_area_free(array); return ERR_PTR(-ENOMEM); } out: @@ -262,7 +257,7 @@ static void array_map_free(struct bpf_map *map) if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) bpf_array_free_percpu(array); - kvfree(array); + bpf_map_area_free(array); } static const struct bpf_map_ops array_ops = { @@ -319,7 +314,8 @@ static void fd_array_map_free(struct bpf_map *map) /* make sure it's empty */ for (i = 0; i < array->map.max_entries; i++) BUG_ON(array->ptrs[i] != NULL); - kvfree(array); + + bpf_map_area_free(array); } static void *fd_array_map_lookup_elem(struct bpf_map *map, void *key) diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 3f2bb58952d8..a753bbe7df0a 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -13,7 +13,6 @@ #include #include #include -#include #include "percpu_freelist.h" #include "bpf_lru_list.h" @@ -103,7 +102,7 @@ static void htab_free_elems(struct bpf_htab *htab) free_percpu(pptr); } free_elems: - vfree(htab->elems); + bpf_map_area_free(htab->elems); } static struct htab_elem *prealloc_lru_pop(struct bpf_htab *htab, void *key, @@ -125,7 +124,8 @@ static int prealloc_init(struct bpf_htab *htab) { int err = -ENOMEM, i; - htab->elems = vzalloc(htab->elem_size * htab->map.max_entries); + htab->elems = bpf_map_area_alloc(htab->elem_size * + htab->map.max_entries); if (!htab->elems) return -ENOMEM; @@ -320,14 +320,10 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) goto free_htab; err = -ENOMEM; - htab->buckets = kmalloc_array(htab->n_buckets, sizeof(struct bucket), - GFP_USER | __GFP_NOWARN); - - if (!htab->buckets) { - htab->buckets = vmalloc(htab->n_buckets * sizeof(struct bucket)); - if (!htab->buckets) - goto free_htab; - } + htab->buckets = bpf_map_area_alloc(htab->n_buckets * + sizeof(struct bucket)); + if (!htab->buckets) + goto free_htab; for (i = 0; i < htab->n_buckets; i++) { INIT_HLIST_HEAD(&htab->buckets[i].head); @@ -354,7 +350,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) free_extra_elems: free_percpu(htab->extra_elems); free_buckets: - kvfree(htab->buckets); + bpf_map_area_free(htab->buckets); free_htab: kfree(htab); return ERR_PTR(err); @@ -1014,7 +1010,7 @@ static void htab_map_free(struct bpf_map *map) prealloc_destroy(htab); free_percpu(htab->extra_elems); - kvfree(htab->buckets); + bpf_map_area_free(htab->buckets); kfree(htab); } diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 732ae16d12b7..be8519148c25 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -7,7 +7,6 @@ #include #include #include -#include #include #include #include "percpu_freelist.h" @@ -32,7 +31,7 @@ static int prealloc_elems_and_freelist(struct bpf_stack_map *smap) u32 elem_size = sizeof(struct stack_map_bucket) + smap->map.value_size; int err; - smap->elems = vzalloc(elem_size * smap->map.max_entries); + smap->elems = bpf_map_area_alloc(elem_size * smap->map.max_entries); if (!smap->elems) return -ENOMEM; @@ -45,7 +44,7 @@ static int prealloc_elems_and_freelist(struct bpf_stack_map *smap) return 0; free_elems: - vfree(smap->elems); + bpf_map_area_free(smap->elems); return err; } @@ -76,12 +75,9 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr) if (cost >= U32_MAX - PAGE_SIZE) return ERR_PTR(-E2BIG); - smap = kzalloc(cost, GFP_USER | __GFP_NOWARN); - if (!smap) { - smap = vzalloc(cost); - if (!smap) - return ERR_PTR(-ENOMEM); - } + smap = bpf_map_area_alloc(cost); + if (!smap) + return ERR_PTR(-ENOMEM); err = -E2BIG; cost += n_buckets * (value_size + sizeof(struct stack_map_bucket)); @@ -112,7 +108,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr) put_buffers: put_callchain_buffers(); free_smap: - kvfree(smap); + bpf_map_area_free(smap); return ERR_PTR(err); } @@ -262,9 +258,9 @@ static void stack_map_free(struct bpf_map *map) /* wait for bpf programs to complete before freeing stack map */ synchronize_rcu(); - vfree(smap->elems); + bpf_map_area_free(smap->elems); pcpu_freelist_destroy(&smap->freelist); - kvfree(smap); + bpf_map_area_free(smap); put_callchain_buffers(); } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 1d6b29e4e2c3..19b6129eab23 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -12,6 +12,8 @@ #include #include #include +#include +#include #include #include #include @@ -49,6 +51,30 @@ void bpf_register_map_type(struct bpf_map_type_list *tl) list_add(&tl->list_node, &bpf_map_types); } +void *bpf_map_area_alloc(size_t size) +{ + /* We definitely need __GFP_NORETRY, so OOM killer doesn't + * trigger under memory pressure as we really just want to + * fail instead. + */ + const gfp_t flags = __GFP_NOWARN | __GFP_NORETRY | __GFP_ZERO; + void *area; + + if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) { + area = kmalloc(size, GFP_USER | flags); + if (area != NULL) + return area; + } + + return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | flags, + PAGE_KERNEL); +} + +void bpf_map_area_free(void *area) +{ + kvfree(area); +} + int bpf_map_precharge_memlock(u32 pages) { struct user_struct *user = get_current_user(); -- cgit v1.2.3-71-gd317 From e326ce013a8e851193eb337aafb1aa396c533a61 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 20 Jan 2017 03:25:34 +0100 Subject: Revert "PM / sleep / ACPI: Use the ACPI_FADT_LOW_POWER_S0 flag" Revert commit 08b98d329165 (PM / sleep / ACPI: Use the ACPI_FADT_LOW_POWER_S0 flag) as it caused system suspend (in the default configuration) to fail on Dell XPS13 (9360) with the Kaby Lake processor. Fixes: 08b98d329165 (PM / sleep / ACPI: Use the ACPI_FADT_LOW_POWER_S0 flag) Reported-by: Paul Menzel Signed-off-by: Rafael J. Wysocki --- Documentation/power/states.txt | 4 +--- drivers/acpi/sleep.c | 8 -------- include/linux/suspend.h | 2 -- kernel/power/suspend.c | 4 ++-- 4 files changed, 3 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/Documentation/power/states.txt b/Documentation/power/states.txt index 8a39ce45d8a0..008ecb588317 100644 --- a/Documentation/power/states.txt +++ b/Documentation/power/states.txt @@ -35,9 +35,7 @@ only one way to cause the system to go into the Suspend-To-RAM state (write The default suspend mode (ie. the one to be used without writing anything into /sys/power/mem_sleep) is either "deep" (if Suspend-To-RAM is supported) or "s2idle", but it can be overridden by the value of the "mem_sleep_default" -parameter in the kernel command line. On some ACPI-based systems, depending on -the information in the FADT, the default may be "s2idle" even if Suspend-To-RAM -is supported. +parameter in the kernel command line. The properties of all of the sleep states are described below. diff --git a/drivers/acpi/sleep.c b/drivers/acpi/sleep.c index ce1855fd584b..deb0ff78eba8 100644 --- a/drivers/acpi/sleep.c +++ b/drivers/acpi/sleep.c @@ -691,14 +691,6 @@ static void acpi_sleep_suspend_setup(void) if (acpi_sleep_state_supported(i)) sleep_states[i] = 1; - /* - * Use suspend-to-idle by default if ACPI_FADT_LOW_POWER_S0 is set and - * the default suspend mode was not selected from the command line. - */ - if (acpi_gbl_FADT.flags & ACPI_FADT_LOW_POWER_S0 && - mem_sleep_default > PM_SUSPEND_MEM) - mem_sleep_default = PM_SUSPEND_FREEZE; - suspend_set_ops(old_suspend_ordering ? &acpi_suspend_ops_old : &acpi_suspend_ops); freeze_set_ops(&acpi_freeze_ops); diff --git a/include/linux/suspend.h b/include/linux/suspend.h index 0c729c3c8549..d9718378a8be 100644 --- a/include/linux/suspend.h +++ b/include/linux/suspend.h @@ -194,8 +194,6 @@ struct platform_freeze_ops { }; #ifdef CONFIG_SUSPEND -extern suspend_state_t mem_sleep_default; - /** * suspend_set_ops - set platform dependent suspend operations * @ops: The new suspend operations to set. diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index f67ceb7768b8..15e6baef5c73 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -46,7 +46,7 @@ static const char * const mem_sleep_labels[] = { const char *mem_sleep_states[PM_SUSPEND_MAX]; suspend_state_t mem_sleep_current = PM_SUSPEND_FREEZE; -suspend_state_t mem_sleep_default = PM_SUSPEND_MAX; +static suspend_state_t mem_sleep_default = PM_SUSPEND_MEM; unsigned int pm_suspend_global_flags; EXPORT_SYMBOL_GPL(pm_suspend_global_flags); @@ -168,7 +168,7 @@ void suspend_set_ops(const struct platform_suspend_ops *ops) } if (valid_state(PM_SUSPEND_MEM)) { mem_sleep_states[PM_SUSPEND_MEM] = mem_sleep_labels[PM_SUSPEND_MEM]; - if (mem_sleep_default >= PM_SUSPEND_MEM) + if (mem_sleep_default == PM_SUSPEND_MEM) mem_sleep_current = PM_SUSPEND_MEM; } -- cgit v1.2.3-71-gd317 From 6391a4481ba0796805d6581e42f9f0418c099e34 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Fri, 20 Jan 2017 14:32:42 +0800 Subject: virtio-net: restore VIRTIO_HDR_F_DATA_VALID on receiving Commit 501db511397f ("virtio: don't set VIRTIO_NET_HDR_F_DATA_VALID on xmit") in fact disables VIRTIO_HDR_F_DATA_VALID on receiving path too, fixing this by adding a hint (has_data_valid) and set it only on the receiving path. Cc: Rolf Neugebauer Signed-off-by: Jason Wang Acked-by: Rolf Neugebauer Signed-off-by: David S. Miller --- drivers/net/macvtap.c | 2 +- drivers/net/tun.c | 2 +- drivers/net/virtio_net.c | 2 +- include/linux/virtio_net.h | 6 +++++- net/packet/af_packet.c | 4 ++-- 5 files changed, 10 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c index 5c26653eceb5..402618565838 100644 --- a/drivers/net/macvtap.c +++ b/drivers/net/macvtap.c @@ -825,7 +825,7 @@ static ssize_t macvtap_put_user(struct macvtap_queue *q, return -EINVAL; if (virtio_net_hdr_from_skb(skb, &vnet_hdr, - macvtap_is_little_endian(q))) + macvtap_is_little_endian(q), true)) BUG(); if (copy_to_iter(&vnet_hdr, sizeof(vnet_hdr), iter) != diff --git a/drivers/net/tun.c b/drivers/net/tun.c index cd8e02c94be0..2cd10b26b650 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -1360,7 +1360,7 @@ static ssize_t tun_put_user(struct tun_struct *tun, return -EINVAL; if (virtio_net_hdr_from_skb(skb, &gso, - tun_is_little_endian(tun))) { + tun_is_little_endian(tun), true)) { struct skb_shared_info *sinfo = skb_shinfo(skb); pr_err("unexpected GSO type: " "0x%x, gso_size %d, hdr_len %d\n", diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 4a105006ca63..347424351ade 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -1104,7 +1104,7 @@ static int xmit_skb(struct send_queue *sq, struct sk_buff *skb) hdr = skb_vnet_hdr(skb); if (virtio_net_hdr_from_skb(skb, &hdr->hdr, - virtio_is_little_endian(vi->vdev))) + virtio_is_little_endian(vi->vdev), false)) BUG(); if (vi->mergeable_rx_bufs) diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h index 56436472ccc7..5209b5ed2a64 100644 --- a/include/linux/virtio_net.h +++ b/include/linux/virtio_net.h @@ -56,7 +56,8 @@ static inline int virtio_net_hdr_to_skb(struct sk_buff *skb, static inline int virtio_net_hdr_from_skb(const struct sk_buff *skb, struct virtio_net_hdr *hdr, - bool little_endian) + bool little_endian, + bool has_data_valid) { memset(hdr, 0, sizeof(*hdr)); /* no info leak */ @@ -91,6 +92,9 @@ static inline int virtio_net_hdr_from_skb(const struct sk_buff *skb, skb_checksum_start_offset(skb)); hdr->csum_offset = __cpu_to_virtio16(little_endian, skb->csum_offset); + } else if (has_data_valid && + skb->ip_summed == CHECKSUM_UNNECESSARY) { + hdr->flags = VIRTIO_NET_HDR_F_DATA_VALID; } /* else everything is zero */ return 0; diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index b9e1a13b4ba3..3d555c79a7b5 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -1976,7 +1976,7 @@ static int packet_rcv_vnet(struct msghdr *msg, const struct sk_buff *skb, return -EINVAL; *len -= sizeof(vnet_hdr); - if (virtio_net_hdr_from_skb(skb, &vnet_hdr, vio_le())) + if (virtio_net_hdr_from_skb(skb, &vnet_hdr, vio_le(), true)) return -EINVAL; return memcpy_to_msg(msg, (void *)&vnet_hdr, sizeof(vnet_hdr)); @@ -2237,7 +2237,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, if (po->has_vnet_hdr) { if (virtio_net_hdr_from_skb(skb, h.raw + macoff - sizeof(struct virtio_net_hdr), - vio_le())) { + vio_le(), true)) { spin_lock(&sk->sk_receive_queue.lock); goto drop_n_account; } -- cgit v1.2.3-71-gd317 From 059aa734824165507c65fd30a55ff000afd14983 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Sun, 22 Jan 2017 14:04:29 -0500 Subject: nfs: Don't increment lock sequence ID after NFS4ERR_MOVED Xuan Qi reports that the Linux NFSv4 client failed to lock a file that was migrated. The steps he observed on the wire: 1. The client sent a LOCK request to the source server 2. The source server replied NFS4ERR_MOVED 3. The client switched to the destination server 4. The client sent the same LOCK request to the destination server with a bumped lock sequence ID 5. The destination server rejected the LOCK request with NFS4ERR_BAD_SEQID RFC 3530 section 8.1.5 provides a list of NFS errors which do not bump a lock sequence ID. However, RFC 3530 is now obsoleted by RFC 7530. In RFC 7530 section 9.1.7, this list has been updated by the addition of NFS4ERR_MOVED. Reported-by: Xuan Qi Signed-off-by: Chuck Lever Cc: stable@vger.kernel.org # v3.7+ Signed-off-by: Trond Myklebust --- include/linux/nfs4.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h index bca536341d1a..1b1ca04820a3 100644 --- a/include/linux/nfs4.h +++ b/include/linux/nfs4.h @@ -282,7 +282,7 @@ enum nfsstat4 { static inline bool seqid_mutating_err(u32 err) { - /* rfc 3530 section 8.1.5: */ + /* See RFC 7530, section 9.1.7 */ switch (err) { case NFS4ERR_STALE_CLIENTID: case NFS4ERR_STALE_STATEID: @@ -291,6 +291,7 @@ static inline bool seqid_mutating_err(u32 err) case NFS4ERR_BADXDR: case NFS4ERR_RESOURCE: case NFS4ERR_NOFILEHANDLE: + case NFS4ERR_MOVED: return false; }; return true; -- cgit v1.2.3-71-gd317 From c929ea0b910355e1876c64431f3d5802f95b3d75 Mon Sep 17 00:00:00 2001 From: Kinglong Mee Date: Fri, 20 Jan 2017 16:48:39 +0800 Subject: SUNRPC: cleanup ida information when removing sunrpc module After removing sunrpc module, I get many kmemleak information as, unreferenced object 0xffff88003316b1e0 (size 544): comm "gssproxy", pid 2148, jiffies 4294794465 (age 4200.081s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [] kmemleak_alloc+0x4a/0xa0 [] kmem_cache_alloc+0x15e/0x1f0 [] ida_pre_get+0xaa/0x150 [] ida_simple_get+0xad/0x180 [] nlmsvc_lookup_host+0x4ab/0x7f0 [lockd] [] lockd+0x4d/0x270 [lockd] [] param_set_timeout+0x55/0x100 [lockd] [] svc_defer+0x114/0x3f0 [sunrpc] [] svc_defer+0x2d7/0x3f0 [sunrpc] [] rpc_show_info+0x8a/0x110 [sunrpc] [] proc_reg_write+0x7f/0xc0 [] __vfs_write+0xdf/0x3c0 [] vfs_write+0xef/0x240 [] SyS_write+0xad/0x130 [] entry_SYSCALL_64_fastpath+0x1a/0xa9 [] 0xffffffffffffffff I found, the ida information (dynamic memory) isn't cleanup. Signed-off-by: Kinglong Mee Fixes: 2f048db4680a ("SUNRPC: Add an identifier for struct rpc_clnt") Cc: stable@vger.kernel.org # v3.12+ Signed-off-by: Trond Myklebust --- include/linux/sunrpc/clnt.h | 1 + net/sunrpc/clnt.c | 5 +++++ net/sunrpc/sunrpc_syms.c | 1 + 3 files changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h index 85cc819676e8..333ad11b3dd9 100644 --- a/include/linux/sunrpc/clnt.h +++ b/include/linux/sunrpc/clnt.h @@ -216,5 +216,6 @@ void rpc_clnt_xprt_switch_put(struct rpc_clnt *); void rpc_clnt_xprt_switch_add_xprt(struct rpc_clnt *, struct rpc_xprt *); bool rpc_clnt_xprt_switch_has_addr(struct rpc_clnt *clnt, const struct sockaddr *sap); +void rpc_cleanup_clids(void); #endif /* __KERNEL__ */ #endif /* _LINUX_SUNRPC_CLNT_H */ diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 1efbe48e794f..1dc9f3bac099 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -336,6 +336,11 @@ out: static DEFINE_IDA(rpc_clids); +void rpc_cleanup_clids(void) +{ + ida_destroy(&rpc_clids); +} + static int rpc_alloc_clid(struct rpc_clnt *clnt) { int clid; diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c index d1c330a7953a..c73de181467a 100644 --- a/net/sunrpc/sunrpc_syms.c +++ b/net/sunrpc/sunrpc_syms.c @@ -119,6 +119,7 @@ out: static void __exit cleanup_sunrpc(void) { + rpc_cleanup_clids(); rpcauth_remove_module(); cleanup_socket_xprt(); svc_cleanup_xprt_sock(); -- cgit v1.2.3-71-gd317 From d6f8cfa3dea294eabf8f302e90176dd6381fb66e Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Wed, 25 Jan 2017 11:39:49 +0100 Subject: net: phy: leds: Break dependency of phy.h on phy_led_triggers.h includes , which is not really needed. Drop the include from , and add it to all users that didn't include it explicitly. Suggested-by: Andrew Lunn Signed-off-by: Geert Uytterhoeven Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/phy/phy.c | 1 + drivers/net/phy/phy_led_triggers.c | 1 + include/linux/phy.h | 1 - 3 files changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c index e687a9cb4a37..7cc1b7dcfe05 100644 --- a/drivers/net/phy/phy.c +++ b/drivers/net/phy/phy.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/net/phy/phy_led_triggers.c b/drivers/net/phy/phy_led_triggers.c index 3f619e7371e9..94ca42e630bb 100644 --- a/drivers/net/phy/phy_led_triggers.c +++ b/drivers/net/phy/phy_led_triggers.c @@ -12,6 +12,7 @@ */ #include #include +#include #include static struct phy_led_trigger *phy_speed_to_led_trigger(struct phy_device *phy, diff --git a/include/linux/phy.h b/include/linux/phy.h index f7d95f644eed..7fc1105605bf 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -25,7 +25,6 @@ #include #include #include -#include #include -- cgit v1.2.3-71-gd317 From 3c880eb0205222bb062970085ebedc73ec8dfd14 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Wed, 25 Jan 2017 11:39:50 +0100 Subject: net: phy: leds: Fix truncated LED trigger names Commit 4567d686f5c6d955 ("phy: increase size of MII_BUS_ID_SIZE and bus_id") increased the size of MII bus IDs, but forgot to update the private definition in . This may cause: 1. Truncation of LED trigger names, 2. Duplicate LED trigger names, 3. Failures registering LED triggers, 4. Crashes due to bad error handling in the LED trigger failure path. To fix this, and prevent the definitions going out of sync again in the future, let the PHY LED trigger code use the existing MII_BUS_ID_SIZE definition. Example: - Before I had triggers "ee700000.etherne:01:100Mbps" and "ee700000.etherne:01:10Mbps", - After the increase of MII_BUS_ID_SIZE, both became "ee700000.ethernet-ffffffff:01:" => FAIL, - Now, the triggers are "ee700000.ethernet-ffffffff:01:100Mbps" and "ee700000.ethernet-ffffffff:01:10Mbps", which are unique again. Fixes: 4567d686f5c6d955 ("phy: increase size of MII_BUS_ID_SIZE and bus_id") Fixes: 2e0bc452f4721520 ("net: phy: leds: add support for led triggers on phy link state change") Signed-off-by: Geert Uytterhoeven Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- include/linux/phy_led_triggers.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/phy_led_triggers.h b/include/linux/phy_led_triggers.h index a2daea0a37d2..b37b05bfd1a6 100644 --- a/include/linux/phy_led_triggers.h +++ b/include/linux/phy_led_triggers.h @@ -18,11 +18,11 @@ struct phy_device; #ifdef CONFIG_LED_TRIGGER_PHY #include +#include #define PHY_LED_TRIGGER_SPEED_SUFFIX_SIZE 10 -#define PHY_MII_BUS_ID_SIZE (20 - 3) -#define PHY_LINK_LED_TRIGGER_NAME_SIZE (PHY_MII_BUS_ID_SIZE + \ +#define PHY_LINK_LED_TRIGGER_NAME_SIZE (MII_BUS_ID_SIZE + \ FIELD_SIZEOF(struct mdio_device, addr)+\ PHY_LED_TRIGGER_SPEED_SUFFIX_SIZE) -- cgit v1.2.3-71-gd317 From 9d162ed69f51cbd9ee5a0c7e82aba7acc96362ff Mon Sep 17 00:00:00 2001 From: Sean Nyekjaer Date: Fri, 27 Jan 2017 08:46:23 +0100 Subject: net: phy: micrel: add support for KSZ8795 This is adds support for the PHYs in the KSZ8795 5port managed switch. It will allow to detect the link between the switch and the soc and uses the same read_status functions as the KSZ8873MLL switch. Signed-off-by: Sean Nyekjaer Signed-off-by: David S. Miller --- drivers/net/phy/micrel.c | 14 ++++++++++++++ include/linux/micrel_phy.h | 2 ++ 2 files changed, 16 insertions(+) (limited to 'include/linux') diff --git a/drivers/net/phy/micrel.c b/drivers/net/phy/micrel.c index 9a77289109b7..e55809c5beb7 100644 --- a/drivers/net/phy/micrel.c +++ b/drivers/net/phy/micrel.c @@ -1008,6 +1008,20 @@ static struct phy_driver ksphy_driver[] = { .get_stats = kszphy_get_stats, .suspend = genphy_suspend, .resume = genphy_resume, +}, { + .phy_id = PHY_ID_KSZ8795, + .phy_id_mask = MICREL_PHY_ID_MASK, + .name = "Micrel KSZ8795", + .features = (SUPPORTED_Pause | SUPPORTED_Asym_Pause), + .flags = PHY_HAS_MAGICANEG | PHY_HAS_INTERRUPT, + .config_init = kszphy_config_init, + .config_aneg = ksz8873mll_config_aneg, + .read_status = ksz8873mll_read_status, + .get_sset_count = kszphy_get_sset_count, + .get_strings = kszphy_get_strings, + .get_stats = kszphy_get_stats, + .suspend = genphy_suspend, + .resume = genphy_resume, } }; module_phy_driver(ksphy_driver); diff --git a/include/linux/micrel_phy.h b/include/linux/micrel_phy.h index 257173e0095e..f541da68d1e7 100644 --- a/include/linux/micrel_phy.h +++ b/include/linux/micrel_phy.h @@ -35,6 +35,8 @@ #define PHY_ID_KSZ886X 0x00221430 #define PHY_ID_KSZ8863 0x00221435 +#define PHY_ID_KSZ8795 0x00221550 + /* struct phy_device dev_flags definitions */ #define MICREL_PHY_50MHZ_CLK 0x00000001 #define MICREL_PHY_FXEN 0x00000002 -- cgit v1.2.3-71-gd317 From 966d2b04e070bc040319aaebfec09e0144dc3341 Mon Sep 17 00:00:00 2001 From: Douglas Miller Date: Sat, 28 Jan 2017 06:42:20 -0600 Subject: percpu-refcount: fix reference leak during percpu-atomic transition percpu_ref_tryget() and percpu_ref_tryget_live() should return "true" IFF they acquire a reference. But the return value from atomic_long_inc_not_zero() is a long and may have high bits set, e.g. PERCPU_COUNT_BIAS, and the return value of the tryget routines is bool so the reference may actually be acquired but the routines return "false" which results in a reference leak since the caller assumes it does not need to do a corresponding percpu_ref_put(). This was seen when performing CPU hotplug during I/O, as hangs in blk_mq_freeze_queue_wait where percpu_ref_kill (blk_mq_freeze_queue_start) raced with percpu_ref_tryget (blk_mq_timeout_work). Sample stack trace: __switch_to+0x2c0/0x450 __schedule+0x2f8/0x970 schedule+0x48/0xc0 blk_mq_freeze_queue_wait+0x94/0x120 blk_mq_queue_reinit_work+0xb8/0x180 blk_mq_queue_reinit_prepare+0x84/0xa0 cpuhp_invoke_callback+0x17c/0x600 cpuhp_up_callbacks+0x58/0x150 _cpu_up+0xf0/0x1c0 do_cpu_up+0x120/0x150 cpu_subsys_online+0x64/0xe0 device_online+0xb4/0x120 online_store+0xb4/0xc0 dev_attr_store+0x68/0xa0 sysfs_kf_write+0x80/0xb0 kernfs_fop_write+0x17c/0x250 __vfs_write+0x6c/0x1e0 vfs_write+0xd0/0x270 SyS_write+0x6c/0x110 system_call+0x38/0xe0 Examination of the queue showed a single reference (no PERCPU_COUNT_BIAS, and __PERCPU_REF_DEAD, __PERCPU_REF_ATOMIC set) and no requests. However, conditions at the time of the race are count of PERCPU_COUNT_BIAS + 0 and __PERCPU_REF_DEAD and __PERCPU_REF_ATOMIC set. The fix is to make the tryget routines use an actual boolean internally instead of the atomic long result truncated to a int. Fixes: e625305b3907 percpu-refcount: make percpu_ref based on longs instead of ints Link: https://bugzilla.kernel.org/show_bug.cgi?id=190751 Signed-off-by: Douglas Miller Reviewed-by: Jens Axboe Signed-off-by: Tejun Heo Fixes: e625305b3907 ("percpu-refcount: make percpu_ref based on longs instead of ints") Cc: stable@vger.kernel.org # v3.18+ --- include/linux/percpu-refcount.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h index 1c7eec09e5eb..3a481a49546e 100644 --- a/include/linux/percpu-refcount.h +++ b/include/linux/percpu-refcount.h @@ -204,7 +204,7 @@ static inline void percpu_ref_get(struct percpu_ref *ref) static inline bool percpu_ref_tryget(struct percpu_ref *ref) { unsigned long __percpu *percpu_count; - int ret; + bool ret; rcu_read_lock_sched(); @@ -238,7 +238,7 @@ static inline bool percpu_ref_tryget(struct percpu_ref *ref) static inline bool percpu_ref_tryget_live(struct percpu_ref *ref) { unsigned long __percpu *percpu_count; - int ret = false; + bool ret = false; rcu_read_lock_sched(); -- cgit v1.2.3-71-gd317 From f1712c73714088a7252d276a57126d56c7d37e64 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 27 Jan 2017 08:11:44 -0800 Subject: can: Fix kernel panic at security_sock_rcv_skb Zhang Yanmin reported crashes [1] and provided a patch adding a synchronize_rcu() call in can_rx_unregister() The main problem seems that the sockets themselves are not RCU protected. If CAN uses RCU for delivery, then sockets should be freed only after one RCU grace period. Recent kernels could use sock_set_flag(sk, SOCK_RCU_FREE), but let's ease stable backports with the following fix instead. [1] BUG: unable to handle kernel NULL pointer dereference at (null) IP: [] selinux_socket_sock_rcv_skb+0x65/0x2a0 Call Trace: [] security_sock_rcv_skb+0x4c/0x60 [] sk_filter+0x41/0x210 [] sock_queue_rcv_skb+0x53/0x3a0 [] raw_rcv+0x2a3/0x3c0 [] can_rcv_filter+0x12b/0x370 [] can_receive+0xd9/0x120 [] can_rcv+0xab/0x100 [] __netif_receive_skb_core+0xd8c/0x11f0 [] __netif_receive_skb+0x24/0xb0 [] process_backlog+0x127/0x280 [] net_rx_action+0x33b/0x4f0 [] __do_softirq+0x184/0x440 [] do_softirq_own_stack+0x1c/0x30 [] do_softirq.part.18+0x3b/0x40 [] do_softirq+0x1d/0x20 [] netif_rx_ni+0xe5/0x110 [] slcan_receive_buf+0x507/0x520 [] flush_to_ldisc+0x21c/0x230 [] process_one_work+0x24f/0x670 [] worker_thread+0x9d/0x6f0 [] ? rescuer_thread+0x480/0x480 [] kthread+0x12c/0x150 [] ret_from_fork+0x3f/0x70 Reported-by: Zhang Yanmin Signed-off-by: Eric Dumazet Acked-by: Oliver Hartkopp Signed-off-by: David S. Miller --- include/linux/can/core.h | 7 +++---- net/can/af_can.c | 12 ++++++++++-- net/can/af_can.h | 3 ++- net/can/bcm.c | 4 ++-- net/can/gw.c | 2 +- net/can/raw.c | 4 ++-- 6 files changed, 20 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/can/core.h b/include/linux/can/core.h index a0875001b13c..df08a41d5be5 100644 --- a/include/linux/can/core.h +++ b/include/linux/can/core.h @@ -45,10 +45,9 @@ struct can_proto { extern int can_proto_register(const struct can_proto *cp); extern void can_proto_unregister(const struct can_proto *cp); -extern int can_rx_register(struct net_device *dev, canid_t can_id, - canid_t mask, - void (*func)(struct sk_buff *, void *), - void *data, char *ident); +int can_rx_register(struct net_device *dev, canid_t can_id, canid_t mask, + void (*func)(struct sk_buff *, void *), + void *data, char *ident, struct sock *sk); extern void can_rx_unregister(struct net_device *dev, canid_t can_id, canid_t mask, diff --git a/net/can/af_can.c b/net/can/af_can.c index 1108079d934f..5488e4a6ccd0 100644 --- a/net/can/af_can.c +++ b/net/can/af_can.c @@ -445,6 +445,7 @@ static struct hlist_head *find_rcv_list(canid_t *can_id, canid_t *mask, * @func: callback function on filter match * @data: returned parameter for callback function * @ident: string for calling module identification + * @sk: socket pointer (might be NULL) * * Description: * Invokes the callback function with the received sk_buff and the given @@ -468,7 +469,7 @@ static struct hlist_head *find_rcv_list(canid_t *can_id, canid_t *mask, */ int can_rx_register(struct net_device *dev, canid_t can_id, canid_t mask, void (*func)(struct sk_buff *, void *), void *data, - char *ident) + char *ident, struct sock *sk) { struct receiver *r; struct hlist_head *rl; @@ -496,6 +497,7 @@ int can_rx_register(struct net_device *dev, canid_t can_id, canid_t mask, r->func = func; r->data = data; r->ident = ident; + r->sk = sk; hlist_add_head_rcu(&r->list, rl); d->entries++; @@ -520,8 +522,11 @@ EXPORT_SYMBOL(can_rx_register); static void can_rx_delete_receiver(struct rcu_head *rp) { struct receiver *r = container_of(rp, struct receiver, rcu); + struct sock *sk = r->sk; kmem_cache_free(rcv_cache, r); + if (sk) + sock_put(sk); } /** @@ -596,8 +601,11 @@ void can_rx_unregister(struct net_device *dev, canid_t can_id, canid_t mask, spin_unlock(&can_rcvlists_lock); /* schedule the receiver item for deletion */ - if (r) + if (r) { + if (r->sk) + sock_hold(r->sk); call_rcu(&r->rcu, can_rx_delete_receiver); + } } EXPORT_SYMBOL(can_rx_unregister); diff --git a/net/can/af_can.h b/net/can/af_can.h index fca0fe9fc45a..b86f5129e838 100644 --- a/net/can/af_can.h +++ b/net/can/af_can.h @@ -50,13 +50,14 @@ struct receiver { struct hlist_node list; - struct rcu_head rcu; canid_t can_id; canid_t mask; unsigned long matches; void (*func)(struct sk_buff *, void *); void *data; char *ident; + struct sock *sk; + struct rcu_head rcu; }; #define CAN_SFF_RCV_ARRAY_SZ (1 << CAN_SFF_ID_BITS) diff --git a/net/can/bcm.c b/net/can/bcm.c index 21ac75390e3d..5c9407181918 100644 --- a/net/can/bcm.c +++ b/net/can/bcm.c @@ -1216,7 +1216,7 @@ static int bcm_rx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg, err = can_rx_register(dev, op->can_id, REGMASK(op->can_id), bcm_rx_handler, op, - "bcm"); + "bcm", sk); op->rx_reg_dev = dev; dev_put(dev); @@ -1225,7 +1225,7 @@ static int bcm_rx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg, } else err = can_rx_register(NULL, op->can_id, REGMASK(op->can_id), - bcm_rx_handler, op, "bcm"); + bcm_rx_handler, op, "bcm", sk); if (err) { /* this bcm rx op is broken -> remove it */ list_del(&op->list); diff --git a/net/can/gw.c b/net/can/gw.c index a54ab0c82104..7056a1a2bb70 100644 --- a/net/can/gw.c +++ b/net/can/gw.c @@ -442,7 +442,7 @@ static inline int cgw_register_filter(struct cgw_job *gwj) { return can_rx_register(gwj->src.dev, gwj->ccgw.filter.can_id, gwj->ccgw.filter.can_mask, can_can_gw_rcv, - gwj, "gw"); + gwj, "gw", NULL); } static inline void cgw_unregister_filter(struct cgw_job *gwj) diff --git a/net/can/raw.c b/net/can/raw.c index b075f028d7e2..6dc546a06673 100644 --- a/net/can/raw.c +++ b/net/can/raw.c @@ -190,7 +190,7 @@ static int raw_enable_filters(struct net_device *dev, struct sock *sk, for (i = 0; i < count; i++) { err = can_rx_register(dev, filter[i].can_id, filter[i].can_mask, - raw_rcv, sk, "raw"); + raw_rcv, sk, "raw", sk); if (err) { /* clean up successfully registered filters */ while (--i >= 0) @@ -211,7 +211,7 @@ static int raw_enable_errfilter(struct net_device *dev, struct sock *sk, if (err_mask) err = can_rx_register(dev, 0, err_mask | CAN_ERR_FLAG, - raw_rcv, sk, "raw"); + raw_rcv, sk, "raw", sk); return err; } -- cgit v1.2.3-71-gd317 From 08d85f3ea99f1eeafc4e8507936190e86a16ee8c Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 17 Jan 2017 16:00:48 +0000 Subject: irqdomain: Avoid activating interrupts more than once Since commit f3b0946d629c ("genirq/msi: Make sure PCI MSIs are activated early"), we can end-up activating a PCI/MSI twice (once at allocation time, and once at startup time). This is normally of no consequences, except that there is some HW out there that may misbehave if activate is used more than once (the GICv3 ITS, for example, uses the activate callback to issue the MAPVI command, and the architecture spec says that "If there is an existing mapping for the EventID-DeviceID combination, behavior is UNPREDICTABLE"). While this could be worked around in each individual driver, it may make more sense to tackle the issue at the core level. In order to avoid getting in that situation, let's have a per-interrupt flag to remember if we have already activated that interrupt or not. Fixes: f3b0946d629c ("genirq/msi: Make sure PCI MSIs are activated early") Reported-and-tested-by: Andre Przywara Signed-off-by: Marc Zyngier Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/1484668848-24361-1-git-send-email-marc.zyngier@arm.com Signed-off-by: Thomas Gleixner --- include/linux/irq.h | 17 +++++++++++++++++ kernel/irq/irqdomain.c | 44 ++++++++++++++++++++++++++++++-------------- 2 files changed, 47 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/irq.h b/include/linux/irq.h index e79875574b39..39e3254e5769 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -184,6 +184,7 @@ struct irq_data { * * IRQD_TRIGGER_MASK - Mask for the trigger type bits * IRQD_SETAFFINITY_PENDING - Affinity setting is pending + * IRQD_ACTIVATED - Interrupt has already been activated * IRQD_NO_BALANCING - Balancing disabled for this IRQ * IRQD_PER_CPU - Interrupt is per cpu * IRQD_AFFINITY_SET - Interrupt affinity was set @@ -202,6 +203,7 @@ struct irq_data { enum { IRQD_TRIGGER_MASK = 0xf, IRQD_SETAFFINITY_PENDING = (1 << 8), + IRQD_ACTIVATED = (1 << 9), IRQD_NO_BALANCING = (1 << 10), IRQD_PER_CPU = (1 << 11), IRQD_AFFINITY_SET = (1 << 12), @@ -312,6 +314,21 @@ static inline bool irqd_affinity_is_managed(struct irq_data *d) return __irqd_to_state(d) & IRQD_AFFINITY_MANAGED; } +static inline bool irqd_is_activated(struct irq_data *d) +{ + return __irqd_to_state(d) & IRQD_ACTIVATED; +} + +static inline void irqd_set_activated(struct irq_data *d) +{ + __irqd_to_state(d) |= IRQD_ACTIVATED; +} + +static inline void irqd_clr_activated(struct irq_data *d) +{ + __irqd_to_state(d) &= ~IRQD_ACTIVATED; +} + #undef __irqd_to_state static inline irq_hw_number_t irqd_to_hwirq(struct irq_data *d) diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 8c0a0ae43521..b59e6768c5e9 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -1346,6 +1346,30 @@ void irq_domain_free_irqs_parent(struct irq_domain *domain, } EXPORT_SYMBOL_GPL(irq_domain_free_irqs_parent); +static void __irq_domain_activate_irq(struct irq_data *irq_data) +{ + if (irq_data && irq_data->domain) { + struct irq_domain *domain = irq_data->domain; + + if (irq_data->parent_data) + __irq_domain_activate_irq(irq_data->parent_data); + if (domain->ops->activate) + domain->ops->activate(domain, irq_data); + } +} + +static void __irq_domain_deactivate_irq(struct irq_data *irq_data) +{ + if (irq_data && irq_data->domain) { + struct irq_domain *domain = irq_data->domain; + + if (domain->ops->deactivate) + domain->ops->deactivate(domain, irq_data); + if (irq_data->parent_data) + __irq_domain_deactivate_irq(irq_data->parent_data); + } +} + /** * irq_domain_activate_irq - Call domain_ops->activate recursively to activate * interrupt @@ -1356,13 +1380,9 @@ EXPORT_SYMBOL_GPL(irq_domain_free_irqs_parent); */ void irq_domain_activate_irq(struct irq_data *irq_data) { - if (irq_data && irq_data->domain) { - struct irq_domain *domain = irq_data->domain; - - if (irq_data->parent_data) - irq_domain_activate_irq(irq_data->parent_data); - if (domain->ops->activate) - domain->ops->activate(domain, irq_data); + if (!irqd_is_activated(irq_data)) { + __irq_domain_activate_irq(irq_data); + irqd_set_activated(irq_data); } } @@ -1376,13 +1396,9 @@ void irq_domain_activate_irq(struct irq_data *irq_data) */ void irq_domain_deactivate_irq(struct irq_data *irq_data) { - if (irq_data && irq_data->domain) { - struct irq_domain *domain = irq_data->domain; - - if (domain->ops->deactivate) - domain->ops->deactivate(domain, irq_data); - if (irq_data->parent_data) - irq_domain_deactivate_irq(irq_data->parent_data); + if (irqd_is_activated(irq_data)) { + __irq_domain_deactivate_irq(irq_data); + irqd_clr_activated(irq_data); } } -- cgit v1.2.3-71-gd317 From 433e19cf33d34bb6751c874a9c00980552fe508c Mon Sep 17 00:00:00 2001 From: Dexuan Cui Date: Sat, 28 Jan 2017 11:46:02 -0700 Subject: Drivers: hv: vmbus: finally fix hv_need_to_signal_on_read() Commit a389fcfd2cb5 ("Drivers: hv: vmbus: Fix signaling logic in hv_need_to_signal_on_read()") added the proper mb(), but removed the test "prev_write_sz < pending_sz" when making the signal decision. As a result, the guest can signal the host unnecessarily, and then the host can throttle the guest because the host thinks the guest is buggy or malicious; finally the user running stress test can perceive intermittent freeze of the guest. This patch brings back the test, and properly handles the in-place consumption APIs used by NetVSC (see get_next_pkt_raw(), put_pkt_raw() and commit_rd_index()). Fixes: a389fcfd2cb5 ("Drivers: hv: vmbus: Fix signaling logic in hv_need_to_signal_on_read()") Signed-off-by: Dexuan Cui Reported-by: Rolf Neugebauer Tested-by: Rolf Neugebauer Cc: "K. Y. Srinivasan" Cc: Haiyang Zhang Cc: Stephen Hemminger Cc: Signed-off-by: K. Y. Srinivasan Signed-off-by: Greg Kroah-Hartman --- drivers/hv/ring_buffer.c | 1 + drivers/net/hyperv/netvsc.c | 6 ++++++ include/linux/hyperv.h | 32 ++++++++++++++++++++++++++++++-- 3 files changed, 37 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/drivers/hv/ring_buffer.c b/drivers/hv/ring_buffer.c index cd49cb17eb7f..308dbda700eb 100644 --- a/drivers/hv/ring_buffer.c +++ b/drivers/hv/ring_buffer.c @@ -383,6 +383,7 @@ int hv_ringbuffer_read(struct vmbus_channel *channel, return ret; } + init_cached_read_index(channel); next_read_location = hv_get_next_read_location(inring_info); next_read_location = hv_copyfrom_ringbuffer(inring_info, &desc, sizeof(desc), diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c index 5a1cc089acb7..86e5749226ef 100644 --- a/drivers/net/hyperv/netvsc.c +++ b/drivers/net/hyperv/netvsc.c @@ -1295,6 +1295,9 @@ void netvsc_channel_cb(void *context) ndev = hv_get_drvdata(device); buffer = get_per_channel_state(channel); + /* commit_rd_index() -> hv_signal_on_read() needs this. */ + init_cached_read_index(channel); + do { desc = get_next_pkt_raw(channel); if (desc != NULL) { @@ -1347,6 +1350,9 @@ void netvsc_channel_cb(void *context) bufferlen = bytes_recvd; } + + init_cached_read_index(channel); + } while (1); if (bufferlen > NETVSC_PACKET_SIZE) diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index 42fe43fb0c80..183efde54269 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -128,6 +128,7 @@ struct hv_ring_buffer_info { u32 ring_data_startoffset; u32 priv_write_index; u32 priv_read_index; + u32 cached_read_index; }; /* @@ -180,6 +181,19 @@ static inline u32 hv_get_bytes_to_write(struct hv_ring_buffer_info *rbi) return write; } +static inline u32 hv_get_cached_bytes_to_write( + const struct hv_ring_buffer_info *rbi) +{ + u32 read_loc, write_loc, dsize, write; + + dsize = rbi->ring_datasize; + read_loc = rbi->cached_read_index; + write_loc = rbi->ring_buffer->write_index; + + write = write_loc >= read_loc ? dsize - (write_loc - read_loc) : + read_loc - write_loc; + return write; +} /* * VMBUS version is 32 bit entity broken up into * two 16 bit quantities: major_number. minor_number. @@ -1488,7 +1502,7 @@ hv_get_ring_buffer(struct hv_ring_buffer_info *ring_info) static inline void hv_signal_on_read(struct vmbus_channel *channel) { - u32 cur_write_sz; + u32 cur_write_sz, cached_write_sz; u32 pending_sz; struct hv_ring_buffer_info *rbi = &channel->inbound; @@ -1512,12 +1526,24 @@ static inline void hv_signal_on_read(struct vmbus_channel *channel) cur_write_sz = hv_get_bytes_to_write(rbi); - if (cur_write_sz >= pending_sz) + if (cur_write_sz < pending_sz) + return; + + cached_write_sz = hv_get_cached_bytes_to_write(rbi); + if (cached_write_sz < pending_sz) vmbus_setevent(channel); return; } +static inline void +init_cached_read_index(struct vmbus_channel *channel) +{ + struct hv_ring_buffer_info *rbi = &channel->inbound; + + rbi->cached_read_index = rbi->ring_buffer->read_index; +} + /* * An API to support in-place processing of incoming VMBUS packets. */ @@ -1569,6 +1595,8 @@ static inline void put_pkt_raw(struct vmbus_channel *channel, * This call commits the read index and potentially signals the host. * Here is the pattern for using the "in-place" consumption APIs: * + * init_cached_read_index(); + * * while (get_next_pkt_raw() { * process the packet "in-place"; * put_pkt_raw(); -- cgit v1.2.3-71-gd317 From e26bfebdfc0d212d366de9990a096665d5c0209a Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 31 Jan 2017 09:45:28 +0000 Subject: fscache: Fix dead object requeue Under some circumstances, an fscache object can become queued such that it fscache_object_work_func() can be called once the object is in the OBJECT_DEAD state. This results in the kernel oopsing when it tries to invoke the handler for the state (which is hard coded to 0x2). The way this comes about is something like the following: (1) The object dispatcher is processing a work state for an object. This is done in workqueue context. (2) An out-of-band event comes in that isn't masked, causing the object to be queued, say EV_KILL. (3) The object dispatcher finishes processing the current work state on that object and then sees there's another event to process, so, without returning to the workqueue core, it processes that event too. It then follows the chain of events that initiates until we reach OBJECT_DEAD without going through a wait state (such as WAIT_FOR_CLEARANCE). At this point, object->events may be 0, object->event_mask will be 0 and oob_event_mask will be 0. (4) The object dispatcher returns to the workqueue processor, and in due course, this sees that the object's work item is still queued and invokes it again. (5) The current state is a work state (OBJECT_DEAD), so the dispatcher jumps to it - resulting in an OOPS. When I'm seeing this, the work state in (1) appears to have been either LOOK_UP_OBJECT or CREATE_OBJECT (object->oob_table is fscache_osm_lookup_oob). The window for (2) is very small: (A) object->event_mask is cleared whilst the event dispatch process is underway - though there's no memory barrier to force this to the top of the function. The window, therefore is from the time the object was selected by the workqueue processor and made requeueable to the time the mask was cleared. (B) fscache_raise_event() will only queue the object if it manages to set the event bit and the corresponding event_mask bit was set. The enqueuement is then deferred slightly whilst we get a ref on the object and get the per-CPU variable for workqueue congestion. This slight deferral slightly increases the probability by allowing extra time for the workqueue to make the item requeueable. Handle this by giving the dead state a processor function and checking the for the dead state address rather than seeing if the processor function is address 0x2. The dead state processor function can then set a flag to indicate that it's occurred and give a warning if it occurs more than once per object. If this race occurs, an oops similar to the following is seen (note the RIP value): BUG: unable to handle kernel NULL pointer dereference at 0000000000000002 IP: [<0000000000000002>] 0x1 PGD 0 Oops: 0010 [#1] SMP Modules linked in: ... CPU: 17 PID: 16077 Comm: kworker/u48:9 Not tainted 3.10.0-327.18.2.el7.x86_64 #1 Hardware name: HP ProLiant DL380 Gen9/ProLiant DL380 Gen9, BIOS P89 12/27/2015 Workqueue: fscache_object fscache_object_work_func [fscache] task: ffff880302b63980 ti: ffff880717544000 task.ti: ffff880717544000 RIP: 0010:[<0000000000000002>] [<0000000000000002>] 0x1 RSP: 0018:ffff880717547df8 EFLAGS: 00010202 RAX: ffffffffa0368640 RBX: ffff880edf7a4480 RCX: dead000000200200 RDX: 0000000000000002 RSI: 00000000ffffffff RDI: ffff880edf7a4480 RBP: ffff880717547e18 R08: 0000000000000000 R09: dfc40a25cb3a4510 R10: dfc40a25cb3a4510 R11: 0000000000000400 R12: 0000000000000000 R13: ffff880edf7a4510 R14: ffff8817f6153400 R15: 0000000000000600 FS: 0000000000000000(0000) GS:ffff88181f420000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000002 CR3: 000000000194a000 CR4: 00000000001407e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Stack: ffffffffa0363695 ffff880edf7a4510 ffff88093f16f900 ffff8817faa4ec00 ffff880717547e60 ffffffff8109d5db 00000000faa4ec18 0000000000000000 ffff8817faa4ec18 ffff88093f16f930 ffff880302b63980 ffff88093f16f900 Call Trace: [] ? fscache_object_work_func+0xa5/0x200 [fscache] [] process_one_work+0x17b/0x470 [] worker_thread+0x21c/0x400 [] ? rescuer_thread+0x400/0x400 [] kthread+0xcf/0xe0 [] ? kthread_create_on_node+0x140/0x140 [] ret_from_fork+0x58/0x90 [] ? kthread_create_on_node+0x140/0x140 Signed-off-by: David Howells Acked-by: Jeremy McNicoll Tested-by: Frank Sorenson Tested-by: Benjamin Coddington Reviewed-by: Benjamin Coddington Signed-off-by: Al Viro --- fs/fscache/object.c | 26 ++++++++++++++++++++++++-- include/linux/fscache-cache.h | 1 + 2 files changed, 25 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/fs/fscache/object.c b/fs/fscache/object.c index be02a086ed9b..7a182c87f378 100644 --- a/fs/fscache/object.c +++ b/fs/fscache/object.c @@ -30,6 +30,7 @@ static const struct fscache_state *fscache_look_up_object(struct fscache_object static const struct fscache_state *fscache_object_available(struct fscache_object *, int); static const struct fscache_state *fscache_parent_ready(struct fscache_object *, int); static const struct fscache_state *fscache_update_object(struct fscache_object *, int); +static const struct fscache_state *fscache_object_dead(struct fscache_object *, int); #define __STATE_NAME(n) fscache_osm_##n #define STATE(n) (&__STATE_NAME(n)) @@ -91,7 +92,7 @@ static WORK_STATE(LOOKUP_FAILURE, "LCFL", fscache_lookup_failure); static WORK_STATE(KILL_OBJECT, "KILL", fscache_kill_object); static WORK_STATE(KILL_DEPENDENTS, "KDEP", fscache_kill_dependents); static WORK_STATE(DROP_OBJECT, "DROP", fscache_drop_object); -static WORK_STATE(OBJECT_DEAD, "DEAD", (void*)2UL); +static WORK_STATE(OBJECT_DEAD, "DEAD", fscache_object_dead); static WAIT_STATE(WAIT_FOR_INIT, "?INI", TRANSIT_TO(INIT_OBJECT, 1 << FSCACHE_OBJECT_EV_NEW_CHILD)); @@ -229,6 +230,10 @@ execute_work_state: event = -1; if (new_state == NO_TRANSIT) { _debug("{OBJ%x} %s notrans", object->debug_id, state->name); + if (unlikely(state == STATE(OBJECT_DEAD))) { + _leave(" [dead]"); + return; + } fscache_enqueue_object(object); event_mask = object->oob_event_mask; goto unmask_events; @@ -239,7 +244,7 @@ execute_work_state: object->state = state = new_state; if (state->work) { - if (unlikely(state->work == ((void *)2UL))) { + if (unlikely(state == STATE(OBJECT_DEAD))) { _leave(" [dead]"); return; } @@ -1083,3 +1088,20 @@ void fscache_object_mark_killed(struct fscache_object *object, } } EXPORT_SYMBOL(fscache_object_mark_killed); + +/* + * The object is dead. We can get here if an object gets queued by an event + * that would lead to its death (such as EV_KILL) when the dispatcher is + * already running (and so can be requeued) but hasn't yet cleared the event + * mask. + */ +static const struct fscache_state *fscache_object_dead(struct fscache_object *object, + int event) +{ + if (!test_and_set_bit(FSCACHE_OBJECT_RUN_AFTER_DEAD, + &object->flags)) + return NO_TRANSIT; + + WARN(true, "FS-Cache object redispatched after death"); + return NO_TRANSIT; +} diff --git a/include/linux/fscache-cache.h b/include/linux/fscache-cache.h index 13ba552e6c09..4c467ef50159 100644 --- a/include/linux/fscache-cache.h +++ b/include/linux/fscache-cache.h @@ -360,6 +360,7 @@ struct fscache_object { #define FSCACHE_OBJECT_IS_AVAILABLE 5 /* T if object has become active */ #define FSCACHE_OBJECT_RETIRED 6 /* T if object was retired on relinquishment */ #define FSCACHE_OBJECT_KILLED_BY_CACHE 7 /* T if object was killed by the cache */ +#define FSCACHE_OBJECT_RUN_AFTER_DEAD 8 /* T if object has been dispatched after death */ struct list_head cache_link; /* link in cache->object_list */ struct hlist_node cookie_link; /* link in cookie->backing_objects */ -- cgit v1.2.3-71-gd317 From dd86e373e09fb16b83e8adf5c48c421a4ca76468 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 31 Jan 2017 23:58:38 +0100 Subject: perf/x86/intel/rapl: Make package handling more robust The package management code in RAPL relies on package mapping being available before a CPU is started. This changed with: 9d85eb9119f4 ("x86/smpboot: Make logical package management more robust") because the ACPI/BIOS information turned out to be unreliable, but that left RAPL in broken state. This was not noticed because on a regular boot all CPUs are online before RAPL is initialized. A possible fix would be to reintroduce the mess which allocates a package data structure in CPU prepare and when it turns out to already exist in starting throw it away later in the CPU online callback. But that's a horrible hack and not required at all because RAPL becomes functional for perf only in the CPU online callback. That's correct because user space is not yet informed about the CPU being onlined, so nothing caan rely on RAPL being available on that particular CPU. Move the allocation to the CPU online callback and simplify the hotplug handling. At this point the package mapping is established and correct. This also adds a missing check for available package data in the event_init() function. Reported-by: Yasuaki Ishimatsu Signed-off-by: Thomas Gleixner Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Sebastian Siewior Cc: Stephane Eranian Cc: Vince Weaver Fixes: 9d85eb9119f4 ("x86/smpboot: Make logical package management more robust") Link: http://lkml.kernel.org/r/20170131230141.212593966@linutronix.de Signed-off-by: Ingo Molnar --- arch/x86/events/intel/rapl.c | 60 +++++++++++++++++++------------------------- include/linux/cpuhotplug.h | 1 - 2 files changed, 26 insertions(+), 35 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/events/intel/rapl.c b/arch/x86/events/intel/rapl.c index 17c3564d087a..22ef4f72cf32 100644 --- a/arch/x86/events/intel/rapl.c +++ b/arch/x86/events/intel/rapl.c @@ -161,7 +161,13 @@ static u64 rapl_timer_ms; static inline struct rapl_pmu *cpu_to_rapl_pmu(unsigned int cpu) { - return rapl_pmus->pmus[topology_logical_package_id(cpu)]; + unsigned int pkgid = topology_logical_package_id(cpu); + + /* + * The unsigned check also catches the '-1' return value for non + * existent mappings in the topology map. + */ + return pkgid < rapl_pmus->maxpkg ? rapl_pmus->pmus[pkgid] : NULL; } static inline u64 rapl_read_counter(struct perf_event *event) @@ -402,6 +408,8 @@ static int rapl_pmu_event_init(struct perf_event *event) /* must be done before validate_group */ pmu = cpu_to_rapl_pmu(event->cpu); + if (!pmu) + return -EINVAL; event->cpu = pmu->cpu; event->pmu_private = pmu; event->hw.event_base = msr; @@ -585,6 +593,20 @@ static int rapl_cpu_online(unsigned int cpu) struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu); int target; + if (!pmu) { + pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu)); + if (!pmu) + return -ENOMEM; + + raw_spin_lock_init(&pmu->lock); + INIT_LIST_HEAD(&pmu->active_list); + pmu->pmu = &rapl_pmus->pmu; + pmu->timer_interval = ms_to_ktime(rapl_timer_ms); + rapl_hrtimer_init(pmu); + + rapl_pmus->pmus[topology_logical_package_id(cpu)] = pmu; + } + /* * Check if there is an online cpu in the package which collects rapl * events already. @@ -598,27 +620,6 @@ static int rapl_cpu_online(unsigned int cpu) return 0; } -static int rapl_cpu_prepare(unsigned int cpu) -{ - struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu); - - if (pmu) - return 0; - - pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu)); - if (!pmu) - return -ENOMEM; - - raw_spin_lock_init(&pmu->lock); - INIT_LIST_HEAD(&pmu->active_list); - pmu->pmu = &rapl_pmus->pmu; - pmu->timer_interval = ms_to_ktime(rapl_timer_ms); - pmu->cpu = -1; - rapl_hrtimer_init(pmu); - rapl_pmus->pmus[topology_logical_package_id(cpu)] = pmu; - return 0; -} - static int rapl_check_hw_unit(bool apply_quirk) { u64 msr_rapl_power_unit_bits; @@ -803,29 +804,21 @@ static int __init rapl_pmu_init(void) /* * Install callbacks. Core will call them for each online cpu. */ - - ret = cpuhp_setup_state(CPUHP_PERF_X86_RAPL_PREP, "perf/x86/rapl:prepare", - rapl_cpu_prepare, NULL); - if (ret) - goto out; - ret = cpuhp_setup_state(CPUHP_AP_PERF_X86_RAPL_ONLINE, "perf/x86/rapl:online", rapl_cpu_online, rapl_cpu_offline); if (ret) - goto out1; + goto out; ret = perf_pmu_register(&rapl_pmus->pmu, "power", -1); if (ret) - goto out2; + goto out1; rapl_advertise(); return 0; -out2: - cpuhp_remove_state(CPUHP_AP_PERF_X86_RAPL_ONLINE); out1: - cpuhp_remove_state(CPUHP_PERF_X86_RAPL_PREP); + cpuhp_remove_state(CPUHP_AP_PERF_X86_RAPL_ONLINE); out: pr_warn("Initialization failed (%d), disabled\n", ret); cleanup_rapl_pmus(); @@ -836,7 +829,6 @@ module_init(rapl_pmu_init); static void __exit intel_rapl_exit(void) { cpuhp_remove_state_nocalls(CPUHP_AP_PERF_X86_RAPL_ONLINE); - cpuhp_remove_state_nocalls(CPUHP_PERF_X86_RAPL_PREP); perf_pmu_unregister(&rapl_pmus->pmu); cleanup_rapl_pmus(); } diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index d936a0021839..8329f3dc592c 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -10,7 +10,6 @@ enum cpuhp_state { CPUHP_PERF_X86_PREPARE, CPUHP_PERF_X86_UNCORE_PREP, CPUHP_PERF_X86_AMD_UNCORE_PREP, - CPUHP_PERF_X86_RAPL_PREP, CPUHP_PERF_BFIN, CPUHP_PERF_POWER, CPUHP_PERF_SUPERH, -- cgit v1.2.3-71-gd317 From fff4b87e594ad3d2e4f51e8d3d86a6f9d3d8b654 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 31 Jan 2017 23:58:40 +0100 Subject: perf/x86/intel/uncore: Make package handling more robust The package management code in uncore relies on package mapping being available before a CPU is started. This changed with: 9d85eb9119f4 ("x86/smpboot: Make logical package management more robust") because the ACPI/BIOS information turned out to be unreliable, but that left uncore in broken state. This was not noticed because on a regular boot all CPUs are online before uncore is initialized. Move the allocation to the CPU online callback and simplify the hotplug handling. At this point the package mapping is established and correct. Signed-off-by: Thomas Gleixner Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Sebastian Siewior Cc: Stephane Eranian Cc: Vince Weaver Cc: Yasuaki Ishimatsu Fixes: 9d85eb9119f4 ("x86/smpboot: Make logical package management more robust") Link: http://lkml.kernel.org/r/20170131230141.377156255@linutronix.de Signed-off-by: Ingo Molnar --- arch/x86/events/intel/uncore.c | 196 +++++++++++++++++++---------------------- include/linux/cpuhotplug.h | 2 - 2 files changed, 91 insertions(+), 107 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c index 56c5235dcc29..1ab45976474d 100644 --- a/arch/x86/events/intel/uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -100,7 +100,13 @@ ssize_t uncore_event_show(struct kobject *kobj, struct intel_uncore_box *uncore_pmu_to_box(struct intel_uncore_pmu *pmu, int cpu) { - return pmu->boxes[topology_logical_package_id(cpu)]; + unsigned int pkgid = topology_logical_package_id(cpu); + + /* + * The unsigned check also catches the '-1' return value for non + * existent mappings in the topology map. + */ + return pkgid < max_packages ? pmu->boxes[pkgid] : NULL; } u64 uncore_msr_read_counter(struct intel_uncore_box *box, struct perf_event *event) @@ -1034,76 +1040,6 @@ static void uncore_pci_exit(void) } } -static int uncore_cpu_dying(unsigned int cpu) -{ - struct intel_uncore_type *type, **types = uncore_msr_uncores; - struct intel_uncore_pmu *pmu; - struct intel_uncore_box *box; - int i, pkg; - - pkg = topology_logical_package_id(cpu); - for (; *types; types++) { - type = *types; - pmu = type->pmus; - for (i = 0; i < type->num_boxes; i++, pmu++) { - box = pmu->boxes[pkg]; - if (box && atomic_dec_return(&box->refcnt) == 0) - uncore_box_exit(box); - } - } - return 0; -} - -static int uncore_cpu_starting(unsigned int cpu) -{ - struct intel_uncore_type *type, **types = uncore_msr_uncores; - struct intel_uncore_pmu *pmu; - struct intel_uncore_box *box; - int i, pkg; - - pkg = topology_logical_package_id(cpu); - for (; *types; types++) { - type = *types; - pmu = type->pmus; - for (i = 0; i < type->num_boxes; i++, pmu++) { - box = pmu->boxes[pkg]; - if (!box) - continue; - /* The first cpu on a package activates the box */ - if (atomic_inc_return(&box->refcnt) == 1) - uncore_box_init(box); - } - } - - return 0; -} - -static int uncore_cpu_prepare(unsigned int cpu) -{ - struct intel_uncore_type *type, **types = uncore_msr_uncores; - struct intel_uncore_pmu *pmu; - struct intel_uncore_box *box; - int i, pkg; - - pkg = topology_logical_package_id(cpu); - for (; *types; types++) { - type = *types; - pmu = type->pmus; - for (i = 0; i < type->num_boxes; i++, pmu++) { - if (pmu->boxes[pkg]) - continue; - /* First cpu of a package allocates the box */ - box = uncore_alloc_box(type, cpu_to_node(cpu)); - if (!box) - return -ENOMEM; - box->pmu = pmu; - box->pkgid = pkg; - pmu->boxes[pkg] = box; - } - } - return 0; -} - static void uncore_change_type_ctx(struct intel_uncore_type *type, int old_cpu, int new_cpu) { @@ -1143,12 +1079,14 @@ static void uncore_change_context(struct intel_uncore_type **uncores, static int uncore_event_cpu_offline(unsigned int cpu) { - int target; + struct intel_uncore_type *type, **types = uncore_msr_uncores; + struct intel_uncore_pmu *pmu; + struct intel_uncore_box *box; + int i, pkg, target; /* Check if exiting cpu is used for collecting uncore events */ if (!cpumask_test_and_clear_cpu(cpu, &uncore_cpu_mask)) - return 0; - + goto unref; /* Find a new cpu to collect uncore events */ target = cpumask_any_but(topology_core_cpumask(cpu), cpu); @@ -1160,12 +1098,82 @@ static int uncore_event_cpu_offline(unsigned int cpu) uncore_change_context(uncore_msr_uncores, cpu, target); uncore_change_context(uncore_pci_uncores, cpu, target); + +unref: + /* Clear the references */ + pkg = topology_logical_package_id(cpu); + for (; *types; types++) { + type = *types; + pmu = type->pmus; + for (i = 0; i < type->num_boxes; i++, pmu++) { + box = pmu->boxes[pkg]; + if (box && atomic_dec_return(&box->refcnt) == 0) + uncore_box_exit(box); + } + } return 0; } +static int allocate_boxes(struct intel_uncore_type **types, + unsigned int pkg, unsigned int cpu) +{ + struct intel_uncore_box *box, *tmp; + struct intel_uncore_type *type; + struct intel_uncore_pmu *pmu; + LIST_HEAD(allocated); + int i; + + /* Try to allocate all required boxes */ + for (; *types; types++) { + type = *types; + pmu = type->pmus; + for (i = 0; i < type->num_boxes; i++, pmu++) { + if (pmu->boxes[pkg]) + continue; + box = uncore_alloc_box(type, cpu_to_node(cpu)); + if (!box) + goto cleanup; + box->pmu = pmu; + box->pkgid = pkg; + list_add(&box->active_list, &allocated); + } + } + /* Install them in the pmus */ + list_for_each_entry_safe(box, tmp, &allocated, active_list) { + list_del_init(&box->active_list); + box->pmu->boxes[pkg] = box; + } + return 0; + +cleanup: + list_for_each_entry_safe(box, tmp, &allocated, active_list) { + list_del_init(&box->active_list); + kfree(box); + } + return -ENOMEM; +} + static int uncore_event_cpu_online(unsigned int cpu) { - int target; + struct intel_uncore_type *type, **types = uncore_msr_uncores; + struct intel_uncore_pmu *pmu; + struct intel_uncore_box *box; + int i, ret, pkg, target; + + pkg = topology_logical_package_id(cpu); + ret = allocate_boxes(types, pkg, cpu); + if (ret) + return ret; + + for (; *types; types++) { + type = *types; + pmu = type->pmus; + for (i = 0; i < type->num_boxes; i++, pmu++) { + box = pmu->boxes[pkg]; + if (!box && atomic_inc_return(&box->refcnt) == 1) + uncore_box_init(box); + } + } /* * Check if there is an online cpu in the package @@ -1355,33 +1363,13 @@ static int __init intel_uncore_init(void) if (cret && pret) return -ENODEV; - /* - * Install callbacks. Core will call them for each online cpu. - * - * The first online cpu of each package allocates and takes - * the refcounts for all other online cpus in that package. - * If msrs are not enabled no allocation is required and - * uncore_cpu_prepare() is not called for each online cpu. - */ - if (!cret) { - ret = cpuhp_setup_state(CPUHP_PERF_X86_UNCORE_PREP, - "perf/x86/intel/uncore:prepare", - uncore_cpu_prepare, NULL); - if (ret) - goto err; - } else { - cpuhp_setup_state_nocalls(CPUHP_PERF_X86_UNCORE_PREP, - "perf/x86/intel/uncore:prepare", - uncore_cpu_prepare, NULL); - } - - cpuhp_setup_state(CPUHP_AP_PERF_X86_UNCORE_STARTING, - "perf/x86/uncore:starting", - uncore_cpu_starting, uncore_cpu_dying); - - cpuhp_setup_state(CPUHP_AP_PERF_X86_UNCORE_ONLINE, - "perf/x86/uncore:online", - uncore_event_cpu_online, uncore_event_cpu_offline); + /* Install hotplug callbacks to setup the targets for each package */ + ret = cpuhp_setup_state(CPUHP_AP_PERF_X86_UNCORE_ONLINE, + "perf/x86/intel/uncore:online", + uncore_event_cpu_online, + uncore_event_cpu_offline); + if (ret) + goto err; return 0; err: @@ -1393,9 +1381,7 @@ module_init(intel_uncore_init); static void __exit intel_uncore_exit(void) { - cpuhp_remove_state_nocalls(CPUHP_AP_PERF_X86_UNCORE_ONLINE); - cpuhp_remove_state_nocalls(CPUHP_AP_PERF_X86_UNCORE_STARTING); - cpuhp_remove_state_nocalls(CPUHP_PERF_X86_UNCORE_PREP); + cpuhp_remove_state(CPUHP_AP_PERF_X86_UNCORE_ONLINE); uncore_types_exit(uncore_msr_uncores); uncore_pci_exit(); } diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 8329f3dc592c..921acaaa1601 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -8,7 +8,6 @@ enum cpuhp_state { CPUHP_CREATE_THREADS, CPUHP_PERF_PREPARE, CPUHP_PERF_X86_PREPARE, - CPUHP_PERF_X86_UNCORE_PREP, CPUHP_PERF_X86_AMD_UNCORE_PREP, CPUHP_PERF_BFIN, CPUHP_PERF_POWER, @@ -85,7 +84,6 @@ enum cpuhp_state { CPUHP_AP_IRQ_ARMADA_XP_STARTING, CPUHP_AP_IRQ_BCM2836_STARTING, CPUHP_AP_ARM_MVEBU_COHERENCY, - CPUHP_AP_PERF_X86_UNCORE_STARTING, CPUHP_AP_PERF_X86_AMD_UNCORE_STARTING, CPUHP_AP_PERF_X86_STARTING, CPUHP_AP_PERF_X86_AMD_IBS_STARTING, -- cgit v1.2.3-71-gd317 From 1a2a14444d32b89b28116daea86f63ced1716668 Mon Sep 17 00:00:00 2001 From: Dimitris Michailidis Date: Tue, 31 Jan 2017 16:03:13 -0800 Subject: net: fix ndo_features_check/ndo_fix_features comment ordering Commit cdba756f5803a2 ("net: move ndo_features_check() close to ndo_start_xmit()") inadvertently moved the doc comment for .ndo_fix_features instead of .ndo_features_check. Fix the comment ordering. Fixes: cdba756f5803a2 ("net: move ndo_features_check() close to ndo_start_xmit()") Signed-off-by: Dimitris Michailidis Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 9bde9558b596..70ad0291d517 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -866,11 +866,15 @@ struct netdev_xdp { * of useless work if you return NETDEV_TX_BUSY. * Required; cannot be NULL. * - * netdev_features_t (*ndo_fix_features)(struct net_device *dev, - * netdev_features_t features); - * Adjusts the requested feature flags according to device-specific - * constraints, and returns the resulting flags. Must not modify - * the device state. + * netdev_features_t (*ndo_features_check)(struct sk_buff *skb, + * struct net_device *dev + * netdev_features_t features); + * Called by core transmit path to determine if device is capable of + * performing offload operations on a given packet. This is to give + * the device an opportunity to implement any restrictions that cannot + * be otherwise expressed by feature flags. The check is called with + * the set of features that the stack has calculated and it returns + * those the driver believes to be appropriate. * * u16 (*ndo_select_queue)(struct net_device *dev, struct sk_buff *skb, * void *accel_priv, select_queue_fallback_t fallback); @@ -1028,6 +1032,12 @@ struct netdev_xdp { * Called to release previously enslaved netdev. * * Feature/offload setting functions. + * netdev_features_t (*ndo_fix_features)(struct net_device *dev, + * netdev_features_t features); + * Adjusts the requested feature flags according to device-specific + * constraints, and returns the resulting flags. Must not modify + * the device state. + * * int (*ndo_set_features)(struct net_device *dev, netdev_features_t features); * Called to update device configuration to new features. Passed * feature set might be less than what was returned by ndo_fix_features()). @@ -1100,15 +1110,6 @@ struct netdev_xdp { * Callback to use for xmit over the accelerated station. This * is used in place of ndo_start_xmit on accelerated net * devices. - * netdev_features_t (*ndo_features_check)(struct sk_buff *skb, - * struct net_device *dev - * netdev_features_t features); - * Called by core transmit path to determine if device is capable of - * performing offload operations on a given packet. This is to give - * the device an opportunity to implement any restrictions that cannot - * be otherwise expressed by feature flags. The check is called with - * the set of features that the stack has calculated and it returns - * those the driver believes to be appropriate. * int (*ndo_set_tx_maxrate)(struct net_device *dev, * int queue_index, u32 maxrate); * Called when a user wants to set a max-rate limitation of specific -- cgit v1.2.3-71-gd317 From 71810db27c1c853b335675bee335d893bc3d324b Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Fri, 3 Feb 2017 09:54:06 +0000 Subject: modversions: treat symbol CRCs as 32 bit quantities The modversion symbol CRCs are emitted as ELF symbols, which allows us to easily populate the kcrctab sections by relying on the linker to associate each kcrctab slot with the correct value. This has a couple of downsides: - Given that the CRCs are treated as memory addresses, we waste 4 bytes for each CRC on 64 bit architectures, - On architectures that support runtime relocation, a R__RELATIVE relocation entry is emitted for each CRC value, which identifies it as a quantity that requires fixing up based on the actual runtime load offset of the kernel. This results in corrupted CRCs unless we explicitly undo the fixup (and this is currently being handled in the core module code) - Such runtime relocation entries take up 24 bytes of __init space each, resulting in a x8 overhead in [uncompressed] kernel size for CRCs. Switching to explicit 32 bit values on 64 bit architectures fixes most of these issues, given that 32 bit values are not treated as quantities that require fixing up based on the actual runtime load offset. Note that on some ELF64 architectures [such as PPC64], these 32-bit values are still emitted as [absolute] runtime relocatable quantities, even if the value resolves to a build time constant. Since relative relocations are always resolved at build time, this patch enables MODULE_REL_CRCS on powerpc when CONFIG_RELOCATABLE=y, which turns the absolute CRC references into relative references into .rodata where the actual CRC value is stored. So redefine all CRC fields and variables as u32, and redefine the __CRC_SYMBOL() macro for 64 bit builds to emit the CRC reference using inline assembler (which is necessary since 64-bit C code cannot use 32-bit types to hold memory addresses, even if they are ultimately resolved using values that do not exceed 0xffffffff). To avoid potential problems with legacy 32-bit architectures using legacy toolchains, the equivalent C definition of the kcrctab entry is retained for 32-bit architectures. Note that this mostly reverts commit d4703aefdbc8 ("module: handle ppc64 relocating kcrctabs when CONFIG_RELOCATABLE=y") Acked-by: Rusty Russell Signed-off-by: Ard Biesheuvel Signed-off-by: Linus Torvalds --- arch/powerpc/Kconfig | 1 + arch/powerpc/include/asm/module.h | 4 --- arch/powerpc/kernel/module_64.c | 8 ------ include/asm-generic/export.h | 11 ++++---- include/linux/export.h | 14 +++++++++++ include/linux/module.h | 14 +++++------ kernel/module.c | 53 ++++++++++++++++++--------------------- 7 files changed, 53 insertions(+), 52 deletions(-) (limited to 'include/linux') diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index a8ee573fe610..db8a1ef6bfaf 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -484,6 +484,7 @@ config RELOCATABLE bool "Build a relocatable kernel" depends on (PPC64 && !COMPILE_TEST) || (FLATMEM && (44x || FSL_BOOKE)) select NONSTATIC_KERNEL + select MODULE_REL_CRCS if MODVERSIONS help This builds a kernel image that is capable of running at the location the kernel is loaded at. For ppc32, there is no any diff --git a/arch/powerpc/include/asm/module.h b/arch/powerpc/include/asm/module.h index cc12c61ef315..53885512b8d3 100644 --- a/arch/powerpc/include/asm/module.h +++ b/arch/powerpc/include/asm/module.h @@ -90,9 +90,5 @@ static inline int module_finalize_ftrace(struct module *mod, const Elf_Shdr *sec } #endif -#if defined(CONFIG_MODVERSIONS) && defined(CONFIG_PPC64) -#define ARCH_RELOCATES_KCRCTAB -#define reloc_start PHYSICAL_START -#endif #endif /* __KERNEL__ */ #endif /* _ASM_POWERPC_MODULE_H */ diff --git a/arch/powerpc/kernel/module_64.c b/arch/powerpc/kernel/module_64.c index bb1807184bad..0b0f89685b67 100644 --- a/arch/powerpc/kernel/module_64.c +++ b/arch/powerpc/kernel/module_64.c @@ -286,14 +286,6 @@ static void dedotify_versions(struct modversion_info *vers, for (end = (void *)vers + size; vers < end; vers++) if (vers->name[0] == '.') { memmove(vers->name, vers->name+1, strlen(vers->name)); -#ifdef ARCH_RELOCATES_KCRCTAB - /* The TOC symbol has no CRC computed. To avoid CRC - * check failing, we must force it to the expected - * value (see CRC check in module.c). - */ - if (!strcmp(vers->name, "TOC.")) - vers->crc = -(unsigned long)reloc_start; -#endif } } diff --git a/include/asm-generic/export.h b/include/asm-generic/export.h index 63554e9f6e0c..719db1968d81 100644 --- a/include/asm-generic/export.h +++ b/include/asm-generic/export.h @@ -9,18 +9,15 @@ #ifndef KSYM_ALIGN #define KSYM_ALIGN 8 #endif -#ifndef KCRC_ALIGN -#define KCRC_ALIGN 8 -#endif #else #define __put .long #ifndef KSYM_ALIGN #define KSYM_ALIGN 4 #endif +#endif #ifndef KCRC_ALIGN #define KCRC_ALIGN 4 #endif -#endif #ifdef CONFIG_HAVE_UNDERSCORE_SYMBOL_PREFIX #define KSYM(name) _##name @@ -52,7 +49,11 @@ KSYM(__kstrtab_\name): .section ___kcrctab\sec+\name,"a" .balign KCRC_ALIGN KSYM(__kcrctab_\name): - __put KSYM(__crc_\name) +#if defined(CONFIG_MODULE_REL_CRCS) + .long KSYM(__crc_\name) - . +#else + .long KSYM(__crc_\name) +#endif .weak KSYM(__crc_\name) .previous #endif diff --git a/include/linux/export.h b/include/linux/export.h index 2a0f61fbc731..7473fba6a60c 100644 --- a/include/linux/export.h +++ b/include/linux/export.h @@ -43,6 +43,13 @@ extern struct module __this_module; #ifdef CONFIG_MODVERSIONS /* Mark the CRC weak since genksyms apparently decides not to * generate a checksums for some symbols */ +#if defined(CONFIG_MODULE_REL_CRCS) +#define __CRC_SYMBOL(sym, sec) \ + asm(" .section \"___kcrctab" sec "+" #sym "\", \"a\" \n" \ + " .weak " VMLINUX_SYMBOL_STR(__crc_##sym) " \n" \ + " .long " VMLINUX_SYMBOL_STR(__crc_##sym) " - . \n" \ + " .previous \n"); +#elif !defined(CONFIG_64BIT) #define __CRC_SYMBOL(sym, sec) \ extern __visible void *__crc_##sym __attribute__((weak)); \ static const unsigned long __kcrctab_##sym \ @@ -50,6 +57,13 @@ extern struct module __this_module; __attribute__((section("___kcrctab" sec "+" #sym), used)) \ = (unsigned long) &__crc_##sym; #else +#define __CRC_SYMBOL(sym, sec) \ + asm(" .section \"___kcrctab" sec "+" #sym "\", \"a\" \n" \ + " .weak " VMLINUX_SYMBOL_STR(__crc_##sym) " \n" \ + " .long " VMLINUX_SYMBOL_STR(__crc_##sym) " \n" \ + " .previous \n"); +#endif +#else #define __CRC_SYMBOL(sym, sec) #endif diff --git a/include/linux/module.h b/include/linux/module.h index 7c84273d60b9..cc7cba219b20 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -346,7 +346,7 @@ struct module { /* Exported symbols */ const struct kernel_symbol *syms; - const unsigned long *crcs; + const s32 *crcs; unsigned int num_syms; /* Kernel parameters. */ @@ -359,18 +359,18 @@ struct module { /* GPL-only exported symbols. */ unsigned int num_gpl_syms; const struct kernel_symbol *gpl_syms; - const unsigned long *gpl_crcs; + const s32 *gpl_crcs; #ifdef CONFIG_UNUSED_SYMBOLS /* unused exported symbols. */ const struct kernel_symbol *unused_syms; - const unsigned long *unused_crcs; + const s32 *unused_crcs; unsigned int num_unused_syms; /* GPL-only, unused exported symbols. */ unsigned int num_unused_gpl_syms; const struct kernel_symbol *unused_gpl_syms; - const unsigned long *unused_gpl_crcs; + const s32 *unused_gpl_crcs; #endif #ifdef CONFIG_MODULE_SIG @@ -382,7 +382,7 @@ struct module { /* symbols that will be GPL-only in the near future. */ const struct kernel_symbol *gpl_future_syms; - const unsigned long *gpl_future_crcs; + const s32 *gpl_future_crcs; unsigned int num_gpl_future_syms; /* Exception table */ @@ -523,7 +523,7 @@ struct module *find_module(const char *name); struct symsearch { const struct kernel_symbol *start, *stop; - const unsigned long *crcs; + const s32 *crcs; enum { NOT_GPL_ONLY, GPL_ONLY, @@ -539,7 +539,7 @@ struct symsearch { */ const struct kernel_symbol *find_symbol(const char *name, struct module **owner, - const unsigned long **crc, + const s32 **crc, bool gplok, bool warn); diff --git a/kernel/module.c b/kernel/module.c index 38d4270925d4..3d8f126208e3 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -389,16 +389,16 @@ extern const struct kernel_symbol __start___ksymtab_gpl[]; extern const struct kernel_symbol __stop___ksymtab_gpl[]; extern const struct kernel_symbol __start___ksymtab_gpl_future[]; extern const struct kernel_symbol __stop___ksymtab_gpl_future[]; -extern const unsigned long __start___kcrctab[]; -extern const unsigned long __start___kcrctab_gpl[]; -extern const unsigned long __start___kcrctab_gpl_future[]; +extern const s32 __start___kcrctab[]; +extern const s32 __start___kcrctab_gpl[]; +extern const s32 __start___kcrctab_gpl_future[]; #ifdef CONFIG_UNUSED_SYMBOLS extern const struct kernel_symbol __start___ksymtab_unused[]; extern const struct kernel_symbol __stop___ksymtab_unused[]; extern const struct kernel_symbol __start___ksymtab_unused_gpl[]; extern const struct kernel_symbol __stop___ksymtab_unused_gpl[]; -extern const unsigned long __start___kcrctab_unused[]; -extern const unsigned long __start___kcrctab_unused_gpl[]; +extern const s32 __start___kcrctab_unused[]; +extern const s32 __start___kcrctab_unused_gpl[]; #endif #ifndef CONFIG_MODVERSIONS @@ -497,7 +497,7 @@ struct find_symbol_arg { /* Output */ struct module *owner; - const unsigned long *crc; + const s32 *crc; const struct kernel_symbol *sym; }; @@ -563,7 +563,7 @@ static bool find_symbol_in_section(const struct symsearch *syms, * (optional) module which owns it. Needs preempt disabled or module_mutex. */ const struct kernel_symbol *find_symbol(const char *name, struct module **owner, - const unsigned long **crc, + const s32 **crc, bool gplok, bool warn) { @@ -1249,23 +1249,17 @@ static int try_to_force_load(struct module *mod, const char *reason) } #ifdef CONFIG_MODVERSIONS -/* If the arch applies (non-zero) relocations to kernel kcrctab, unapply it. */ -static unsigned long maybe_relocated(unsigned long crc, - const struct module *crc_owner) + +static u32 resolve_rel_crc(const s32 *crc) { -#ifdef ARCH_RELOCATES_KCRCTAB - if (crc_owner == NULL) - return crc - (unsigned long)reloc_start; -#endif - return crc; + return *(u32 *)((void *)crc + *crc); } static int check_version(Elf_Shdr *sechdrs, unsigned int versindex, const char *symname, struct module *mod, - const unsigned long *crc, - const struct module *crc_owner) + const s32 *crc) { unsigned int i, num_versions; struct modversion_info *versions; @@ -1283,13 +1277,19 @@ static int check_version(Elf_Shdr *sechdrs, / sizeof(struct modversion_info); for (i = 0; i < num_versions; i++) { + u32 crcval; + if (strcmp(versions[i].name, symname) != 0) continue; - if (versions[i].crc == maybe_relocated(*crc, crc_owner)) + if (IS_ENABLED(CONFIG_MODULE_REL_CRCS)) + crcval = resolve_rel_crc(crc); + else + crcval = *crc; + if (versions[i].crc == crcval) return 1; - pr_debug("Found checksum %lX vs module %lX\n", - maybe_relocated(*crc, crc_owner), versions[i].crc); + pr_debug("Found checksum %X vs module %lX\n", + crcval, versions[i].crc); goto bad_version; } @@ -1307,7 +1307,7 @@ static inline int check_modstruct_version(Elf_Shdr *sechdrs, unsigned int versindex, struct module *mod) { - const unsigned long *crc; + const s32 *crc; /* * Since this should be found in kernel (which can't be removed), no @@ -1321,8 +1321,7 @@ static inline int check_modstruct_version(Elf_Shdr *sechdrs, } preempt_enable(); return check_version(sechdrs, versindex, - VMLINUX_SYMBOL_STR(module_layout), mod, crc, - NULL); + VMLINUX_SYMBOL_STR(module_layout), mod, crc); } /* First part is kernel version, which we ignore if module has crcs. */ @@ -1340,8 +1339,7 @@ static inline int check_version(Elf_Shdr *sechdrs, unsigned int versindex, const char *symname, struct module *mod, - const unsigned long *crc, - const struct module *crc_owner) + const s32 *crc) { return 1; } @@ -1368,7 +1366,7 @@ static const struct kernel_symbol *resolve_symbol(struct module *mod, { struct module *owner; const struct kernel_symbol *sym; - const unsigned long *crc; + const s32 *crc; int err; /* @@ -1383,8 +1381,7 @@ static const struct kernel_symbol *resolve_symbol(struct module *mod, if (!sym) goto unlock; - if (!check_version(info->sechdrs, info->index.vers, name, mod, crc, - owner)) { + if (!check_version(info->sechdrs, info->index.vers, name, mod, crc)) { sym = ERR_PTR(-EINVAL); goto getname; } -- cgit v1.2.3-71-gd317 From 4b9eee96fcb361a5e16a8d2619825e8a048f81f7 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Fri, 3 Feb 2017 09:54:07 +0000 Subject: module: unify absolute krctab definitions for 32-bit and 64-bit The previous patch introduced a separate inline asm version of the krcrctab declaration template for use with 64-bit architectures, which cannot refer to ELF symbols using 32-bit quantities. This declaration should be equivalent to the C one for 32-bit architectures, but just in case - unify them in a separate patch, which can simply be dropped if it turns out to break anything. Signed-off-by: Ard Biesheuvel Signed-off-by: Linus Torvalds --- include/linux/export.h | 7 ------- 1 file changed, 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/export.h b/include/linux/export.h index 7473fba6a60c..1a1dfdb2a5c6 100644 --- a/include/linux/export.h +++ b/include/linux/export.h @@ -49,13 +49,6 @@ extern struct module __this_module; " .weak " VMLINUX_SYMBOL_STR(__crc_##sym) " \n" \ " .long " VMLINUX_SYMBOL_STR(__crc_##sym) " - . \n" \ " .previous \n"); -#elif !defined(CONFIG_64BIT) -#define __CRC_SYMBOL(sym, sec) \ - extern __visible void *__crc_##sym __attribute__((weak)); \ - static const unsigned long __kcrctab_##sym \ - __used \ - __attribute__((section("___kcrctab" sec "+" #sym), used)) \ - = (unsigned long) &__crc_##sym; #else #define __CRC_SYMBOL(sym, sec) \ asm(" .section \"___kcrctab" sec "+" #sym "\", \"a\" \n" \ -- cgit v1.2.3-71-gd317 From 29905b52fad0854351f57bab867647e4982285bf Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 2 Feb 2017 18:05:26 +0000 Subject: log2: make order_base_2() behave correctly on const input value zero The function order_base_2() is defined (according to the comment block) as returning zero on input zero, but subsequently passes the input into roundup_pow_of_two(), which is explicitly undefined for input zero. This has gone unnoticed until now, but optimization passes in GCC 7 may produce constant folded function instances where a constant value of zero is passed into order_base_2(), resulting in link errors against the deliberately undefined '____ilog2_NaN'. So update order_base_2() to adhere to its own documented interface. [ See http://marc.info/?l=linux-kernel&m=147672952517795&w=2 and follow-up discussion for more background. The gcc "optimization pass" is really just broken, but now the GCC trunk problem seems to have escaped out of just specially built daily images, so we need to work around it in mainline. - Linus ] Signed-off-by: Ard Biesheuvel Signed-off-by: Linus Torvalds --- include/linux/log2.h | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/log2.h b/include/linux/log2.h index fd7ff3d91e6a..ef3d4f67118c 100644 --- a/include/linux/log2.h +++ b/include/linux/log2.h @@ -203,6 +203,17 @@ unsigned long __rounddown_pow_of_two(unsigned long n) * ... and so on. */ -#define order_base_2(n) ilog2(roundup_pow_of_two(n)) +static inline __attribute_const__ +int __order_base_2(unsigned long n) +{ + return n > 1 ? ilog2(n - 1) + 1 : 0; +} +#define order_base_2(n) \ +( \ + __builtin_constant_p(n) ? ( \ + ((n) == 0 || (n) == 1) ? 0 : \ + ilog2((n) - 1) + 1) : \ + __order_base_2(n) \ +) #endif /* _LINUX_LOG2_H */ -- cgit v1.2.3-71-gd317 From a96dfddbcc04336bbed50dc2b24823e45e09e80c Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Fri, 3 Feb 2017 13:13:23 -0800 Subject: base/memory, hotplug: fix a kernel oops in show_valid_zones() Reading a sysfs "memoryN/valid_zones" file leads to the following oops when the first page of a range is not backed by struct page. show_valid_zones() assumes that 'start_pfn' is always valid for page_zone(). BUG: unable to handle kernel paging request at ffffea017a000000 IP: show_valid_zones+0x6f/0x160 This issue may happen on x86-64 systems with 64GiB or more memory since their memory block size is bumped up to 2GiB. [1] An example of such systems is desribed below. 0x3240000000 is only aligned by 1GiB and this memory block starts from 0x3200000000, which is not backed by struct page. BIOS-e820: [mem 0x0000003240000000-0x000000603fffffff] usable Since test_pages_in_a_zone() already checks holes, fix this issue by extending this function to return 'valid_start' and 'valid_end' for a given range. show_valid_zones() then proceeds with the valid range. [1] 'Commit bdee237c0343 ("x86: mm: Use 2GB memory block size on large-memory x86-64 systems")' Link: http://lkml.kernel.org/r/20170127222149.30893-3-toshi.kani@hpe.com Signed-off-by: Toshi Kani Cc: Greg Kroah-Hartman Cc: Zhang Zhen Cc: Reza Arbab Cc: David Rientjes Cc: Dan Williams Cc: [4.4+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/memory.c | 12 ++++++------ include/linux/memory_hotplug.h | 3 ++- mm/memory_hotplug.c | 20 +++++++++++++++----- 3 files changed, 23 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/drivers/base/memory.c b/drivers/base/memory.c index dacb6a8418aa..fa26ffd25fa6 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -389,33 +389,33 @@ static ssize_t show_valid_zones(struct device *dev, { struct memory_block *mem = to_memory_block(dev); unsigned long start_pfn, end_pfn; + unsigned long valid_start, valid_end, valid_pages; unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; - struct page *first_page; struct zone *zone; int zone_shift = 0; start_pfn = section_nr_to_pfn(mem->start_section_nr); end_pfn = start_pfn + nr_pages; - first_page = pfn_to_page(start_pfn); /* The block contains more than one zone can not be offlined. */ - if (!test_pages_in_a_zone(start_pfn, end_pfn)) + if (!test_pages_in_a_zone(start_pfn, end_pfn, &valid_start, &valid_end)) return sprintf(buf, "none\n"); - zone = page_zone(first_page); + zone = page_zone(pfn_to_page(valid_start)); + valid_pages = valid_end - valid_start; /* MMOP_ONLINE_KEEP */ sprintf(buf, "%s", zone->name); /* MMOP_ONLINE_KERNEL */ - zone_can_shift(start_pfn, nr_pages, ZONE_NORMAL, &zone_shift); + zone_can_shift(valid_start, valid_pages, ZONE_NORMAL, &zone_shift); if (zone_shift) { strcat(buf, " "); strcat(buf, (zone + zone_shift)->name); } /* MMOP_ONLINE_MOVABLE */ - zone_can_shift(start_pfn, nr_pages, ZONE_MOVABLE, &zone_shift); + zone_can_shift(valid_start, valid_pages, ZONE_MOVABLE, &zone_shift); if (zone_shift) { strcat(buf, " "); strcat(buf, (zone + zone_shift)->name); diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index c1784c0b4f35..134a2f69c21a 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -85,7 +85,8 @@ extern int zone_grow_waitqueues(struct zone *zone, unsigned long nr_pages); extern int add_one_highpage(struct page *page, int pfn, int bad_ppro); /* VM interface that may be used by firmware interface */ extern int online_pages(unsigned long, unsigned long, int); -extern int test_pages_in_a_zone(unsigned long, unsigned long); +extern int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn, + unsigned long *valid_start, unsigned long *valid_end); extern void __offline_isolated_pages(unsigned long, unsigned long); typedef void (*online_page_callback_t)(struct page *page); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 1218f73890b6..b8c11e063ff0 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1484,10 +1484,13 @@ bool is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages) /* * Confirm all pages in a range [start, end) belong to the same zone. + * When true, return its valid [start, end). */ -int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn) +int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn, + unsigned long *valid_start, unsigned long *valid_end) { unsigned long pfn, sec_end_pfn; + unsigned long start, end; struct zone *zone = NULL; struct page *page; int i; @@ -1509,14 +1512,20 @@ int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn) page = pfn_to_page(pfn + i); if (zone && page_zone(page) != zone) return 0; + if (!zone) + start = pfn + i; zone = page_zone(page); + end = pfn + MAX_ORDER_NR_PAGES; } } - if (zone) + if (zone) { + *valid_start = start; + *valid_end = end; return 1; - else + } else { return 0; + } } /* @@ -1843,6 +1852,7 @@ static int __ref __offline_pages(unsigned long start_pfn, long offlined_pages; int ret, drain, retry_max, node; unsigned long flags; + unsigned long valid_start, valid_end; struct zone *zone; struct memory_notify arg; @@ -1853,10 +1863,10 @@ static int __ref __offline_pages(unsigned long start_pfn, return -EINVAL; /* This makes hotplug much easier...and readable. we assume this for now. .*/ - if (!test_pages_in_a_zone(start_pfn, end_pfn)) + if (!test_pages_in_a_zone(start_pfn, end_pfn, &valid_start, &valid_end)) return -EINVAL; - zone = page_zone(pfn_to_page(start_pfn)); + zone = page_zone(pfn_to_page(valid_start)); node = zone_to_nid(zone); nr_pages = end_pfn - start_pfn; -- cgit v1.2.3-71-gd317