From 5cc415001bca8fe0e3f0ee6d58a953a314dd9751 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Mon, 6 Apr 2020 12:41:52 +0200 Subject: Drivers: hv: avoid passing opaque pointer to vmbus_onmessage() vmbus_onmessage() doesn't need the header of the message, it only uses it to get to the payload, we can pass the pointer to the payload directly. Signed-off-by: Vitaly Kuznetsov Reviewed-by: Michael Kelley Link: https://lore.kernel.org/r/20200406104154.45010-4-vkuznets@redhat.com Signed-off-by: Wei Liu --- include/linux/hyperv.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index 692c89ccf5df..cbd24f4e68d1 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -1017,7 +1017,7 @@ static inline void clear_low_latency_mode(struct vmbus_channel *c) c->low_latency = false; } -void vmbus_onmessage(void *context); +void vmbus_onmessage(struct vmbus_channel_message_header *hdr); int vmbus_request_offers(void); -- cgit v1.2.3-71-gd317 From 8b6a877c060ed6b86878fe66c7c6493a6054cf23 Mon Sep 17 00:00:00 2001 From: "Andrea Parri (Microsoft)" Date: Mon, 6 Apr 2020 02:15:06 +0200 Subject: Drivers: hv: vmbus: Replace the per-CPU channel lists with a global array of channels When Hyper-V sends an interrupt to the guest, the guest has to figure out which channel the interrupt is associated with. Hyper-V sets a bit in a memory page that is shared with the guest, indicating a particular "relid" that the interrupt is associated with. The current Linux code then uses a set of per-CPU linked lists to map a given "relid" to a pointer to a channel structure. This design introduces a synchronization problem if the CPU that Hyper-V will interrupt for a certain channel is changed. If the interrupt comes on the "old CPU" and the channel was already moved to the per-CPU list of the "new CPU", then the relid -> channel mapping will fail and the interrupt is dropped. Similarly, if the interrupt comes on the new CPU but the channel was not moved to the per-CPU list of the new CPU, then the mapping will fail and the interrupt is dropped. Relids are integers ranging from 0 to 2047. The mapping from relids to channel structures can be done by setting up an array with 2048 entries, each entry being a pointer to a channel structure (hence total size ~16K bytes, which is not a problem). The array is global, so there are no per-CPU linked lists to update. The array can be searched and updated by loading from/storing to the array at the specified index. With no per-CPU data structures, the above mentioned synchronization problem is avoided and the relid2channel() function gets simpler. Suggested-by: Michael Kelley Signed-off-by: Andrea Parri (Microsoft) Link: https://lore.kernel.org/r/20200406001514.19876-4-parri.andrea@gmail.com Reviewed-by: Michael Kelley Signed-off-by: Wei Liu --- drivers/hv/channel_mgmt.c | 179 +++++++++++++++++++++++++++++----------------- drivers/hv/connection.c | 38 +++------- drivers/hv/hv.c | 2 - drivers/hv/hyperv_vmbus.h | 14 ++-- drivers/hv/vmbus_drv.c | 48 ++++++++----- include/linux/hyperv.h | 5 -- 6 files changed, 160 insertions(+), 126 deletions(-) (limited to 'include/linux') diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c index 0ced32ef2715..bbd0eee4362f 100644 --- a/drivers/hv/channel_mgmt.c +++ b/drivers/hv/channel_mgmt.c @@ -319,7 +319,6 @@ static struct vmbus_channel *alloc_channel(void) init_completion(&channel->rescind_event); INIT_LIST_HEAD(&channel->sc_list); - INIT_LIST_HEAD(&channel->percpu_list); tasklet_init(&channel->callback_event, vmbus_on_event, (unsigned long)channel); @@ -340,23 +339,49 @@ static void free_channel(struct vmbus_channel *channel) kobject_put(&channel->kobj); } -static void percpu_channel_enq(void *arg) +void vmbus_channel_map_relid(struct vmbus_channel *channel) { - struct vmbus_channel *channel = arg; - struct hv_per_cpu_context *hv_cpu - = this_cpu_ptr(hv_context.cpu_context); - - list_add_tail_rcu(&channel->percpu_list, &hv_cpu->chan_list); + if (WARN_ON(channel->offermsg.child_relid >= MAX_CHANNEL_RELIDS)) + return; + /* + * The mapping of the channel's relid is visible from the CPUs that + * execute vmbus_chan_sched() by the time that vmbus_chan_sched() will + * execute: + * + * (a) In the "normal (i.e., not resuming from hibernation)" path, + * the full barrier in smp_store_mb() guarantees that the store + * is propagated to all CPUs before the add_channel_work work + * is queued. In turn, add_channel_work is queued before the + * channel's ring buffer is allocated/initialized and the + * OPENCHANNEL message for the channel is sent in vmbus_open(). + * Hyper-V won't start sending the interrupts for the channel + * before the OPENCHANNEL message is acked. The memory barrier + * in vmbus_chan_sched() -> sync_test_and_clear_bit() ensures + * that vmbus_chan_sched() must find the channel's relid in + * recv_int_page before retrieving the channel pointer from the + * array of channels. + * + * (b) In the "resuming from hibernation" path, the smp_store_mb() + * guarantees that the store is propagated to all CPUs before + * the VMBus connection is marked as ready for the resume event + * (cf. check_ready_for_resume_event()). The interrupt handler + * of the VMBus driver and vmbus_chan_sched() can not run before + * vmbus_bus_resume() has completed execution (cf. resume_noirq). + */ + smp_store_mb( + vmbus_connection.channels[channel->offermsg.child_relid], + channel); } -static void percpu_channel_deq(void *arg) +void vmbus_channel_unmap_relid(struct vmbus_channel *channel) { - struct vmbus_channel *channel = arg; - - list_del_rcu(&channel->percpu_list); + if (WARN_ON(channel->offermsg.child_relid >= MAX_CHANNEL_RELIDS)) + return; + WRITE_ONCE( + vmbus_connection.channels[channel->offermsg.child_relid], + NULL); } - static void vmbus_release_relid(u32 relid) { struct vmbus_channel_relid_released msg; @@ -376,17 +401,25 @@ void hv_process_channel_removal(struct vmbus_channel *channel) struct vmbus_channel *primary_channel; unsigned long flags; - BUG_ON(!mutex_is_locked(&vmbus_connection.channel_mutex)); + lockdep_assert_held(&vmbus_connection.channel_mutex); BUG_ON(!channel->rescind); - if (channel->target_cpu != get_cpu()) { - put_cpu(); - smp_call_function_single(channel->target_cpu, - percpu_channel_deq, channel, true); - } else { - percpu_channel_deq(channel); - put_cpu(); - } + /* + * hv_process_channel_removal() could find INVALID_RELID only for + * hv_sock channels. See the inline comments in vmbus_onoffer(). + */ + WARN_ON(channel->offermsg.child_relid == INVALID_RELID && + !is_hvsock_channel(channel)); + + /* + * Upon suspend, an in-use hv_sock channel is removed from the array of + * channels and the relid is invalidated. After hibernation, when the + * user-space appplication destroys the channel, it's unnecessary and + * unsafe to remove the channel from the array of channels. See also + * the inline comments before the call of vmbus_release_relid() below. + */ + if (channel->offermsg.child_relid != INVALID_RELID) + vmbus_channel_unmap_relid(channel); if (channel->primary_channel == NULL) { list_del(&channel->listentry); @@ -447,16 +480,6 @@ static void vmbus_add_channel_work(struct work_struct *work) init_vp_index(newchannel, dev_type); - if (newchannel->target_cpu != get_cpu()) { - put_cpu(); - smp_call_function_single(newchannel->target_cpu, - percpu_channel_enq, - newchannel, true); - } else { - percpu_channel_enq(newchannel); - put_cpu(); - } - /* * This state is used to indicate a successful open * so that when we do close the channel normally, we @@ -523,17 +546,10 @@ err_deq_chan: spin_unlock_irqrestore(&primary_channel->lock, flags); } - mutex_unlock(&vmbus_connection.channel_mutex); + /* vmbus_process_offer() has mapped the channel. */ + vmbus_channel_unmap_relid(newchannel); - if (newchannel->target_cpu != get_cpu()) { - put_cpu(); - smp_call_function_single(newchannel->target_cpu, - percpu_channel_deq, - newchannel, true); - } else { - percpu_channel_deq(newchannel); - put_cpu(); - } + mutex_unlock(&vmbus_connection.channel_mutex); vmbus_release_relid(newchannel->offermsg.child_relid); @@ -599,6 +615,8 @@ static void vmbus_process_offer(struct vmbus_channel *newchannel) spin_unlock_irqrestore(&channel->lock, flags); } + vmbus_channel_map_relid(newchannel); + mutex_unlock(&vmbus_connection.channel_mutex); /* @@ -940,8 +958,6 @@ static void vmbus_onoffer(struct vmbus_channel_message_header *hdr) oldchannel = find_primary_channel_by_offer(offer); if (oldchannel != NULL) { - atomic_dec(&vmbus_connection.offer_in_progress); - /* * We're resuming from hibernation: all the sub-channel and * hv_sock channels we had before the hibernation should have @@ -949,36 +965,65 @@ static void vmbus_onoffer(struct vmbus_channel_message_header *hdr) * primary channel that we had before the hibernation. */ + /* + * { Initially: channel relid = INVALID_RELID, + * channels[valid_relid] = NULL } + * + * CPU1 CPU2 + * + * [vmbus_onoffer()] [vmbus_device_release()] + * + * LOCK channel_mutex LOCK channel_mutex + * STORE channel relid = valid_relid LOAD r1 = channel relid + * MAP_RELID channel if (r1 != INVALID_RELID) + * UNLOCK channel_mutex UNMAP_RELID channel + * UNLOCK channel_mutex + * + * Forbids: r1 == valid_relid && + * channels[valid_relid] == channel + * + * Note. r1 can be INVALID_RELID only for an hv_sock channel. + * None of the hv_sock channels which were present before the + * suspend are re-offered upon the resume. See the WARN_ON() + * in hv_process_channel_removal(). + */ + mutex_lock(&vmbus_connection.channel_mutex); + + atomic_dec(&vmbus_connection.offer_in_progress); + WARN_ON(oldchannel->offermsg.child_relid != INVALID_RELID); /* Fix up the relid. */ oldchannel->offermsg.child_relid = offer->child_relid; offer_sz = sizeof(*offer); - if (memcmp(offer, &oldchannel->offermsg, offer_sz) == 0) { - check_ready_for_resume_event(); - return; + if (memcmp(offer, &oldchannel->offermsg, offer_sz) != 0) { + /* + * This is not an error, since the host can also change + * the other field(s) of the offer, e.g. on WS RS5 + * (Build 17763), the offer->connection_id of the + * Mellanox VF vmbus device can change when the host + * reoffers the device upon resume. + */ + pr_debug("vmbus offer changed: relid=%d\n", + offer->child_relid); + + print_hex_dump_debug("Old vmbus offer: ", + DUMP_PREFIX_OFFSET, 16, 4, + &oldchannel->offermsg, offer_sz, + false); + print_hex_dump_debug("New vmbus offer: ", + DUMP_PREFIX_OFFSET, 16, 4, + offer, offer_sz, false); + + /* Fix up the old channel. */ + vmbus_setup_channel_state(oldchannel, offer); } - /* - * This is not an error, since the host can also change the - * other field(s) of the offer, e.g. on WS RS5 (Build 17763), - * the offer->connection_id of the Mellanox VF vmbus device - * can change when the host reoffers the device upon resume. - */ - pr_debug("vmbus offer changed: relid=%d\n", - offer->child_relid); - - print_hex_dump_debug("Old vmbus offer: ", DUMP_PREFIX_OFFSET, - 16, 4, &oldchannel->offermsg, offer_sz, - false); - print_hex_dump_debug("New vmbus offer: ", DUMP_PREFIX_OFFSET, - 16, 4, offer, offer_sz, false); - - /* Fix up the old channel. */ - vmbus_setup_channel_state(oldchannel, offer); - + /* Add the channel back to the array of channels. */ + vmbus_channel_map_relid(oldchannel); check_ready_for_resume_event(); + mutex_unlock(&vmbus_connection.channel_mutex); return; } @@ -1036,14 +1081,14 @@ static void vmbus_onoffer_rescind(struct vmbus_channel_message_header *hdr) * * CPU1 CPU2 * - * [vmbus_process_offer()] [vmbus_onoffer_rescind()] + * [vmbus_onoffer()] [vmbus_onoffer_rescind()] * * LOCK channel_mutex WAIT_ON offer_in_progress == 0 * DECREMENT offer_in_progress LOCK channel_mutex - * INSERT chn_list SEARCH chn_list + * STORE channels[] LOAD channels[] * UNLOCK channel_mutex UNLOCK channel_mutex * - * Forbids: CPU2's SEARCH from *not* seeing CPU1's INSERT + * Forbids: CPU2's LOAD from *not* seeing CPU1's STORE */ while (atomic_read(&vmbus_connection.offer_in_progress) != 0) { diff --git a/drivers/hv/connection.c b/drivers/hv/connection.c index f4bd306d2cef..11170d9a2e1a 100644 --- a/drivers/hv/connection.c +++ b/drivers/hv/connection.c @@ -248,6 +248,14 @@ int vmbus_connect(void) pr_info("Vmbus version:%d.%d\n", version >> 16, version & 0xFFFF); + vmbus_connection.channels = kcalloc(MAX_CHANNEL_RELIDS, + sizeof(struct vmbus_channel *), + GFP_KERNEL); + if (vmbus_connection.channels == NULL) { + ret = -ENOMEM; + goto cleanup; + } + kfree(msginfo); return 0; @@ -295,33 +303,9 @@ void vmbus_disconnect(void) */ struct vmbus_channel *relid2channel(u32 relid) { - struct vmbus_channel *channel; - struct vmbus_channel *found_channel = NULL; - struct list_head *cur, *tmp; - struct vmbus_channel *cur_sc; - - BUG_ON(!mutex_is_locked(&vmbus_connection.channel_mutex)); - - list_for_each_entry(channel, &vmbus_connection.chn_list, listentry) { - if (channel->offermsg.child_relid == relid) { - found_channel = channel; - break; - } else if (!list_empty(&channel->sc_list)) { - /* - * Deal with sub-channels. - */ - list_for_each_safe(cur, tmp, &channel->sc_list) { - cur_sc = list_entry(cur, struct vmbus_channel, - sc_list); - if (cur_sc->offermsg.child_relid == relid) { - found_channel = cur_sc; - break; - } - } - } - } - - return found_channel; + if (WARN_ON(relid >= MAX_CHANNEL_RELIDS)) + return NULL; + return READ_ONCE(vmbus_connection.channels[relid]); } /* diff --git a/drivers/hv/hv.c b/drivers/hv/hv.c index e2b331045464..17bf1f229152 100644 --- a/drivers/hv/hv.c +++ b/drivers/hv/hv.c @@ -117,8 +117,6 @@ int hv_synic_alloc(void) pr_err("Unable to allocate post msg page\n"); goto err; } - - INIT_LIST_HEAD(&hv_cpu->chan_list); } return 0; diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h index 000740d053a2..3edf993b0fd9 100644 --- a/drivers/hv/hyperv_vmbus.h +++ b/drivers/hv/hyperv_vmbus.h @@ -132,12 +132,6 @@ struct hv_per_cpu_context { * basis. */ struct tasklet_struct msg_dpc; - - /* - * To optimize the mapping of relid to channel, maintain - * per-cpu list of the channels based on their CPU affinity. - */ - struct list_head chan_list; }; struct hv_context { @@ -202,6 +196,8 @@ int hv_ringbuffer_read(struct vmbus_channel *channel, /* TODO: Need to make this configurable */ #define MAX_NUM_CHANNELS_SUPPORTED 256 +#define MAX_CHANNEL_RELIDS \ + max(MAX_NUM_CHANNELS_SUPPORTED, HV_EVENT_FLAGS_COUNT) enum vmbus_connect_state { DISCONNECTED, @@ -251,6 +247,9 @@ struct vmbus_connection { struct list_head chn_list; struct mutex channel_mutex; + /* Array of channels */ + struct vmbus_channel **channels; + /* * An offer message is handled first on the work_queue, and then * is further handled on handle_primary_chan_wq or @@ -338,6 +337,9 @@ int vmbus_add_channel_kobj(struct hv_device *device_obj, void vmbus_remove_channel_attr_group(struct vmbus_channel *channel); +void vmbus_channel_map_relid(struct vmbus_channel *channel); +void vmbus_channel_unmap_relid(struct vmbus_channel *channel); + struct vmbus_channel *relid2channel(u32 relid); void vmbus_free_channels(void); diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c index 1f3e69d18d35..21a01bca7fcd 100644 --- a/drivers/hv/vmbus_drv.c +++ b/drivers/hv/vmbus_drv.c @@ -1252,33 +1252,39 @@ static void vmbus_chan_sched(struct hv_per_cpu_context *hv_cpu) if (relid == 0) continue; + /* + * Pairs with the kfree_rcu() in vmbus_chan_release(). + * Guarantees that the channel data structure doesn't + * get freed while the channel pointer below is being + * dereferenced. + */ rcu_read_lock(); /* Find channel based on relid */ - list_for_each_entry_rcu(channel, &hv_cpu->chan_list, percpu_list) { - if (channel->offermsg.child_relid != relid) - continue; + channel = relid2channel(relid); + if (channel == NULL) + goto sched_unlock_rcu; - if (channel->rescind) - continue; + if (channel->rescind) + goto sched_unlock_rcu; - trace_vmbus_chan_sched(channel); + trace_vmbus_chan_sched(channel); - ++channel->interrupts; + ++channel->interrupts; - switch (channel->callback_mode) { - case HV_CALL_ISR: - vmbus_channel_isr(channel); - break; + switch (channel->callback_mode) { + case HV_CALL_ISR: + vmbus_channel_isr(channel); + break; - case HV_CALL_BATCHED: - hv_begin_read(&channel->inbound); - /* fallthrough */ - case HV_CALL_DIRECT: - tasklet_schedule(&channel->callback_event); - } + case HV_CALL_BATCHED: + hv_begin_read(&channel->inbound); + fallthrough; + case HV_CALL_DIRECT: + tasklet_schedule(&channel->callback_event); } +sched_unlock_rcu: rcu_read_unlock(); } } @@ -2264,9 +2270,12 @@ static int vmbus_bus_suspend(struct device *dev) list_for_each_entry(channel, &vmbus_connection.chn_list, listentry) { /* - * Invalidate the field. Upon resume, vmbus_onoffer() will fix - * up the field, and the other fields (if necessary). + * Remove the channel from the array of channels and invalidate + * the channel's relid. Upon resume, vmbus_onoffer() will fix + * up the relid (and other fields, if necessary) and add the + * channel back to the array. */ + vmbus_channel_unmap_relid(channel); channel->offermsg.child_relid = INVALID_RELID; if (is_hvsock_channel(channel)) { @@ -2502,6 +2511,7 @@ static void __exit vmbus_exit(void) hv_debug_rm_all_dir(); vmbus_free_channels(); + kfree(vmbus_connection.channels); if (ms_hyperv.misc_features & HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE) { kmsg_dump_unregister(&hv_kmsg_dumper); diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index cbd24f4e68d1..f5b3f008c55a 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -854,11 +854,6 @@ struct vmbus_channel { * Support per-channel state for use by vmbus drivers. */ void *per_channel_state; - /* - * To support per-cpu lookup mapping of relid to channel, - * link up channels based on their CPU affinity. - */ - struct list_head percpu_list; /* * Defer freeing channel until after all cpu's have -- cgit v1.2.3-71-gd317 From 9403b66e6161130ebae7e55a97491c84c1ad6f9f Mon Sep 17 00:00:00 2001 From: "Andrea Parri (Microsoft)" Date: Mon, 6 Apr 2020 02:15:09 +0200 Subject: Drivers: hv: vmbus: Use a spin lock for synchronizing channel scheduling vs. channel removal Since vmbus_chan_sched() dereferences the ring buffer pointer, we have to make sure that the ring buffer data structures don't get freed while such dereferencing is happening. Current code does this by sending an IPI to the CPU that is allowed to access that ring buffer from interrupt level, cf., vmbus_reset_channel_cb(). But with the new functionality to allow changing the CPU that a channel will interrupt, we can't be sure what CPU will be running the vmbus_chan_sched() function for a particular channel, so the current IPI mechanism is infeasible. Instead synchronize vmbus_chan_sched() and vmbus_reset_channel_cb() by using the (newly introduced) per-channel spin lock "sched_lock". Move the test for onchannel_callback being NULL before the "switch" control statement in vmbus_chan_sched(), in order to not access the ring buffer if the vmbus_reset_channel_cb() has been completed on the channel. Suggested-by: Michael Kelley Signed-off-by: Andrea Parri (Microsoft) Link: https://lore.kernel.org/r/20200406001514.19876-7-parri.andrea@gmail.com Reviewed-by: Michael Kelley Signed-off-by: Wei Liu --- drivers/hv/channel.c | 24 +++++++----------------- drivers/hv/channel_mgmt.c | 1 + drivers/hv/vmbus_drv.c | 30 +++++++++++++++++------------- include/linux/hyperv.h | 6 ++++++ 4 files changed, 31 insertions(+), 30 deletions(-) (limited to 'include/linux') diff --git a/drivers/hv/channel.c b/drivers/hv/channel.c index 256ee90c7446..132e476f87b2 100644 --- a/drivers/hv/channel.c +++ b/drivers/hv/channel.c @@ -594,15 +594,10 @@ post_msg_err: } EXPORT_SYMBOL_GPL(vmbus_teardown_gpadl); -static void reset_channel_cb(void *arg) -{ - struct vmbus_channel *channel = arg; - - channel->onchannel_callback = NULL; -} - void vmbus_reset_channel_cb(struct vmbus_channel *channel) { + unsigned long flags; + /* * vmbus_on_event(), running in the per-channel tasklet, can race * with vmbus_close_internal() in the case of SMP guest, e.g., when @@ -618,17 +613,12 @@ void vmbus_reset_channel_cb(struct vmbus_channel *channel) */ tasklet_disable(&channel->callback_event); - channel->sc_creation_callback = NULL; + /* See the inline comments in vmbus_chan_sched(). */ + spin_lock_irqsave(&channel->sched_lock, flags); + channel->onchannel_callback = NULL; + spin_unlock_irqrestore(&channel->sched_lock, flags); - /* Stop the callback asap */ - if (channel->target_cpu != get_cpu()) { - put_cpu(); - smp_call_function_single(channel->target_cpu, reset_channel_cb, - channel, true); - } else { - reset_channel_cb(channel); - put_cpu(); - } + channel->sc_creation_callback = NULL; /* Re-enable tasklet for use on re-open */ tasklet_enable(&channel->callback_event); diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c index bbd0eee4362f..2e730f84654b 100644 --- a/drivers/hv/channel_mgmt.c +++ b/drivers/hv/channel_mgmt.c @@ -315,6 +315,7 @@ static struct vmbus_channel *alloc_channel(void) if (!channel) return NULL; + spin_lock_init(&channel->sched_lock); spin_lock_init(&channel->lock); init_completion(&channel->rescind_event); diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c index 21a01bca7fcd..0f7dfa507a40 100644 --- a/drivers/hv/vmbus_drv.c +++ b/drivers/hv/vmbus_drv.c @@ -1201,18 +1201,6 @@ static void vmbus_force_channel_rescinded(struct vmbus_channel *channel) } #endif /* CONFIG_PM_SLEEP */ -/* - * Direct callback for channels using other deferred processing - */ -static void vmbus_channel_isr(struct vmbus_channel *channel) -{ - void (*callback_fn)(void *); - - callback_fn = READ_ONCE(channel->onchannel_callback); - if (likely(callback_fn != NULL)) - (*callback_fn)(channel->channel_callback_context); -} - /* * Schedule all channels with events pending */ @@ -1243,6 +1231,7 @@ static void vmbus_chan_sched(struct hv_per_cpu_context *hv_cpu) return; for_each_set_bit(relid, recv_int_page, maxbits) { + void (*callback_fn)(void *context); struct vmbus_channel *channel; if (!sync_test_and_clear_bit(relid, recv_int_page)) @@ -1268,13 +1257,26 @@ static void vmbus_chan_sched(struct hv_per_cpu_context *hv_cpu) if (channel->rescind) goto sched_unlock_rcu; + /* + * Make sure that the ring buffer data structure doesn't get + * freed while we dereference the ring buffer pointer. Test + * for the channel's onchannel_callback being NULL within a + * sched_lock critical section. See also the inline comments + * in vmbus_reset_channel_cb(). + */ + spin_lock(&channel->sched_lock); + + callback_fn = channel->onchannel_callback; + if (unlikely(callback_fn == NULL)) + goto sched_unlock; + trace_vmbus_chan_sched(channel); ++channel->interrupts; switch (channel->callback_mode) { case HV_CALL_ISR: - vmbus_channel_isr(channel); + (*callback_fn)(channel->channel_callback_context); break; case HV_CALL_BATCHED: @@ -1284,6 +1286,8 @@ static void vmbus_chan_sched(struct hv_per_cpu_context *hv_cpu) tasklet_schedule(&channel->callback_event); } +sched_unlock: + spin_unlock(&channel->sched_lock); sched_unlock_rcu: rcu_read_unlock(); } diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index f5b3f008c55a..62b2c11d0875 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -771,6 +771,12 @@ struct vmbus_channel { void (*onchannel_callback)(void *context); void *channel_callback_context; + /* + * Synchronize channel scheduling and channel removal; see the inline + * comments in vmbus_chan_sched() and vmbus_reset_channel_cb(). + */ + spinlock_t sched_lock; + /* * A channel can be marked for one of three modes of reading: * BATCHED - callback called from taslket and should read -- cgit v1.2.3-71-gd317 From 8ef4c4abbbcdcd9d4bc0fd9454df03e6dac24b73 Mon Sep 17 00:00:00 2001 From: "Andrea Parri (Microsoft)" Date: Mon, 6 Apr 2020 02:15:11 +0200 Subject: Drivers: hv: vmbus: Remove the unused HV_LOCALIZED channel affinity logic The logic is unused since commit 509879bdb30b8 ("Drivers: hv: Introduce a policy for controlling channel affinity"). This logic assumes that a channel target_cpu doesn't change during the lifetime of a channel, but this assumption is incompatible with the new functionality that allows changing the vCPU a channel will interrupt. Signed-off-by: Andrea Parri (Microsoft) Link: https://lore.kernel.org/r/20200406001514.19876-9-parri.andrea@gmail.com Reviewed-by: Michael Kelley Signed-off-by: Wei Liu --- drivers/hv/channel_mgmt.c | 105 +++++++++++----------------------------------- include/linux/hyperv.h | 27 ------------ 2 files changed, 25 insertions(+), 107 deletions(-) (limited to 'include/linux') diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c index 2e730f84654b..a19dc28fdf77 100644 --- a/drivers/hv/channel_mgmt.c +++ b/drivers/hv/channel_mgmt.c @@ -433,14 +433,6 @@ void hv_process_channel_removal(struct vmbus_channel *channel) spin_unlock_irqrestore(&primary_channel->lock, flags); } - /* - * We need to free the bit for init_vp_index() to work in the case - * of sub-channel, when we reload drivers like hv_netvsc. - */ - if (channel->affinity_policy == HV_LOCALIZED) - cpumask_clear_cpu(channel->target_cpu, - &primary_channel->alloced_cpus_in_node); - /* * Upon suspend, an in-use hv_sock channel is marked as "rescinded" and * the relid is invalidated; after hibernation, when the user-space app @@ -662,20 +654,21 @@ static DEFINE_SPINLOCK(bind_channel_to_cpu_lock); /* * Starting with Win8, we can statically distribute the incoming * channel interrupt load by binding a channel to VCPU. - * We distribute the interrupt loads to one or more NUMA nodes based on - * the channel's affinity_policy. * * For pre-win8 hosts or non-performance critical channels we assign the * first CPU in the first NUMA node. + * + * Starting with win8, performance critical channels will be distributed + * evenly among all the available NUMA nodes. Once the node is assigned, + * we will assign the CPU based on a simple round robin scheme. */ static void init_vp_index(struct vmbus_channel *channel, u16 dev_type) { - u32 cur_cpu; bool perf_chn = vmbus_devs[dev_type].perf_device; - struct vmbus_channel *primary = channel->primary_channel; - int next_node; cpumask_var_t available_mask; struct cpumask *alloced_mask; + u32 target_cpu; + int numa_node; if ((vmbus_proto_version == VERSION_WS2008) || (vmbus_proto_version == VERSION_WIN7) || (!perf_chn) || @@ -693,31 +686,27 @@ static void init_vp_index(struct vmbus_channel *channel, u16 dev_type) return; } - spin_lock(&bind_channel_to_cpu_lock); - /* - * Based on the channel affinity policy, we will assign the NUMA - * nodes. + * Serializes the accesses to the global variable next_numa_node_id. + * See also the header comment of the spin lock declaration. */ + spin_lock(&bind_channel_to_cpu_lock); - if ((channel->affinity_policy == HV_BALANCED) || (!primary)) { - while (true) { - next_node = next_numa_node_id++; - if (next_node == nr_node_ids) { - next_node = next_numa_node_id = 0; - continue; - } - if (cpumask_empty(cpumask_of_node(next_node))) - continue; - break; + while (true) { + numa_node = next_numa_node_id++; + if (numa_node == nr_node_ids) { + next_numa_node_id = 0; + continue; } - channel->numa_node = next_node; - primary = channel; + if (cpumask_empty(cpumask_of_node(numa_node))) + continue; + break; } - alloced_mask = &hv_context.hv_numa_map[primary->numa_node]; + channel->numa_node = numa_node; + alloced_mask = &hv_context.hv_numa_map[numa_node]; if (cpumask_weight(alloced_mask) == - cpumask_weight(cpumask_of_node(primary->numa_node))) { + cpumask_weight(cpumask_of_node(numa_node))) { /* * We have cycled through all the CPUs in the node; * reset the alloced map. @@ -725,57 +714,13 @@ static void init_vp_index(struct vmbus_channel *channel, u16 dev_type) cpumask_clear(alloced_mask); } - cpumask_xor(available_mask, alloced_mask, - cpumask_of_node(primary->numa_node)); + cpumask_xor(available_mask, alloced_mask, cpumask_of_node(numa_node)); - cur_cpu = -1; - - if (primary->affinity_policy == HV_LOCALIZED) { - /* - * Normally Hyper-V host doesn't create more subchannels - * than there are VCPUs on the node but it is possible when not - * all present VCPUs on the node are initialized by guest. - * Clear the alloced_cpus_in_node to start over. - */ - if (cpumask_equal(&primary->alloced_cpus_in_node, - cpumask_of_node(primary->numa_node))) - cpumask_clear(&primary->alloced_cpus_in_node); - } - - while (true) { - cur_cpu = cpumask_next(cur_cpu, available_mask); - if (cur_cpu >= nr_cpu_ids) { - cur_cpu = -1; - cpumask_copy(available_mask, - cpumask_of_node(primary->numa_node)); - continue; - } - - if (primary->affinity_policy == HV_LOCALIZED) { - /* - * NOTE: in the case of sub-channel, we clear the - * sub-channel related bit(s) in - * primary->alloced_cpus_in_node in - * hv_process_channel_removal(), so when we - * reload drivers like hv_netvsc in SMP guest, here - * we're able to re-allocate - * bit from primary->alloced_cpus_in_node. - */ - if (!cpumask_test_cpu(cur_cpu, - &primary->alloced_cpus_in_node)) { - cpumask_set_cpu(cur_cpu, - &primary->alloced_cpus_in_node); - cpumask_set_cpu(cur_cpu, alloced_mask); - break; - } - } else { - cpumask_set_cpu(cur_cpu, alloced_mask); - break; - } - } + target_cpu = cpumask_first(available_mask); + cpumask_set_cpu(target_cpu, alloced_mask); - channel->target_cpu = cur_cpu; - channel->target_vp = hv_cpu_number_to_vp_number(cur_cpu); + channel->target_cpu = target_cpu; + channel->target_vp = hv_cpu_number_to_vp_number(target_cpu); spin_unlock(&bind_channel_to_cpu_lock); diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index 62b2c11d0875..247356dbd742 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -689,11 +689,6 @@ union hv_connection_id { } u; }; -enum hv_numa_policy { - HV_BALANCED = 0, - HV_LOCALIZED, -}; - enum vmbus_device_type { HV_IDE = 0, HV_SCSI, @@ -808,10 +803,6 @@ struct vmbus_channel { u32 target_vp; /* The corresponding CPUID in the guest */ u32 target_cpu; - /* - * State to manage the CPU affiliation of channels. - */ - struct cpumask alloced_cpus_in_node; int numa_node; /* * Support for sub-channels. For high performance devices, @@ -898,18 +889,6 @@ struct vmbus_channel { */ bool low_latency; - /* - * NUMA distribution policy: - * We support two policies: - * 1) Balanced: Here all performance critical channels are - * distributed evenly amongst all the NUMA nodes. - * This policy will be the default policy. - * 2) Localized: All channels of a given instance of a - * performance critical service will be assigned CPUs - * within a selected NUMA node. - */ - enum hv_numa_policy affinity_policy; - bool probe_done; /* @@ -965,12 +944,6 @@ static inline bool is_sub_channel(const struct vmbus_channel *c) return c->offermsg.offer.sub_channel_index != 0; } -static inline void set_channel_affinity_state(struct vmbus_channel *c, - enum hv_numa_policy policy) -{ - c->affinity_policy = policy; -} - static inline void set_channel_read_mode(struct vmbus_channel *c, enum hv_callback_mode mode) { -- cgit v1.2.3-71-gd317 From 7527810573436f00e582d3d5ef2eb3c027c98d7d Mon Sep 17 00:00:00 2001 From: "Andrea Parri (Microsoft)" Date: Mon, 6 Apr 2020 02:15:13 +0200 Subject: Drivers: hv: vmbus: Introduce the CHANNELMSG_MODIFYCHANNEL message type VMBus version 4.1 and later support the CHANNELMSG_MODIFYCHANNEL(22) message type which can be used to request Hyper-V to change the vCPU that a channel will interrupt. Introduce the CHANNELMSG_MODIFYCHANNEL message type, and define the vmbus_send_modifychannel() function to send CHANNELMSG_MODIFYCHANNEL requests to the host via a hypercall. The function is then used to define a sysfs "store" operation, which allows to change the (v)CPU the channel will interrupt by using the sysfs interface. The feature can be used for load balancing or other purposes. One interesting catch here is that Hyper-V can *not* currently ACK CHANNELMSG_MODIFYCHANNEL messages with the promise that (after the ACK is sent) the channel won't send any more interrupts to the "old" CPU. The peculiarity of the CHANNELMSG_MODIFYCHANNEL messages is problematic if the user want to take a CPU offline, since we don't want to take a CPU offline (and, potentially, "lose" channel interrupts on such CPU) if the host is still processing a CHANNELMSG_MODIFYCHANNEL message associated to that CPU. It is worth mentioning, however, that we have been unable to observe the above mentioned "race": in all our tests, CHANNELMSG_MODIFYCHANNEL requests appeared *as if* they were processed synchronously by the host. Suggested-by: Michael Kelley Signed-off-by: Andrea Parri (Microsoft) Link: https://lore.kernel.org/r/20200406001514.19876-11-parri.andrea@gmail.com Reviewed-by: Michael Kelley [ wei: fix conflict in channel_mgmt.c ] Signed-off-by: Wei Liu --- drivers/hv/channel.c | 28 ++++++++++++ drivers/hv/channel_mgmt.c | 2 +- drivers/hv/hv_trace.h | 19 ++++++++ drivers/hv/vmbus_drv.c | 108 +++++++++++++++++++++++++++++++++++++++++++++- include/linux/hyperv.h | 10 ++++- 5 files changed, 163 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/drivers/hv/channel.c b/drivers/hv/channel.c index 132e476f87b2..90070b337c10 100644 --- a/drivers/hv/channel.c +++ b/drivers/hv/channel.c @@ -289,6 +289,34 @@ int vmbus_send_tl_connect_request(const guid_t *shv_guest_servie_id, } EXPORT_SYMBOL_GPL(vmbus_send_tl_connect_request); +/* + * Set/change the vCPU (@target_vp) the channel (@child_relid) will interrupt. + * + * CHANNELMSG_MODIFYCHANNEL messages are aynchronous. Also, Hyper-V does not + * ACK such messages. IOW we can't know when the host will stop interrupting + * the "old" vCPU and start interrupting the "new" vCPU for the given channel. + * + * The CHANNELMSG_MODIFYCHANNEL message type is supported since VMBus version + * VERSION_WIN10_V4_1. + */ +int vmbus_send_modifychannel(u32 child_relid, u32 target_vp) +{ + struct vmbus_channel_modifychannel conn_msg; + int ret; + + memset(&conn_msg, 0, sizeof(conn_msg)); + conn_msg.header.msgtype = CHANNELMSG_MODIFYCHANNEL; + conn_msg.child_relid = child_relid; + conn_msg.target_vp = target_vp; + + ret = vmbus_post_msg(&conn_msg, sizeof(conn_msg), true); + + trace_vmbus_send_modifychannel(&conn_msg, ret); + + return ret; +} +EXPORT_SYMBOL_GPL(vmbus_send_modifychannel); + /* * create_gpadl_header - Creates a gpadl for the specified buffer */ diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c index 2db3823f0e59..ffd7fffa5f83 100644 --- a/drivers/hv/channel_mgmt.c +++ b/drivers/hv/channel_mgmt.c @@ -1383,7 +1383,7 @@ channel_message_table[CHANNELMSG_COUNT] = { { CHANNELMSG_19, 0, NULL, 0}, { CHANNELMSG_20, 0, NULL, 0}, { CHANNELMSG_TL_CONNECT_REQUEST, 0, NULL, 0}, - { CHANNELMSG_22, 0, NULL, 0}, + { CHANNELMSG_MODIFYCHANNEL, 0, NULL, 0}, { CHANNELMSG_TL_CONNECT_RESULT, 0, NULL, 0}, }; diff --git a/drivers/hv/hv_trace.h b/drivers/hv/hv_trace.h index e70783e33680..a43bc76c2d5d 100644 --- a/drivers/hv/hv_trace.h +++ b/drivers/hv/hv_trace.h @@ -296,6 +296,25 @@ TRACE_EVENT(vmbus_send_tl_connect_request, ) ); +TRACE_EVENT(vmbus_send_modifychannel, + TP_PROTO(const struct vmbus_channel_modifychannel *msg, + int ret), + TP_ARGS(msg, ret), + TP_STRUCT__entry( + __field(u32, child_relid) + __field(u32, target_vp) + __field(int, ret) + ), + TP_fast_assign( + __entry->child_relid = msg->child_relid; + __entry->target_vp = msg->target_vp; + __entry->ret = ret; + ), + TP_printk("binding child_relid 0x%x to target_vp 0x%x, ret %d", + __entry->child_relid, __entry->target_vp, __entry->ret + ) + ); + DECLARE_EVENT_CLASS(vmbus_channel, TP_PROTO(const struct vmbus_channel *channel), TP_ARGS(channel), diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c index 0f7dfa507a40..5d24b25fb5aa 100644 --- a/drivers/hv/vmbus_drv.c +++ b/drivers/hv/vmbus_drv.c @@ -1606,8 +1606,24 @@ static ssize_t vmbus_chan_attr_show(struct kobject *kobj, return attribute->show(chan, buf); } +static ssize_t vmbus_chan_attr_store(struct kobject *kobj, + struct attribute *attr, const char *buf, + size_t count) +{ + const struct vmbus_chan_attribute *attribute + = container_of(attr, struct vmbus_chan_attribute, attr); + struct vmbus_channel *chan + = container_of(kobj, struct vmbus_channel, kobj); + + if (!attribute->store) + return -EIO; + + return attribute->store(chan, buf, count); +} + static const struct sysfs_ops vmbus_chan_sysfs_ops = { .show = vmbus_chan_attr_show, + .store = vmbus_chan_attr_store, }; static ssize_t out_mask_show(struct vmbus_channel *channel, char *buf) @@ -1678,11 +1694,99 @@ static ssize_t write_avail_show(struct vmbus_channel *channel, char *buf) } static VMBUS_CHAN_ATTR_RO(write_avail); -static ssize_t show_target_cpu(struct vmbus_channel *channel, char *buf) +static ssize_t target_cpu_show(struct vmbus_channel *channel, char *buf) { return sprintf(buf, "%u\n", channel->target_cpu); } -static VMBUS_CHAN_ATTR(cpu, S_IRUGO, show_target_cpu, NULL); +static ssize_t target_cpu_store(struct vmbus_channel *channel, + const char *buf, size_t count) +{ + ssize_t ret = count; + u32 target_cpu; + + if (vmbus_proto_version < VERSION_WIN10_V4_1) + return -EIO; + + if (sscanf(buf, "%uu", &target_cpu) != 1) + return -EIO; + + /* Validate target_cpu for the cpumask_test_cpu() operation below. */ + if (target_cpu >= nr_cpumask_bits) + return -EINVAL; + + /* No CPUs should come up or down during this. */ + cpus_read_lock(); + + if (!cpumask_test_cpu(target_cpu, cpu_online_mask)) { + cpus_read_unlock(); + return -EINVAL; + } + + /* + * Synchronizes target_cpu_store() and channel closure: + * + * { Initially: state = CHANNEL_OPENED } + * + * CPU1 CPU2 + * + * [target_cpu_store()] [vmbus_disconnect_ring()] + * + * LOCK channel_mutex LOCK channel_mutex + * LOAD r1 = state LOAD r2 = state + * IF (r1 == CHANNEL_OPENED) IF (r2 == CHANNEL_OPENED) + * SEND MODIFYCHANNEL STORE state = CHANNEL_OPEN + * [...] SEND CLOSECHANNEL + * UNLOCK channel_mutex UNLOCK channel_mutex + * + * Forbids: r1 == r2 == CHANNEL_OPENED (i.e., CPU1's LOCK precedes + * CPU2's LOCK) && CPU2's SEND precedes CPU1's SEND + * + * Note. The host processes the channel messages "sequentially", in + * the order in which they are received on a per-partition basis. + */ + mutex_lock(&vmbus_connection.channel_mutex); + + /* + * Hyper-V will ignore MODIFYCHANNEL messages for "non-open" channels; + * avoid sending the message and fail here for such channels. + */ + if (channel->state != CHANNEL_OPENED_STATE) { + ret = -EIO; + goto cpu_store_unlock; + } + + if (channel->target_cpu == target_cpu) + goto cpu_store_unlock; + + if (vmbus_send_modifychannel(channel->offermsg.child_relid, + hv_cpu_number_to_vp_number(target_cpu))) { + ret = -EIO; + goto cpu_store_unlock; + } + + /* + * Warning. At this point, there is *no* guarantee that the host will + * have successfully processed the vmbus_send_modifychannel() request. + * See the header comment of vmbus_send_modifychannel() for more info. + * + * Lags in the processing of the above vmbus_send_modifychannel() can + * result in missed interrupts if the "old" target CPU is taken offline + * before Hyper-V starts sending interrupts to the "new" target CPU. + * But apart from this offlining scenario, the code tolerates such + * lags. It will function correctly even if a channel interrupt comes + * in on a CPU that is different from the channel target_cpu value. + */ + + channel->target_cpu = target_cpu; + channel->target_vp = hv_cpu_number_to_vp_number(target_cpu); + channel->numa_node = cpu_to_node(target_cpu); + +cpu_store_unlock: + mutex_unlock(&vmbus_connection.channel_mutex); + cpus_read_unlock(); + return ret; +} +static VMBUS_CHAN_ATTR(cpu, 0644, target_cpu_show, target_cpu_store); static ssize_t channel_pending_show(struct vmbus_channel *channel, char *buf) diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index 247356dbd742..b85d7580f2c1 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -425,7 +425,7 @@ enum vmbus_channel_message_type { CHANNELMSG_19 = 19, CHANNELMSG_20 = 20, CHANNELMSG_TL_CONNECT_REQUEST = 21, - CHANNELMSG_22 = 22, + CHANNELMSG_MODIFYCHANNEL = 22, CHANNELMSG_TL_CONNECT_RESULT = 23, CHANNELMSG_COUNT }; @@ -620,6 +620,13 @@ struct vmbus_channel_tl_connect_request { guid_t host_service_id; } __packed; +/* Modify Channel parameters, cf. vmbus_send_modifychannel() */ +struct vmbus_channel_modifychannel { + struct vmbus_channel_message_header header; + u32 child_relid; + u32 target_vp; +} __packed; + struct vmbus_channel_version_response { struct vmbus_channel_message_header header; u8 version_supported; @@ -1505,6 +1512,7 @@ extern __u32 vmbus_proto_version; int vmbus_send_tl_connect_request(const guid_t *shv_guest_servie_id, const guid_t *shv_host_servie_id); +int vmbus_send_modifychannel(u32 child_relid, u32 target_vp); void vmbus_set_event(struct vmbus_channel *channel); /* Get the start of the ring buffer. */ -- cgit v1.2.3-71-gd317 From 7769e18c201aa88eade5556faf9da7f2bc15bb8a Mon Sep 17 00:00:00 2001 From: "Andrea Parri (Microsoft)" Date: Mon, 6 Apr 2020 02:15:14 +0200 Subject: scsi: storvsc: Re-init stor_chns when a channel interrupt is re-assigned For each storvsc_device, storvsc keeps track of the channel target CPUs associated to the device (alloced_cpus) and it uses this information to fill a "cache" (stor_chns) mapping CPU->channel according to a certain heuristic. Update the alloced_cpus mask and the stor_chns array when a channel of the storvsc device is re-assigned to a different CPU. Signed-off-by: Andrea Parri (Microsoft) Cc: "James E.J. Bottomley" Cc: "Martin K. Petersen" Cc: Link: https://lore.kernel.org/r/20200406001514.19876-12-parri.andrea@gmail.com Reviewed-by; Long Li Reviewed-by: Michael Kelley [ wei: fix a small issue reported by kbuild test robot ] Signed-off-by: Wei Liu --- drivers/hv/vmbus_drv.c | 4 ++ drivers/scsi/storvsc_drv.c | 96 ++++++++++++++++++++++++++++++++++++++++++---- include/linux/hyperv.h | 3 ++ 3 files changed, 95 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c index 5d24b25fb5aa..28c009c7a9cd 100644 --- a/drivers/hv/vmbus_drv.c +++ b/drivers/hv/vmbus_drv.c @@ -1777,6 +1777,10 @@ static ssize_t target_cpu_store(struct vmbus_channel *channel, * in on a CPU that is different from the channel target_cpu value. */ + if (channel->change_target_cpu_callback) + (*channel->change_target_cpu_callback)(channel, + channel->target_cpu, target_cpu); + channel->target_cpu = target_cpu; channel->target_vp = hv_cpu_number_to_vp_number(target_cpu); channel->numa_node = cpu_to_node(target_cpu); diff --git a/drivers/scsi/storvsc_drv.c b/drivers/scsi/storvsc_drv.c index fb41636519ee..072ed8728657 100644 --- a/drivers/scsi/storvsc_drv.c +++ b/drivers/scsi/storvsc_drv.c @@ -621,6 +621,64 @@ get_in_err: } +static void storvsc_change_target_cpu(struct vmbus_channel *channel, u32 old, + u32 new) +{ + struct storvsc_device *stor_device; + struct vmbus_channel *cur_chn; + bool old_is_alloced = false; + struct hv_device *device; + unsigned long flags; + int cpu; + + device = channel->primary_channel ? + channel->primary_channel->device_obj + : channel->device_obj; + stor_device = get_out_stor_device(device); + if (!stor_device) + return; + + /* See storvsc_do_io() -> get_og_chn(). */ + spin_lock_irqsave(&device->channel->lock, flags); + + /* + * Determines if the storvsc device has other channels assigned to + * the "old" CPU to update the alloced_cpus mask and the stor_chns + * array. + */ + if (device->channel != channel && device->channel->target_cpu == old) { + cur_chn = device->channel; + old_is_alloced = true; + goto old_is_alloced; + } + list_for_each_entry(cur_chn, &device->channel->sc_list, sc_list) { + if (cur_chn == channel) + continue; + if (cur_chn->target_cpu == old) { + old_is_alloced = true; + goto old_is_alloced; + } + } + +old_is_alloced: + if (old_is_alloced) + WRITE_ONCE(stor_device->stor_chns[old], cur_chn); + else + cpumask_clear_cpu(old, &stor_device->alloced_cpus); + + /* "Flush" the stor_chns array. */ + for_each_possible_cpu(cpu) { + if (stor_device->stor_chns[cpu] && !cpumask_test_cpu( + cpu, &stor_device->alloced_cpus)) + WRITE_ONCE(stor_device->stor_chns[cpu], NULL); + } + + WRITE_ONCE(stor_device->stor_chns[new], channel); + cpumask_set_cpu(new, &stor_device->alloced_cpus); + + spin_unlock_irqrestore(&device->channel->lock, flags); +} + static void handle_sc_creation(struct vmbus_channel *new_sc) { struct hv_device *device = new_sc->primary_channel->device_obj; @@ -648,6 +706,8 @@ static void handle_sc_creation(struct vmbus_channel *new_sc) return; } + new_sc->change_target_cpu_callback = storvsc_change_target_cpu; + /* Add the sub-channel to the array of available channels. */ stor_device->stor_chns[new_sc->target_cpu] = new_sc; cpumask_set_cpu(new_sc->target_cpu, &stor_device->alloced_cpus); @@ -876,6 +936,8 @@ static int storvsc_channel_init(struct hv_device *device, bool is_fc) if (stor_device->stor_chns == NULL) return -ENOMEM; + device->channel->change_target_cpu_callback = storvsc_change_target_cpu; + stor_device->stor_chns[device->channel->target_cpu] = device->channel; cpumask_set_cpu(device->channel->target_cpu, &stor_device->alloced_cpus); @@ -1248,8 +1310,10 @@ static struct vmbus_channel *get_og_chn(struct storvsc_device *stor_device, const struct cpumask *node_mask; int num_channels, tgt_cpu; - if (stor_device->num_sc == 0) + if (stor_device->num_sc == 0) { + stor_device->stor_chns[q_num] = stor_device->device->channel; return stor_device->device->channel; + } /* * Our channel array is sparsley populated and we @@ -1258,7 +1322,6 @@ static struct vmbus_channel *get_og_chn(struct storvsc_device *stor_device, * The strategy is simple: * I. Ensure NUMA locality * II. Distribute evenly (best effort) - * III. Mapping is persistent. */ node_mask = cpumask_of_node(cpu_to_node(q_num)); @@ -1268,8 +1331,10 @@ static struct vmbus_channel *get_og_chn(struct storvsc_device *stor_device, if (cpumask_test_cpu(tgt_cpu, node_mask)) num_channels++; } - if (num_channels == 0) + if (num_channels == 0) { + stor_device->stor_chns[q_num] = stor_device->device->channel; return stor_device->device->channel; + } hash_qnum = q_num; while (hash_qnum >= num_channels) @@ -1295,6 +1360,7 @@ static int storvsc_do_io(struct hv_device *device, struct storvsc_device *stor_device; struct vstor_packet *vstor_packet; struct vmbus_channel *outgoing_channel, *channel; + unsigned long flags; int ret = 0; const struct cpumask *node_mask; int tgt_cpu; @@ -1308,10 +1374,11 @@ static int storvsc_do_io(struct hv_device *device, request->device = device; /* - * Select an an appropriate channel to send the request out. + * Select an appropriate channel to send the request out. */ - if (stor_device->stor_chns[q_num] != NULL) { - outgoing_channel = stor_device->stor_chns[q_num]; + /* See storvsc_change_target_cpu(). */ + outgoing_channel = READ_ONCE(stor_device->stor_chns[q_num]); + if (outgoing_channel != NULL) { if (outgoing_channel->target_cpu == q_num) { /* * Ideally, we want to pick a different channel if @@ -1324,7 +1391,10 @@ static int storvsc_do_io(struct hv_device *device, continue; if (tgt_cpu == q_num) continue; - channel = stor_device->stor_chns[tgt_cpu]; + channel = READ_ONCE( + stor_device->stor_chns[tgt_cpu]); + if (channel == NULL) + continue; if (hv_get_avail_to_write_percent( &channel->outbound) > ring_avail_percent_lowater) { @@ -1350,7 +1420,10 @@ static int storvsc_do_io(struct hv_device *device, for_each_cpu(tgt_cpu, &stor_device->alloced_cpus) { if (cpumask_test_cpu(tgt_cpu, node_mask)) continue; - channel = stor_device->stor_chns[tgt_cpu]; + channel = READ_ONCE( + stor_device->stor_chns[tgt_cpu]); + if (channel == NULL) + continue; if (hv_get_avail_to_write_percent( &channel->outbound) > ring_avail_percent_lowater) { @@ -1360,7 +1433,14 @@ static int storvsc_do_io(struct hv_device *device, } } } else { + spin_lock_irqsave(&device->channel->lock, flags); + outgoing_channel = stor_device->stor_chns[q_num]; + if (outgoing_channel != NULL) { + spin_unlock_irqrestore(&device->channel->lock, flags); + goto found_channel; + } outgoing_channel = get_og_chn(stor_device, q_num); + spin_unlock_irqrestore(&device->channel->lock, flags); } found_channel: diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index b85d7580f2c1..cd64ab7bb3b7 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -773,6 +773,9 @@ struct vmbus_channel { void (*onchannel_callback)(void *context); void *channel_callback_context; + void (*change_target_cpu_callback)(struct vmbus_channel *channel, + u32 old, u32 new); + /* * Synchronize channel scheduling and channel removal; see the inline * comments in vmbus_chan_sched() and vmbus_reset_channel_cb(). -- cgit v1.2.3-71-gd317 From b7d18c57c94a38a81c1dbcfee0d63153ffaf1856 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 23 Apr 2020 16:45:05 +0300 Subject: hyper-v: Switch to use UUID types directly uuid_le is an alias for guid_t and is going to be removed in the future. Replace it with original type. Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20200423134505.78221-4-andriy.shevchenko@linux.intel.com Signed-off-by: Wei Liu --- include/linux/mod_devicetable.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h index 4c2ddd0941a7..f36b1cf5ffe3 100644 --- a/include/linux/mod_devicetable.h +++ b/include/linux/mod_devicetable.h @@ -434,7 +434,7 @@ struct virtio_device_id { * For Hyper-V devices we use the device guid as the id. */ struct hv_vmbus_device_id { - uuid_le guid; + guid_t guid; kernel_ulong_t driver_data; /* Data private to the driver */ }; -- cgit v1.2.3-71-gd317 From db5871e85533f06ffe1795c1cb9b9153860d1e4f Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Thu, 7 May 2020 13:53:23 -0500 Subject: vmbus: Replace zero-length array with flexible-array The current codebase makes use of the zero-length array language extension to the C90 standard, but the preferred mechanism to declare variable-length types such as these ones is a flexible array member[1][2], introduced in C99: struct foo { int stuff; struct boo array[]; }; By making use of the mechanism above, we will get a compiler warning in case the flexible array does not occur last in the structure, which will help us prevent some kind of undefined behavior bugs from being inadvertently introduced[3] to the codebase from now on. Also, notice that, dynamic memory allocations won't be affected by this change: "Flexible array members have incomplete type, and so the sizeof operator may not be applied. As a quirk of the original implementation of zero-length arrays, sizeof evaluates to zero."[1] sizeof(flexible-array-member) triggers a warning because flexible array members have incomplete type[1]. There are some instances of code in which the sizeof operator is being incorrectly/erroneously applied to zero-length arrays and the result is zero. Such instances may be hiding some bugs. So, this work (flexible-array member conversions) will also help to get completely rid of those sorts of issues. This issue was found with the help of Coccinelle. [1] https://gcc.gnu.org/onlinedocs/gcc/Zero-Length.html [2] https://github.com/KSPP/linux/issues/21 [3] commit 76497732932f ("cxgb3/l2t: Fix undefined behaviour") Signed-off-by: Gustavo A. R. Silva Link: https://lore.kernel.org/r/20200507185323.GA14416@embeddedor Signed-off-by: Wei Liu --- include/linux/hyperv.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index cd64ab7bb3b7..d783847d8cb4 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -117,7 +117,7 @@ struct hv_ring_buffer { * Ring data starts here + RingDataStartOffset * !!! DO NOT place any fields below this !!! */ - u8 buffer[0]; + u8 buffer[]; } __packed; struct hv_ring_buffer_info { @@ -313,7 +313,7 @@ struct vmadd_remove_transfer_page_set { struct gpa_range { u32 byte_count; u32 byte_offset; - u64 pfn_array[0]; + u64 pfn_array[]; }; /* @@ -563,7 +563,7 @@ struct vmbus_channel_gpadl_header { u32 gpadl; u16 range_buflen; u16 rangecount; - struct gpa_range range[0]; + struct gpa_range range[]; } __packed; /* This is the followup packet that contains more PFNs. */ @@ -571,7 +571,7 @@ struct vmbus_channel_gpadl_body { struct vmbus_channel_message_header header; u32 msgnumber; u32 gpadl; - u64 pfn[0]; + u64 pfn[]; } __packed; struct vmbus_channel_gpadl_created { @@ -679,7 +679,7 @@ struct vmbus_channel_msginfo { * The channel message that goes out on the "wire". * It will contain at minimum the VMBUS_CHANNEL_MESSAGE_HEADER header */ - unsigned char msg[0]; + unsigned char msg[]; }; struct vmbus_close_msg { -- cgit v1.2.3-71-gd317 From afaa33da08abd10be8978781d7c99a9e67d2bbff Mon Sep 17 00:00:00 2001 From: "Andrea Parri (Microsoft)" Date: Fri, 22 May 2020 19:19:01 +0200 Subject: Drivers: hv: vmbus: Resolve more races involving init_vp_index() init_vp_index() uses the (per-node) hv_numa_map[] masks to record the CPUs allocated for channel interrupts at a given time, and distribute the performance-critical channels across the available CPUs: in part., the mask of "candidate" target CPUs in a given NUMA node, for a newly offered channel, is determined by XOR-ing the node's CPU mask and the node's hv_numa_map. This operation/mechanism assumes that no offline CPUs is set in the hv_numa_map mask, an assumption that does not hold since such mask is currently not updated when a channel is removed or assigned to a different CPU. To address the issues described above, this adds hooks in the channel removal path (hv_process_channel_removal()) and in target_cpu_store() in order to clear, resp. to update, the hv_numa_map[] masks as needed. This also adds a (missed) update of the masks in init_vp_index() (cf., e.g., the memory-allocation failure path in this function). Like in the case of init_vp_index(), such hooks require to determine if the given channel is performance critical. init_vp_index() does this by parsing the channel's offer, it can not rely on the device data structure (device_obj) to retrieve such information because the device data structure has not been allocated/linked with the channel by the time that init_vp_index() executes. A similar situation may hold in hv_is_alloced_cpu() (defined below); the adopted approach is to "cache" the device type of the channel, as computed by parsing the channel's offer, in the channel structure itself. Fixes: 7527810573436f ("Drivers: hv: vmbus: Introduce the CHANNELMSG_MODIFYCHANNEL message type") Signed-off-by: Andrea Parri (Microsoft) Reviewed-by: Michael Kelley Link: https://lore.kernel.org/r/20200522171901.204127-3-parri.andrea@gmail.com Signed-off-by: Wei Liu --- drivers/hv/channel_mgmt.c | 22 ++++++++++++++++------ drivers/hv/hyperv_vmbus.h | 48 +++++++++++++++++++++++++++++++++++++++++++++++ drivers/hv/vmbus_drv.c | 19 +++++++++++++------ include/linux/hyperv.h | 7 +++++++ 4 files changed, 84 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c index 89eaacf069a8..417a95e5094d 100644 --- a/drivers/hv/channel_mgmt.c +++ b/drivers/hv/channel_mgmt.c @@ -24,9 +24,9 @@ #include "hyperv_vmbus.h" -static void init_vp_index(struct vmbus_channel *channel, u16 dev_type); +static void init_vp_index(struct vmbus_channel *channel); -static const struct vmbus_device vmbus_devs[] = { +const struct vmbus_device vmbus_devs[] = { /* IDE */ { .dev_type = HV_IDE, HV_IDE_GUID, @@ -431,6 +431,13 @@ void hv_process_channel_removal(struct vmbus_channel *channel) spin_unlock_irqrestore(&primary_channel->lock, flags); } + /* + * If this is a "perf" channel, updates the hv_numa_map[] masks so that + * init_vp_index() can (re-)use the CPU. + */ + if (hv_is_perf_channel(channel)) + hv_clear_alloced_cpu(channel->target_cpu); + /* * Upon suspend, an in-use hv_sock channel is marked as "rescinded" and * the relid is invalidated; after hibernation, when the user-space app @@ -497,7 +504,7 @@ static void vmbus_add_channel_work(struct work_struct *work) if (!newchannel->device_obj) goto err_deq_chan; - newchannel->device_obj->device_id = hv_get_dev_type(newchannel); + newchannel->device_obj->device_id = newchannel->device_id; /* * Add the new device to the bus. This will kick off device-driver * binding which eventually invokes the device driver's AddDevice() @@ -580,7 +587,7 @@ static void vmbus_process_offer(struct vmbus_channel *newchannel) */ mutex_lock(&vmbus_connection.channel_mutex); - init_vp_index(newchannel, hv_get_dev_type(newchannel)); + init_vp_index(newchannel); /* Remember the channels that should be cleaned up upon suspend. */ if (is_hvsock_channel(newchannel) || is_sub_channel(newchannel)) @@ -676,9 +683,9 @@ static int next_numa_node_id; * evenly among all the available NUMA nodes. Once the node is assigned, * we will assign the CPU based on a simple round robin scheme. */ -static void init_vp_index(struct vmbus_channel *channel, u16 dev_type) +static void init_vp_index(struct vmbus_channel *channel) { - bool perf_chn = vmbus_devs[dev_type].perf_device; + bool perf_chn = hv_is_perf_channel(channel); cpumask_var_t available_mask; struct cpumask *alloced_mask; u32 target_cpu; @@ -699,6 +706,8 @@ static void init_vp_index(struct vmbus_channel *channel, u16 dev_type) channel->target_cpu = VMBUS_CONNECT_CPU; channel->target_vp = hv_cpu_number_to_vp_number(VMBUS_CONNECT_CPU); + if (perf_chn) + hv_set_alloced_cpu(VMBUS_CONNECT_CPU); return; } @@ -862,6 +871,7 @@ static void vmbus_setup_channel_state(struct vmbus_channel *channel, sizeof(struct vmbus_channel_offer_channel)); channel->monitor_grp = (u8)offer->monitorid / 32; channel->monitor_bit = (u8)offer->monitorid % 32; + channel->device_id = hv_get_dev_type(channel); } /* diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h index 5e5cebe5d048..40e2b9f91163 100644 --- a/drivers/hv/hyperv_vmbus.h +++ b/drivers/hv/hyperv_vmbus.h @@ -395,6 +395,54 @@ enum delay { MESSAGE_DELAY = 1, }; +extern const struct vmbus_device vmbus_devs[]; + +static inline bool hv_is_perf_channel(struct vmbus_channel *channel) +{ + return vmbus_devs[channel->device_id].perf_device; +} + +static inline bool hv_is_alloced_cpu(unsigned int cpu) +{ + struct vmbus_channel *channel, *sc; + + lockdep_assert_held(&vmbus_connection.channel_mutex); + /* + * List additions/deletions as well as updates of the target CPUs are + * protected by channel_mutex. + */ + list_for_each_entry(channel, &vmbus_connection.chn_list, listentry) { + if (!hv_is_perf_channel(channel)) + continue; + if (channel->target_cpu == cpu) + return true; + list_for_each_entry(sc, &channel->sc_list, sc_list) { + if (sc->target_cpu == cpu) + return true; + } + } + return false; +} + +static inline void hv_set_alloced_cpu(unsigned int cpu) +{ + cpumask_set_cpu(cpu, &hv_context.hv_numa_map[cpu_to_node(cpu)]); +} + +static inline void hv_clear_alloced_cpu(unsigned int cpu) +{ + if (hv_is_alloced_cpu(cpu)) + return; + cpumask_clear_cpu(cpu, &hv_context.hv_numa_map[cpu_to_node(cpu)]); +} + +static inline void hv_update_alloced_cpus(unsigned int old_cpu, + unsigned int new_cpu) +{ + hv_set_alloced_cpu(new_cpu); + hv_clear_alloced_cpu(old_cpu); +} + #ifdef CONFIG_HYPERV_TESTING int hv_debug_add_dev_dir(struct hv_device *dev); diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c index c2a4a7c0b99a..47747755d2e1 100644 --- a/drivers/hv/vmbus_drv.c +++ b/drivers/hv/vmbus_drv.c @@ -1687,8 +1687,8 @@ static ssize_t target_cpu_show(struct vmbus_channel *channel, char *buf) static ssize_t target_cpu_store(struct vmbus_channel *channel, const char *buf, size_t count) { + u32 target_cpu, origin_cpu; ssize_t ret = count; - u32 target_cpu; if (vmbus_proto_version < VERSION_WIN10_V4_1) return -EIO; @@ -1741,7 +1741,8 @@ static ssize_t target_cpu_store(struct vmbus_channel *channel, goto cpu_store_unlock; } - if (channel->target_cpu == target_cpu) + origin_cpu = channel->target_cpu; + if (target_cpu == origin_cpu) goto cpu_store_unlock; if (vmbus_send_modifychannel(channel->offermsg.child_relid, @@ -1763,14 +1764,20 @@ static ssize_t target_cpu_store(struct vmbus_channel *channel, * in on a CPU that is different from the channel target_cpu value. */ - if (channel->change_target_cpu_callback) - (*channel->change_target_cpu_callback)(channel, - channel->target_cpu, target_cpu); - channel->target_cpu = target_cpu; channel->target_vp = hv_cpu_number_to_vp_number(target_cpu); channel->numa_node = cpu_to_node(target_cpu); + /* See init_vp_index(). */ + if (hv_is_perf_channel(channel)) + hv_update_alloced_cpus(origin_cpu, target_cpu); + + /* Currently set only for storvsc channels. */ + if (channel->change_target_cpu_callback) { + (*channel->change_target_cpu_callback)(channel, + origin_cpu, target_cpu); + } + cpu_store_unlock: mutex_unlock(&vmbus_connection.channel_mutex); cpus_read_unlock(); diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index d783847d8cb4..40df3103e890 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -901,6 +901,13 @@ struct vmbus_channel { bool probe_done; + /* + * Cache the device ID here for easy access; this is useful, in + * particular, in situations where the channel's device_obj has + * not been allocated/initialized yet. + */ + u16 device_id; + /* * We must offload the handling of the primary/sub channels * from the single-threaded vmbus_connection.work_queue to -- cgit v1.2.3-71-gd317