cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

eventfd.c (23406B)


      1// SPDX-License-Identifier: GPL-2.0-only
      2/*
      3 * kvm eventfd support - use eventfd objects to signal various KVM events
      4 *
      5 * Copyright 2009 Novell.  All Rights Reserved.
      6 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
      7 *
      8 * Author:
      9 *	Gregory Haskins <ghaskins@novell.com>
     10 */
     11
     12#include <linux/kvm_host.h>
     13#include <linux/kvm.h>
     14#include <linux/kvm_irqfd.h>
     15#include <linux/workqueue.h>
     16#include <linux/syscalls.h>
     17#include <linux/wait.h>
     18#include <linux/poll.h>
     19#include <linux/file.h>
     20#include <linux/list.h>
     21#include <linux/eventfd.h>
     22#include <linux/kernel.h>
     23#include <linux/srcu.h>
     24#include <linux/slab.h>
     25#include <linux/seqlock.h>
     26#include <linux/irqbypass.h>
     27#include <trace/events/kvm.h>
     28
     29#include <kvm/iodev.h>
     30
     31#ifdef CONFIG_HAVE_KVM_IRQFD
     32
     33static struct workqueue_struct *irqfd_cleanup_wq;
     34
     35bool __attribute__((weak))
     36kvm_arch_irqfd_allowed(struct kvm *kvm, struct kvm_irqfd *args)
     37{
     38	return true;
     39}
     40
     41static void
     42irqfd_inject(struct work_struct *work)
     43{
     44	struct kvm_kernel_irqfd *irqfd =
     45		container_of(work, struct kvm_kernel_irqfd, inject);
     46	struct kvm *kvm = irqfd->kvm;
     47
     48	if (!irqfd->resampler) {
     49		kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 1,
     50				false);
     51		kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0,
     52				false);
     53	} else
     54		kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
     55			    irqfd->gsi, 1, false);
     56}
     57
     58/*
     59 * Since resampler irqfds share an IRQ source ID, we de-assert once
     60 * then notify all of the resampler irqfds using this GSI.  We can't
     61 * do multiple de-asserts or we risk racing with incoming re-asserts.
     62 */
     63static void
     64irqfd_resampler_ack(struct kvm_irq_ack_notifier *kian)
     65{
     66	struct kvm_kernel_irqfd_resampler *resampler;
     67	struct kvm *kvm;
     68	struct kvm_kernel_irqfd *irqfd;
     69	int idx;
     70
     71	resampler = container_of(kian,
     72			struct kvm_kernel_irqfd_resampler, notifier);
     73	kvm = resampler->kvm;
     74
     75	kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
     76		    resampler->notifier.gsi, 0, false);
     77
     78	idx = srcu_read_lock(&kvm->irq_srcu);
     79
     80	list_for_each_entry_srcu(irqfd, &resampler->list, resampler_link,
     81	    srcu_read_lock_held(&kvm->irq_srcu))
     82		eventfd_signal(irqfd->resamplefd, 1);
     83
     84	srcu_read_unlock(&kvm->irq_srcu, idx);
     85}
     86
     87static void
     88irqfd_resampler_shutdown(struct kvm_kernel_irqfd *irqfd)
     89{
     90	struct kvm_kernel_irqfd_resampler *resampler = irqfd->resampler;
     91	struct kvm *kvm = resampler->kvm;
     92
     93	mutex_lock(&kvm->irqfds.resampler_lock);
     94
     95	list_del_rcu(&irqfd->resampler_link);
     96	synchronize_srcu(&kvm->irq_srcu);
     97
     98	if (list_empty(&resampler->list)) {
     99		list_del(&resampler->link);
    100		kvm_unregister_irq_ack_notifier(kvm, &resampler->notifier);
    101		kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
    102			    resampler->notifier.gsi, 0, false);
    103		kfree(resampler);
    104	}
    105
    106	mutex_unlock(&kvm->irqfds.resampler_lock);
    107}
    108
    109/*
    110 * Race-free decouple logic (ordering is critical)
    111 */
    112static void
    113irqfd_shutdown(struct work_struct *work)
    114{
    115	struct kvm_kernel_irqfd *irqfd =
    116		container_of(work, struct kvm_kernel_irqfd, shutdown);
    117	struct kvm *kvm = irqfd->kvm;
    118	u64 cnt;
    119
    120	/* Make sure irqfd has been initialized in assign path. */
    121	synchronize_srcu(&kvm->irq_srcu);
    122
    123	/*
    124	 * Synchronize with the wait-queue and unhook ourselves to prevent
    125	 * further events.
    126	 */
    127	eventfd_ctx_remove_wait_queue(irqfd->eventfd, &irqfd->wait, &cnt);
    128
    129	/*
    130	 * We know no new events will be scheduled at this point, so block
    131	 * until all previously outstanding events have completed
    132	 */
    133	flush_work(&irqfd->inject);
    134
    135	if (irqfd->resampler) {
    136		irqfd_resampler_shutdown(irqfd);
    137		eventfd_ctx_put(irqfd->resamplefd);
    138	}
    139
    140	/*
    141	 * It is now safe to release the object's resources
    142	 */
    143#ifdef CONFIG_HAVE_KVM_IRQ_BYPASS
    144	irq_bypass_unregister_consumer(&irqfd->consumer);
    145#endif
    146	eventfd_ctx_put(irqfd->eventfd);
    147	kfree(irqfd);
    148}
    149
    150
    151/* assumes kvm->irqfds.lock is held */
    152static bool
    153irqfd_is_active(struct kvm_kernel_irqfd *irqfd)
    154{
    155	return list_empty(&irqfd->list) ? false : true;
    156}
    157
    158/*
    159 * Mark the irqfd as inactive and schedule it for removal
    160 *
    161 * assumes kvm->irqfds.lock is held
    162 */
    163static void
    164irqfd_deactivate(struct kvm_kernel_irqfd *irqfd)
    165{
    166	BUG_ON(!irqfd_is_active(irqfd));
    167
    168	list_del_init(&irqfd->list);
    169
    170	queue_work(irqfd_cleanup_wq, &irqfd->shutdown);
    171}
    172
    173int __attribute__((weak)) kvm_arch_set_irq_inatomic(
    174				struct kvm_kernel_irq_routing_entry *irq,
    175				struct kvm *kvm, int irq_source_id,
    176				int level,
    177				bool line_status)
    178{
    179	return -EWOULDBLOCK;
    180}
    181
    182/*
    183 * Called with wqh->lock held and interrupts disabled
    184 */
    185static int
    186irqfd_wakeup(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
    187{
    188	struct kvm_kernel_irqfd *irqfd =
    189		container_of(wait, struct kvm_kernel_irqfd, wait);
    190	__poll_t flags = key_to_poll(key);
    191	struct kvm_kernel_irq_routing_entry irq;
    192	struct kvm *kvm = irqfd->kvm;
    193	unsigned seq;
    194	int idx;
    195	int ret = 0;
    196
    197	if (flags & EPOLLIN) {
    198		u64 cnt;
    199		eventfd_ctx_do_read(irqfd->eventfd, &cnt);
    200
    201		idx = srcu_read_lock(&kvm->irq_srcu);
    202		do {
    203			seq = read_seqcount_begin(&irqfd->irq_entry_sc);
    204			irq = irqfd->irq_entry;
    205		} while (read_seqcount_retry(&irqfd->irq_entry_sc, seq));
    206		/* An event has been signaled, inject an interrupt */
    207		if (kvm_arch_set_irq_inatomic(&irq, kvm,
    208					      KVM_USERSPACE_IRQ_SOURCE_ID, 1,
    209					      false) == -EWOULDBLOCK)
    210			schedule_work(&irqfd->inject);
    211		srcu_read_unlock(&kvm->irq_srcu, idx);
    212		ret = 1;
    213	}
    214
    215	if (flags & EPOLLHUP) {
    216		/* The eventfd is closing, detach from KVM */
    217		unsigned long iflags;
    218
    219		spin_lock_irqsave(&kvm->irqfds.lock, iflags);
    220
    221		/*
    222		 * We must check if someone deactivated the irqfd before
    223		 * we could acquire the irqfds.lock since the item is
    224		 * deactivated from the KVM side before it is unhooked from
    225		 * the wait-queue.  If it is already deactivated, we can
    226		 * simply return knowing the other side will cleanup for us.
    227		 * We cannot race against the irqfd going away since the
    228		 * other side is required to acquire wqh->lock, which we hold
    229		 */
    230		if (irqfd_is_active(irqfd))
    231			irqfd_deactivate(irqfd);
    232
    233		spin_unlock_irqrestore(&kvm->irqfds.lock, iflags);
    234	}
    235
    236	return ret;
    237}
    238
    239static void
    240irqfd_ptable_queue_proc(struct file *file, wait_queue_head_t *wqh,
    241			poll_table *pt)
    242{
    243	struct kvm_kernel_irqfd *irqfd =
    244		container_of(pt, struct kvm_kernel_irqfd, pt);
    245	add_wait_queue_priority(wqh, &irqfd->wait);
    246}
    247
    248/* Must be called under irqfds.lock */
    249static void irqfd_update(struct kvm *kvm, struct kvm_kernel_irqfd *irqfd)
    250{
    251	struct kvm_kernel_irq_routing_entry *e;
    252	struct kvm_kernel_irq_routing_entry entries[KVM_NR_IRQCHIPS];
    253	int n_entries;
    254
    255	n_entries = kvm_irq_map_gsi(kvm, entries, irqfd->gsi);
    256
    257	write_seqcount_begin(&irqfd->irq_entry_sc);
    258
    259	e = entries;
    260	if (n_entries == 1)
    261		irqfd->irq_entry = *e;
    262	else
    263		irqfd->irq_entry.type = 0;
    264
    265	write_seqcount_end(&irqfd->irq_entry_sc);
    266}
    267
    268#ifdef CONFIG_HAVE_KVM_IRQ_BYPASS
    269void __attribute__((weak)) kvm_arch_irq_bypass_stop(
    270				struct irq_bypass_consumer *cons)
    271{
    272}
    273
    274void __attribute__((weak)) kvm_arch_irq_bypass_start(
    275				struct irq_bypass_consumer *cons)
    276{
    277}
    278
    279int  __attribute__((weak)) kvm_arch_update_irqfd_routing(
    280				struct kvm *kvm, unsigned int host_irq,
    281				uint32_t guest_irq, bool set)
    282{
    283	return 0;
    284}
    285
    286bool __attribute__((weak)) kvm_arch_irqfd_route_changed(
    287				struct kvm_kernel_irq_routing_entry *old,
    288				struct kvm_kernel_irq_routing_entry *new)
    289{
    290	return true;
    291}
    292#endif
    293
    294static int
    295kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args)
    296{
    297	struct kvm_kernel_irqfd *irqfd, *tmp;
    298	struct fd f;
    299	struct eventfd_ctx *eventfd = NULL, *resamplefd = NULL;
    300	int ret;
    301	__poll_t events;
    302	int idx;
    303
    304	if (!kvm_arch_intc_initialized(kvm))
    305		return -EAGAIN;
    306
    307	if (!kvm_arch_irqfd_allowed(kvm, args))
    308		return -EINVAL;
    309
    310	irqfd = kzalloc(sizeof(*irqfd), GFP_KERNEL_ACCOUNT);
    311	if (!irqfd)
    312		return -ENOMEM;
    313
    314	irqfd->kvm = kvm;
    315	irqfd->gsi = args->gsi;
    316	INIT_LIST_HEAD(&irqfd->list);
    317	INIT_WORK(&irqfd->inject, irqfd_inject);
    318	INIT_WORK(&irqfd->shutdown, irqfd_shutdown);
    319	seqcount_spinlock_init(&irqfd->irq_entry_sc, &kvm->irqfds.lock);
    320
    321	f = fdget(args->fd);
    322	if (!f.file) {
    323		ret = -EBADF;
    324		goto out;
    325	}
    326
    327	eventfd = eventfd_ctx_fileget(f.file);
    328	if (IS_ERR(eventfd)) {
    329		ret = PTR_ERR(eventfd);
    330		goto fail;
    331	}
    332
    333	irqfd->eventfd = eventfd;
    334
    335	if (args->flags & KVM_IRQFD_FLAG_RESAMPLE) {
    336		struct kvm_kernel_irqfd_resampler *resampler;
    337
    338		resamplefd = eventfd_ctx_fdget(args->resamplefd);
    339		if (IS_ERR(resamplefd)) {
    340			ret = PTR_ERR(resamplefd);
    341			goto fail;
    342		}
    343
    344		irqfd->resamplefd = resamplefd;
    345		INIT_LIST_HEAD(&irqfd->resampler_link);
    346
    347		mutex_lock(&kvm->irqfds.resampler_lock);
    348
    349		list_for_each_entry(resampler,
    350				    &kvm->irqfds.resampler_list, link) {
    351			if (resampler->notifier.gsi == irqfd->gsi) {
    352				irqfd->resampler = resampler;
    353				break;
    354			}
    355		}
    356
    357		if (!irqfd->resampler) {
    358			resampler = kzalloc(sizeof(*resampler),
    359					    GFP_KERNEL_ACCOUNT);
    360			if (!resampler) {
    361				ret = -ENOMEM;
    362				mutex_unlock(&kvm->irqfds.resampler_lock);
    363				goto fail;
    364			}
    365
    366			resampler->kvm = kvm;
    367			INIT_LIST_HEAD(&resampler->list);
    368			resampler->notifier.gsi = irqfd->gsi;
    369			resampler->notifier.irq_acked = irqfd_resampler_ack;
    370			INIT_LIST_HEAD(&resampler->link);
    371
    372			list_add(&resampler->link, &kvm->irqfds.resampler_list);
    373			kvm_register_irq_ack_notifier(kvm,
    374						      &resampler->notifier);
    375			irqfd->resampler = resampler;
    376		}
    377
    378		list_add_rcu(&irqfd->resampler_link, &irqfd->resampler->list);
    379		synchronize_srcu(&kvm->irq_srcu);
    380
    381		mutex_unlock(&kvm->irqfds.resampler_lock);
    382	}
    383
    384	/*
    385	 * Install our own custom wake-up handling so we are notified via
    386	 * a callback whenever someone signals the underlying eventfd
    387	 */
    388	init_waitqueue_func_entry(&irqfd->wait, irqfd_wakeup);
    389	init_poll_funcptr(&irqfd->pt, irqfd_ptable_queue_proc);
    390
    391	spin_lock_irq(&kvm->irqfds.lock);
    392
    393	ret = 0;
    394	list_for_each_entry(tmp, &kvm->irqfds.items, list) {
    395		if (irqfd->eventfd != tmp->eventfd)
    396			continue;
    397		/* This fd is used for another irq already. */
    398		ret = -EBUSY;
    399		spin_unlock_irq(&kvm->irqfds.lock);
    400		goto fail;
    401	}
    402
    403	idx = srcu_read_lock(&kvm->irq_srcu);
    404	irqfd_update(kvm, irqfd);
    405
    406	list_add_tail(&irqfd->list, &kvm->irqfds.items);
    407
    408	spin_unlock_irq(&kvm->irqfds.lock);
    409
    410	/*
    411	 * Check if there was an event already pending on the eventfd
    412	 * before we registered, and trigger it as if we didn't miss it.
    413	 */
    414	events = vfs_poll(f.file, &irqfd->pt);
    415
    416	if (events & EPOLLIN)
    417		schedule_work(&irqfd->inject);
    418
    419#ifdef CONFIG_HAVE_KVM_IRQ_BYPASS
    420	if (kvm_arch_has_irq_bypass()) {
    421		irqfd->consumer.token = (void *)irqfd->eventfd;
    422		irqfd->consumer.add_producer = kvm_arch_irq_bypass_add_producer;
    423		irqfd->consumer.del_producer = kvm_arch_irq_bypass_del_producer;
    424		irqfd->consumer.stop = kvm_arch_irq_bypass_stop;
    425		irqfd->consumer.start = kvm_arch_irq_bypass_start;
    426		ret = irq_bypass_register_consumer(&irqfd->consumer);
    427		if (ret)
    428			pr_info("irq bypass consumer (token %p) registration fails: %d\n",
    429				irqfd->consumer.token, ret);
    430	}
    431#endif
    432
    433	srcu_read_unlock(&kvm->irq_srcu, idx);
    434
    435	/*
    436	 * do not drop the file until the irqfd is fully initialized, otherwise
    437	 * we might race against the EPOLLHUP
    438	 */
    439	fdput(f);
    440	return 0;
    441
    442fail:
    443	if (irqfd->resampler)
    444		irqfd_resampler_shutdown(irqfd);
    445
    446	if (resamplefd && !IS_ERR(resamplefd))
    447		eventfd_ctx_put(resamplefd);
    448
    449	if (eventfd && !IS_ERR(eventfd))
    450		eventfd_ctx_put(eventfd);
    451
    452	fdput(f);
    453
    454out:
    455	kfree(irqfd);
    456	return ret;
    457}
    458
    459bool kvm_irq_has_notifier(struct kvm *kvm, unsigned irqchip, unsigned pin)
    460{
    461	struct kvm_irq_ack_notifier *kian;
    462	int gsi, idx;
    463
    464	idx = srcu_read_lock(&kvm->irq_srcu);
    465	gsi = kvm_irq_map_chip_pin(kvm, irqchip, pin);
    466	if (gsi != -1)
    467		hlist_for_each_entry_srcu(kian, &kvm->irq_ack_notifier_list,
    468					  link, srcu_read_lock_held(&kvm->irq_srcu))
    469			if (kian->gsi == gsi) {
    470				srcu_read_unlock(&kvm->irq_srcu, idx);
    471				return true;
    472			}
    473
    474	srcu_read_unlock(&kvm->irq_srcu, idx);
    475
    476	return false;
    477}
    478EXPORT_SYMBOL_GPL(kvm_irq_has_notifier);
    479
    480void kvm_notify_acked_gsi(struct kvm *kvm, int gsi)
    481{
    482	struct kvm_irq_ack_notifier *kian;
    483
    484	hlist_for_each_entry_srcu(kian, &kvm->irq_ack_notifier_list,
    485				  link, srcu_read_lock_held(&kvm->irq_srcu))
    486		if (kian->gsi == gsi)
    487			kian->irq_acked(kian);
    488}
    489
    490void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin)
    491{
    492	int gsi, idx;
    493
    494	trace_kvm_ack_irq(irqchip, pin);
    495
    496	idx = srcu_read_lock(&kvm->irq_srcu);
    497	gsi = kvm_irq_map_chip_pin(kvm, irqchip, pin);
    498	if (gsi != -1)
    499		kvm_notify_acked_gsi(kvm, gsi);
    500	srcu_read_unlock(&kvm->irq_srcu, idx);
    501}
    502
    503void kvm_register_irq_ack_notifier(struct kvm *kvm,
    504				   struct kvm_irq_ack_notifier *kian)
    505{
    506	mutex_lock(&kvm->irq_lock);
    507	hlist_add_head_rcu(&kian->link, &kvm->irq_ack_notifier_list);
    508	mutex_unlock(&kvm->irq_lock);
    509	kvm_arch_post_irq_ack_notifier_list_update(kvm);
    510}
    511
    512void kvm_unregister_irq_ack_notifier(struct kvm *kvm,
    513				    struct kvm_irq_ack_notifier *kian)
    514{
    515	mutex_lock(&kvm->irq_lock);
    516	hlist_del_init_rcu(&kian->link);
    517	mutex_unlock(&kvm->irq_lock);
    518	synchronize_srcu(&kvm->irq_srcu);
    519	kvm_arch_post_irq_ack_notifier_list_update(kvm);
    520}
    521#endif
    522
    523void
    524kvm_eventfd_init(struct kvm *kvm)
    525{
    526#ifdef CONFIG_HAVE_KVM_IRQFD
    527	spin_lock_init(&kvm->irqfds.lock);
    528	INIT_LIST_HEAD(&kvm->irqfds.items);
    529	INIT_LIST_HEAD(&kvm->irqfds.resampler_list);
    530	mutex_init(&kvm->irqfds.resampler_lock);
    531#endif
    532	INIT_LIST_HEAD(&kvm->ioeventfds);
    533}
    534
    535#ifdef CONFIG_HAVE_KVM_IRQFD
    536/*
    537 * shutdown any irqfd's that match fd+gsi
    538 */
    539static int
    540kvm_irqfd_deassign(struct kvm *kvm, struct kvm_irqfd *args)
    541{
    542	struct kvm_kernel_irqfd *irqfd, *tmp;
    543	struct eventfd_ctx *eventfd;
    544
    545	eventfd = eventfd_ctx_fdget(args->fd);
    546	if (IS_ERR(eventfd))
    547		return PTR_ERR(eventfd);
    548
    549	spin_lock_irq(&kvm->irqfds.lock);
    550
    551	list_for_each_entry_safe(irqfd, tmp, &kvm->irqfds.items, list) {
    552		if (irqfd->eventfd == eventfd && irqfd->gsi == args->gsi) {
    553			/*
    554			 * This clearing of irq_entry.type is needed for when
    555			 * another thread calls kvm_irq_routing_update before
    556			 * we flush workqueue below (we synchronize with
    557			 * kvm_irq_routing_update using irqfds.lock).
    558			 */
    559			write_seqcount_begin(&irqfd->irq_entry_sc);
    560			irqfd->irq_entry.type = 0;
    561			write_seqcount_end(&irqfd->irq_entry_sc);
    562			irqfd_deactivate(irqfd);
    563		}
    564	}
    565
    566	spin_unlock_irq(&kvm->irqfds.lock);
    567	eventfd_ctx_put(eventfd);
    568
    569	/*
    570	 * Block until we know all outstanding shutdown jobs have completed
    571	 * so that we guarantee there will not be any more interrupts on this
    572	 * gsi once this deassign function returns.
    573	 */
    574	flush_workqueue(irqfd_cleanup_wq);
    575
    576	return 0;
    577}
    578
    579int
    580kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args)
    581{
    582	if (args->flags & ~(KVM_IRQFD_FLAG_DEASSIGN | KVM_IRQFD_FLAG_RESAMPLE))
    583		return -EINVAL;
    584
    585	if (args->flags & KVM_IRQFD_FLAG_DEASSIGN)
    586		return kvm_irqfd_deassign(kvm, args);
    587
    588	return kvm_irqfd_assign(kvm, args);
    589}
    590
    591/*
    592 * This function is called as the kvm VM fd is being released. Shutdown all
    593 * irqfds that still remain open
    594 */
    595void
    596kvm_irqfd_release(struct kvm *kvm)
    597{
    598	struct kvm_kernel_irqfd *irqfd, *tmp;
    599
    600	spin_lock_irq(&kvm->irqfds.lock);
    601
    602	list_for_each_entry_safe(irqfd, tmp, &kvm->irqfds.items, list)
    603		irqfd_deactivate(irqfd);
    604
    605	spin_unlock_irq(&kvm->irqfds.lock);
    606
    607	/*
    608	 * Block until we know all outstanding shutdown jobs have completed
    609	 * since we do not take a kvm* reference.
    610	 */
    611	flush_workqueue(irqfd_cleanup_wq);
    612
    613}
    614
    615/*
    616 * Take note of a change in irq routing.
    617 * Caller must invoke synchronize_srcu(&kvm->irq_srcu) afterwards.
    618 */
    619void kvm_irq_routing_update(struct kvm *kvm)
    620{
    621	struct kvm_kernel_irqfd *irqfd;
    622
    623	spin_lock_irq(&kvm->irqfds.lock);
    624
    625	list_for_each_entry(irqfd, &kvm->irqfds.items, list) {
    626#ifdef CONFIG_HAVE_KVM_IRQ_BYPASS
    627		/* Under irqfds.lock, so can read irq_entry safely */
    628		struct kvm_kernel_irq_routing_entry old = irqfd->irq_entry;
    629#endif
    630
    631		irqfd_update(kvm, irqfd);
    632
    633#ifdef CONFIG_HAVE_KVM_IRQ_BYPASS
    634		if (irqfd->producer &&
    635		    kvm_arch_irqfd_route_changed(&old, &irqfd->irq_entry)) {
    636			int ret = kvm_arch_update_irqfd_routing(
    637					irqfd->kvm, irqfd->producer->irq,
    638					irqfd->gsi, 1);
    639			WARN_ON(ret);
    640		}
    641#endif
    642	}
    643
    644	spin_unlock_irq(&kvm->irqfds.lock);
    645}
    646
    647/*
    648 * create a host-wide workqueue for issuing deferred shutdown requests
    649 * aggregated from all vm* instances. We need our own isolated
    650 * queue to ease flushing work items when a VM exits.
    651 */
    652int kvm_irqfd_init(void)
    653{
    654	irqfd_cleanup_wq = alloc_workqueue("kvm-irqfd-cleanup", 0, 0);
    655	if (!irqfd_cleanup_wq)
    656		return -ENOMEM;
    657
    658	return 0;
    659}
    660
    661void kvm_irqfd_exit(void)
    662{
    663	destroy_workqueue(irqfd_cleanup_wq);
    664}
    665#endif
    666
    667/*
    668 * --------------------------------------------------------------------
    669 * ioeventfd: translate a PIO/MMIO memory write to an eventfd signal.
    670 *
    671 * userspace can register a PIO/MMIO address with an eventfd for receiving
    672 * notification when the memory has been touched.
    673 * --------------------------------------------------------------------
    674 */
    675
    676struct _ioeventfd {
    677	struct list_head     list;
    678	u64                  addr;
    679	int                  length;
    680	struct eventfd_ctx  *eventfd;
    681	u64                  datamatch;
    682	struct kvm_io_device dev;
    683	u8                   bus_idx;
    684	bool                 wildcard;
    685};
    686
    687static inline struct _ioeventfd *
    688to_ioeventfd(struct kvm_io_device *dev)
    689{
    690	return container_of(dev, struct _ioeventfd, dev);
    691}
    692
    693static void
    694ioeventfd_release(struct _ioeventfd *p)
    695{
    696	eventfd_ctx_put(p->eventfd);
    697	list_del(&p->list);
    698	kfree(p);
    699}
    700
    701static bool
    702ioeventfd_in_range(struct _ioeventfd *p, gpa_t addr, int len, const void *val)
    703{
    704	u64 _val;
    705
    706	if (addr != p->addr)
    707		/* address must be precise for a hit */
    708		return false;
    709
    710	if (!p->length)
    711		/* length = 0 means only look at the address, so always a hit */
    712		return true;
    713
    714	if (len != p->length)
    715		/* address-range must be precise for a hit */
    716		return false;
    717
    718	if (p->wildcard)
    719		/* all else equal, wildcard is always a hit */
    720		return true;
    721
    722	/* otherwise, we have to actually compare the data */
    723
    724	BUG_ON(!IS_ALIGNED((unsigned long)val, len));
    725
    726	switch (len) {
    727	case 1:
    728		_val = *(u8 *)val;
    729		break;
    730	case 2:
    731		_val = *(u16 *)val;
    732		break;
    733	case 4:
    734		_val = *(u32 *)val;
    735		break;
    736	case 8:
    737		_val = *(u64 *)val;
    738		break;
    739	default:
    740		return false;
    741	}
    742
    743	return _val == p->datamatch;
    744}
    745
    746/* MMIO/PIO writes trigger an event if the addr/val match */
    747static int
    748ioeventfd_write(struct kvm_vcpu *vcpu, struct kvm_io_device *this, gpa_t addr,
    749		int len, const void *val)
    750{
    751	struct _ioeventfd *p = to_ioeventfd(this);
    752
    753	if (!ioeventfd_in_range(p, addr, len, val))
    754		return -EOPNOTSUPP;
    755
    756	eventfd_signal(p->eventfd, 1);
    757	return 0;
    758}
    759
    760/*
    761 * This function is called as KVM is completely shutting down.  We do not
    762 * need to worry about locking just nuke anything we have as quickly as possible
    763 */
    764static void
    765ioeventfd_destructor(struct kvm_io_device *this)
    766{
    767	struct _ioeventfd *p = to_ioeventfd(this);
    768
    769	ioeventfd_release(p);
    770}
    771
    772static const struct kvm_io_device_ops ioeventfd_ops = {
    773	.write      = ioeventfd_write,
    774	.destructor = ioeventfd_destructor,
    775};
    776
    777/* assumes kvm->slots_lock held */
    778static bool
    779ioeventfd_check_collision(struct kvm *kvm, struct _ioeventfd *p)
    780{
    781	struct _ioeventfd *_p;
    782
    783	list_for_each_entry(_p, &kvm->ioeventfds, list)
    784		if (_p->bus_idx == p->bus_idx &&
    785		    _p->addr == p->addr &&
    786		    (!_p->length || !p->length ||
    787		     (_p->length == p->length &&
    788		      (_p->wildcard || p->wildcard ||
    789		       _p->datamatch == p->datamatch))))
    790			return true;
    791
    792	return false;
    793}
    794
    795static enum kvm_bus ioeventfd_bus_from_flags(__u32 flags)
    796{
    797	if (flags & KVM_IOEVENTFD_FLAG_PIO)
    798		return KVM_PIO_BUS;
    799	if (flags & KVM_IOEVENTFD_FLAG_VIRTIO_CCW_NOTIFY)
    800		return KVM_VIRTIO_CCW_NOTIFY_BUS;
    801	return KVM_MMIO_BUS;
    802}
    803
    804static int kvm_assign_ioeventfd_idx(struct kvm *kvm,
    805				enum kvm_bus bus_idx,
    806				struct kvm_ioeventfd *args)
    807{
    808
    809	struct eventfd_ctx *eventfd;
    810	struct _ioeventfd *p;
    811	int ret;
    812
    813	eventfd = eventfd_ctx_fdget(args->fd);
    814	if (IS_ERR(eventfd))
    815		return PTR_ERR(eventfd);
    816
    817	p = kzalloc(sizeof(*p), GFP_KERNEL_ACCOUNT);
    818	if (!p) {
    819		ret = -ENOMEM;
    820		goto fail;
    821	}
    822
    823	INIT_LIST_HEAD(&p->list);
    824	p->addr    = args->addr;
    825	p->bus_idx = bus_idx;
    826	p->length  = args->len;
    827	p->eventfd = eventfd;
    828
    829	/* The datamatch feature is optional, otherwise this is a wildcard */
    830	if (args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH)
    831		p->datamatch = args->datamatch;
    832	else
    833		p->wildcard = true;
    834
    835	mutex_lock(&kvm->slots_lock);
    836
    837	/* Verify that there isn't a match already */
    838	if (ioeventfd_check_collision(kvm, p)) {
    839		ret = -EEXIST;
    840		goto unlock_fail;
    841	}
    842
    843	kvm_iodevice_init(&p->dev, &ioeventfd_ops);
    844
    845	ret = kvm_io_bus_register_dev(kvm, bus_idx, p->addr, p->length,
    846				      &p->dev);
    847	if (ret < 0)
    848		goto unlock_fail;
    849
    850	kvm_get_bus(kvm, bus_idx)->ioeventfd_count++;
    851	list_add_tail(&p->list, &kvm->ioeventfds);
    852
    853	mutex_unlock(&kvm->slots_lock);
    854
    855	return 0;
    856
    857unlock_fail:
    858	mutex_unlock(&kvm->slots_lock);
    859
    860fail:
    861	kfree(p);
    862	eventfd_ctx_put(eventfd);
    863
    864	return ret;
    865}
    866
    867static int
    868kvm_deassign_ioeventfd_idx(struct kvm *kvm, enum kvm_bus bus_idx,
    869			   struct kvm_ioeventfd *args)
    870{
    871	struct _ioeventfd        *p, *tmp;
    872	struct eventfd_ctx       *eventfd;
    873	struct kvm_io_bus	 *bus;
    874	int                       ret = -ENOENT;
    875	bool                      wildcard;
    876
    877	eventfd = eventfd_ctx_fdget(args->fd);
    878	if (IS_ERR(eventfd))
    879		return PTR_ERR(eventfd);
    880
    881	wildcard = !(args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH);
    882
    883	mutex_lock(&kvm->slots_lock);
    884
    885	list_for_each_entry_safe(p, tmp, &kvm->ioeventfds, list) {
    886
    887		if (p->bus_idx != bus_idx ||
    888		    p->eventfd != eventfd  ||
    889		    p->addr != args->addr  ||
    890		    p->length != args->len ||
    891		    p->wildcard != wildcard)
    892			continue;
    893
    894		if (!p->wildcard && p->datamatch != args->datamatch)
    895			continue;
    896
    897		kvm_io_bus_unregister_dev(kvm, bus_idx, &p->dev);
    898		bus = kvm_get_bus(kvm, bus_idx);
    899		if (bus)
    900			bus->ioeventfd_count--;
    901		ioeventfd_release(p);
    902		ret = 0;
    903		break;
    904	}
    905
    906	mutex_unlock(&kvm->slots_lock);
    907
    908	eventfd_ctx_put(eventfd);
    909
    910	return ret;
    911}
    912
    913static int kvm_deassign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
    914{
    915	enum kvm_bus bus_idx = ioeventfd_bus_from_flags(args->flags);
    916	int ret = kvm_deassign_ioeventfd_idx(kvm, bus_idx, args);
    917
    918	if (!args->len && bus_idx == KVM_MMIO_BUS)
    919		kvm_deassign_ioeventfd_idx(kvm, KVM_FAST_MMIO_BUS, args);
    920
    921	return ret;
    922}
    923
    924static int
    925kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
    926{
    927	enum kvm_bus              bus_idx;
    928	int ret;
    929
    930	bus_idx = ioeventfd_bus_from_flags(args->flags);
    931	/* must be natural-word sized, or 0 to ignore length */
    932	switch (args->len) {
    933	case 0:
    934	case 1:
    935	case 2:
    936	case 4:
    937	case 8:
    938		break;
    939	default:
    940		return -EINVAL;
    941	}
    942
    943	/* check for range overflow */
    944	if (args->addr + args->len < args->addr)
    945		return -EINVAL;
    946
    947	/* check for extra flags that we don't understand */
    948	if (args->flags & ~KVM_IOEVENTFD_VALID_FLAG_MASK)
    949		return -EINVAL;
    950
    951	/* ioeventfd with no length can't be combined with DATAMATCH */
    952	if (!args->len && (args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH))
    953		return -EINVAL;
    954
    955	ret = kvm_assign_ioeventfd_idx(kvm, bus_idx, args);
    956	if (ret)
    957		goto fail;
    958
    959	/* When length is ignored, MMIO is also put on a separate bus, for
    960	 * faster lookups.
    961	 */
    962	if (!args->len && bus_idx == KVM_MMIO_BUS) {
    963		ret = kvm_assign_ioeventfd_idx(kvm, KVM_FAST_MMIO_BUS, args);
    964		if (ret < 0)
    965			goto fast_fail;
    966	}
    967
    968	return 0;
    969
    970fast_fail:
    971	kvm_deassign_ioeventfd_idx(kvm, bus_idx, args);
    972fail:
    973	return ret;
    974}
    975
    976int
    977kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
    978{
    979	if (args->flags & KVM_IOEVENTFD_FLAG_DEASSIGN)
    980		return kvm_deassign_ioeventfd(kvm, args);
    981
    982	return kvm_assign_ioeventfd(kvm, args);
    983}