1 /*
2  * Kernel-based Virtual Machine - device assignment support
3  *
4  * Copyright (C) 2010 Red Hat, Inc. and/or its affiliates.
5  *
6  * This work is licensed under the terms of the GNU GPL, version 2.  See
7  * the COPYING file in the top-level directory.
8  *
9  */
10 
11 #include <linux/kvm_host.h>
12 #include <linux/kvm.h>
13 #include <linux/uaccess.h>
14 #include <linux/vmalloc.h>
15 #include <linux/errno.h>
16 #include <linux/spinlock.h>
17 #include <linux/pci.h>
18 #include <linux/interrupt.h>
19 #include <linux/slab.h>
20 #include <linux/namei.h>
21 #include <linux/fs.h>
22 #include "irq.h"
23 #include "assigned-dev.h"
24 #include "trace/events/kvm.h"
25 
26 struct kvm_assigned_dev_kernel {
27 	struct kvm_irq_ack_notifier ack_notifier;
28 	struct list_head list;
29 	int assigned_dev_id;
30 	int host_segnr;
31 	int host_busnr;
32 	int host_devfn;
33 	unsigned int entries_nr;
34 	int host_irq;
35 	bool host_irq_disabled;
36 	bool pci_2_3;
37 	struct msix_entry *host_msix_entries;
38 	int guest_irq;
39 	struct msix_entry *guest_msix_entries;
40 	unsigned long irq_requested_type;
41 	int irq_source_id;
42 	int flags;
43 	struct pci_dev *dev;
44 	struct kvm *kvm;
45 	spinlock_t intx_lock;
46 	spinlock_t intx_mask_lock;
47 	char irq_name[32];
48 	struct pci_saved_state *pci_saved_state;
49 };
50 
kvm_find_assigned_dev(struct list_head * head,int assigned_dev_id)51 static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *head,
52 						      int assigned_dev_id)
53 {
54 	struct list_head *ptr;
55 	struct kvm_assigned_dev_kernel *match;
56 
57 	list_for_each(ptr, head) {
58 		match = list_entry(ptr, struct kvm_assigned_dev_kernel, list);
59 		if (match->assigned_dev_id == assigned_dev_id)
60 			return match;
61 	}
62 	return NULL;
63 }
64 
find_index_from_host_irq(struct kvm_assigned_dev_kernel * assigned_dev,int irq)65 static int find_index_from_host_irq(struct kvm_assigned_dev_kernel
66 				    *assigned_dev, int irq)
67 {
68 	int i, index;
69 	struct msix_entry *host_msix_entries;
70 
71 	host_msix_entries = assigned_dev->host_msix_entries;
72 
73 	index = -1;
74 	for (i = 0; i < assigned_dev->entries_nr; i++)
75 		if (irq == host_msix_entries[i].vector) {
76 			index = i;
77 			break;
78 		}
79 	if (index < 0)
80 		printk(KERN_WARNING "Fail to find correlated MSI-X entry!\n");
81 
82 	return index;
83 }
84 
kvm_assigned_dev_intx(int irq,void * dev_id)85 static irqreturn_t kvm_assigned_dev_intx(int irq, void *dev_id)
86 {
87 	struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
88 	int ret;
89 
90 	spin_lock(&assigned_dev->intx_lock);
91 	if (pci_check_and_mask_intx(assigned_dev->dev)) {
92 		assigned_dev->host_irq_disabled = true;
93 		ret = IRQ_WAKE_THREAD;
94 	} else
95 		ret = IRQ_NONE;
96 	spin_unlock(&assigned_dev->intx_lock);
97 
98 	return ret;
99 }
100 
101 static void
kvm_assigned_dev_raise_guest_irq(struct kvm_assigned_dev_kernel * assigned_dev,int vector)102 kvm_assigned_dev_raise_guest_irq(struct kvm_assigned_dev_kernel *assigned_dev,
103 				 int vector)
104 {
105 	if (unlikely(assigned_dev->irq_requested_type &
106 		     KVM_DEV_IRQ_GUEST_INTX)) {
107 		spin_lock(&assigned_dev->intx_mask_lock);
108 		if (!(assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX))
109 			kvm_set_irq(assigned_dev->kvm,
110 				    assigned_dev->irq_source_id, vector, 1,
111 				    false);
112 		spin_unlock(&assigned_dev->intx_mask_lock);
113 	} else
114 		kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
115 			    vector, 1, false);
116 }
117 
kvm_assigned_dev_thread_intx(int irq,void * dev_id)118 static irqreturn_t kvm_assigned_dev_thread_intx(int irq, void *dev_id)
119 {
120 	struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
121 
122 	if (!(assigned_dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) {
123 		spin_lock_irq(&assigned_dev->intx_lock);
124 		disable_irq_nosync(irq);
125 		assigned_dev->host_irq_disabled = true;
126 		spin_unlock_irq(&assigned_dev->intx_lock);
127 	}
128 
129 	kvm_assigned_dev_raise_guest_irq(assigned_dev,
130 					 assigned_dev->guest_irq);
131 
132 	return IRQ_HANDLED;
133 }
134 
135 /*
136  * Deliver an IRQ in an atomic context if we can, or return a failure,
137  * user can retry in a process context.
138  * Return value:
139  *  -EWOULDBLOCK - Can't deliver in atomic context: retry in a process context.
140  *  Other values - No need to retry.
141  */
kvm_set_irq_inatomic(struct kvm * kvm,int irq_source_id,u32 irq,int level)142 static int kvm_set_irq_inatomic(struct kvm *kvm, int irq_source_id, u32 irq,
143 				int level)
144 {
145 	struct kvm_kernel_irq_routing_entry entries[KVM_NR_IRQCHIPS];
146 	struct kvm_kernel_irq_routing_entry *e;
147 	int ret = -EINVAL;
148 	int idx;
149 
150 	trace_kvm_set_irq(irq, level, irq_source_id);
151 
152 	/*
153 	 * Injection into either PIC or IOAPIC might need to scan all CPUs,
154 	 * which would need to be retried from thread context;  when same GSI
155 	 * is connected to both PIC and IOAPIC, we'd have to report a
156 	 * partial failure here.
157 	 * Since there's no easy way to do this, we only support injecting MSI
158 	 * which is limited to 1:1 GSI mapping.
159 	 */
160 	idx = srcu_read_lock(&kvm->irq_srcu);
161 	if (kvm_irq_map_gsi(kvm, entries, irq) > 0) {
162 		e = &entries[0];
163 		ret = kvm_arch_set_irq_inatomic(e, kvm, irq_source_id,
164 						irq, level);
165 	}
166 	srcu_read_unlock(&kvm->irq_srcu, idx);
167 	return ret;
168 }
169 
170 
kvm_assigned_dev_msi(int irq,void * dev_id)171 static irqreturn_t kvm_assigned_dev_msi(int irq, void *dev_id)
172 {
173 	struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
174 	int ret = kvm_set_irq_inatomic(assigned_dev->kvm,
175 				       assigned_dev->irq_source_id,
176 				       assigned_dev->guest_irq, 1);
177 	return unlikely(ret == -EWOULDBLOCK) ? IRQ_WAKE_THREAD : IRQ_HANDLED;
178 }
179 
kvm_assigned_dev_thread_msi(int irq,void * dev_id)180 static irqreturn_t kvm_assigned_dev_thread_msi(int irq, void *dev_id)
181 {
182 	struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
183 
184 	kvm_assigned_dev_raise_guest_irq(assigned_dev,
185 					 assigned_dev->guest_irq);
186 
187 	return IRQ_HANDLED;
188 }
189 
kvm_assigned_dev_msix(int irq,void * dev_id)190 static irqreturn_t kvm_assigned_dev_msix(int irq, void *dev_id)
191 {
192 	struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
193 	int index = find_index_from_host_irq(assigned_dev, irq);
194 	u32 vector;
195 	int ret = 0;
196 
197 	if (index >= 0) {
198 		vector = assigned_dev->guest_msix_entries[index].vector;
199 		ret = kvm_set_irq_inatomic(assigned_dev->kvm,
200 					   assigned_dev->irq_source_id,
201 					   vector, 1);
202 	}
203 
204 	return unlikely(ret == -EWOULDBLOCK) ? IRQ_WAKE_THREAD : IRQ_HANDLED;
205 }
206 
kvm_assigned_dev_thread_msix(int irq,void * dev_id)207 static irqreturn_t kvm_assigned_dev_thread_msix(int irq, void *dev_id)
208 {
209 	struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
210 	int index = find_index_from_host_irq(assigned_dev, irq);
211 	u32 vector;
212 
213 	if (index >= 0) {
214 		vector = assigned_dev->guest_msix_entries[index].vector;
215 		kvm_assigned_dev_raise_guest_irq(assigned_dev, vector);
216 	}
217 
218 	return IRQ_HANDLED;
219 }
220 
221 /* Ack the irq line for an assigned device */
kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier * kian)222 static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian)
223 {
224 	struct kvm_assigned_dev_kernel *dev =
225 		container_of(kian, struct kvm_assigned_dev_kernel,
226 			     ack_notifier);
227 
228 	kvm_set_irq(dev->kvm, dev->irq_source_id, dev->guest_irq, 0, false);
229 
230 	spin_lock(&dev->intx_mask_lock);
231 
232 	if (!(dev->flags & KVM_DEV_ASSIGN_MASK_INTX)) {
233 		bool reassert = false;
234 
235 		spin_lock_irq(&dev->intx_lock);
236 		/*
237 		 * The guest IRQ may be shared so this ack can come from an
238 		 * IRQ for another guest device.
239 		 */
240 		if (dev->host_irq_disabled) {
241 			if (!(dev->flags & KVM_DEV_ASSIGN_PCI_2_3))
242 				enable_irq(dev->host_irq);
243 			else if (!pci_check_and_unmask_intx(dev->dev))
244 				reassert = true;
245 			dev->host_irq_disabled = reassert;
246 		}
247 		spin_unlock_irq(&dev->intx_lock);
248 
249 		if (reassert)
250 			kvm_set_irq(dev->kvm, dev->irq_source_id,
251 				    dev->guest_irq, 1, false);
252 	}
253 
254 	spin_unlock(&dev->intx_mask_lock);
255 }
256 
deassign_guest_irq(struct kvm * kvm,struct kvm_assigned_dev_kernel * assigned_dev)257 static void deassign_guest_irq(struct kvm *kvm,
258 			       struct kvm_assigned_dev_kernel *assigned_dev)
259 {
260 	if (assigned_dev->ack_notifier.gsi != -1)
261 		kvm_unregister_irq_ack_notifier(kvm,
262 						&assigned_dev->ack_notifier);
263 
264 	kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
265 		    assigned_dev->guest_irq, 0, false);
266 
267 	if (assigned_dev->irq_source_id != -1)
268 		kvm_free_irq_source_id(kvm, assigned_dev->irq_source_id);
269 	assigned_dev->irq_source_id = -1;
270 	assigned_dev->irq_requested_type &= ~(KVM_DEV_IRQ_GUEST_MASK);
271 }
272 
273 /* The function implicit hold kvm->lock mutex due to cancel_work_sync() */
deassign_host_irq(struct kvm * kvm,struct kvm_assigned_dev_kernel * assigned_dev)274 static void deassign_host_irq(struct kvm *kvm,
275 			      struct kvm_assigned_dev_kernel *assigned_dev)
276 {
277 	/*
278 	 * We disable irq here to prevent further events.
279 	 *
280 	 * Notice this maybe result in nested disable if the interrupt type is
281 	 * INTx, but it's OK for we are going to free it.
282 	 *
283 	 * If this function is a part of VM destroy, please ensure that till
284 	 * now, the kvm state is still legal for probably we also have to wait
285 	 * on a currently running IRQ handler.
286 	 */
287 	if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) {
288 		int i;
289 		for (i = 0; i < assigned_dev->entries_nr; i++)
290 			disable_irq(assigned_dev->host_msix_entries[i].vector);
291 
292 		for (i = 0; i < assigned_dev->entries_nr; i++)
293 			free_irq(assigned_dev->host_msix_entries[i].vector,
294 				 assigned_dev);
295 
296 		assigned_dev->entries_nr = 0;
297 		kfree(assigned_dev->host_msix_entries);
298 		kfree(assigned_dev->guest_msix_entries);
299 		pci_disable_msix(assigned_dev->dev);
300 	} else {
301 		/* Deal with MSI and INTx */
302 		if ((assigned_dev->irq_requested_type &
303 		     KVM_DEV_IRQ_HOST_INTX) &&
304 		    (assigned_dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) {
305 			spin_lock_irq(&assigned_dev->intx_lock);
306 			pci_intx(assigned_dev->dev, false);
307 			spin_unlock_irq(&assigned_dev->intx_lock);
308 			synchronize_irq(assigned_dev->host_irq);
309 		} else
310 			disable_irq(assigned_dev->host_irq);
311 
312 		free_irq(assigned_dev->host_irq, assigned_dev);
313 
314 		if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSI)
315 			pci_disable_msi(assigned_dev->dev);
316 	}
317 
318 	assigned_dev->irq_requested_type &= ~(KVM_DEV_IRQ_HOST_MASK);
319 }
320 
kvm_deassign_irq(struct kvm * kvm,struct kvm_assigned_dev_kernel * assigned_dev,unsigned long irq_requested_type)321 static int kvm_deassign_irq(struct kvm *kvm,
322 			    struct kvm_assigned_dev_kernel *assigned_dev,
323 			    unsigned long irq_requested_type)
324 {
325 	unsigned long guest_irq_type, host_irq_type;
326 
327 	if (!irqchip_in_kernel(kvm))
328 		return -EINVAL;
329 	/* no irq assignment to deassign */
330 	if (!assigned_dev->irq_requested_type)
331 		return -ENXIO;
332 
333 	host_irq_type = irq_requested_type & KVM_DEV_IRQ_HOST_MASK;
334 	guest_irq_type = irq_requested_type & KVM_DEV_IRQ_GUEST_MASK;
335 
336 	if (host_irq_type)
337 		deassign_host_irq(kvm, assigned_dev);
338 	if (guest_irq_type)
339 		deassign_guest_irq(kvm, assigned_dev);
340 
341 	return 0;
342 }
343 
kvm_free_assigned_irq(struct kvm * kvm,struct kvm_assigned_dev_kernel * assigned_dev)344 static void kvm_free_assigned_irq(struct kvm *kvm,
345 				  struct kvm_assigned_dev_kernel *assigned_dev)
346 {
347 	kvm_deassign_irq(kvm, assigned_dev, assigned_dev->irq_requested_type);
348 }
349 
kvm_free_assigned_device(struct kvm * kvm,struct kvm_assigned_dev_kernel * assigned_dev)350 static void kvm_free_assigned_device(struct kvm *kvm,
351 				     struct kvm_assigned_dev_kernel
352 				     *assigned_dev)
353 {
354 	kvm_free_assigned_irq(kvm, assigned_dev);
355 
356 	pci_reset_function(assigned_dev->dev);
357 	if (pci_load_and_free_saved_state(assigned_dev->dev,
358 					  &assigned_dev->pci_saved_state))
359 		printk(KERN_INFO "%s: Couldn't reload %s saved state\n",
360 		       __func__, dev_name(&assigned_dev->dev->dev));
361 	else
362 		pci_restore_state(assigned_dev->dev);
363 
364 	pci_clear_dev_assigned(assigned_dev->dev);
365 
366 	pci_release_regions(assigned_dev->dev);
367 	pci_disable_device(assigned_dev->dev);
368 	pci_dev_put(assigned_dev->dev);
369 
370 	list_del(&assigned_dev->list);
371 	kfree(assigned_dev);
372 }
373 
kvm_free_all_assigned_devices(struct kvm * kvm)374 void kvm_free_all_assigned_devices(struct kvm *kvm)
375 {
376 	struct list_head *ptr, *ptr2;
377 	struct kvm_assigned_dev_kernel *assigned_dev;
378 
379 	list_for_each_safe(ptr, ptr2, &kvm->arch.assigned_dev_head) {
380 		assigned_dev = list_entry(ptr,
381 					  struct kvm_assigned_dev_kernel,
382 					  list);
383 
384 		kvm_free_assigned_device(kvm, assigned_dev);
385 	}
386 }
387 
assigned_device_enable_host_intx(struct kvm * kvm,struct kvm_assigned_dev_kernel * dev)388 static int assigned_device_enable_host_intx(struct kvm *kvm,
389 					    struct kvm_assigned_dev_kernel *dev)
390 {
391 	irq_handler_t irq_handler;
392 	unsigned long flags;
393 
394 	dev->host_irq = dev->dev->irq;
395 
396 	/*
397 	 * We can only share the IRQ line with other host devices if we are
398 	 * able to disable the IRQ source at device-level - independently of
399 	 * the guest driver. Otherwise host devices may suffer from unbounded
400 	 * IRQ latencies when the guest keeps the line asserted.
401 	 */
402 	if (dev->flags & KVM_DEV_ASSIGN_PCI_2_3) {
403 		irq_handler = kvm_assigned_dev_intx;
404 		flags = IRQF_SHARED;
405 	} else {
406 		irq_handler = NULL;
407 		flags = IRQF_ONESHOT;
408 	}
409 	if (request_threaded_irq(dev->host_irq, irq_handler,
410 				 kvm_assigned_dev_thread_intx, flags,
411 				 dev->irq_name, dev))
412 		return -EIO;
413 
414 	if (dev->flags & KVM_DEV_ASSIGN_PCI_2_3) {
415 		spin_lock_irq(&dev->intx_lock);
416 		pci_intx(dev->dev, true);
417 		spin_unlock_irq(&dev->intx_lock);
418 	}
419 	return 0;
420 }
421 
assigned_device_enable_host_msi(struct kvm * kvm,struct kvm_assigned_dev_kernel * dev)422 static int assigned_device_enable_host_msi(struct kvm *kvm,
423 					   struct kvm_assigned_dev_kernel *dev)
424 {
425 	int r;
426 
427 	if (!dev->dev->msi_enabled) {
428 		r = pci_enable_msi(dev->dev);
429 		if (r)
430 			return r;
431 	}
432 
433 	dev->host_irq = dev->dev->irq;
434 	if (request_threaded_irq(dev->host_irq, kvm_assigned_dev_msi,
435 				 kvm_assigned_dev_thread_msi, 0,
436 				 dev->irq_name, dev)) {
437 		pci_disable_msi(dev->dev);
438 		return -EIO;
439 	}
440 
441 	return 0;
442 }
443 
assigned_device_enable_host_msix(struct kvm * kvm,struct kvm_assigned_dev_kernel * dev)444 static int assigned_device_enable_host_msix(struct kvm *kvm,
445 					    struct kvm_assigned_dev_kernel *dev)
446 {
447 	int i, r = -EINVAL;
448 
449 	/* host_msix_entries and guest_msix_entries should have been
450 	 * initialized */
451 	if (dev->entries_nr == 0)
452 		return r;
453 
454 	r = pci_enable_msix_exact(dev->dev,
455 				  dev->host_msix_entries, dev->entries_nr);
456 	if (r)
457 		return r;
458 
459 	for (i = 0; i < dev->entries_nr; i++) {
460 		r = request_threaded_irq(dev->host_msix_entries[i].vector,
461 					 kvm_assigned_dev_msix,
462 					 kvm_assigned_dev_thread_msix,
463 					 0, dev->irq_name, dev);
464 		if (r)
465 			goto err;
466 	}
467 
468 	return 0;
469 err:
470 	for (i -= 1; i >= 0; i--)
471 		free_irq(dev->host_msix_entries[i].vector, dev);
472 	pci_disable_msix(dev->dev);
473 	return r;
474 }
475 
assigned_device_enable_guest_intx(struct kvm * kvm,struct kvm_assigned_dev_kernel * dev,struct kvm_assigned_irq * irq)476 static int assigned_device_enable_guest_intx(struct kvm *kvm,
477 				struct kvm_assigned_dev_kernel *dev,
478 				struct kvm_assigned_irq *irq)
479 {
480 	dev->guest_irq = irq->guest_irq;
481 	dev->ack_notifier.gsi = irq->guest_irq;
482 	return 0;
483 }
484 
assigned_device_enable_guest_msi(struct kvm * kvm,struct kvm_assigned_dev_kernel * dev,struct kvm_assigned_irq * irq)485 static int assigned_device_enable_guest_msi(struct kvm *kvm,
486 			struct kvm_assigned_dev_kernel *dev,
487 			struct kvm_assigned_irq *irq)
488 {
489 	dev->guest_irq = irq->guest_irq;
490 	dev->ack_notifier.gsi = -1;
491 	return 0;
492 }
493 
assigned_device_enable_guest_msix(struct kvm * kvm,struct kvm_assigned_dev_kernel * dev,struct kvm_assigned_irq * irq)494 static int assigned_device_enable_guest_msix(struct kvm *kvm,
495 			struct kvm_assigned_dev_kernel *dev,
496 			struct kvm_assigned_irq *irq)
497 {
498 	dev->guest_irq = irq->guest_irq;
499 	dev->ack_notifier.gsi = -1;
500 	return 0;
501 }
502 
assign_host_irq(struct kvm * kvm,struct kvm_assigned_dev_kernel * dev,__u32 host_irq_type)503 static int assign_host_irq(struct kvm *kvm,
504 			   struct kvm_assigned_dev_kernel *dev,
505 			   __u32 host_irq_type)
506 {
507 	int r = -EEXIST;
508 
509 	if (dev->irq_requested_type & KVM_DEV_IRQ_HOST_MASK)
510 		return r;
511 
512 	snprintf(dev->irq_name, sizeof(dev->irq_name), "kvm:%s",
513 		 pci_name(dev->dev));
514 
515 	switch (host_irq_type) {
516 	case KVM_DEV_IRQ_HOST_INTX:
517 		r = assigned_device_enable_host_intx(kvm, dev);
518 		break;
519 	case KVM_DEV_IRQ_HOST_MSI:
520 		r = assigned_device_enable_host_msi(kvm, dev);
521 		break;
522 	case KVM_DEV_IRQ_HOST_MSIX:
523 		r = assigned_device_enable_host_msix(kvm, dev);
524 		break;
525 	default:
526 		r = -EINVAL;
527 	}
528 	dev->host_irq_disabled = false;
529 
530 	if (!r)
531 		dev->irq_requested_type |= host_irq_type;
532 
533 	return r;
534 }
535 
assign_guest_irq(struct kvm * kvm,struct kvm_assigned_dev_kernel * dev,struct kvm_assigned_irq * irq,unsigned long guest_irq_type)536 static int assign_guest_irq(struct kvm *kvm,
537 			    struct kvm_assigned_dev_kernel *dev,
538 			    struct kvm_assigned_irq *irq,
539 			    unsigned long guest_irq_type)
540 {
541 	int id;
542 	int r = -EEXIST;
543 
544 	if (dev->irq_requested_type & KVM_DEV_IRQ_GUEST_MASK)
545 		return r;
546 
547 	id = kvm_request_irq_source_id(kvm);
548 	if (id < 0)
549 		return id;
550 
551 	dev->irq_source_id = id;
552 
553 	switch (guest_irq_type) {
554 	case KVM_DEV_IRQ_GUEST_INTX:
555 		r = assigned_device_enable_guest_intx(kvm, dev, irq);
556 		break;
557 	case KVM_DEV_IRQ_GUEST_MSI:
558 		r = assigned_device_enable_guest_msi(kvm, dev, irq);
559 		break;
560 	case KVM_DEV_IRQ_GUEST_MSIX:
561 		r = assigned_device_enable_guest_msix(kvm, dev, irq);
562 		break;
563 	default:
564 		r = -EINVAL;
565 	}
566 
567 	if (!r) {
568 		dev->irq_requested_type |= guest_irq_type;
569 		if (dev->ack_notifier.gsi != -1)
570 			kvm_register_irq_ack_notifier(kvm, &dev->ack_notifier);
571 	} else {
572 		kvm_free_irq_source_id(kvm, dev->irq_source_id);
573 		dev->irq_source_id = -1;
574 	}
575 
576 	return r;
577 }
578 
579 /* TODO Deal with KVM_DEV_IRQ_ASSIGNED_MASK_MSIX */
kvm_vm_ioctl_assign_irq(struct kvm * kvm,struct kvm_assigned_irq * assigned_irq)580 static int kvm_vm_ioctl_assign_irq(struct kvm *kvm,
581 				   struct kvm_assigned_irq *assigned_irq)
582 {
583 	int r = -EINVAL;
584 	struct kvm_assigned_dev_kernel *match;
585 	unsigned long host_irq_type, guest_irq_type;
586 
587 	if (!irqchip_in_kernel(kvm))
588 		return r;
589 
590 	mutex_lock(&kvm->lock);
591 	r = -ENODEV;
592 	match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
593 				      assigned_irq->assigned_dev_id);
594 	if (!match)
595 		goto out;
596 
597 	host_irq_type = (assigned_irq->flags & KVM_DEV_IRQ_HOST_MASK);
598 	guest_irq_type = (assigned_irq->flags & KVM_DEV_IRQ_GUEST_MASK);
599 
600 	r = -EINVAL;
601 	/* can only assign one type at a time */
602 	if (hweight_long(host_irq_type) > 1)
603 		goto out;
604 	if (hweight_long(guest_irq_type) > 1)
605 		goto out;
606 	if (host_irq_type == 0 && guest_irq_type == 0)
607 		goto out;
608 
609 	r = 0;
610 	if (host_irq_type)
611 		r = assign_host_irq(kvm, match, host_irq_type);
612 	if (r)
613 		goto out;
614 
615 	if (guest_irq_type)
616 		r = assign_guest_irq(kvm, match, assigned_irq, guest_irq_type);
617 out:
618 	mutex_unlock(&kvm->lock);
619 	return r;
620 }
621 
kvm_vm_ioctl_deassign_dev_irq(struct kvm * kvm,struct kvm_assigned_irq * assigned_irq)622 static int kvm_vm_ioctl_deassign_dev_irq(struct kvm *kvm,
623 					 struct kvm_assigned_irq
624 					 *assigned_irq)
625 {
626 	int r = -ENODEV;
627 	struct kvm_assigned_dev_kernel *match;
628 	unsigned long irq_type;
629 
630 	mutex_lock(&kvm->lock);
631 
632 	match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
633 				      assigned_irq->assigned_dev_id);
634 	if (!match)
635 		goto out;
636 
637 	irq_type = assigned_irq->flags & (KVM_DEV_IRQ_HOST_MASK |
638 					  KVM_DEV_IRQ_GUEST_MASK);
639 	r = kvm_deassign_irq(kvm, match, irq_type);
640 out:
641 	mutex_unlock(&kvm->lock);
642 	return r;
643 }
644 
645 /*
646  * We want to test whether the caller has been granted permissions to
647  * use this device.  To be able to configure and control the device,
648  * the user needs access to PCI configuration space and BAR resources.
649  * These are accessed through PCI sysfs.  PCI config space is often
650  * passed to the process calling this ioctl via file descriptor, so we
651  * can't rely on access to that file.  We can check for permissions
652  * on each of the BAR resource files, which is a pretty clear
653  * indicator that the user has been granted access to the device.
654  */
probe_sysfs_permissions(struct pci_dev * dev)655 static int probe_sysfs_permissions(struct pci_dev *dev)
656 {
657 #ifdef CONFIG_SYSFS
658 	int i;
659 	bool bar_found = false;
660 
661 	for (i = PCI_STD_RESOURCES; i <= PCI_STD_RESOURCE_END; i++) {
662 		char *kpath, *syspath;
663 		struct path path;
664 		struct inode *inode;
665 		int r;
666 
667 		if (!pci_resource_len(dev, i))
668 			continue;
669 
670 		kpath = kobject_get_path(&dev->dev.kobj, GFP_KERNEL);
671 		if (!kpath)
672 			return -ENOMEM;
673 
674 		/* Per sysfs-rules, sysfs is always at /sys */
675 		syspath = kasprintf(GFP_KERNEL, "/sys%s/resource%d", kpath, i);
676 		kfree(kpath);
677 		if (!syspath)
678 			return -ENOMEM;
679 
680 		r = kern_path(syspath, LOOKUP_FOLLOW, &path);
681 		kfree(syspath);
682 		if (r)
683 			return r;
684 
685 		inode = d_backing_inode(path.dentry);
686 
687 		r = inode_permission(inode, MAY_READ | MAY_WRITE | MAY_ACCESS);
688 		path_put(&path);
689 		if (r)
690 			return r;
691 
692 		bar_found = true;
693 	}
694 
695 	/* If no resources, probably something special */
696 	if (!bar_found)
697 		return -EPERM;
698 
699 	return 0;
700 #else
701 	return -EINVAL; /* No way to control the device without sysfs */
702 #endif
703 }
704 
kvm_vm_ioctl_assign_device(struct kvm * kvm,struct kvm_assigned_pci_dev * assigned_dev)705 static int kvm_vm_ioctl_assign_device(struct kvm *kvm,
706 				      struct kvm_assigned_pci_dev *assigned_dev)
707 {
708 	int r = 0, idx;
709 	struct kvm_assigned_dev_kernel *match;
710 	struct pci_dev *dev;
711 
712 	if (!(assigned_dev->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU))
713 		return -EINVAL;
714 
715 	mutex_lock(&kvm->lock);
716 	idx = srcu_read_lock(&kvm->srcu);
717 
718 	match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
719 				      assigned_dev->assigned_dev_id);
720 	if (match) {
721 		/* device already assigned */
722 		r = -EEXIST;
723 		goto out;
724 	}
725 
726 	match = kzalloc(sizeof(struct kvm_assigned_dev_kernel), GFP_KERNEL);
727 	if (match == NULL) {
728 		printk(KERN_INFO "%s: Couldn't allocate memory\n",
729 		       __func__);
730 		r = -ENOMEM;
731 		goto out;
732 	}
733 	dev = pci_get_domain_bus_and_slot(assigned_dev->segnr,
734 				   assigned_dev->busnr,
735 				   assigned_dev->devfn);
736 	if (!dev) {
737 		printk(KERN_INFO "%s: host device not found\n", __func__);
738 		r = -EINVAL;
739 		goto out_free;
740 	}
741 
742 	/* Don't allow bridges to be assigned */
743 	if (dev->hdr_type != PCI_HEADER_TYPE_NORMAL) {
744 		r = -EPERM;
745 		goto out_put;
746 	}
747 
748 	r = probe_sysfs_permissions(dev);
749 	if (r)
750 		goto out_put;
751 
752 	if (pci_enable_device(dev)) {
753 		printk(KERN_INFO "%s: Could not enable PCI device\n", __func__);
754 		r = -EBUSY;
755 		goto out_put;
756 	}
757 	r = pci_request_regions(dev, "kvm_assigned_device");
758 	if (r) {
759 		printk(KERN_INFO "%s: Could not get access to device regions\n",
760 		       __func__);
761 		goto out_disable;
762 	}
763 
764 	pci_reset_function(dev);
765 	pci_save_state(dev);
766 	match->pci_saved_state = pci_store_saved_state(dev);
767 	if (!match->pci_saved_state)
768 		printk(KERN_DEBUG "%s: Couldn't store %s saved state\n",
769 		       __func__, dev_name(&dev->dev));
770 
771 	if (!pci_intx_mask_supported(dev))
772 		assigned_dev->flags &= ~KVM_DEV_ASSIGN_PCI_2_3;
773 
774 	match->assigned_dev_id = assigned_dev->assigned_dev_id;
775 	match->host_segnr = assigned_dev->segnr;
776 	match->host_busnr = assigned_dev->busnr;
777 	match->host_devfn = assigned_dev->devfn;
778 	match->flags = assigned_dev->flags;
779 	match->dev = dev;
780 	spin_lock_init(&match->intx_lock);
781 	spin_lock_init(&match->intx_mask_lock);
782 	match->irq_source_id = -1;
783 	match->kvm = kvm;
784 	match->ack_notifier.irq_acked = kvm_assigned_dev_ack_irq;
785 
786 	list_add(&match->list, &kvm->arch.assigned_dev_head);
787 
788 	if (!kvm->arch.iommu_domain) {
789 		r = kvm_iommu_map_guest(kvm);
790 		if (r)
791 			goto out_list_del;
792 	}
793 	r = kvm_assign_device(kvm, match->dev);
794 	if (r)
795 		goto out_list_del;
796 
797 out:
798 	srcu_read_unlock(&kvm->srcu, idx);
799 	mutex_unlock(&kvm->lock);
800 	return r;
801 out_list_del:
802 	if (pci_load_and_free_saved_state(dev, &match->pci_saved_state))
803 		printk(KERN_INFO "%s: Couldn't reload %s saved state\n",
804 		       __func__, dev_name(&dev->dev));
805 	list_del(&match->list);
806 	pci_release_regions(dev);
807 out_disable:
808 	pci_disable_device(dev);
809 out_put:
810 	pci_dev_put(dev);
811 out_free:
812 	kfree(match);
813 	srcu_read_unlock(&kvm->srcu, idx);
814 	mutex_unlock(&kvm->lock);
815 	return r;
816 }
817 
kvm_vm_ioctl_deassign_device(struct kvm * kvm,struct kvm_assigned_pci_dev * assigned_dev)818 static int kvm_vm_ioctl_deassign_device(struct kvm *kvm,
819 		struct kvm_assigned_pci_dev *assigned_dev)
820 {
821 	int r = 0;
822 	struct kvm_assigned_dev_kernel *match;
823 
824 	mutex_lock(&kvm->lock);
825 
826 	match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
827 				      assigned_dev->assigned_dev_id);
828 	if (!match) {
829 		printk(KERN_INFO "%s: device hasn't been assigned before, "
830 		  "so cannot be deassigned\n", __func__);
831 		r = -EINVAL;
832 		goto out;
833 	}
834 
835 	kvm_deassign_device(kvm, match->dev);
836 
837 	kvm_free_assigned_device(kvm, match);
838 
839 out:
840 	mutex_unlock(&kvm->lock);
841 	return r;
842 }
843 
844 
kvm_vm_ioctl_set_msix_nr(struct kvm * kvm,struct kvm_assigned_msix_nr * entry_nr)845 static int kvm_vm_ioctl_set_msix_nr(struct kvm *kvm,
846 				    struct kvm_assigned_msix_nr *entry_nr)
847 {
848 	int r = 0;
849 	struct kvm_assigned_dev_kernel *adev;
850 
851 	mutex_lock(&kvm->lock);
852 
853 	adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
854 				      entry_nr->assigned_dev_id);
855 	if (!adev) {
856 		r = -EINVAL;
857 		goto msix_nr_out;
858 	}
859 
860 	if (adev->entries_nr == 0) {
861 		adev->entries_nr = entry_nr->entry_nr;
862 		if (adev->entries_nr == 0 ||
863 		    adev->entries_nr > KVM_MAX_MSIX_PER_DEV) {
864 			r = -EINVAL;
865 			goto msix_nr_out;
866 		}
867 
868 		adev->host_msix_entries = kzalloc(sizeof(struct msix_entry) *
869 						entry_nr->entry_nr,
870 						GFP_KERNEL);
871 		if (!adev->host_msix_entries) {
872 			r = -ENOMEM;
873 			goto msix_nr_out;
874 		}
875 		adev->guest_msix_entries =
876 			kzalloc(sizeof(struct msix_entry) * entry_nr->entry_nr,
877 				GFP_KERNEL);
878 		if (!adev->guest_msix_entries) {
879 			kfree(adev->host_msix_entries);
880 			r = -ENOMEM;
881 			goto msix_nr_out;
882 		}
883 	} else /* Not allowed set MSI-X number twice */
884 		r = -EINVAL;
885 msix_nr_out:
886 	mutex_unlock(&kvm->lock);
887 	return r;
888 }
889 
kvm_vm_ioctl_set_msix_entry(struct kvm * kvm,struct kvm_assigned_msix_entry * entry)890 static int kvm_vm_ioctl_set_msix_entry(struct kvm *kvm,
891 				       struct kvm_assigned_msix_entry *entry)
892 {
893 	int r = 0, i;
894 	struct kvm_assigned_dev_kernel *adev;
895 
896 	mutex_lock(&kvm->lock);
897 
898 	adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
899 				      entry->assigned_dev_id);
900 
901 	if (!adev) {
902 		r = -EINVAL;
903 		goto msix_entry_out;
904 	}
905 
906 	for (i = 0; i < adev->entries_nr; i++)
907 		if (adev->guest_msix_entries[i].vector == 0 ||
908 		    adev->guest_msix_entries[i].entry == entry->entry) {
909 			adev->guest_msix_entries[i].entry = entry->entry;
910 			adev->guest_msix_entries[i].vector = entry->gsi;
911 			adev->host_msix_entries[i].entry = entry->entry;
912 			break;
913 		}
914 	if (i == adev->entries_nr) {
915 		r = -ENOSPC;
916 		goto msix_entry_out;
917 	}
918 
919 msix_entry_out:
920 	mutex_unlock(&kvm->lock);
921 
922 	return r;
923 }
924 
kvm_vm_ioctl_set_pci_irq_mask(struct kvm * kvm,struct kvm_assigned_pci_dev * assigned_dev)925 static int kvm_vm_ioctl_set_pci_irq_mask(struct kvm *kvm,
926 		struct kvm_assigned_pci_dev *assigned_dev)
927 {
928 	int r = 0;
929 	struct kvm_assigned_dev_kernel *match;
930 
931 	mutex_lock(&kvm->lock);
932 
933 	match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
934 				      assigned_dev->assigned_dev_id);
935 	if (!match) {
936 		r = -ENODEV;
937 		goto out;
938 	}
939 
940 	spin_lock(&match->intx_mask_lock);
941 
942 	match->flags &= ~KVM_DEV_ASSIGN_MASK_INTX;
943 	match->flags |= assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX;
944 
945 	if (match->irq_requested_type & KVM_DEV_IRQ_GUEST_INTX) {
946 		if (assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX) {
947 			kvm_set_irq(match->kvm, match->irq_source_id,
948 				    match->guest_irq, 0, false);
949 			/*
950 			 * Masking at hardware-level is performed on demand,
951 			 * i.e. when an IRQ actually arrives at the host.
952 			 */
953 		} else if (!(assigned_dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) {
954 			/*
955 			 * Unmask the IRQ line if required. Unmasking at
956 			 * device level will be performed by user space.
957 			 */
958 			spin_lock_irq(&match->intx_lock);
959 			if (match->host_irq_disabled) {
960 				enable_irq(match->host_irq);
961 				match->host_irq_disabled = false;
962 			}
963 			spin_unlock_irq(&match->intx_lock);
964 		}
965 	}
966 
967 	spin_unlock(&match->intx_mask_lock);
968 
969 out:
970 	mutex_unlock(&kvm->lock);
971 	return r;
972 }
973 
kvm_vm_ioctl_assigned_device(struct kvm * kvm,unsigned ioctl,unsigned long arg)974 long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
975 				  unsigned long arg)
976 {
977 	void __user *argp = (void __user *)arg;
978 	int r;
979 
980 	switch (ioctl) {
981 	case KVM_ASSIGN_PCI_DEVICE: {
982 		struct kvm_assigned_pci_dev assigned_dev;
983 
984 		r = -EFAULT;
985 		if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev))
986 			goto out;
987 		r = kvm_vm_ioctl_assign_device(kvm, &assigned_dev);
988 		if (r)
989 			goto out;
990 		break;
991 	}
992 	case KVM_ASSIGN_IRQ: {
993 		r = -EOPNOTSUPP;
994 		break;
995 	}
996 	case KVM_ASSIGN_DEV_IRQ: {
997 		struct kvm_assigned_irq assigned_irq;
998 
999 		r = -EFAULT;
1000 		if (copy_from_user(&assigned_irq, argp, sizeof assigned_irq))
1001 			goto out;
1002 		r = kvm_vm_ioctl_assign_irq(kvm, &assigned_irq);
1003 		if (r)
1004 			goto out;
1005 		break;
1006 	}
1007 	case KVM_DEASSIGN_DEV_IRQ: {
1008 		struct kvm_assigned_irq assigned_irq;
1009 
1010 		r = -EFAULT;
1011 		if (copy_from_user(&assigned_irq, argp, sizeof assigned_irq))
1012 			goto out;
1013 		r = kvm_vm_ioctl_deassign_dev_irq(kvm, &assigned_irq);
1014 		if (r)
1015 			goto out;
1016 		break;
1017 	}
1018 	case KVM_DEASSIGN_PCI_DEVICE: {
1019 		struct kvm_assigned_pci_dev assigned_dev;
1020 
1021 		r = -EFAULT;
1022 		if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev))
1023 			goto out;
1024 		r = kvm_vm_ioctl_deassign_device(kvm, &assigned_dev);
1025 		if (r)
1026 			goto out;
1027 		break;
1028 	}
1029 	case KVM_ASSIGN_SET_MSIX_NR: {
1030 		struct kvm_assigned_msix_nr entry_nr;
1031 		r = -EFAULT;
1032 		if (copy_from_user(&entry_nr, argp, sizeof entry_nr))
1033 			goto out;
1034 		r = kvm_vm_ioctl_set_msix_nr(kvm, &entry_nr);
1035 		if (r)
1036 			goto out;
1037 		break;
1038 	}
1039 	case KVM_ASSIGN_SET_MSIX_ENTRY: {
1040 		struct kvm_assigned_msix_entry entry;
1041 		r = -EFAULT;
1042 		if (copy_from_user(&entry, argp, sizeof entry))
1043 			goto out;
1044 		r = kvm_vm_ioctl_set_msix_entry(kvm, &entry);
1045 		if (r)
1046 			goto out;
1047 		break;
1048 	}
1049 	case KVM_ASSIGN_SET_INTX_MASK: {
1050 		struct kvm_assigned_pci_dev assigned_dev;
1051 
1052 		r = -EFAULT;
1053 		if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev))
1054 			goto out;
1055 		r = kvm_vm_ioctl_set_pci_irq_mask(kvm, &assigned_dev);
1056 		break;
1057 	}
1058 	default:
1059 		r = -ENOTTY;
1060 		break;
1061 	}
1062 out:
1063 	return r;
1064 }
1065