1/* 2 * Copyright (C) 2012 Red Hat, Inc. All rights reserved. 3 * Author: Alex Williamson <alex.williamson@redhat.com> 4 * 5 * This program is free software; you can redistribute it and/or modify 6 * it under the terms of the GNU General Public License version 2 as 7 * published by the Free Software Foundation. 8 * 9 * Derived from original vfio: 10 * Copyright 2010 Cisco Systems, Inc. All rights reserved. 11 * Author: Tom Lyon, pugs@cisco.com 12 */ 13 14#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 15 16#include <linux/device.h> 17#include <linux/eventfd.h> 18#include <linux/file.h> 19#include <linux/interrupt.h> 20#include <linux/iommu.h> 21#include <linux/module.h> 22#include <linux/mutex.h> 23#include <linux/notifier.h> 24#include <linux/pci.h> 25#include <linux/pm_runtime.h> 26#include <linux/slab.h> 27#include <linux/types.h> 28#include <linux/uaccess.h> 29#include <linux/vfio.h> 30#include <linux/vgaarb.h> 31 32#include "vfio_pci_private.h" 33 34#define DRIVER_VERSION "0.2" 35#define DRIVER_AUTHOR "Alex Williamson <alex.williamson@redhat.com>" 36#define DRIVER_DESC "VFIO PCI - User Level meta-driver" 37 38static char ids[1024] __initdata; 39module_param_string(ids, ids, sizeof(ids), 0); 40MODULE_PARM_DESC(ids, "Initial PCI IDs to add to the vfio driver, format is \"vendor:device[:subvendor[:subdevice[:class[:class_mask]]]]\" and multiple comma separated entries can be specified"); 41 42static bool nointxmask; 43module_param_named(nointxmask, nointxmask, bool, S_IRUGO | S_IWUSR); 44MODULE_PARM_DESC(nointxmask, 45 "Disable support for PCI 2.3 style INTx masking. If this resolves problems for specific devices, report lspci -vvvxxx to linux-pci@vger.kernel.org so the device can be fixed automatically via the broken_intx_masking flag."); 46 47#ifdef CONFIG_VFIO_PCI_VGA 48static bool disable_vga; 49module_param(disable_vga, bool, S_IRUGO); 50MODULE_PARM_DESC(disable_vga, "Disable VGA resource access through vfio-pci"); 51#endif 52 53static bool disable_idle_d3; 54module_param(disable_idle_d3, bool, S_IRUGO | S_IWUSR); 55MODULE_PARM_DESC(disable_idle_d3, 56 "Disable using the PCI D3 low power state for idle, unused devices"); 57 58static DEFINE_MUTEX(driver_lock); 59 60static inline bool vfio_vga_disabled(void) 61{ 62#ifdef CONFIG_VFIO_PCI_VGA 63 return disable_vga; 64#else 65 return true; 66#endif 67} 68 69/* 70 * Our VGA arbiter participation is limited since we don't know anything 71 * about the device itself. However, if the device is the only VGA device 72 * downstream of a bridge and VFIO VGA support is disabled, then we can 73 * safely return legacy VGA IO and memory as not decoded since the user 74 * has no way to get to it and routing can be disabled externally at the 75 * bridge. 76 */ 77static unsigned int vfio_pci_set_vga_decode(void *opaque, bool single_vga) 78{ 79 struct vfio_pci_device *vdev = opaque; 80 struct pci_dev *tmp = NULL, *pdev = vdev->pdev; 81 unsigned char max_busnr; 82 unsigned int decodes; 83 84 if (single_vga || !vfio_vga_disabled() || pci_is_root_bus(pdev->bus)) 85 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM | 86 VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM; 87 88 max_busnr = pci_bus_max_busnr(pdev->bus); 89 decodes = VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 90 91 while ((tmp = pci_get_class(PCI_CLASS_DISPLAY_VGA << 8, tmp)) != NULL) { 92 if (tmp == pdev || 93 pci_domain_nr(tmp->bus) != pci_domain_nr(pdev->bus) || 94 pci_is_root_bus(tmp->bus)) 95 continue; 96 97 if (tmp->bus->number >= pdev->bus->number && 98 tmp->bus->number <= max_busnr) { 99 pci_dev_put(tmp); 100 decodes |= VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM; 101 break; 102 } 103 } 104 105 return decodes; 106} 107 108static inline bool vfio_pci_is_vga(struct pci_dev *pdev) 109{ 110 return (pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA; 111} 112 113static void vfio_pci_try_bus_reset(struct vfio_pci_device *vdev); 114 115static int vfio_pci_enable(struct vfio_pci_device *vdev) 116{ 117 struct pci_dev *pdev = vdev->pdev; 118 int ret; 119 u16 cmd; 120 u8 msix_pos; 121 122 pci_set_power_state(pdev, PCI_D0); 123 124 /* Don't allow our initial saved state to include busmaster */ 125 pci_clear_master(pdev); 126 127 ret = pci_enable_device(pdev); 128 if (ret) 129 return ret; 130 131 vdev->reset_works = (pci_reset_function(pdev) == 0); 132 pci_save_state(pdev); 133 vdev->pci_saved_state = pci_store_saved_state(pdev); 134 if (!vdev->pci_saved_state) 135 pr_debug("%s: Couldn't store %s saved state\n", 136 __func__, dev_name(&pdev->dev)); 137 138 ret = vfio_config_init(vdev); 139 if (ret) { 140 kfree(vdev->pci_saved_state); 141 vdev->pci_saved_state = NULL; 142 pci_disable_device(pdev); 143 return ret; 144 } 145 146 if (likely(!nointxmask)) 147 vdev->pci_2_3 = pci_intx_mask_supported(pdev); 148 149 pci_read_config_word(pdev, PCI_COMMAND, &cmd); 150 if (vdev->pci_2_3 && (cmd & PCI_COMMAND_INTX_DISABLE)) { 151 cmd &= ~PCI_COMMAND_INTX_DISABLE; 152 pci_write_config_word(pdev, PCI_COMMAND, cmd); 153 } 154 155 msix_pos = pdev->msix_cap; 156 if (msix_pos) { 157 u16 flags; 158 u32 table; 159 160 pci_read_config_word(pdev, msix_pos + PCI_MSIX_FLAGS, &flags); 161 pci_read_config_dword(pdev, msix_pos + PCI_MSIX_TABLE, &table); 162 163 vdev->msix_bar = table & PCI_MSIX_TABLE_BIR; 164 vdev->msix_offset = table & PCI_MSIX_TABLE_OFFSET; 165 vdev->msix_size = ((flags & PCI_MSIX_FLAGS_QSIZE) + 1) * 16; 166 } else 167 vdev->msix_bar = 0xFF; 168 169 if (!vfio_vga_disabled() && vfio_pci_is_vga(pdev)) 170 vdev->has_vga = true; 171 172 return 0; 173} 174 175static void vfio_pci_disable(struct vfio_pci_device *vdev) 176{ 177 struct pci_dev *pdev = vdev->pdev; 178 int bar; 179 180 /* Stop the device from further DMA */ 181 pci_clear_master(pdev); 182 183 vfio_pci_set_irqs_ioctl(vdev, VFIO_IRQ_SET_DATA_NONE | 184 VFIO_IRQ_SET_ACTION_TRIGGER, 185 vdev->irq_type, 0, 0, NULL); 186 187 vdev->virq_disabled = false; 188 189 vfio_config_free(vdev); 190 191 for (bar = PCI_STD_RESOURCES; bar <= PCI_STD_RESOURCE_END; bar++) { 192 if (!vdev->barmap[bar]) 193 continue; 194 pci_iounmap(pdev, vdev->barmap[bar]); 195 pci_release_selected_regions(pdev, 1 << bar); 196 vdev->barmap[bar] = NULL; 197 } 198 199 vdev->needs_reset = true; 200 201 /* 202 * If we have saved state, restore it. If we can reset the device, 203 * even better. Resetting with current state seems better than 204 * nothing, but saving and restoring current state without reset 205 * is just busy work. 206 */ 207 if (pci_load_and_free_saved_state(pdev, &vdev->pci_saved_state)) { 208 pr_info("%s: Couldn't reload %s saved state\n", 209 __func__, dev_name(&pdev->dev)); 210 211 if (!vdev->reset_works) 212 goto out; 213 214 pci_save_state(pdev); 215 } 216 217 /* 218 * Disable INTx and MSI, presumably to avoid spurious interrupts 219 * during reset. Stolen from pci_reset_function() 220 */ 221 pci_write_config_word(pdev, PCI_COMMAND, PCI_COMMAND_INTX_DISABLE); 222 223 /* 224 * Try to reset the device. The success of this is dependent on 225 * being able to lock the device, which is not always possible. 226 */ 227 if (vdev->reset_works && !pci_try_reset_function(pdev)) 228 vdev->needs_reset = false; 229 230 pci_restore_state(pdev); 231out: 232 pci_disable_device(pdev); 233 234 vfio_pci_try_bus_reset(vdev); 235 236 if (!disable_idle_d3) 237 pci_set_power_state(pdev, PCI_D3hot); 238} 239 240static void vfio_pci_release(void *device_data) 241{ 242 struct vfio_pci_device *vdev = device_data; 243 244 mutex_lock(&driver_lock); 245 246 if (!(--vdev->refcnt)) { 247 vfio_spapr_pci_eeh_release(vdev->pdev); 248 vfio_pci_disable(vdev); 249 } 250 251 mutex_unlock(&driver_lock); 252 253 module_put(THIS_MODULE); 254} 255 256static int vfio_pci_open(void *device_data) 257{ 258 struct vfio_pci_device *vdev = device_data; 259 int ret = 0; 260 261 if (!try_module_get(THIS_MODULE)) 262 return -ENODEV; 263 264 mutex_lock(&driver_lock); 265 266 if (!vdev->refcnt) { 267 ret = vfio_pci_enable(vdev); 268 if (ret) 269 goto error; 270 271 vfio_spapr_pci_eeh_open(vdev->pdev); 272 } 273 vdev->refcnt++; 274error: 275 mutex_unlock(&driver_lock); 276 if (ret) 277 module_put(THIS_MODULE); 278 return ret; 279} 280 281static int vfio_pci_get_irq_count(struct vfio_pci_device *vdev, int irq_type) 282{ 283 if (irq_type == VFIO_PCI_INTX_IRQ_INDEX) { 284 u8 pin; 285 pci_read_config_byte(vdev->pdev, PCI_INTERRUPT_PIN, &pin); 286 if (IS_ENABLED(CONFIG_VFIO_PCI_INTX) && pin) 287 return 1; 288 289 } else if (irq_type == VFIO_PCI_MSI_IRQ_INDEX) { 290 u8 pos; 291 u16 flags; 292 293 pos = vdev->pdev->msi_cap; 294 if (pos) { 295 pci_read_config_word(vdev->pdev, 296 pos + PCI_MSI_FLAGS, &flags); 297 return 1 << ((flags & PCI_MSI_FLAGS_QMASK) >> 1); 298 } 299 } else if (irq_type == VFIO_PCI_MSIX_IRQ_INDEX) { 300 u8 pos; 301 u16 flags; 302 303 pos = vdev->pdev->msix_cap; 304 if (pos) { 305 pci_read_config_word(vdev->pdev, 306 pos + PCI_MSIX_FLAGS, &flags); 307 308 return (flags & PCI_MSIX_FLAGS_QSIZE) + 1; 309 } 310 } else if (irq_type == VFIO_PCI_ERR_IRQ_INDEX) { 311 if (pci_is_pcie(vdev->pdev)) 312 return 1; 313 } else if (irq_type == VFIO_PCI_REQ_IRQ_INDEX) { 314 return 1; 315 } 316 317 return 0; 318} 319 320static int vfio_pci_count_devs(struct pci_dev *pdev, void *data) 321{ 322 (*(int *)data)++; 323 return 0; 324} 325 326struct vfio_pci_fill_info { 327 int max; 328 int cur; 329 struct vfio_pci_dependent_device *devices; 330}; 331 332static int vfio_pci_fill_devs(struct pci_dev *pdev, void *data) 333{ 334 struct vfio_pci_fill_info *fill = data; 335 struct iommu_group *iommu_group; 336 337 if (fill->cur == fill->max) 338 return -EAGAIN; /* Something changed, try again */ 339 340 iommu_group = iommu_group_get(&pdev->dev); 341 if (!iommu_group) 342 return -EPERM; /* Cannot reset non-isolated devices */ 343 344 fill->devices[fill->cur].group_id = iommu_group_id(iommu_group); 345 fill->devices[fill->cur].segment = pci_domain_nr(pdev->bus); 346 fill->devices[fill->cur].bus = pdev->bus->number; 347 fill->devices[fill->cur].devfn = pdev->devfn; 348 fill->cur++; 349 iommu_group_put(iommu_group); 350 return 0; 351} 352 353struct vfio_pci_group_entry { 354 struct vfio_group *group; 355 int id; 356}; 357 358struct vfio_pci_group_info { 359 int count; 360 struct vfio_pci_group_entry *groups; 361}; 362 363static int vfio_pci_validate_devs(struct pci_dev *pdev, void *data) 364{ 365 struct vfio_pci_group_info *info = data; 366 struct iommu_group *group; 367 int id, i; 368 369 group = iommu_group_get(&pdev->dev); 370 if (!group) 371 return -EPERM; 372 373 id = iommu_group_id(group); 374 375 for (i = 0; i < info->count; i++) 376 if (info->groups[i].id == id) 377 break; 378 379 iommu_group_put(group); 380 381 return (i == info->count) ? -EINVAL : 0; 382} 383 384static bool vfio_pci_dev_below_slot(struct pci_dev *pdev, struct pci_slot *slot) 385{ 386 for (; pdev; pdev = pdev->bus->self) 387 if (pdev->bus == slot->bus) 388 return (pdev->slot == slot); 389 return false; 390} 391 392struct vfio_pci_walk_info { 393 int (*fn)(struct pci_dev *, void *data); 394 void *data; 395 struct pci_dev *pdev; 396 bool slot; 397 int ret; 398}; 399 400static int vfio_pci_walk_wrapper(struct pci_dev *pdev, void *data) 401{ 402 struct vfio_pci_walk_info *walk = data; 403 404 if (!walk->slot || vfio_pci_dev_below_slot(pdev, walk->pdev->slot)) 405 walk->ret = walk->fn(pdev, walk->data); 406 407 return walk->ret; 408} 409 410static int vfio_pci_for_each_slot_or_bus(struct pci_dev *pdev, 411 int (*fn)(struct pci_dev *, 412 void *data), void *data, 413 bool slot) 414{ 415 struct vfio_pci_walk_info walk = { 416 .fn = fn, .data = data, .pdev = pdev, .slot = slot, .ret = 0, 417 }; 418 419 pci_walk_bus(pdev->bus, vfio_pci_walk_wrapper, &walk); 420 421 return walk.ret; 422} 423 424static long vfio_pci_ioctl(void *device_data, 425 unsigned int cmd, unsigned long arg) 426{ 427 struct vfio_pci_device *vdev = device_data; 428 unsigned long minsz; 429 430 if (cmd == VFIO_DEVICE_GET_INFO) { 431 struct vfio_device_info info; 432 433 minsz = offsetofend(struct vfio_device_info, num_irqs); 434 435 if (copy_from_user(&info, (void __user *)arg, minsz)) 436 return -EFAULT; 437 438 if (info.argsz < minsz) 439 return -EINVAL; 440 441 info.flags = VFIO_DEVICE_FLAGS_PCI; 442 443 if (vdev->reset_works) 444 info.flags |= VFIO_DEVICE_FLAGS_RESET; 445 446 info.num_regions = VFIO_PCI_NUM_REGIONS; 447 info.num_irqs = VFIO_PCI_NUM_IRQS; 448 449 return copy_to_user((void __user *)arg, &info, minsz) ? 450 -EFAULT : 0; 451 452 } else if (cmd == VFIO_DEVICE_GET_REGION_INFO) { 453 struct pci_dev *pdev = vdev->pdev; 454 struct vfio_region_info info; 455 456 minsz = offsetofend(struct vfio_region_info, offset); 457 458 if (copy_from_user(&info, (void __user *)arg, minsz)) 459 return -EFAULT; 460 461 if (info.argsz < minsz) 462 return -EINVAL; 463 464 switch (info.index) { 465 case VFIO_PCI_CONFIG_REGION_INDEX: 466 info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); 467 info.size = pdev->cfg_size; 468 info.flags = VFIO_REGION_INFO_FLAG_READ | 469 VFIO_REGION_INFO_FLAG_WRITE; 470 break; 471 case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX: 472 info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); 473 info.size = pci_resource_len(pdev, info.index); 474 if (!info.size) { 475 info.flags = 0; 476 break; 477 } 478 479 info.flags = VFIO_REGION_INFO_FLAG_READ | 480 VFIO_REGION_INFO_FLAG_WRITE; 481 if (IS_ENABLED(CONFIG_VFIO_PCI_MMAP) && 482 pci_resource_flags(pdev, info.index) & 483 IORESOURCE_MEM && info.size >= PAGE_SIZE) 484 info.flags |= VFIO_REGION_INFO_FLAG_MMAP; 485 break; 486 case VFIO_PCI_ROM_REGION_INDEX: 487 { 488 void __iomem *io; 489 size_t size; 490 491 info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); 492 info.flags = 0; 493 494 /* Report the BAR size, not the ROM size */ 495 info.size = pci_resource_len(pdev, info.index); 496 if (!info.size) 497 break; 498 499 /* Is it really there? */ 500 io = pci_map_rom(pdev, &size); 501 if (!io || !size) { 502 info.size = 0; 503 break; 504 } 505 pci_unmap_rom(pdev, io); 506 507 info.flags = VFIO_REGION_INFO_FLAG_READ; 508 break; 509 } 510 case VFIO_PCI_VGA_REGION_INDEX: 511 if (!vdev->has_vga) 512 return -EINVAL; 513 514 info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); 515 info.size = 0xc0000; 516 info.flags = VFIO_REGION_INFO_FLAG_READ | 517 VFIO_REGION_INFO_FLAG_WRITE; 518 519 break; 520 default: 521 return -EINVAL; 522 } 523 524 return copy_to_user((void __user *)arg, &info, minsz) ? 525 -EFAULT : 0; 526 527 } else if (cmd == VFIO_DEVICE_GET_IRQ_INFO) { 528 struct vfio_irq_info info; 529 530 minsz = offsetofend(struct vfio_irq_info, count); 531 532 if (copy_from_user(&info, (void __user *)arg, minsz)) 533 return -EFAULT; 534 535 if (info.argsz < minsz || info.index >= VFIO_PCI_NUM_IRQS) 536 return -EINVAL; 537 538 switch (info.index) { 539 case VFIO_PCI_INTX_IRQ_INDEX ... VFIO_PCI_MSIX_IRQ_INDEX: 540 case VFIO_PCI_REQ_IRQ_INDEX: 541 break; 542 case VFIO_PCI_ERR_IRQ_INDEX: 543 if (pci_is_pcie(vdev->pdev)) 544 break; 545 /* pass thru to return error */ 546 default: 547 return -EINVAL; 548 } 549 550 info.flags = VFIO_IRQ_INFO_EVENTFD; 551 552 info.count = vfio_pci_get_irq_count(vdev, info.index); 553 554 if (info.index == VFIO_PCI_INTX_IRQ_INDEX) 555 info.flags |= (VFIO_IRQ_INFO_MASKABLE | 556 VFIO_IRQ_INFO_AUTOMASKED); 557 else 558 info.flags |= VFIO_IRQ_INFO_NORESIZE; 559 560 return copy_to_user((void __user *)arg, &info, minsz) ? 561 -EFAULT : 0; 562 563 } else if (cmd == VFIO_DEVICE_SET_IRQS) { 564 struct vfio_irq_set hdr; 565 u8 *data = NULL; 566 int ret = 0; 567 568 minsz = offsetofend(struct vfio_irq_set, count); 569 570 if (copy_from_user(&hdr, (void __user *)arg, minsz)) 571 return -EFAULT; 572 573 if (hdr.argsz < minsz || hdr.index >= VFIO_PCI_NUM_IRQS || 574 hdr.flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK | 575 VFIO_IRQ_SET_ACTION_TYPE_MASK)) 576 return -EINVAL; 577 578 if (!(hdr.flags & VFIO_IRQ_SET_DATA_NONE)) { 579 size_t size; 580 int max = vfio_pci_get_irq_count(vdev, hdr.index); 581 582 if (hdr.flags & VFIO_IRQ_SET_DATA_BOOL) 583 size = sizeof(uint8_t); 584 else if (hdr.flags & VFIO_IRQ_SET_DATA_EVENTFD) 585 size = sizeof(int32_t); 586 else 587 return -EINVAL; 588 589 if (hdr.argsz - minsz < hdr.count * size || 590 hdr.start >= max || hdr.start + hdr.count > max) 591 return -EINVAL; 592 593 data = memdup_user((void __user *)(arg + minsz), 594 hdr.count * size); 595 if (IS_ERR(data)) 596 return PTR_ERR(data); 597 } 598 599 mutex_lock(&vdev->igate); 600 601 ret = vfio_pci_set_irqs_ioctl(vdev, hdr.flags, hdr.index, 602 hdr.start, hdr.count, data); 603 604 mutex_unlock(&vdev->igate); 605 kfree(data); 606 607 return ret; 608 609 } else if (cmd == VFIO_DEVICE_RESET) { 610 return vdev->reset_works ? 611 pci_try_reset_function(vdev->pdev) : -EINVAL; 612 613 } else if (cmd == VFIO_DEVICE_GET_PCI_HOT_RESET_INFO) { 614 struct vfio_pci_hot_reset_info hdr; 615 struct vfio_pci_fill_info fill = { 0 }; 616 struct vfio_pci_dependent_device *devices = NULL; 617 bool slot = false; 618 int ret = 0; 619 620 minsz = offsetofend(struct vfio_pci_hot_reset_info, count); 621 622 if (copy_from_user(&hdr, (void __user *)arg, minsz)) 623 return -EFAULT; 624 625 if (hdr.argsz < minsz) 626 return -EINVAL; 627 628 hdr.flags = 0; 629 630 /* Can we do a slot or bus reset or neither? */ 631 if (!pci_probe_reset_slot(vdev->pdev->slot)) 632 slot = true; 633 else if (pci_probe_reset_bus(vdev->pdev->bus)) 634 return -ENODEV; 635 636 /* How many devices are affected? */ 637 ret = vfio_pci_for_each_slot_or_bus(vdev->pdev, 638 vfio_pci_count_devs, 639 &fill.max, slot); 640 if (ret) 641 return ret; 642 643 WARN_ON(!fill.max); /* Should always be at least one */ 644 645 /* 646 * If there's enough space, fill it now, otherwise return 647 * -ENOSPC and the number of devices affected. 648 */ 649 if (hdr.argsz < sizeof(hdr) + (fill.max * sizeof(*devices))) { 650 ret = -ENOSPC; 651 hdr.count = fill.max; 652 goto reset_info_exit; 653 } 654 655 devices = kcalloc(fill.max, sizeof(*devices), GFP_KERNEL); 656 if (!devices) 657 return -ENOMEM; 658 659 fill.devices = devices; 660 661 ret = vfio_pci_for_each_slot_or_bus(vdev->pdev, 662 vfio_pci_fill_devs, 663 &fill, slot); 664 665 /* 666 * If a device was removed between counting and filling, 667 * we may come up short of fill.max. If a device was 668 * added, we'll have a return of -EAGAIN above. 669 */ 670 if (!ret) 671 hdr.count = fill.cur; 672 673reset_info_exit: 674 if (copy_to_user((void __user *)arg, &hdr, minsz)) 675 ret = -EFAULT; 676 677 if (!ret) { 678 if (copy_to_user((void __user *)(arg + minsz), devices, 679 hdr.count * sizeof(*devices))) 680 ret = -EFAULT; 681 } 682 683 kfree(devices); 684 return ret; 685 686 } else if (cmd == VFIO_DEVICE_PCI_HOT_RESET) { 687 struct vfio_pci_hot_reset hdr; 688 int32_t *group_fds; 689 struct vfio_pci_group_entry *groups; 690 struct vfio_pci_group_info info; 691 bool slot = false; 692 int i, count = 0, ret = 0; 693 694 minsz = offsetofend(struct vfio_pci_hot_reset, count); 695 696 if (copy_from_user(&hdr, (void __user *)arg, minsz)) 697 return -EFAULT; 698 699 if (hdr.argsz < minsz || hdr.flags) 700 return -EINVAL; 701 702 /* Can we do a slot or bus reset or neither? */ 703 if (!pci_probe_reset_slot(vdev->pdev->slot)) 704 slot = true; 705 else if (pci_probe_reset_bus(vdev->pdev->bus)) 706 return -ENODEV; 707 708 /* 709 * We can't let userspace give us an arbitrarily large 710 * buffer to copy, so verify how many we think there 711 * could be. Note groups can have multiple devices so 712 * one group per device is the max. 713 */ 714 ret = vfio_pci_for_each_slot_or_bus(vdev->pdev, 715 vfio_pci_count_devs, 716 &count, slot); 717 if (ret) 718 return ret; 719 720 /* Somewhere between 1 and count is OK */ 721 if (!hdr.count || hdr.count > count) 722 return -EINVAL; 723 724 group_fds = kcalloc(hdr.count, sizeof(*group_fds), GFP_KERNEL); 725 groups = kcalloc(hdr.count, sizeof(*groups), GFP_KERNEL); 726 if (!group_fds || !groups) { 727 kfree(group_fds); 728 kfree(groups); 729 return -ENOMEM; 730 } 731 732 if (copy_from_user(group_fds, (void __user *)(arg + minsz), 733 hdr.count * sizeof(*group_fds))) { 734 kfree(group_fds); 735 kfree(groups); 736 return -EFAULT; 737 } 738 739 /* 740 * For each group_fd, get the group through the vfio external 741 * user interface and store the group and iommu ID. This 742 * ensures the group is held across the reset. 743 */ 744 for (i = 0; i < hdr.count; i++) { 745 struct vfio_group *group; 746 struct fd f = fdget(group_fds[i]); 747 if (!f.file) { 748 ret = -EBADF; 749 break; 750 } 751 752 group = vfio_group_get_external_user(f.file); 753 fdput(f); 754 if (IS_ERR(group)) { 755 ret = PTR_ERR(group); 756 break; 757 } 758 759 groups[i].group = group; 760 groups[i].id = vfio_external_user_iommu_id(group); 761 } 762 763 kfree(group_fds); 764 765 /* release reference to groups on error */ 766 if (ret) 767 goto hot_reset_release; 768 769 info.count = hdr.count; 770 info.groups = groups; 771 772 /* 773 * Test whether all the affected devices are contained 774 * by the set of groups provided by the user. 775 */ 776 ret = vfio_pci_for_each_slot_or_bus(vdev->pdev, 777 vfio_pci_validate_devs, 778 &info, slot); 779 if (!ret) 780 /* User has access, do the reset */ 781 ret = slot ? pci_try_reset_slot(vdev->pdev->slot) : 782 pci_try_reset_bus(vdev->pdev->bus); 783 784hot_reset_release: 785 for (i--; i >= 0; i--) 786 vfio_group_put_external_user(groups[i].group); 787 788 kfree(groups); 789 return ret; 790 } 791 792 return -ENOTTY; 793} 794 795static ssize_t vfio_pci_rw(void *device_data, char __user *buf, 796 size_t count, loff_t *ppos, bool iswrite) 797{ 798 unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); 799 struct vfio_pci_device *vdev = device_data; 800 801 if (index >= VFIO_PCI_NUM_REGIONS) 802 return -EINVAL; 803 804 switch (index) { 805 case VFIO_PCI_CONFIG_REGION_INDEX: 806 return vfio_pci_config_rw(vdev, buf, count, ppos, iswrite); 807 808 case VFIO_PCI_ROM_REGION_INDEX: 809 if (iswrite) 810 return -EINVAL; 811 return vfio_pci_bar_rw(vdev, buf, count, ppos, false); 812 813 case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX: 814 return vfio_pci_bar_rw(vdev, buf, count, ppos, iswrite); 815 816 case VFIO_PCI_VGA_REGION_INDEX: 817 return vfio_pci_vga_rw(vdev, buf, count, ppos, iswrite); 818 } 819 820 return -EINVAL; 821} 822 823static ssize_t vfio_pci_read(void *device_data, char __user *buf, 824 size_t count, loff_t *ppos) 825{ 826 if (!count) 827 return 0; 828 829 return vfio_pci_rw(device_data, buf, count, ppos, false); 830} 831 832static ssize_t vfio_pci_write(void *device_data, const char __user *buf, 833 size_t count, loff_t *ppos) 834{ 835 if (!count) 836 return 0; 837 838 return vfio_pci_rw(device_data, (char __user *)buf, count, ppos, true); 839} 840 841static int vfio_pci_mmap(void *device_data, struct vm_area_struct *vma) 842{ 843 struct vfio_pci_device *vdev = device_data; 844 struct pci_dev *pdev = vdev->pdev; 845 unsigned int index; 846 u64 phys_len, req_len, pgoff, req_start; 847 int ret; 848 849 index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT); 850 851 if (vma->vm_end < vma->vm_start) 852 return -EINVAL; 853 if ((vma->vm_flags & VM_SHARED) == 0) 854 return -EINVAL; 855 if (index >= VFIO_PCI_ROM_REGION_INDEX) 856 return -EINVAL; 857 if (!(pci_resource_flags(pdev, index) & IORESOURCE_MEM)) 858 return -EINVAL; 859 860 phys_len = pci_resource_len(pdev, index); 861 req_len = vma->vm_end - vma->vm_start; 862 pgoff = vma->vm_pgoff & 863 ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1); 864 req_start = pgoff << PAGE_SHIFT; 865 866 if (phys_len < PAGE_SIZE || req_start + req_len > phys_len) 867 return -EINVAL; 868 869 if (index == vdev->msix_bar) { 870 /* 871 * Disallow mmaps overlapping the MSI-X table; users don't 872 * get to touch this directly. We could find somewhere 873 * else to map the overlap, but page granularity is only 874 * a recommendation, not a requirement, so the user needs 875 * to know which bits are real. Requiring them to mmap 876 * around the table makes that clear. 877 */ 878 879 /* If neither entirely above nor below, then it overlaps */ 880 if (!(req_start >= vdev->msix_offset + vdev->msix_size || 881 req_start + req_len <= vdev->msix_offset)) 882 return -EINVAL; 883 } 884 885 /* 886 * Even though we don't make use of the barmap for the mmap, 887 * we need to request the region and the barmap tracks that. 888 */ 889 if (!vdev->barmap[index]) { 890 ret = pci_request_selected_regions(pdev, 891 1 << index, "vfio-pci"); 892 if (ret) 893 return ret; 894 895 vdev->barmap[index] = pci_iomap(pdev, index, 0); 896 } 897 898 vma->vm_private_data = vdev; 899 vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); 900 vma->vm_pgoff = (pci_resource_start(pdev, index) >> PAGE_SHIFT) + pgoff; 901 902 return remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff, 903 req_len, vma->vm_page_prot); 904} 905 906static void vfio_pci_request(void *device_data, unsigned int count) 907{ 908 struct vfio_pci_device *vdev = device_data; 909 910 mutex_lock(&vdev->igate); 911 912 if (vdev->req_trigger) { 913 if (!(count % 10)) 914 dev_notice_ratelimited(&vdev->pdev->dev, 915 "Relaying device request to user (#%u)\n", 916 count); 917 eventfd_signal(vdev->req_trigger, 1); 918 } else if (count == 0) { 919 dev_warn(&vdev->pdev->dev, 920 "No device request channel registered, blocked until released by user\n"); 921 } 922 923 mutex_unlock(&vdev->igate); 924} 925 926static const struct vfio_device_ops vfio_pci_ops = { 927 .name = "vfio-pci", 928 .open = vfio_pci_open, 929 .release = vfio_pci_release, 930 .ioctl = vfio_pci_ioctl, 931 .read = vfio_pci_read, 932 .write = vfio_pci_write, 933 .mmap = vfio_pci_mmap, 934 .request = vfio_pci_request, 935}; 936 937static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) 938{ 939 struct vfio_pci_device *vdev; 940 struct iommu_group *group; 941 int ret; 942 943 if (pdev->hdr_type != PCI_HEADER_TYPE_NORMAL) 944 return -EINVAL; 945 946 group = iommu_group_get(&pdev->dev); 947 if (!group) 948 return -EINVAL; 949 950 vdev = kzalloc(sizeof(*vdev), GFP_KERNEL); 951 if (!vdev) { 952 iommu_group_put(group); 953 return -ENOMEM; 954 } 955 956 vdev->pdev = pdev; 957 vdev->irq_type = VFIO_PCI_NUM_IRQS; 958 mutex_init(&vdev->igate); 959 spin_lock_init(&vdev->irqlock); 960 961 ret = vfio_add_group_dev(&pdev->dev, &vfio_pci_ops, vdev); 962 if (ret) { 963 iommu_group_put(group); 964 kfree(vdev); 965 return ret; 966 } 967 968 if (vfio_pci_is_vga(pdev)) { 969 vga_client_register(pdev, vdev, NULL, vfio_pci_set_vga_decode); 970 vga_set_legacy_decoding(pdev, 971 vfio_pci_set_vga_decode(vdev, false)); 972 } 973 974 if (!disable_idle_d3) { 975 /* 976 * pci-core sets the device power state to an unknown value at 977 * bootup and after being removed from a driver. The only 978 * transition it allows from this unknown state is to D0, which 979 * typically happens when a driver calls pci_enable_device(). 980 * We're not ready to enable the device yet, but we do want to 981 * be able to get to D3. Therefore first do a D0 transition 982 * before going to D3. 983 */ 984 pci_set_power_state(pdev, PCI_D0); 985 pci_set_power_state(pdev, PCI_D3hot); 986 } 987 988 return ret; 989} 990 991static void vfio_pci_remove(struct pci_dev *pdev) 992{ 993 struct vfio_pci_device *vdev; 994 995 vdev = vfio_del_group_dev(&pdev->dev); 996 if (!vdev) 997 return; 998 999 iommu_group_put(pdev->dev.iommu_group); 1000 kfree(vdev); 1001 1002 if (vfio_pci_is_vga(pdev)) { 1003 vga_client_register(pdev, NULL, NULL, NULL); 1004 vga_set_legacy_decoding(pdev, 1005 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM | 1006 VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM); 1007 } 1008 1009 if (!disable_idle_d3) 1010 pci_set_power_state(pdev, PCI_D0); 1011} 1012 1013static pci_ers_result_t vfio_pci_aer_err_detected(struct pci_dev *pdev, 1014 pci_channel_state_t state) 1015{ 1016 struct vfio_pci_device *vdev; 1017 struct vfio_device *device; 1018 1019 device = vfio_device_get_from_dev(&pdev->dev); 1020 if (device == NULL) 1021 return PCI_ERS_RESULT_DISCONNECT; 1022 1023 vdev = vfio_device_data(device); 1024 if (vdev == NULL) { 1025 vfio_device_put(device); 1026 return PCI_ERS_RESULT_DISCONNECT; 1027 } 1028 1029 mutex_lock(&vdev->igate); 1030 1031 if (vdev->err_trigger) 1032 eventfd_signal(vdev->err_trigger, 1); 1033 1034 mutex_unlock(&vdev->igate); 1035 1036 vfio_device_put(device); 1037 1038 return PCI_ERS_RESULT_CAN_RECOVER; 1039} 1040 1041static struct pci_error_handlers vfio_err_handlers = { 1042 .error_detected = vfio_pci_aer_err_detected, 1043}; 1044 1045static struct pci_driver vfio_pci_driver = { 1046 .name = "vfio-pci", 1047 .id_table = NULL, /* only dynamic ids */ 1048 .probe = vfio_pci_probe, 1049 .remove = vfio_pci_remove, 1050 .err_handler = &vfio_err_handlers, 1051}; 1052 1053struct vfio_devices { 1054 struct vfio_device **devices; 1055 int cur_index; 1056 int max_index; 1057}; 1058 1059static int vfio_pci_get_devs(struct pci_dev *pdev, void *data) 1060{ 1061 struct vfio_devices *devs = data; 1062 struct pci_driver *pci_drv = ACCESS_ONCE(pdev->driver); 1063 1064 if (pci_drv != &vfio_pci_driver) 1065 return -EBUSY; 1066 1067 if (devs->cur_index == devs->max_index) 1068 return -ENOSPC; 1069 1070 devs->devices[devs->cur_index] = vfio_device_get_from_dev(&pdev->dev); 1071 if (!devs->devices[devs->cur_index]) 1072 return -EINVAL; 1073 1074 devs->cur_index++; 1075 return 0; 1076} 1077 1078/* 1079 * Attempt to do a bus/slot reset if there are devices affected by a reset for 1080 * this device that are needs_reset and all of the affected devices are unused 1081 * (!refcnt). Callers are required to hold driver_lock when calling this to 1082 * prevent device opens and concurrent bus reset attempts. We prevent device 1083 * unbinds by acquiring and holding a reference to the vfio_device. 1084 * 1085 * NB: vfio-core considers a group to be viable even if some devices are 1086 * bound to drivers like pci-stub or pcieport. Here we require all devices 1087 * to be bound to vfio_pci since that's the only way we can be sure they 1088 * stay put. 1089 */ 1090static void vfio_pci_try_bus_reset(struct vfio_pci_device *vdev) 1091{ 1092 struct vfio_devices devs = { .cur_index = 0 }; 1093 int i = 0, ret = -EINVAL; 1094 bool needs_reset = false, slot = false; 1095 struct vfio_pci_device *tmp; 1096 1097 if (!pci_probe_reset_slot(vdev->pdev->slot)) 1098 slot = true; 1099 else if (pci_probe_reset_bus(vdev->pdev->bus)) 1100 return; 1101 1102 if (vfio_pci_for_each_slot_or_bus(vdev->pdev, vfio_pci_count_devs, 1103 &i, slot) || !i) 1104 return; 1105 1106 devs.max_index = i; 1107 devs.devices = kcalloc(i, sizeof(struct vfio_device *), GFP_KERNEL); 1108 if (!devs.devices) 1109 return; 1110 1111 if (vfio_pci_for_each_slot_or_bus(vdev->pdev, 1112 vfio_pci_get_devs, &devs, slot)) 1113 goto put_devs; 1114 1115 for (i = 0; i < devs.cur_index; i++) { 1116 tmp = vfio_device_data(devs.devices[i]); 1117 if (tmp->needs_reset) 1118 needs_reset = true; 1119 if (tmp->refcnt) 1120 goto put_devs; 1121 } 1122 1123 if (needs_reset) 1124 ret = slot ? pci_try_reset_slot(vdev->pdev->slot) : 1125 pci_try_reset_bus(vdev->pdev->bus); 1126 1127put_devs: 1128 for (i = 0; i < devs.cur_index; i++) { 1129 tmp = vfio_device_data(devs.devices[i]); 1130 if (!ret) 1131 tmp->needs_reset = false; 1132 1133 if (!tmp->refcnt && !disable_idle_d3) 1134 pci_set_power_state(tmp->pdev, PCI_D3hot); 1135 1136 vfio_device_put(devs.devices[i]); 1137 } 1138 1139 kfree(devs.devices); 1140} 1141 1142static void __exit vfio_pci_cleanup(void) 1143{ 1144 pci_unregister_driver(&vfio_pci_driver); 1145 vfio_pci_uninit_perm_bits(); 1146} 1147 1148static void __init vfio_pci_fill_ids(void) 1149{ 1150 char *p, *id; 1151 int rc; 1152 1153 /* no ids passed actually */ 1154 if (ids[0] == '\0') 1155 return; 1156 1157 /* add ids specified in the module parameter */ 1158 p = ids; 1159 while ((id = strsep(&p, ","))) { 1160 unsigned int vendor, device, subvendor = PCI_ANY_ID, 1161 subdevice = PCI_ANY_ID, class = 0, class_mask = 0; 1162 int fields; 1163 1164 if (!strlen(id)) 1165 continue; 1166 1167 fields = sscanf(id, "%x:%x:%x:%x:%x:%x", 1168 &vendor, &device, &subvendor, &subdevice, 1169 &class, &class_mask); 1170 1171 if (fields < 2) { 1172 pr_warn("invalid id string \"%s\"\n", id); 1173 continue; 1174 } 1175 1176 rc = pci_add_dynid(&vfio_pci_driver, vendor, device, 1177 subvendor, subdevice, class, class_mask, 0); 1178 if (rc) 1179 pr_warn("failed to add dynamic id [%04hx:%04hx[%04hx:%04hx]] class %#08x/%08x (%d)\n", 1180 vendor, device, subvendor, subdevice, 1181 class, class_mask, rc); 1182 else 1183 pr_info("add [%04hx:%04hx[%04hx:%04hx]] class %#08x/%08x\n", 1184 vendor, device, subvendor, subdevice, 1185 class, class_mask); 1186 } 1187} 1188 1189static int __init vfio_pci_init(void) 1190{ 1191 int ret; 1192 1193 /* Allocate shared config space permision data used by all devices */ 1194 ret = vfio_pci_init_perm_bits(); 1195 if (ret) 1196 return ret; 1197 1198 /* Register and scan for devices */ 1199 ret = pci_register_driver(&vfio_pci_driver); 1200 if (ret) 1201 goto out_driver; 1202 1203 vfio_pci_fill_ids(); 1204 1205 return 0; 1206 1207out_driver: 1208 vfio_pci_uninit_perm_bits(); 1209 return ret; 1210} 1211 1212module_init(vfio_pci_init); 1213module_exit(vfio_pci_cleanup); 1214 1215MODULE_VERSION(DRIVER_VERSION); 1216MODULE_LICENSE("GPL v2"); 1217MODULE_AUTHOR(DRIVER_AUTHOR); 1218MODULE_DESCRIPTION(DRIVER_DESC); 1219