1/* 2 * Copyright (C) 2007-2010 Advanced Micro Devices, Inc. 3 * Author: Joerg Roedel <jroedel@suse.de> 4 * Leo Duran <leo.duran@amd.com> 5 * 6 * This program is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 as published 8 * by the Free Software Foundation. 9 * 10 * This program is distributed in the hope that it will be useful, 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 * GNU General Public License for more details. 14 * 15 * You should have received a copy of the GNU General Public License 16 * along with this program; if not, write to the Free Software 17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 18 */ 19 20#include <linux/ratelimit.h> 21#include <linux/pci.h> 22#include <linux/pci-ats.h> 23#include <linux/bitmap.h> 24#include <linux/slab.h> 25#include <linux/debugfs.h> 26#include <linux/scatterlist.h> 27#include <linux/dma-mapping.h> 28#include <linux/iommu-helper.h> 29#include <linux/iommu.h> 30#include <linux/delay.h> 31#include <linux/amd-iommu.h> 32#include <linux/notifier.h> 33#include <linux/export.h> 34#include <linux/irq.h> 35#include <linux/msi.h> 36#include <linux/dma-contiguous.h> 37#include <asm/irq_remapping.h> 38#include <asm/io_apic.h> 39#include <asm/apic.h> 40#include <asm/hw_irq.h> 41#include <asm/msidef.h> 42#include <asm/proto.h> 43#include <asm/iommu.h> 44#include <asm/gart.h> 45#include <asm/dma.h> 46 47#include "amd_iommu_proto.h" 48#include "amd_iommu_types.h" 49#include "irq_remapping.h" 50 51#define CMD_SET_TYPE(cmd, t) ((cmd)->data[1] |= ((t) << 28)) 52 53#define LOOP_TIMEOUT 100000 54 55/* 56 * This bitmap is used to advertise the page sizes our hardware support 57 * to the IOMMU core, which will then use this information to split 58 * physically contiguous memory regions it is mapping into page sizes 59 * that we support. 60 * 61 * 512GB Pages are not supported due to a hardware bug 62 */ 63#define AMD_IOMMU_PGSIZES ((~0xFFFUL) & ~(2ULL << 38)) 64 65static DEFINE_RWLOCK(amd_iommu_devtable_lock); 66 67/* A list of preallocated protection domains */ 68static LIST_HEAD(iommu_pd_list); 69static DEFINE_SPINLOCK(iommu_pd_list_lock); 70 71/* List of all available dev_data structures */ 72static LIST_HEAD(dev_data_list); 73static DEFINE_SPINLOCK(dev_data_list_lock); 74 75LIST_HEAD(ioapic_map); 76LIST_HEAD(hpet_map); 77 78/* 79 * Domain for untranslated devices - only allocated 80 * if iommu=pt passed on kernel cmd line. 81 */ 82static struct protection_domain *pt_domain; 83 84static const struct iommu_ops amd_iommu_ops; 85 86static ATOMIC_NOTIFIER_HEAD(ppr_notifier); 87int amd_iommu_max_glx_val = -1; 88 89static struct dma_map_ops amd_iommu_dma_ops; 90 91/* 92 * This struct contains device specific data for the IOMMU 93 */ 94struct iommu_dev_data { 95 struct list_head list; /* For domain->dev_list */ 96 struct list_head dev_data_list; /* For global dev_data_list */ 97 struct list_head alias_list; /* Link alias-groups together */ 98 struct iommu_dev_data *alias_data;/* The alias dev_data */ 99 struct protection_domain *domain; /* Domain the device is bound to */ 100 u16 devid; /* PCI Device ID */ 101 bool iommu_v2; /* Device can make use of IOMMUv2 */ 102 bool passthrough; /* Default for device is pt_domain */ 103 struct { 104 bool enabled; 105 int qdep; 106 } ats; /* ATS state */ 107 bool pri_tlp; /* PASID TLB required for 108 PPR completions */ 109 u32 errata; /* Bitmap for errata to apply */ 110}; 111 112/* 113 * general struct to manage commands send to an IOMMU 114 */ 115struct iommu_cmd { 116 u32 data[4]; 117}; 118 119struct kmem_cache *amd_iommu_irq_cache; 120 121static void update_domain(struct protection_domain *domain); 122static int __init alloc_passthrough_domain(void); 123 124/**************************************************************************** 125 * 126 * Helper functions 127 * 128 ****************************************************************************/ 129 130static struct protection_domain *to_pdomain(struct iommu_domain *dom) 131{ 132 return container_of(dom, struct protection_domain, domain); 133} 134 135static struct iommu_dev_data *alloc_dev_data(u16 devid) 136{ 137 struct iommu_dev_data *dev_data; 138 unsigned long flags; 139 140 dev_data = kzalloc(sizeof(*dev_data), GFP_KERNEL); 141 if (!dev_data) 142 return NULL; 143 144 INIT_LIST_HEAD(&dev_data->alias_list); 145 146 dev_data->devid = devid; 147 148 spin_lock_irqsave(&dev_data_list_lock, flags); 149 list_add_tail(&dev_data->dev_data_list, &dev_data_list); 150 spin_unlock_irqrestore(&dev_data_list_lock, flags); 151 152 return dev_data; 153} 154 155static void free_dev_data(struct iommu_dev_data *dev_data) 156{ 157 unsigned long flags; 158 159 spin_lock_irqsave(&dev_data_list_lock, flags); 160 list_del(&dev_data->dev_data_list); 161 spin_unlock_irqrestore(&dev_data_list_lock, flags); 162 163 kfree(dev_data); 164} 165 166static struct iommu_dev_data *search_dev_data(u16 devid) 167{ 168 struct iommu_dev_data *dev_data; 169 unsigned long flags; 170 171 spin_lock_irqsave(&dev_data_list_lock, flags); 172 list_for_each_entry(dev_data, &dev_data_list, dev_data_list) { 173 if (dev_data->devid == devid) 174 goto out_unlock; 175 } 176 177 dev_data = NULL; 178 179out_unlock: 180 spin_unlock_irqrestore(&dev_data_list_lock, flags); 181 182 return dev_data; 183} 184 185static struct iommu_dev_data *find_dev_data(u16 devid) 186{ 187 struct iommu_dev_data *dev_data; 188 189 dev_data = search_dev_data(devid); 190 191 if (dev_data == NULL) 192 dev_data = alloc_dev_data(devid); 193 194 return dev_data; 195} 196 197static inline u16 get_device_id(struct device *dev) 198{ 199 struct pci_dev *pdev = to_pci_dev(dev); 200 201 return PCI_DEVID(pdev->bus->number, pdev->devfn); 202} 203 204static struct iommu_dev_data *get_dev_data(struct device *dev) 205{ 206 return dev->archdata.iommu; 207} 208 209static bool pci_iommuv2_capable(struct pci_dev *pdev) 210{ 211 static const int caps[] = { 212 PCI_EXT_CAP_ID_ATS, 213 PCI_EXT_CAP_ID_PRI, 214 PCI_EXT_CAP_ID_PASID, 215 }; 216 int i, pos; 217 218 for (i = 0; i < 3; ++i) { 219 pos = pci_find_ext_capability(pdev, caps[i]); 220 if (pos == 0) 221 return false; 222 } 223 224 return true; 225} 226 227static bool pdev_pri_erratum(struct pci_dev *pdev, u32 erratum) 228{ 229 struct iommu_dev_data *dev_data; 230 231 dev_data = get_dev_data(&pdev->dev); 232 233 return dev_data->errata & (1 << erratum) ? true : false; 234} 235 236/* 237 * In this function the list of preallocated protection domains is traversed to 238 * find the domain for a specific device 239 */ 240static struct dma_ops_domain *find_protection_domain(u16 devid) 241{ 242 struct dma_ops_domain *entry, *ret = NULL; 243 unsigned long flags; 244 u16 alias = amd_iommu_alias_table[devid]; 245 246 if (list_empty(&iommu_pd_list)) 247 return NULL; 248 249 spin_lock_irqsave(&iommu_pd_list_lock, flags); 250 251 list_for_each_entry(entry, &iommu_pd_list, list) { 252 if (entry->target_dev == devid || 253 entry->target_dev == alias) { 254 ret = entry; 255 break; 256 } 257 } 258 259 spin_unlock_irqrestore(&iommu_pd_list_lock, flags); 260 261 return ret; 262} 263 264/* 265 * This function checks if the driver got a valid device from the caller to 266 * avoid dereferencing invalid pointers. 267 */ 268static bool check_device(struct device *dev) 269{ 270 u16 devid; 271 272 if (!dev || !dev->dma_mask) 273 return false; 274 275 /* No PCI device */ 276 if (!dev_is_pci(dev)) 277 return false; 278 279 devid = get_device_id(dev); 280 281 /* Out of our scope? */ 282 if (devid > amd_iommu_last_bdf) 283 return false; 284 285 if (amd_iommu_rlookup_table[devid] == NULL) 286 return false; 287 288 return true; 289} 290 291static void init_iommu_group(struct device *dev) 292{ 293 struct iommu_group *group; 294 295 group = iommu_group_get_for_dev(dev); 296 if (!IS_ERR(group)) 297 iommu_group_put(group); 298} 299 300static int __last_alias(struct pci_dev *pdev, u16 alias, void *data) 301{ 302 *(u16 *)data = alias; 303 return 0; 304} 305 306static u16 get_alias(struct device *dev) 307{ 308 struct pci_dev *pdev = to_pci_dev(dev); 309 u16 devid, ivrs_alias, pci_alias; 310 311 devid = get_device_id(dev); 312 ivrs_alias = amd_iommu_alias_table[devid]; 313 pci_for_each_dma_alias(pdev, __last_alias, &pci_alias); 314 315 if (ivrs_alias == pci_alias) 316 return ivrs_alias; 317 318 /* 319 * DMA alias showdown 320 * 321 * The IVRS is fairly reliable in telling us about aliases, but it 322 * can't know about every screwy device. If we don't have an IVRS 323 * reported alias, use the PCI reported alias. In that case we may 324 * still need to initialize the rlookup and dev_table entries if the 325 * alias is to a non-existent device. 326 */ 327 if (ivrs_alias == devid) { 328 if (!amd_iommu_rlookup_table[pci_alias]) { 329 amd_iommu_rlookup_table[pci_alias] = 330 amd_iommu_rlookup_table[devid]; 331 memcpy(amd_iommu_dev_table[pci_alias].data, 332 amd_iommu_dev_table[devid].data, 333 sizeof(amd_iommu_dev_table[pci_alias].data)); 334 } 335 336 return pci_alias; 337 } 338 339 pr_info("AMD-Vi: Using IVRS reported alias %02x:%02x.%d " 340 "for device %s[%04x:%04x], kernel reported alias " 341 "%02x:%02x.%d\n", PCI_BUS_NUM(ivrs_alias), PCI_SLOT(ivrs_alias), 342 PCI_FUNC(ivrs_alias), dev_name(dev), pdev->vendor, pdev->device, 343 PCI_BUS_NUM(pci_alias), PCI_SLOT(pci_alias), 344 PCI_FUNC(pci_alias)); 345 346 /* 347 * If we don't have a PCI DMA alias and the IVRS alias is on the same 348 * bus, then the IVRS table may know about a quirk that we don't. 349 */ 350 if (pci_alias == devid && 351 PCI_BUS_NUM(ivrs_alias) == pdev->bus->number) { 352 pdev->dev_flags |= PCI_DEV_FLAGS_DMA_ALIAS_DEVFN; 353 pdev->dma_alias_devfn = ivrs_alias & 0xff; 354 pr_info("AMD-Vi: Added PCI DMA alias %02x.%d for %s\n", 355 PCI_SLOT(ivrs_alias), PCI_FUNC(ivrs_alias), 356 dev_name(dev)); 357 } 358 359 return ivrs_alias; 360} 361 362static int iommu_init_device(struct device *dev) 363{ 364 struct pci_dev *pdev = to_pci_dev(dev); 365 struct iommu_dev_data *dev_data; 366 u16 alias; 367 368 if (dev->archdata.iommu) 369 return 0; 370 371 dev_data = find_dev_data(get_device_id(dev)); 372 if (!dev_data) 373 return -ENOMEM; 374 375 alias = get_alias(dev); 376 377 if (alias != dev_data->devid) { 378 struct iommu_dev_data *alias_data; 379 380 alias_data = find_dev_data(alias); 381 if (alias_data == NULL) { 382 pr_err("AMD-Vi: Warning: Unhandled device %s\n", 383 dev_name(dev)); 384 free_dev_data(dev_data); 385 return -ENOTSUPP; 386 } 387 dev_data->alias_data = alias_data; 388 389 /* Add device to the alias_list */ 390 list_add(&dev_data->alias_list, &alias_data->alias_list); 391 } 392 393 if (pci_iommuv2_capable(pdev)) { 394 struct amd_iommu *iommu; 395 396 iommu = amd_iommu_rlookup_table[dev_data->devid]; 397 dev_data->iommu_v2 = iommu->is_iommu_v2; 398 } 399 400 dev->archdata.iommu = dev_data; 401 402 iommu_device_link(amd_iommu_rlookup_table[dev_data->devid]->iommu_dev, 403 dev); 404 405 return 0; 406} 407 408static void iommu_ignore_device(struct device *dev) 409{ 410 u16 devid, alias; 411 412 devid = get_device_id(dev); 413 alias = amd_iommu_alias_table[devid]; 414 415 memset(&amd_iommu_dev_table[devid], 0, sizeof(struct dev_table_entry)); 416 memset(&amd_iommu_dev_table[alias], 0, sizeof(struct dev_table_entry)); 417 418 amd_iommu_rlookup_table[devid] = NULL; 419 amd_iommu_rlookup_table[alias] = NULL; 420} 421 422static void iommu_uninit_device(struct device *dev) 423{ 424 struct iommu_dev_data *dev_data = search_dev_data(get_device_id(dev)); 425 426 if (!dev_data) 427 return; 428 429 iommu_device_unlink(amd_iommu_rlookup_table[dev_data->devid]->iommu_dev, 430 dev); 431 432 iommu_group_remove_device(dev); 433 434 /* Unlink from alias, it may change if another device is re-plugged */ 435 dev_data->alias_data = NULL; 436 437 /* 438 * We keep dev_data around for unplugged devices and reuse it when the 439 * device is re-plugged - not doing so would introduce a ton of races. 440 */ 441} 442 443void __init amd_iommu_uninit_devices(void) 444{ 445 struct iommu_dev_data *dev_data, *n; 446 struct pci_dev *pdev = NULL; 447 448 for_each_pci_dev(pdev) { 449 450 if (!check_device(&pdev->dev)) 451 continue; 452 453 iommu_uninit_device(&pdev->dev); 454 } 455 456 /* Free all of our dev_data structures */ 457 list_for_each_entry_safe(dev_data, n, &dev_data_list, dev_data_list) 458 free_dev_data(dev_data); 459} 460 461int __init amd_iommu_init_devices(void) 462{ 463 struct pci_dev *pdev = NULL; 464 int ret = 0; 465 466 for_each_pci_dev(pdev) { 467 468 if (!check_device(&pdev->dev)) 469 continue; 470 471 ret = iommu_init_device(&pdev->dev); 472 if (ret == -ENOTSUPP) 473 iommu_ignore_device(&pdev->dev); 474 else if (ret) 475 goto out_free; 476 } 477 478 /* 479 * Initialize IOMMU groups only after iommu_init_device() has 480 * had a chance to populate any IVRS defined aliases. 481 */ 482 for_each_pci_dev(pdev) { 483 if (check_device(&pdev->dev)) 484 init_iommu_group(&pdev->dev); 485 } 486 487 return 0; 488 489out_free: 490 491 amd_iommu_uninit_devices(); 492 493 return ret; 494} 495#ifdef CONFIG_AMD_IOMMU_STATS 496 497/* 498 * Initialization code for statistics collection 499 */ 500 501DECLARE_STATS_COUNTER(compl_wait); 502DECLARE_STATS_COUNTER(cnt_map_single); 503DECLARE_STATS_COUNTER(cnt_unmap_single); 504DECLARE_STATS_COUNTER(cnt_map_sg); 505DECLARE_STATS_COUNTER(cnt_unmap_sg); 506DECLARE_STATS_COUNTER(cnt_alloc_coherent); 507DECLARE_STATS_COUNTER(cnt_free_coherent); 508DECLARE_STATS_COUNTER(cross_page); 509DECLARE_STATS_COUNTER(domain_flush_single); 510DECLARE_STATS_COUNTER(domain_flush_all); 511DECLARE_STATS_COUNTER(alloced_io_mem); 512DECLARE_STATS_COUNTER(total_map_requests); 513DECLARE_STATS_COUNTER(complete_ppr); 514DECLARE_STATS_COUNTER(invalidate_iotlb); 515DECLARE_STATS_COUNTER(invalidate_iotlb_all); 516DECLARE_STATS_COUNTER(pri_requests); 517 518static struct dentry *stats_dir; 519static struct dentry *de_fflush; 520 521static void amd_iommu_stats_add(struct __iommu_counter *cnt) 522{ 523 if (stats_dir == NULL) 524 return; 525 526 cnt->dent = debugfs_create_u64(cnt->name, 0444, stats_dir, 527 &cnt->value); 528} 529 530static void amd_iommu_stats_init(void) 531{ 532 stats_dir = debugfs_create_dir("amd-iommu", NULL); 533 if (stats_dir == NULL) 534 return; 535 536 de_fflush = debugfs_create_bool("fullflush", 0444, stats_dir, 537 &amd_iommu_unmap_flush); 538 539 amd_iommu_stats_add(&compl_wait); 540 amd_iommu_stats_add(&cnt_map_single); 541 amd_iommu_stats_add(&cnt_unmap_single); 542 amd_iommu_stats_add(&cnt_map_sg); 543 amd_iommu_stats_add(&cnt_unmap_sg); 544 amd_iommu_stats_add(&cnt_alloc_coherent); 545 amd_iommu_stats_add(&cnt_free_coherent); 546 amd_iommu_stats_add(&cross_page); 547 amd_iommu_stats_add(&domain_flush_single); 548 amd_iommu_stats_add(&domain_flush_all); 549 amd_iommu_stats_add(&alloced_io_mem); 550 amd_iommu_stats_add(&total_map_requests); 551 amd_iommu_stats_add(&complete_ppr); 552 amd_iommu_stats_add(&invalidate_iotlb); 553 amd_iommu_stats_add(&invalidate_iotlb_all); 554 amd_iommu_stats_add(&pri_requests); 555} 556 557#endif 558 559/**************************************************************************** 560 * 561 * Interrupt handling functions 562 * 563 ****************************************************************************/ 564 565static void dump_dte_entry(u16 devid) 566{ 567 int i; 568 569 for (i = 0; i < 4; ++i) 570 pr_err("AMD-Vi: DTE[%d]: %016llx\n", i, 571 amd_iommu_dev_table[devid].data[i]); 572} 573 574static void dump_command(unsigned long phys_addr) 575{ 576 struct iommu_cmd *cmd = phys_to_virt(phys_addr); 577 int i; 578 579 for (i = 0; i < 4; ++i) 580 pr_err("AMD-Vi: CMD[%d]: %08x\n", i, cmd->data[i]); 581} 582 583static void iommu_print_event(struct amd_iommu *iommu, void *__evt) 584{ 585 int type, devid, domid, flags; 586 volatile u32 *event = __evt; 587 int count = 0; 588 u64 address; 589 590retry: 591 type = (event[1] >> EVENT_TYPE_SHIFT) & EVENT_TYPE_MASK; 592 devid = (event[0] >> EVENT_DEVID_SHIFT) & EVENT_DEVID_MASK; 593 domid = (event[1] >> EVENT_DOMID_SHIFT) & EVENT_DOMID_MASK; 594 flags = (event[1] >> EVENT_FLAGS_SHIFT) & EVENT_FLAGS_MASK; 595 address = (u64)(((u64)event[3]) << 32) | event[2]; 596 597 if (type == 0) { 598 /* Did we hit the erratum? */ 599 if (++count == LOOP_TIMEOUT) { 600 pr_err("AMD-Vi: No event written to event log\n"); 601 return; 602 } 603 udelay(1); 604 goto retry; 605 } 606 607 printk(KERN_ERR "AMD-Vi: Event logged ["); 608 609 switch (type) { 610 case EVENT_TYPE_ILL_DEV: 611 printk("ILLEGAL_DEV_TABLE_ENTRY device=%02x:%02x.%x " 612 "address=0x%016llx flags=0x%04x]\n", 613 PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid), 614 address, flags); 615 dump_dte_entry(devid); 616 break; 617 case EVENT_TYPE_IO_FAULT: 618 printk("IO_PAGE_FAULT device=%02x:%02x.%x " 619 "domain=0x%04x address=0x%016llx flags=0x%04x]\n", 620 PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid), 621 domid, address, flags); 622 break; 623 case EVENT_TYPE_DEV_TAB_ERR: 624 printk("DEV_TAB_HARDWARE_ERROR device=%02x:%02x.%x " 625 "address=0x%016llx flags=0x%04x]\n", 626 PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid), 627 address, flags); 628 break; 629 case EVENT_TYPE_PAGE_TAB_ERR: 630 printk("PAGE_TAB_HARDWARE_ERROR device=%02x:%02x.%x " 631 "domain=0x%04x address=0x%016llx flags=0x%04x]\n", 632 PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid), 633 domid, address, flags); 634 break; 635 case EVENT_TYPE_ILL_CMD: 636 printk("ILLEGAL_COMMAND_ERROR address=0x%016llx]\n", address); 637 dump_command(address); 638 break; 639 case EVENT_TYPE_CMD_HARD_ERR: 640 printk("COMMAND_HARDWARE_ERROR address=0x%016llx " 641 "flags=0x%04x]\n", address, flags); 642 break; 643 case EVENT_TYPE_IOTLB_INV_TO: 644 printk("IOTLB_INV_TIMEOUT device=%02x:%02x.%x " 645 "address=0x%016llx]\n", 646 PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid), 647 address); 648 break; 649 case EVENT_TYPE_INV_DEV_REQ: 650 printk("INVALID_DEVICE_REQUEST device=%02x:%02x.%x " 651 "address=0x%016llx flags=0x%04x]\n", 652 PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid), 653 address, flags); 654 break; 655 default: 656 printk(KERN_ERR "UNKNOWN type=0x%02x]\n", type); 657 } 658 659 memset(__evt, 0, 4 * sizeof(u32)); 660} 661 662static void iommu_poll_events(struct amd_iommu *iommu) 663{ 664 u32 head, tail; 665 666 head = readl(iommu->mmio_base + MMIO_EVT_HEAD_OFFSET); 667 tail = readl(iommu->mmio_base + MMIO_EVT_TAIL_OFFSET); 668 669 while (head != tail) { 670 iommu_print_event(iommu, iommu->evt_buf + head); 671 head = (head + EVENT_ENTRY_SIZE) % iommu->evt_buf_size; 672 } 673 674 writel(head, iommu->mmio_base + MMIO_EVT_HEAD_OFFSET); 675} 676 677static void iommu_handle_ppr_entry(struct amd_iommu *iommu, u64 *raw) 678{ 679 struct amd_iommu_fault fault; 680 681 INC_STATS_COUNTER(pri_requests); 682 683 if (PPR_REQ_TYPE(raw[0]) != PPR_REQ_FAULT) { 684 pr_err_ratelimited("AMD-Vi: Unknown PPR request received\n"); 685 return; 686 } 687 688 fault.address = raw[1]; 689 fault.pasid = PPR_PASID(raw[0]); 690 fault.device_id = PPR_DEVID(raw[0]); 691 fault.tag = PPR_TAG(raw[0]); 692 fault.flags = PPR_FLAGS(raw[0]); 693 694 atomic_notifier_call_chain(&ppr_notifier, 0, &fault); 695} 696 697static void iommu_poll_ppr_log(struct amd_iommu *iommu) 698{ 699 u32 head, tail; 700 701 if (iommu->ppr_log == NULL) 702 return; 703 704 head = readl(iommu->mmio_base + MMIO_PPR_HEAD_OFFSET); 705 tail = readl(iommu->mmio_base + MMIO_PPR_TAIL_OFFSET); 706 707 while (head != tail) { 708 volatile u64 *raw; 709 u64 entry[2]; 710 int i; 711 712 raw = (u64 *)(iommu->ppr_log + head); 713 714 /* 715 * Hardware bug: Interrupt may arrive before the entry is 716 * written to memory. If this happens we need to wait for the 717 * entry to arrive. 718 */ 719 for (i = 0; i < LOOP_TIMEOUT; ++i) { 720 if (PPR_REQ_TYPE(raw[0]) != 0) 721 break; 722 udelay(1); 723 } 724 725 /* Avoid memcpy function-call overhead */ 726 entry[0] = raw[0]; 727 entry[1] = raw[1]; 728 729 /* 730 * To detect the hardware bug we need to clear the entry 731 * back to zero. 732 */ 733 raw[0] = raw[1] = 0UL; 734 735 /* Update head pointer of hardware ring-buffer */ 736 head = (head + PPR_ENTRY_SIZE) % PPR_LOG_SIZE; 737 writel(head, iommu->mmio_base + MMIO_PPR_HEAD_OFFSET); 738 739 /* Handle PPR entry */ 740 iommu_handle_ppr_entry(iommu, entry); 741 742 /* Refresh ring-buffer information */ 743 head = readl(iommu->mmio_base + MMIO_PPR_HEAD_OFFSET); 744 tail = readl(iommu->mmio_base + MMIO_PPR_TAIL_OFFSET); 745 } 746} 747 748irqreturn_t amd_iommu_int_thread(int irq, void *data) 749{ 750 struct amd_iommu *iommu = (struct amd_iommu *) data; 751 u32 status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET); 752 753 while (status & (MMIO_STATUS_EVT_INT_MASK | MMIO_STATUS_PPR_INT_MASK)) { 754 /* Enable EVT and PPR interrupts again */ 755 writel((MMIO_STATUS_EVT_INT_MASK | MMIO_STATUS_PPR_INT_MASK), 756 iommu->mmio_base + MMIO_STATUS_OFFSET); 757 758 if (status & MMIO_STATUS_EVT_INT_MASK) { 759 pr_devel("AMD-Vi: Processing IOMMU Event Log\n"); 760 iommu_poll_events(iommu); 761 } 762 763 if (status & MMIO_STATUS_PPR_INT_MASK) { 764 pr_devel("AMD-Vi: Processing IOMMU PPR Log\n"); 765 iommu_poll_ppr_log(iommu); 766 } 767 768 /* 769 * Hardware bug: ERBT1312 770 * When re-enabling interrupt (by writing 1 771 * to clear the bit), the hardware might also try to set 772 * the interrupt bit in the event status register. 773 * In this scenario, the bit will be set, and disable 774 * subsequent interrupts. 775 * 776 * Workaround: The IOMMU driver should read back the 777 * status register and check if the interrupt bits are cleared. 778 * If not, driver will need to go through the interrupt handler 779 * again and re-clear the bits 780 */ 781 status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET); 782 } 783 return IRQ_HANDLED; 784} 785 786irqreturn_t amd_iommu_int_handler(int irq, void *data) 787{ 788 return IRQ_WAKE_THREAD; 789} 790 791/**************************************************************************** 792 * 793 * IOMMU command queuing functions 794 * 795 ****************************************************************************/ 796 797static int wait_on_sem(volatile u64 *sem) 798{ 799 int i = 0; 800 801 while (*sem == 0 && i < LOOP_TIMEOUT) { 802 udelay(1); 803 i += 1; 804 } 805 806 if (i == LOOP_TIMEOUT) { 807 pr_alert("AMD-Vi: Completion-Wait loop timed out\n"); 808 return -EIO; 809 } 810 811 return 0; 812} 813 814static void copy_cmd_to_buffer(struct amd_iommu *iommu, 815 struct iommu_cmd *cmd, 816 u32 tail) 817{ 818 u8 *target; 819 820 target = iommu->cmd_buf + tail; 821 tail = (tail + sizeof(*cmd)) % iommu->cmd_buf_size; 822 823 /* Copy command to buffer */ 824 memcpy(target, cmd, sizeof(*cmd)); 825 826 /* Tell the IOMMU about it */ 827 writel(tail, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET); 828} 829 830static void build_completion_wait(struct iommu_cmd *cmd, u64 address) 831{ 832 WARN_ON(address & 0x7ULL); 833 834 memset(cmd, 0, sizeof(*cmd)); 835 cmd->data[0] = lower_32_bits(__pa(address)) | CMD_COMPL_WAIT_STORE_MASK; 836 cmd->data[1] = upper_32_bits(__pa(address)); 837 cmd->data[2] = 1; 838 CMD_SET_TYPE(cmd, CMD_COMPL_WAIT); 839} 840 841static void build_inv_dte(struct iommu_cmd *cmd, u16 devid) 842{ 843 memset(cmd, 0, sizeof(*cmd)); 844 cmd->data[0] = devid; 845 CMD_SET_TYPE(cmd, CMD_INV_DEV_ENTRY); 846} 847 848static void build_inv_iommu_pages(struct iommu_cmd *cmd, u64 address, 849 size_t size, u16 domid, int pde) 850{ 851 u64 pages; 852 bool s; 853 854 pages = iommu_num_pages(address, size, PAGE_SIZE); 855 s = false; 856 857 if (pages > 1) { 858 /* 859 * If we have to flush more than one page, flush all 860 * TLB entries for this domain 861 */ 862 address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS; 863 s = true; 864 } 865 866 address &= PAGE_MASK; 867 868 memset(cmd, 0, sizeof(*cmd)); 869 cmd->data[1] |= domid; 870 cmd->data[2] = lower_32_bits(address); 871 cmd->data[3] = upper_32_bits(address); 872 CMD_SET_TYPE(cmd, CMD_INV_IOMMU_PAGES); 873 if (s) /* size bit - we flush more than one 4kb page */ 874 cmd->data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK; 875 if (pde) /* PDE bit - we want to flush everything, not only the PTEs */ 876 cmd->data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK; 877} 878 879static void build_inv_iotlb_pages(struct iommu_cmd *cmd, u16 devid, int qdep, 880 u64 address, size_t size) 881{ 882 u64 pages; 883 bool s; 884 885 pages = iommu_num_pages(address, size, PAGE_SIZE); 886 s = false; 887 888 if (pages > 1) { 889 /* 890 * If we have to flush more than one page, flush all 891 * TLB entries for this domain 892 */ 893 address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS; 894 s = true; 895 } 896 897 address &= PAGE_MASK; 898 899 memset(cmd, 0, sizeof(*cmd)); 900 cmd->data[0] = devid; 901 cmd->data[0] |= (qdep & 0xff) << 24; 902 cmd->data[1] = devid; 903 cmd->data[2] = lower_32_bits(address); 904 cmd->data[3] = upper_32_bits(address); 905 CMD_SET_TYPE(cmd, CMD_INV_IOTLB_PAGES); 906 if (s) 907 cmd->data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK; 908} 909 910static void build_inv_iommu_pasid(struct iommu_cmd *cmd, u16 domid, int pasid, 911 u64 address, bool size) 912{ 913 memset(cmd, 0, sizeof(*cmd)); 914 915 address &= ~(0xfffULL); 916 917 cmd->data[0] = pasid; 918 cmd->data[1] = domid; 919 cmd->data[2] = lower_32_bits(address); 920 cmd->data[3] = upper_32_bits(address); 921 cmd->data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK; 922 cmd->data[2] |= CMD_INV_IOMMU_PAGES_GN_MASK; 923 if (size) 924 cmd->data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK; 925 CMD_SET_TYPE(cmd, CMD_INV_IOMMU_PAGES); 926} 927 928static void build_inv_iotlb_pasid(struct iommu_cmd *cmd, u16 devid, int pasid, 929 int qdep, u64 address, bool size) 930{ 931 memset(cmd, 0, sizeof(*cmd)); 932 933 address &= ~(0xfffULL); 934 935 cmd->data[0] = devid; 936 cmd->data[0] |= ((pasid >> 8) & 0xff) << 16; 937 cmd->data[0] |= (qdep & 0xff) << 24; 938 cmd->data[1] = devid; 939 cmd->data[1] |= (pasid & 0xff) << 16; 940 cmd->data[2] = lower_32_bits(address); 941 cmd->data[2] |= CMD_INV_IOMMU_PAGES_GN_MASK; 942 cmd->data[3] = upper_32_bits(address); 943 if (size) 944 cmd->data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK; 945 CMD_SET_TYPE(cmd, CMD_INV_IOTLB_PAGES); 946} 947 948static void build_complete_ppr(struct iommu_cmd *cmd, u16 devid, int pasid, 949 int status, int tag, bool gn) 950{ 951 memset(cmd, 0, sizeof(*cmd)); 952 953 cmd->data[0] = devid; 954 if (gn) { 955 cmd->data[1] = pasid; 956 cmd->data[2] = CMD_INV_IOMMU_PAGES_GN_MASK; 957 } 958 cmd->data[3] = tag & 0x1ff; 959 cmd->data[3] |= (status & PPR_STATUS_MASK) << PPR_STATUS_SHIFT; 960 961 CMD_SET_TYPE(cmd, CMD_COMPLETE_PPR); 962} 963 964static void build_inv_all(struct iommu_cmd *cmd) 965{ 966 memset(cmd, 0, sizeof(*cmd)); 967 CMD_SET_TYPE(cmd, CMD_INV_ALL); 968} 969 970static void build_inv_irt(struct iommu_cmd *cmd, u16 devid) 971{ 972 memset(cmd, 0, sizeof(*cmd)); 973 cmd->data[0] = devid; 974 CMD_SET_TYPE(cmd, CMD_INV_IRT); 975} 976 977/* 978 * Writes the command to the IOMMUs command buffer and informs the 979 * hardware about the new command. 980 */ 981static int iommu_queue_command_sync(struct amd_iommu *iommu, 982 struct iommu_cmd *cmd, 983 bool sync) 984{ 985 u32 left, tail, head, next_tail; 986 unsigned long flags; 987 988 WARN_ON(iommu->cmd_buf_size & CMD_BUFFER_UNINITIALIZED); 989 990again: 991 spin_lock_irqsave(&iommu->lock, flags); 992 993 head = readl(iommu->mmio_base + MMIO_CMD_HEAD_OFFSET); 994 tail = readl(iommu->mmio_base + MMIO_CMD_TAIL_OFFSET); 995 next_tail = (tail + sizeof(*cmd)) % iommu->cmd_buf_size; 996 left = (head - next_tail) % iommu->cmd_buf_size; 997 998 if (left <= 2) { 999 struct iommu_cmd sync_cmd; 1000 volatile u64 sem = 0; 1001 int ret; 1002 1003 build_completion_wait(&sync_cmd, (u64)&sem); 1004 copy_cmd_to_buffer(iommu, &sync_cmd, tail); 1005 1006 spin_unlock_irqrestore(&iommu->lock, flags); 1007 1008 if ((ret = wait_on_sem(&sem)) != 0) 1009 return ret; 1010 1011 goto again; 1012 } 1013 1014 copy_cmd_to_buffer(iommu, cmd, tail); 1015 1016 /* We need to sync now to make sure all commands are processed */ 1017 iommu->need_sync = sync; 1018 1019 spin_unlock_irqrestore(&iommu->lock, flags); 1020 1021 return 0; 1022} 1023 1024static int iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd) 1025{ 1026 return iommu_queue_command_sync(iommu, cmd, true); 1027} 1028 1029/* 1030 * This function queues a completion wait command into the command 1031 * buffer of an IOMMU 1032 */ 1033static int iommu_completion_wait(struct amd_iommu *iommu) 1034{ 1035 struct iommu_cmd cmd; 1036 volatile u64 sem = 0; 1037 int ret; 1038 1039 if (!iommu->need_sync) 1040 return 0; 1041 1042 build_completion_wait(&cmd, (u64)&sem); 1043 1044 ret = iommu_queue_command_sync(iommu, &cmd, false); 1045 if (ret) 1046 return ret; 1047 1048 return wait_on_sem(&sem); 1049} 1050 1051static int iommu_flush_dte(struct amd_iommu *iommu, u16 devid) 1052{ 1053 struct iommu_cmd cmd; 1054 1055 build_inv_dte(&cmd, devid); 1056 1057 return iommu_queue_command(iommu, &cmd); 1058} 1059 1060static void iommu_flush_dte_all(struct amd_iommu *iommu) 1061{ 1062 u32 devid; 1063 1064 for (devid = 0; devid <= 0xffff; ++devid) 1065 iommu_flush_dte(iommu, devid); 1066 1067 iommu_completion_wait(iommu); 1068} 1069 1070/* 1071 * This function uses heavy locking and may disable irqs for some time. But 1072 * this is no issue because it is only called during resume. 1073 */ 1074static void iommu_flush_tlb_all(struct amd_iommu *iommu) 1075{ 1076 u32 dom_id; 1077 1078 for (dom_id = 0; dom_id <= 0xffff; ++dom_id) { 1079 struct iommu_cmd cmd; 1080 build_inv_iommu_pages(&cmd, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 1081 dom_id, 1); 1082 iommu_queue_command(iommu, &cmd); 1083 } 1084 1085 iommu_completion_wait(iommu); 1086} 1087 1088static void iommu_flush_all(struct amd_iommu *iommu) 1089{ 1090 struct iommu_cmd cmd; 1091 1092 build_inv_all(&cmd); 1093 1094 iommu_queue_command(iommu, &cmd); 1095 iommu_completion_wait(iommu); 1096} 1097 1098static void iommu_flush_irt(struct amd_iommu *iommu, u16 devid) 1099{ 1100 struct iommu_cmd cmd; 1101 1102 build_inv_irt(&cmd, devid); 1103 1104 iommu_queue_command(iommu, &cmd); 1105} 1106 1107static void iommu_flush_irt_all(struct amd_iommu *iommu) 1108{ 1109 u32 devid; 1110 1111 for (devid = 0; devid <= MAX_DEV_TABLE_ENTRIES; devid++) 1112 iommu_flush_irt(iommu, devid); 1113 1114 iommu_completion_wait(iommu); 1115} 1116 1117void iommu_flush_all_caches(struct amd_iommu *iommu) 1118{ 1119 if (iommu_feature(iommu, FEATURE_IA)) { 1120 iommu_flush_all(iommu); 1121 } else { 1122 iommu_flush_dte_all(iommu); 1123 iommu_flush_irt_all(iommu); 1124 iommu_flush_tlb_all(iommu); 1125 } 1126} 1127 1128/* 1129 * Command send function for flushing on-device TLB 1130 */ 1131static int device_flush_iotlb(struct iommu_dev_data *dev_data, 1132 u64 address, size_t size) 1133{ 1134 struct amd_iommu *iommu; 1135 struct iommu_cmd cmd; 1136 int qdep; 1137 1138 qdep = dev_data->ats.qdep; 1139 iommu = amd_iommu_rlookup_table[dev_data->devid]; 1140 1141 build_inv_iotlb_pages(&cmd, dev_data->devid, qdep, address, size); 1142 1143 return iommu_queue_command(iommu, &cmd); 1144} 1145 1146/* 1147 * Command send function for invalidating a device table entry 1148 */ 1149static int device_flush_dte(struct iommu_dev_data *dev_data) 1150{ 1151 struct amd_iommu *iommu; 1152 int ret; 1153 1154 iommu = amd_iommu_rlookup_table[dev_data->devid]; 1155 1156 ret = iommu_flush_dte(iommu, dev_data->devid); 1157 if (ret) 1158 return ret; 1159 1160 if (dev_data->ats.enabled) 1161 ret = device_flush_iotlb(dev_data, 0, ~0UL); 1162 1163 return ret; 1164} 1165 1166/* 1167 * TLB invalidation function which is called from the mapping functions. 1168 * It invalidates a single PTE if the range to flush is within a single 1169 * page. Otherwise it flushes the whole TLB of the IOMMU. 1170 */ 1171static void __domain_flush_pages(struct protection_domain *domain, 1172 u64 address, size_t size, int pde) 1173{ 1174 struct iommu_dev_data *dev_data; 1175 struct iommu_cmd cmd; 1176 int ret = 0, i; 1177 1178 build_inv_iommu_pages(&cmd, address, size, domain->id, pde); 1179 1180 for (i = 0; i < amd_iommus_present; ++i) { 1181 if (!domain->dev_iommu[i]) 1182 continue; 1183 1184 /* 1185 * Devices of this domain are behind this IOMMU 1186 * We need a TLB flush 1187 */ 1188 ret |= iommu_queue_command(amd_iommus[i], &cmd); 1189 } 1190 1191 list_for_each_entry(dev_data, &domain->dev_list, list) { 1192 1193 if (!dev_data->ats.enabled) 1194 continue; 1195 1196 ret |= device_flush_iotlb(dev_data, address, size); 1197 } 1198 1199 WARN_ON(ret); 1200} 1201 1202static void domain_flush_pages(struct protection_domain *domain, 1203 u64 address, size_t size) 1204{ 1205 __domain_flush_pages(domain, address, size, 0); 1206} 1207 1208/* Flush the whole IO/TLB for a given protection domain */ 1209static void domain_flush_tlb(struct protection_domain *domain) 1210{ 1211 __domain_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 0); 1212} 1213 1214/* Flush the whole IO/TLB for a given protection domain - including PDE */ 1215static void domain_flush_tlb_pde(struct protection_domain *domain) 1216{ 1217 __domain_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 1); 1218} 1219 1220static void domain_flush_complete(struct protection_domain *domain) 1221{ 1222 int i; 1223 1224 for (i = 0; i < amd_iommus_present; ++i) { 1225 if (!domain->dev_iommu[i]) 1226 continue; 1227 1228 /* 1229 * Devices of this domain are behind this IOMMU 1230 * We need to wait for completion of all commands. 1231 */ 1232 iommu_completion_wait(amd_iommus[i]); 1233 } 1234} 1235 1236 1237/* 1238 * This function flushes the DTEs for all devices in domain 1239 */ 1240static void domain_flush_devices(struct protection_domain *domain) 1241{ 1242 struct iommu_dev_data *dev_data; 1243 1244 list_for_each_entry(dev_data, &domain->dev_list, list) 1245 device_flush_dte(dev_data); 1246} 1247 1248/**************************************************************************** 1249 * 1250 * The functions below are used the create the page table mappings for 1251 * unity mapped regions. 1252 * 1253 ****************************************************************************/ 1254 1255/* 1256 * This function is used to add another level to an IO page table. Adding 1257 * another level increases the size of the address space by 9 bits to a size up 1258 * to 64 bits. 1259 */ 1260static bool increase_address_space(struct protection_domain *domain, 1261 gfp_t gfp) 1262{ 1263 u64 *pte; 1264 1265 if (domain->mode == PAGE_MODE_6_LEVEL) 1266 /* address space already 64 bit large */ 1267 return false; 1268 1269 pte = (void *)get_zeroed_page(gfp); 1270 if (!pte) 1271 return false; 1272 1273 *pte = PM_LEVEL_PDE(domain->mode, 1274 virt_to_phys(domain->pt_root)); 1275 domain->pt_root = pte; 1276 domain->mode += 1; 1277 domain->updated = true; 1278 1279 return true; 1280} 1281 1282static u64 *alloc_pte(struct protection_domain *domain, 1283 unsigned long address, 1284 unsigned long page_size, 1285 u64 **pte_page, 1286 gfp_t gfp) 1287{ 1288 int level, end_lvl; 1289 u64 *pte, *page; 1290 1291 BUG_ON(!is_power_of_2(page_size)); 1292 1293 while (address > PM_LEVEL_SIZE(domain->mode)) 1294 increase_address_space(domain, gfp); 1295 1296 level = domain->mode - 1; 1297 pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)]; 1298 address = PAGE_SIZE_ALIGN(address, page_size); 1299 end_lvl = PAGE_SIZE_LEVEL(page_size); 1300 1301 while (level > end_lvl) { 1302 if (!IOMMU_PTE_PRESENT(*pte)) { 1303 page = (u64 *)get_zeroed_page(gfp); 1304 if (!page) 1305 return NULL; 1306 *pte = PM_LEVEL_PDE(level, virt_to_phys(page)); 1307 } 1308 1309 /* No level skipping support yet */ 1310 if (PM_PTE_LEVEL(*pte) != level) 1311 return NULL; 1312 1313 level -= 1; 1314 1315 pte = IOMMU_PTE_PAGE(*pte); 1316 1317 if (pte_page && level == end_lvl) 1318 *pte_page = pte; 1319 1320 pte = &pte[PM_LEVEL_INDEX(level, address)]; 1321 } 1322 1323 return pte; 1324} 1325 1326/* 1327 * This function checks if there is a PTE for a given dma address. If 1328 * there is one, it returns the pointer to it. 1329 */ 1330static u64 *fetch_pte(struct protection_domain *domain, 1331 unsigned long address, 1332 unsigned long *page_size) 1333{ 1334 int level; 1335 u64 *pte; 1336 1337 if (address > PM_LEVEL_SIZE(domain->mode)) 1338 return NULL; 1339 1340 level = domain->mode - 1; 1341 pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)]; 1342 *page_size = PTE_LEVEL_PAGE_SIZE(level); 1343 1344 while (level > 0) { 1345 1346 /* Not Present */ 1347 if (!IOMMU_PTE_PRESENT(*pte)) 1348 return NULL; 1349 1350 /* Large PTE */ 1351 if (PM_PTE_LEVEL(*pte) == 7 || 1352 PM_PTE_LEVEL(*pte) == 0) 1353 break; 1354 1355 /* No level skipping support yet */ 1356 if (PM_PTE_LEVEL(*pte) != level) 1357 return NULL; 1358 1359 level -= 1; 1360 1361 /* Walk to the next level */ 1362 pte = IOMMU_PTE_PAGE(*pte); 1363 pte = &pte[PM_LEVEL_INDEX(level, address)]; 1364 *page_size = PTE_LEVEL_PAGE_SIZE(level); 1365 } 1366 1367 if (PM_PTE_LEVEL(*pte) == 0x07) { 1368 unsigned long pte_mask; 1369 1370 /* 1371 * If we have a series of large PTEs, make 1372 * sure to return a pointer to the first one. 1373 */ 1374 *page_size = pte_mask = PTE_PAGE_SIZE(*pte); 1375 pte_mask = ~((PAGE_SIZE_PTE_COUNT(pte_mask) << 3) - 1); 1376 pte = (u64 *)(((unsigned long)pte) & pte_mask); 1377 } 1378 1379 return pte; 1380} 1381 1382/* 1383 * Generic mapping functions. It maps a physical address into a DMA 1384 * address space. It allocates the page table pages if necessary. 1385 * In the future it can be extended to a generic mapping function 1386 * supporting all features of AMD IOMMU page tables like level skipping 1387 * and full 64 bit address spaces. 1388 */ 1389static int iommu_map_page(struct protection_domain *dom, 1390 unsigned long bus_addr, 1391 unsigned long phys_addr, 1392 int prot, 1393 unsigned long page_size) 1394{ 1395 u64 __pte, *pte; 1396 int i, count; 1397 1398 BUG_ON(!IS_ALIGNED(bus_addr, page_size)); 1399 BUG_ON(!IS_ALIGNED(phys_addr, page_size)); 1400 1401 if (!(prot & IOMMU_PROT_MASK)) 1402 return -EINVAL; 1403 1404 count = PAGE_SIZE_PTE_COUNT(page_size); 1405 pte = alloc_pte(dom, bus_addr, page_size, NULL, GFP_KERNEL); 1406 1407 if (!pte) 1408 return -ENOMEM; 1409 1410 for (i = 0; i < count; ++i) 1411 if (IOMMU_PTE_PRESENT(pte[i])) 1412 return -EBUSY; 1413 1414 if (count > 1) { 1415 __pte = PAGE_SIZE_PTE(phys_addr, page_size); 1416 __pte |= PM_LEVEL_ENC(7) | IOMMU_PTE_P | IOMMU_PTE_FC; 1417 } else 1418 __pte = phys_addr | IOMMU_PTE_P | IOMMU_PTE_FC; 1419 1420 if (prot & IOMMU_PROT_IR) 1421 __pte |= IOMMU_PTE_IR; 1422 if (prot & IOMMU_PROT_IW) 1423 __pte |= IOMMU_PTE_IW; 1424 1425 for (i = 0; i < count; ++i) 1426 pte[i] = __pte; 1427 1428 update_domain(dom); 1429 1430 return 0; 1431} 1432 1433static unsigned long iommu_unmap_page(struct protection_domain *dom, 1434 unsigned long bus_addr, 1435 unsigned long page_size) 1436{ 1437 unsigned long long unmapped; 1438 unsigned long unmap_size; 1439 u64 *pte; 1440 1441 BUG_ON(!is_power_of_2(page_size)); 1442 1443 unmapped = 0; 1444 1445 while (unmapped < page_size) { 1446 1447 pte = fetch_pte(dom, bus_addr, &unmap_size); 1448 1449 if (pte) { 1450 int i, count; 1451 1452 count = PAGE_SIZE_PTE_COUNT(unmap_size); 1453 for (i = 0; i < count; i++) 1454 pte[i] = 0ULL; 1455 } 1456 1457 bus_addr = (bus_addr & ~(unmap_size - 1)) + unmap_size; 1458 unmapped += unmap_size; 1459 } 1460 1461 BUG_ON(unmapped && !is_power_of_2(unmapped)); 1462 1463 return unmapped; 1464} 1465 1466/* 1467 * This function checks if a specific unity mapping entry is needed for 1468 * this specific IOMMU. 1469 */ 1470static int iommu_for_unity_map(struct amd_iommu *iommu, 1471 struct unity_map_entry *entry) 1472{ 1473 u16 bdf, i; 1474 1475 for (i = entry->devid_start; i <= entry->devid_end; ++i) { 1476 bdf = amd_iommu_alias_table[i]; 1477 if (amd_iommu_rlookup_table[bdf] == iommu) 1478 return 1; 1479 } 1480 1481 return 0; 1482} 1483 1484/* 1485 * This function actually applies the mapping to the page table of the 1486 * dma_ops domain. 1487 */ 1488static int dma_ops_unity_map(struct dma_ops_domain *dma_dom, 1489 struct unity_map_entry *e) 1490{ 1491 u64 addr; 1492 int ret; 1493 1494 for (addr = e->address_start; addr < e->address_end; 1495 addr += PAGE_SIZE) { 1496 ret = iommu_map_page(&dma_dom->domain, addr, addr, e->prot, 1497 PAGE_SIZE); 1498 if (ret) 1499 return ret; 1500 /* 1501 * if unity mapping is in aperture range mark the page 1502 * as allocated in the aperture 1503 */ 1504 if (addr < dma_dom->aperture_size) 1505 __set_bit(addr >> PAGE_SHIFT, 1506 dma_dom->aperture[0]->bitmap); 1507 } 1508 1509 return 0; 1510} 1511 1512/* 1513 * Init the unity mappings for a specific IOMMU in the system 1514 * 1515 * Basically iterates over all unity mapping entries and applies them to 1516 * the default domain DMA of that IOMMU if necessary. 1517 */ 1518static int iommu_init_unity_mappings(struct amd_iommu *iommu) 1519{ 1520 struct unity_map_entry *entry; 1521 int ret; 1522 1523 list_for_each_entry(entry, &amd_iommu_unity_map, list) { 1524 if (!iommu_for_unity_map(iommu, entry)) 1525 continue; 1526 ret = dma_ops_unity_map(iommu->default_dom, entry); 1527 if (ret) 1528 return ret; 1529 } 1530 1531 return 0; 1532} 1533 1534/* 1535 * Inits the unity mappings required for a specific device 1536 */ 1537static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom, 1538 u16 devid) 1539{ 1540 struct unity_map_entry *e; 1541 int ret; 1542 1543 list_for_each_entry(e, &amd_iommu_unity_map, list) { 1544 if (!(devid >= e->devid_start && devid <= e->devid_end)) 1545 continue; 1546 ret = dma_ops_unity_map(dma_dom, e); 1547 if (ret) 1548 return ret; 1549 } 1550 1551 return 0; 1552} 1553 1554/**************************************************************************** 1555 * 1556 * The next functions belong to the address allocator for the dma_ops 1557 * interface functions. They work like the allocators in the other IOMMU 1558 * drivers. Its basically a bitmap which marks the allocated pages in 1559 * the aperture. Maybe it could be enhanced in the future to a more 1560 * efficient allocator. 1561 * 1562 ****************************************************************************/ 1563 1564/* 1565 * The address allocator core functions. 1566 * 1567 * called with domain->lock held 1568 */ 1569 1570/* 1571 * Used to reserve address ranges in the aperture (e.g. for exclusion 1572 * ranges. 1573 */ 1574static void dma_ops_reserve_addresses(struct dma_ops_domain *dom, 1575 unsigned long start_page, 1576 unsigned int pages) 1577{ 1578 unsigned int i, last_page = dom->aperture_size >> PAGE_SHIFT; 1579 1580 if (start_page + pages > last_page) 1581 pages = last_page - start_page; 1582 1583 for (i = start_page; i < start_page + pages; ++i) { 1584 int index = i / APERTURE_RANGE_PAGES; 1585 int page = i % APERTURE_RANGE_PAGES; 1586 __set_bit(page, dom->aperture[index]->bitmap); 1587 } 1588} 1589 1590/* 1591 * This function is used to add a new aperture range to an existing 1592 * aperture in case of dma_ops domain allocation or address allocation 1593 * failure. 1594 */ 1595static int alloc_new_range(struct dma_ops_domain *dma_dom, 1596 bool populate, gfp_t gfp) 1597{ 1598 int index = dma_dom->aperture_size >> APERTURE_RANGE_SHIFT; 1599 struct amd_iommu *iommu; 1600 unsigned long i, old_size, pte_pgsize; 1601 1602#ifdef CONFIG_IOMMU_STRESS 1603 populate = false; 1604#endif 1605 1606 if (index >= APERTURE_MAX_RANGES) 1607 return -ENOMEM; 1608 1609 dma_dom->aperture[index] = kzalloc(sizeof(struct aperture_range), gfp); 1610 if (!dma_dom->aperture[index]) 1611 return -ENOMEM; 1612 1613 dma_dom->aperture[index]->bitmap = (void *)get_zeroed_page(gfp); 1614 if (!dma_dom->aperture[index]->bitmap) 1615 goto out_free; 1616 1617 dma_dom->aperture[index]->offset = dma_dom->aperture_size; 1618 1619 if (populate) { 1620 unsigned long address = dma_dom->aperture_size; 1621 int i, num_ptes = APERTURE_RANGE_PAGES / 512; 1622 u64 *pte, *pte_page; 1623 1624 for (i = 0; i < num_ptes; ++i) { 1625 pte = alloc_pte(&dma_dom->domain, address, PAGE_SIZE, 1626 &pte_page, gfp); 1627 if (!pte) 1628 goto out_free; 1629 1630 dma_dom->aperture[index]->pte_pages[i] = pte_page; 1631 1632 address += APERTURE_RANGE_SIZE / 64; 1633 } 1634 } 1635 1636 old_size = dma_dom->aperture_size; 1637 dma_dom->aperture_size += APERTURE_RANGE_SIZE; 1638 1639 /* Reserve address range used for MSI messages */ 1640 if (old_size < MSI_ADDR_BASE_LO && 1641 dma_dom->aperture_size > MSI_ADDR_BASE_LO) { 1642 unsigned long spage; 1643 int pages; 1644 1645 pages = iommu_num_pages(MSI_ADDR_BASE_LO, 0x10000, PAGE_SIZE); 1646 spage = MSI_ADDR_BASE_LO >> PAGE_SHIFT; 1647 1648 dma_ops_reserve_addresses(dma_dom, spage, pages); 1649 } 1650 1651 /* Initialize the exclusion range if necessary */ 1652 for_each_iommu(iommu) { 1653 if (iommu->exclusion_start && 1654 iommu->exclusion_start >= dma_dom->aperture[index]->offset 1655 && iommu->exclusion_start < dma_dom->aperture_size) { 1656 unsigned long startpage; 1657 int pages = iommu_num_pages(iommu->exclusion_start, 1658 iommu->exclusion_length, 1659 PAGE_SIZE); 1660 startpage = iommu->exclusion_start >> PAGE_SHIFT; 1661 dma_ops_reserve_addresses(dma_dom, startpage, pages); 1662 } 1663 } 1664 1665 /* 1666 * Check for areas already mapped as present in the new aperture 1667 * range and mark those pages as reserved in the allocator. Such 1668 * mappings may already exist as a result of requested unity 1669 * mappings for devices. 1670 */ 1671 for (i = dma_dom->aperture[index]->offset; 1672 i < dma_dom->aperture_size; 1673 i += pte_pgsize) { 1674 u64 *pte = fetch_pte(&dma_dom->domain, i, &pte_pgsize); 1675 if (!pte || !IOMMU_PTE_PRESENT(*pte)) 1676 continue; 1677 1678 dma_ops_reserve_addresses(dma_dom, i >> PAGE_SHIFT, 1679 pte_pgsize >> 12); 1680 } 1681 1682 update_domain(&dma_dom->domain); 1683 1684 return 0; 1685 1686out_free: 1687 update_domain(&dma_dom->domain); 1688 1689 free_page((unsigned long)dma_dom->aperture[index]->bitmap); 1690 1691 kfree(dma_dom->aperture[index]); 1692 dma_dom->aperture[index] = NULL; 1693 1694 return -ENOMEM; 1695} 1696 1697static unsigned long dma_ops_area_alloc(struct device *dev, 1698 struct dma_ops_domain *dom, 1699 unsigned int pages, 1700 unsigned long align_mask, 1701 u64 dma_mask, 1702 unsigned long start) 1703{ 1704 unsigned long next_bit = dom->next_address % APERTURE_RANGE_SIZE; 1705 int max_index = dom->aperture_size >> APERTURE_RANGE_SHIFT; 1706 int i = start >> APERTURE_RANGE_SHIFT; 1707 unsigned long boundary_size; 1708 unsigned long address = -1; 1709 unsigned long limit; 1710 1711 next_bit >>= PAGE_SHIFT; 1712 1713 boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1, 1714 PAGE_SIZE) >> PAGE_SHIFT; 1715 1716 for (;i < max_index; ++i) { 1717 unsigned long offset = dom->aperture[i]->offset >> PAGE_SHIFT; 1718 1719 if (dom->aperture[i]->offset >= dma_mask) 1720 break; 1721 1722 limit = iommu_device_max_index(APERTURE_RANGE_PAGES, offset, 1723 dma_mask >> PAGE_SHIFT); 1724 1725 address = iommu_area_alloc(dom->aperture[i]->bitmap, 1726 limit, next_bit, pages, 0, 1727 boundary_size, align_mask); 1728 if (address != -1) { 1729 address = dom->aperture[i]->offset + 1730 (address << PAGE_SHIFT); 1731 dom->next_address = address + (pages << PAGE_SHIFT); 1732 break; 1733 } 1734 1735 next_bit = 0; 1736 } 1737 1738 return address; 1739} 1740 1741static unsigned long dma_ops_alloc_addresses(struct device *dev, 1742 struct dma_ops_domain *dom, 1743 unsigned int pages, 1744 unsigned long align_mask, 1745 u64 dma_mask) 1746{ 1747 unsigned long address; 1748 1749#ifdef CONFIG_IOMMU_STRESS 1750 dom->next_address = 0; 1751 dom->need_flush = true; 1752#endif 1753 1754 address = dma_ops_area_alloc(dev, dom, pages, align_mask, 1755 dma_mask, dom->next_address); 1756 1757 if (address == -1) { 1758 dom->next_address = 0; 1759 address = dma_ops_area_alloc(dev, dom, pages, align_mask, 1760 dma_mask, 0); 1761 dom->need_flush = true; 1762 } 1763 1764 if (unlikely(address == -1)) 1765 address = DMA_ERROR_CODE; 1766 1767 WARN_ON((address + (PAGE_SIZE*pages)) > dom->aperture_size); 1768 1769 return address; 1770} 1771 1772/* 1773 * The address free function. 1774 * 1775 * called with domain->lock held 1776 */ 1777static void dma_ops_free_addresses(struct dma_ops_domain *dom, 1778 unsigned long address, 1779 unsigned int pages) 1780{ 1781 unsigned i = address >> APERTURE_RANGE_SHIFT; 1782 struct aperture_range *range = dom->aperture[i]; 1783 1784 BUG_ON(i >= APERTURE_MAX_RANGES || range == NULL); 1785 1786#ifdef CONFIG_IOMMU_STRESS 1787 if (i < 4) 1788 return; 1789#endif 1790 1791 if (address >= dom->next_address) 1792 dom->need_flush = true; 1793 1794 address = (address % APERTURE_RANGE_SIZE) >> PAGE_SHIFT; 1795 1796 bitmap_clear(range->bitmap, address, pages); 1797 1798} 1799 1800/**************************************************************************** 1801 * 1802 * The next functions belong to the domain allocation. A domain is 1803 * allocated for every IOMMU as the default domain. If device isolation 1804 * is enabled, every device get its own domain. The most important thing 1805 * about domains is the page table mapping the DMA address space they 1806 * contain. 1807 * 1808 ****************************************************************************/ 1809 1810/* 1811 * This function adds a protection domain to the global protection domain list 1812 */ 1813static void add_domain_to_list(struct protection_domain *domain) 1814{ 1815 unsigned long flags; 1816 1817 spin_lock_irqsave(&amd_iommu_pd_lock, flags); 1818 list_add(&domain->list, &amd_iommu_pd_list); 1819 spin_unlock_irqrestore(&amd_iommu_pd_lock, flags); 1820} 1821 1822/* 1823 * This function removes a protection domain to the global 1824 * protection domain list 1825 */ 1826static void del_domain_from_list(struct protection_domain *domain) 1827{ 1828 unsigned long flags; 1829 1830 spin_lock_irqsave(&amd_iommu_pd_lock, flags); 1831 list_del(&domain->list); 1832 spin_unlock_irqrestore(&amd_iommu_pd_lock, flags); 1833} 1834 1835static u16 domain_id_alloc(void) 1836{ 1837 unsigned long flags; 1838 int id; 1839 1840 write_lock_irqsave(&amd_iommu_devtable_lock, flags); 1841 id = find_first_zero_bit(amd_iommu_pd_alloc_bitmap, MAX_DOMAIN_ID); 1842 BUG_ON(id == 0); 1843 if (id > 0 && id < MAX_DOMAIN_ID) 1844 __set_bit(id, amd_iommu_pd_alloc_bitmap); 1845 else 1846 id = 0; 1847 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); 1848 1849 return id; 1850} 1851 1852static void domain_id_free(int id) 1853{ 1854 unsigned long flags; 1855 1856 write_lock_irqsave(&amd_iommu_devtable_lock, flags); 1857 if (id > 0 && id < MAX_DOMAIN_ID) 1858 __clear_bit(id, amd_iommu_pd_alloc_bitmap); 1859 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); 1860} 1861 1862#define DEFINE_FREE_PT_FN(LVL, FN) \ 1863static void free_pt_##LVL (unsigned long __pt) \ 1864{ \ 1865 unsigned long p; \ 1866 u64 *pt; \ 1867 int i; \ 1868 \ 1869 pt = (u64 *)__pt; \ 1870 \ 1871 for (i = 0; i < 512; ++i) { \ 1872 /* PTE present? */ \ 1873 if (!IOMMU_PTE_PRESENT(pt[i])) \ 1874 continue; \ 1875 \ 1876 /* Large PTE? */ \ 1877 if (PM_PTE_LEVEL(pt[i]) == 0 || \ 1878 PM_PTE_LEVEL(pt[i]) == 7) \ 1879 continue; \ 1880 \ 1881 p = (unsigned long)IOMMU_PTE_PAGE(pt[i]); \ 1882 FN(p); \ 1883 } \ 1884 free_page((unsigned long)pt); \ 1885} 1886 1887DEFINE_FREE_PT_FN(l2, free_page) 1888DEFINE_FREE_PT_FN(l3, free_pt_l2) 1889DEFINE_FREE_PT_FN(l4, free_pt_l3) 1890DEFINE_FREE_PT_FN(l5, free_pt_l4) 1891DEFINE_FREE_PT_FN(l6, free_pt_l5) 1892 1893static void free_pagetable(struct protection_domain *domain) 1894{ 1895 unsigned long root = (unsigned long)domain->pt_root; 1896 1897 switch (domain->mode) { 1898 case PAGE_MODE_NONE: 1899 break; 1900 case PAGE_MODE_1_LEVEL: 1901 free_page(root); 1902 break; 1903 case PAGE_MODE_2_LEVEL: 1904 free_pt_l2(root); 1905 break; 1906 case PAGE_MODE_3_LEVEL: 1907 free_pt_l3(root); 1908 break; 1909 case PAGE_MODE_4_LEVEL: 1910 free_pt_l4(root); 1911 break; 1912 case PAGE_MODE_5_LEVEL: 1913 free_pt_l5(root); 1914 break; 1915 case PAGE_MODE_6_LEVEL: 1916 free_pt_l6(root); 1917 break; 1918 default: 1919 BUG(); 1920 } 1921} 1922 1923static void free_gcr3_tbl_level1(u64 *tbl) 1924{ 1925 u64 *ptr; 1926 int i; 1927 1928 for (i = 0; i < 512; ++i) { 1929 if (!(tbl[i] & GCR3_VALID)) 1930 continue; 1931 1932 ptr = __va(tbl[i] & PAGE_MASK); 1933 1934 free_page((unsigned long)ptr); 1935 } 1936} 1937 1938static void free_gcr3_tbl_level2(u64 *tbl) 1939{ 1940 u64 *ptr; 1941 int i; 1942 1943 for (i = 0; i < 512; ++i) { 1944 if (!(tbl[i] & GCR3_VALID)) 1945 continue; 1946 1947 ptr = __va(tbl[i] & PAGE_MASK); 1948 1949 free_gcr3_tbl_level1(ptr); 1950 } 1951} 1952 1953static void free_gcr3_table(struct protection_domain *domain) 1954{ 1955 if (domain->glx == 2) 1956 free_gcr3_tbl_level2(domain->gcr3_tbl); 1957 else if (domain->glx == 1) 1958 free_gcr3_tbl_level1(domain->gcr3_tbl); 1959 else if (domain->glx != 0) 1960 BUG(); 1961 1962 free_page((unsigned long)domain->gcr3_tbl); 1963} 1964 1965/* 1966 * Free a domain, only used if something went wrong in the 1967 * allocation path and we need to free an already allocated page table 1968 */ 1969static void dma_ops_domain_free(struct dma_ops_domain *dom) 1970{ 1971 int i; 1972 1973 if (!dom) 1974 return; 1975 1976 del_domain_from_list(&dom->domain); 1977 1978 free_pagetable(&dom->domain); 1979 1980 for (i = 0; i < APERTURE_MAX_RANGES; ++i) { 1981 if (!dom->aperture[i]) 1982 continue; 1983 free_page((unsigned long)dom->aperture[i]->bitmap); 1984 kfree(dom->aperture[i]); 1985 } 1986 1987 kfree(dom); 1988} 1989 1990/* 1991 * Allocates a new protection domain usable for the dma_ops functions. 1992 * It also initializes the page table and the address allocator data 1993 * structures required for the dma_ops interface 1994 */ 1995static struct dma_ops_domain *dma_ops_domain_alloc(void) 1996{ 1997 struct dma_ops_domain *dma_dom; 1998 1999 dma_dom = kzalloc(sizeof(struct dma_ops_domain), GFP_KERNEL); 2000 if (!dma_dom) 2001 return NULL; 2002 2003 spin_lock_init(&dma_dom->domain.lock); 2004 2005 dma_dom->domain.id = domain_id_alloc(); 2006 if (dma_dom->domain.id == 0) 2007 goto free_dma_dom; 2008 INIT_LIST_HEAD(&dma_dom->domain.dev_list); 2009 dma_dom->domain.mode = PAGE_MODE_2_LEVEL; 2010 dma_dom->domain.pt_root = (void *)get_zeroed_page(GFP_KERNEL); 2011 dma_dom->domain.flags = PD_DMA_OPS_MASK; 2012 dma_dom->domain.priv = dma_dom; 2013 if (!dma_dom->domain.pt_root) 2014 goto free_dma_dom; 2015 2016 dma_dom->need_flush = false; 2017 dma_dom->target_dev = 0xffff; 2018 2019 add_domain_to_list(&dma_dom->domain); 2020 2021 if (alloc_new_range(dma_dom, true, GFP_KERNEL)) 2022 goto free_dma_dom; 2023 2024 /* 2025 * mark the first page as allocated so we never return 0 as 2026 * a valid dma-address. So we can use 0 as error value 2027 */ 2028 dma_dom->aperture[0]->bitmap[0] = 1; 2029 dma_dom->next_address = 0; 2030 2031 2032 return dma_dom; 2033 2034free_dma_dom: 2035 dma_ops_domain_free(dma_dom); 2036 2037 return NULL; 2038} 2039 2040/* 2041 * little helper function to check whether a given protection domain is a 2042 * dma_ops domain 2043 */ 2044static bool dma_ops_domain(struct protection_domain *domain) 2045{ 2046 return domain->flags & PD_DMA_OPS_MASK; 2047} 2048 2049static void set_dte_entry(u16 devid, struct protection_domain *domain, bool ats) 2050{ 2051 u64 pte_root = 0; 2052 u64 flags = 0; 2053 2054 if (domain->mode != PAGE_MODE_NONE) 2055 pte_root = virt_to_phys(domain->pt_root); 2056 2057 pte_root |= (domain->mode & DEV_ENTRY_MODE_MASK) 2058 << DEV_ENTRY_MODE_SHIFT; 2059 pte_root |= IOMMU_PTE_IR | IOMMU_PTE_IW | IOMMU_PTE_P | IOMMU_PTE_TV; 2060 2061 flags = amd_iommu_dev_table[devid].data[1]; 2062 2063 if (ats) 2064 flags |= DTE_FLAG_IOTLB; 2065 2066 if (domain->flags & PD_IOMMUV2_MASK) { 2067 u64 gcr3 = __pa(domain->gcr3_tbl); 2068 u64 glx = domain->glx; 2069 u64 tmp; 2070 2071 pte_root |= DTE_FLAG_GV; 2072 pte_root |= (glx & DTE_GLX_MASK) << DTE_GLX_SHIFT; 2073 2074 /* First mask out possible old values for GCR3 table */ 2075 tmp = DTE_GCR3_VAL_B(~0ULL) << DTE_GCR3_SHIFT_B; 2076 flags &= ~tmp; 2077 2078 tmp = DTE_GCR3_VAL_C(~0ULL) << DTE_GCR3_SHIFT_C; 2079 flags &= ~tmp; 2080 2081 /* Encode GCR3 table into DTE */ 2082 tmp = DTE_GCR3_VAL_A(gcr3) << DTE_GCR3_SHIFT_A; 2083 pte_root |= tmp; 2084 2085 tmp = DTE_GCR3_VAL_B(gcr3) << DTE_GCR3_SHIFT_B; 2086 flags |= tmp; 2087 2088 tmp = DTE_GCR3_VAL_C(gcr3) << DTE_GCR3_SHIFT_C; 2089 flags |= tmp; 2090 } 2091 2092 flags &= ~(0xffffUL); 2093 flags |= domain->id; 2094 2095 amd_iommu_dev_table[devid].data[1] = flags; 2096 amd_iommu_dev_table[devid].data[0] = pte_root; 2097} 2098 2099static void clear_dte_entry(u16 devid) 2100{ 2101 /* remove entry from the device table seen by the hardware */ 2102 amd_iommu_dev_table[devid].data[0] = IOMMU_PTE_P | IOMMU_PTE_TV; 2103 amd_iommu_dev_table[devid].data[1] &= DTE_FLAG_MASK; 2104 2105 amd_iommu_apply_erratum_63(devid); 2106} 2107 2108static void do_attach(struct iommu_dev_data *dev_data, 2109 struct protection_domain *domain) 2110{ 2111 struct amd_iommu *iommu; 2112 bool ats; 2113 2114 iommu = amd_iommu_rlookup_table[dev_data->devid]; 2115 ats = dev_data->ats.enabled; 2116 2117 /* Update data structures */ 2118 dev_data->domain = domain; 2119 list_add(&dev_data->list, &domain->dev_list); 2120 set_dte_entry(dev_data->devid, domain, ats); 2121 2122 /* Do reference counting */ 2123 domain->dev_iommu[iommu->index] += 1; 2124 domain->dev_cnt += 1; 2125 2126 /* Flush the DTE entry */ 2127 device_flush_dte(dev_data); 2128} 2129 2130static void do_detach(struct iommu_dev_data *dev_data) 2131{ 2132 struct amd_iommu *iommu; 2133 2134 iommu = amd_iommu_rlookup_table[dev_data->devid]; 2135 2136 /* decrease reference counters */ 2137 dev_data->domain->dev_iommu[iommu->index] -= 1; 2138 dev_data->domain->dev_cnt -= 1; 2139 2140 /* Update data structures */ 2141 dev_data->domain = NULL; 2142 list_del(&dev_data->list); 2143 clear_dte_entry(dev_data->devid); 2144 2145 /* Flush the DTE entry */ 2146 device_flush_dte(dev_data); 2147} 2148 2149/* 2150 * If a device is not yet associated with a domain, this function does 2151 * assigns it visible for the hardware 2152 */ 2153static int __attach_device(struct iommu_dev_data *dev_data, 2154 struct protection_domain *domain) 2155{ 2156 struct iommu_dev_data *head, *entry; 2157 int ret; 2158 2159 /* lock domain */ 2160 spin_lock(&domain->lock); 2161 2162 head = dev_data; 2163 2164 if (head->alias_data != NULL) 2165 head = head->alias_data; 2166 2167 /* Now we have the root of the alias group, if any */ 2168 2169 ret = -EBUSY; 2170 if (head->domain != NULL) 2171 goto out_unlock; 2172 2173 /* Attach alias group root */ 2174 do_attach(head, domain); 2175 2176 /* Attach other devices in the alias group */ 2177 list_for_each_entry(entry, &head->alias_list, alias_list) 2178 do_attach(entry, domain); 2179 2180 ret = 0; 2181 2182out_unlock: 2183 2184 /* ready */ 2185 spin_unlock(&domain->lock); 2186 2187 return ret; 2188} 2189 2190 2191static void pdev_iommuv2_disable(struct pci_dev *pdev) 2192{ 2193 pci_disable_ats(pdev); 2194 pci_disable_pri(pdev); 2195 pci_disable_pasid(pdev); 2196} 2197 2198/* FIXME: Change generic reset-function to do the same */ 2199static int pri_reset_while_enabled(struct pci_dev *pdev) 2200{ 2201 u16 control; 2202 int pos; 2203 2204 pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI); 2205 if (!pos) 2206 return -EINVAL; 2207 2208 pci_read_config_word(pdev, pos + PCI_PRI_CTRL, &control); 2209 control |= PCI_PRI_CTRL_RESET; 2210 pci_write_config_word(pdev, pos + PCI_PRI_CTRL, control); 2211 2212 return 0; 2213} 2214 2215static int pdev_iommuv2_enable(struct pci_dev *pdev) 2216{ 2217 bool reset_enable; 2218 int reqs, ret; 2219 2220 /* FIXME: Hardcode number of outstanding requests for now */ 2221 reqs = 32; 2222 if (pdev_pri_erratum(pdev, AMD_PRI_DEV_ERRATUM_LIMIT_REQ_ONE)) 2223 reqs = 1; 2224 reset_enable = pdev_pri_erratum(pdev, AMD_PRI_DEV_ERRATUM_ENABLE_RESET); 2225 2226 /* Only allow access to user-accessible pages */ 2227 ret = pci_enable_pasid(pdev, 0); 2228 if (ret) 2229 goto out_err; 2230 2231 /* First reset the PRI state of the device */ 2232 ret = pci_reset_pri(pdev); 2233 if (ret) 2234 goto out_err; 2235 2236 /* Enable PRI */ 2237 ret = pci_enable_pri(pdev, reqs); 2238 if (ret) 2239 goto out_err; 2240 2241 if (reset_enable) { 2242 ret = pri_reset_while_enabled(pdev); 2243 if (ret) 2244 goto out_err; 2245 } 2246 2247 ret = pci_enable_ats(pdev, PAGE_SHIFT); 2248 if (ret) 2249 goto out_err; 2250 2251 return 0; 2252 2253out_err: 2254 pci_disable_pri(pdev); 2255 pci_disable_pasid(pdev); 2256 2257 return ret; 2258} 2259 2260/* FIXME: Move this to PCI code */ 2261#define PCI_PRI_TLP_OFF (1 << 15) 2262 2263static bool pci_pri_tlp_required(struct pci_dev *pdev) 2264{ 2265 u16 status; 2266 int pos; 2267 2268 pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI); 2269 if (!pos) 2270 return false; 2271 2272 pci_read_config_word(pdev, pos + PCI_PRI_STATUS, &status); 2273 2274 return (status & PCI_PRI_TLP_OFF) ? true : false; 2275} 2276 2277/* 2278 * If a device is not yet associated with a domain, this function 2279 * assigns it visible for the hardware 2280 */ 2281static int attach_device(struct device *dev, 2282 struct protection_domain *domain) 2283{ 2284 struct pci_dev *pdev = to_pci_dev(dev); 2285 struct iommu_dev_data *dev_data; 2286 unsigned long flags; 2287 int ret; 2288 2289 dev_data = get_dev_data(dev); 2290 2291 if (domain->flags & PD_IOMMUV2_MASK) { 2292 if (!dev_data->iommu_v2 || !dev_data->passthrough) 2293 return -EINVAL; 2294 2295 if (pdev_iommuv2_enable(pdev) != 0) 2296 return -EINVAL; 2297 2298 dev_data->ats.enabled = true; 2299 dev_data->ats.qdep = pci_ats_queue_depth(pdev); 2300 dev_data->pri_tlp = pci_pri_tlp_required(pdev); 2301 } else if (amd_iommu_iotlb_sup && 2302 pci_enable_ats(pdev, PAGE_SHIFT) == 0) { 2303 dev_data->ats.enabled = true; 2304 dev_data->ats.qdep = pci_ats_queue_depth(pdev); 2305 } 2306 2307 write_lock_irqsave(&amd_iommu_devtable_lock, flags); 2308 ret = __attach_device(dev_data, domain); 2309 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); 2310 2311 /* 2312 * We might boot into a crash-kernel here. The crashed kernel 2313 * left the caches in the IOMMU dirty. So we have to flush 2314 * here to evict all dirty stuff. 2315 */ 2316 domain_flush_tlb_pde(domain); 2317 2318 return ret; 2319} 2320 2321/* 2322 * Removes a device from a protection domain (unlocked) 2323 */ 2324static void __detach_device(struct iommu_dev_data *dev_data) 2325{ 2326 struct iommu_dev_data *head, *entry; 2327 struct protection_domain *domain; 2328 unsigned long flags; 2329 2330 BUG_ON(!dev_data->domain); 2331 2332 domain = dev_data->domain; 2333 2334 spin_lock_irqsave(&domain->lock, flags); 2335 2336 head = dev_data; 2337 if (head->alias_data != NULL) 2338 head = head->alias_data; 2339 2340 list_for_each_entry(entry, &head->alias_list, alias_list) 2341 do_detach(entry); 2342 2343 do_detach(head); 2344 2345 spin_unlock_irqrestore(&domain->lock, flags); 2346 2347 /* 2348 * If we run in passthrough mode the device must be assigned to the 2349 * passthrough domain if it is detached from any other domain. 2350 * Make sure we can deassign from the pt_domain itself. 2351 */ 2352 if (dev_data->passthrough && 2353 (dev_data->domain == NULL && domain != pt_domain)) 2354 __attach_device(dev_data, pt_domain); 2355} 2356 2357/* 2358 * Removes a device from a protection domain (with devtable_lock held) 2359 */ 2360static void detach_device(struct device *dev) 2361{ 2362 struct protection_domain *domain; 2363 struct iommu_dev_data *dev_data; 2364 unsigned long flags; 2365 2366 dev_data = get_dev_data(dev); 2367 domain = dev_data->domain; 2368 2369 /* lock device table */ 2370 write_lock_irqsave(&amd_iommu_devtable_lock, flags); 2371 __detach_device(dev_data); 2372 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); 2373 2374 if (domain->flags & PD_IOMMUV2_MASK) 2375 pdev_iommuv2_disable(to_pci_dev(dev)); 2376 else if (dev_data->ats.enabled) 2377 pci_disable_ats(to_pci_dev(dev)); 2378 2379 dev_data->ats.enabled = false; 2380} 2381 2382/* 2383 * Find out the protection domain structure for a given PCI device. This 2384 * will give us the pointer to the page table root for example. 2385 */ 2386static struct protection_domain *domain_for_device(struct device *dev) 2387{ 2388 struct iommu_dev_data *dev_data; 2389 struct protection_domain *dom = NULL; 2390 unsigned long flags; 2391 2392 dev_data = get_dev_data(dev); 2393 2394 if (dev_data->domain) 2395 return dev_data->domain; 2396 2397 if (dev_data->alias_data != NULL) { 2398 struct iommu_dev_data *alias_data = dev_data->alias_data; 2399 2400 read_lock_irqsave(&amd_iommu_devtable_lock, flags); 2401 if (alias_data->domain != NULL) { 2402 __attach_device(dev_data, alias_data->domain); 2403 dom = alias_data->domain; 2404 } 2405 read_unlock_irqrestore(&amd_iommu_devtable_lock, flags); 2406 } 2407 2408 return dom; 2409} 2410 2411static int device_change_notifier(struct notifier_block *nb, 2412 unsigned long action, void *data) 2413{ 2414 struct dma_ops_domain *dma_domain; 2415 struct protection_domain *domain; 2416 struct iommu_dev_data *dev_data; 2417 struct device *dev = data; 2418 struct amd_iommu *iommu; 2419 unsigned long flags; 2420 u16 devid; 2421 2422 if (!check_device(dev)) 2423 return 0; 2424 2425 devid = get_device_id(dev); 2426 iommu = amd_iommu_rlookup_table[devid]; 2427 dev_data = get_dev_data(dev); 2428 2429 switch (action) { 2430 case BUS_NOTIFY_ADD_DEVICE: 2431 2432 iommu_init_device(dev); 2433 init_iommu_group(dev); 2434 2435 /* 2436 * dev_data is still NULL and 2437 * got initialized in iommu_init_device 2438 */ 2439 dev_data = get_dev_data(dev); 2440 2441 if (iommu_pass_through || dev_data->iommu_v2) { 2442 dev_data->passthrough = true; 2443 attach_device(dev, pt_domain); 2444 break; 2445 } 2446 2447 domain = domain_for_device(dev); 2448 2449 /* allocate a protection domain if a device is added */ 2450 dma_domain = find_protection_domain(devid); 2451 if (!dma_domain) { 2452 dma_domain = dma_ops_domain_alloc(); 2453 if (!dma_domain) 2454 goto out; 2455 dma_domain->target_dev = devid; 2456 2457 spin_lock_irqsave(&iommu_pd_list_lock, flags); 2458 list_add_tail(&dma_domain->list, &iommu_pd_list); 2459 spin_unlock_irqrestore(&iommu_pd_list_lock, flags); 2460 } 2461 2462 dev->archdata.dma_ops = &amd_iommu_dma_ops; 2463 2464 break; 2465 case BUS_NOTIFY_REMOVED_DEVICE: 2466 2467 iommu_uninit_device(dev); 2468 2469 default: 2470 goto out; 2471 } 2472 2473 iommu_completion_wait(iommu); 2474 2475out: 2476 return 0; 2477} 2478 2479static struct notifier_block device_nb = { 2480 .notifier_call = device_change_notifier, 2481}; 2482 2483void amd_iommu_init_notifier(void) 2484{ 2485 bus_register_notifier(&pci_bus_type, &device_nb); 2486} 2487 2488/***************************************************************************** 2489 * 2490 * The next functions belong to the dma_ops mapping/unmapping code. 2491 * 2492 *****************************************************************************/ 2493 2494/* 2495 * In the dma_ops path we only have the struct device. This function 2496 * finds the corresponding IOMMU, the protection domain and the 2497 * requestor id for a given device. 2498 * If the device is not yet associated with a domain this is also done 2499 * in this function. 2500 */ 2501static struct protection_domain *get_domain(struct device *dev) 2502{ 2503 struct protection_domain *domain; 2504 struct dma_ops_domain *dma_dom; 2505 u16 devid = get_device_id(dev); 2506 2507 if (!check_device(dev)) 2508 return ERR_PTR(-EINVAL); 2509 2510 domain = domain_for_device(dev); 2511 if (domain != NULL && !dma_ops_domain(domain)) 2512 return ERR_PTR(-EBUSY); 2513 2514 if (domain != NULL) 2515 return domain; 2516 2517 /* Device not bound yet - bind it */ 2518 dma_dom = find_protection_domain(devid); 2519 if (!dma_dom) 2520 dma_dom = amd_iommu_rlookup_table[devid]->default_dom; 2521 attach_device(dev, &dma_dom->domain); 2522 DUMP_printk("Using protection domain %d for device %s\n", 2523 dma_dom->domain.id, dev_name(dev)); 2524 2525 return &dma_dom->domain; 2526} 2527 2528static void update_device_table(struct protection_domain *domain) 2529{ 2530 struct iommu_dev_data *dev_data; 2531 2532 list_for_each_entry(dev_data, &domain->dev_list, list) 2533 set_dte_entry(dev_data->devid, domain, dev_data->ats.enabled); 2534} 2535 2536static void update_domain(struct protection_domain *domain) 2537{ 2538 if (!domain->updated) 2539 return; 2540 2541 update_device_table(domain); 2542 2543 domain_flush_devices(domain); 2544 domain_flush_tlb_pde(domain); 2545 2546 domain->updated = false; 2547} 2548 2549/* 2550 * This function fetches the PTE for a given address in the aperture 2551 */ 2552static u64* dma_ops_get_pte(struct dma_ops_domain *dom, 2553 unsigned long address) 2554{ 2555 struct aperture_range *aperture; 2556 u64 *pte, *pte_page; 2557 2558 aperture = dom->aperture[APERTURE_RANGE_INDEX(address)]; 2559 if (!aperture) 2560 return NULL; 2561 2562 pte = aperture->pte_pages[APERTURE_PAGE_INDEX(address)]; 2563 if (!pte) { 2564 pte = alloc_pte(&dom->domain, address, PAGE_SIZE, &pte_page, 2565 GFP_ATOMIC); 2566 aperture->pte_pages[APERTURE_PAGE_INDEX(address)] = pte_page; 2567 } else 2568 pte += PM_LEVEL_INDEX(0, address); 2569 2570 update_domain(&dom->domain); 2571 2572 return pte; 2573} 2574 2575/* 2576 * This is the generic map function. It maps one 4kb page at paddr to 2577 * the given address in the DMA address space for the domain. 2578 */ 2579static dma_addr_t dma_ops_domain_map(struct dma_ops_domain *dom, 2580 unsigned long address, 2581 phys_addr_t paddr, 2582 int direction) 2583{ 2584 u64 *pte, __pte; 2585 2586 WARN_ON(address > dom->aperture_size); 2587 2588 paddr &= PAGE_MASK; 2589 2590 pte = dma_ops_get_pte(dom, address); 2591 if (!pte) 2592 return DMA_ERROR_CODE; 2593 2594 __pte = paddr | IOMMU_PTE_P | IOMMU_PTE_FC; 2595 2596 if (direction == DMA_TO_DEVICE) 2597 __pte |= IOMMU_PTE_IR; 2598 else if (direction == DMA_FROM_DEVICE) 2599 __pte |= IOMMU_PTE_IW; 2600 else if (direction == DMA_BIDIRECTIONAL) 2601 __pte |= IOMMU_PTE_IR | IOMMU_PTE_IW; 2602 2603 WARN_ON(*pte); 2604 2605 *pte = __pte; 2606 2607 return (dma_addr_t)address; 2608} 2609 2610/* 2611 * The generic unmapping function for on page in the DMA address space. 2612 */ 2613static void dma_ops_domain_unmap(struct dma_ops_domain *dom, 2614 unsigned long address) 2615{ 2616 struct aperture_range *aperture; 2617 u64 *pte; 2618 2619 if (address >= dom->aperture_size) 2620 return; 2621 2622 aperture = dom->aperture[APERTURE_RANGE_INDEX(address)]; 2623 if (!aperture) 2624 return; 2625 2626 pte = aperture->pte_pages[APERTURE_PAGE_INDEX(address)]; 2627 if (!pte) 2628 return; 2629 2630 pte += PM_LEVEL_INDEX(0, address); 2631 2632 WARN_ON(!*pte); 2633 2634 *pte = 0ULL; 2635} 2636 2637/* 2638 * This function contains common code for mapping of a physically 2639 * contiguous memory region into DMA address space. It is used by all 2640 * mapping functions provided with this IOMMU driver. 2641 * Must be called with the domain lock held. 2642 */ 2643static dma_addr_t __map_single(struct device *dev, 2644 struct dma_ops_domain *dma_dom, 2645 phys_addr_t paddr, 2646 size_t size, 2647 int dir, 2648 bool align, 2649 u64 dma_mask) 2650{ 2651 dma_addr_t offset = paddr & ~PAGE_MASK; 2652 dma_addr_t address, start, ret; 2653 unsigned int pages; 2654 unsigned long align_mask = 0; 2655 int i; 2656 2657 pages = iommu_num_pages(paddr, size, PAGE_SIZE); 2658 paddr &= PAGE_MASK; 2659 2660 INC_STATS_COUNTER(total_map_requests); 2661 2662 if (pages > 1) 2663 INC_STATS_COUNTER(cross_page); 2664 2665 if (align) 2666 align_mask = (1UL << get_order(size)) - 1; 2667 2668retry: 2669 address = dma_ops_alloc_addresses(dev, dma_dom, pages, align_mask, 2670 dma_mask); 2671 if (unlikely(address == DMA_ERROR_CODE)) { 2672 /* 2673 * setting next_address here will let the address 2674 * allocator only scan the new allocated range in the 2675 * first run. This is a small optimization. 2676 */ 2677 dma_dom->next_address = dma_dom->aperture_size; 2678 2679 if (alloc_new_range(dma_dom, false, GFP_ATOMIC)) 2680 goto out; 2681 2682 /* 2683 * aperture was successfully enlarged by 128 MB, try 2684 * allocation again 2685 */ 2686 goto retry; 2687 } 2688 2689 start = address; 2690 for (i = 0; i < pages; ++i) { 2691 ret = dma_ops_domain_map(dma_dom, start, paddr, dir); 2692 if (ret == DMA_ERROR_CODE) 2693 goto out_unmap; 2694 2695 paddr += PAGE_SIZE; 2696 start += PAGE_SIZE; 2697 } 2698 address += offset; 2699 2700 ADD_STATS_COUNTER(alloced_io_mem, size); 2701 2702 if (unlikely(dma_dom->need_flush && !amd_iommu_unmap_flush)) { 2703 domain_flush_tlb(&dma_dom->domain); 2704 dma_dom->need_flush = false; 2705 } else if (unlikely(amd_iommu_np_cache)) 2706 domain_flush_pages(&dma_dom->domain, address, size); 2707 2708out: 2709 return address; 2710 2711out_unmap: 2712 2713 for (--i; i >= 0; --i) { 2714 start -= PAGE_SIZE; 2715 dma_ops_domain_unmap(dma_dom, start); 2716 } 2717 2718 dma_ops_free_addresses(dma_dom, address, pages); 2719 2720 return DMA_ERROR_CODE; 2721} 2722 2723/* 2724 * Does the reverse of the __map_single function. Must be called with 2725 * the domain lock held too 2726 */ 2727static void __unmap_single(struct dma_ops_domain *dma_dom, 2728 dma_addr_t dma_addr, 2729 size_t size, 2730 int dir) 2731{ 2732 dma_addr_t flush_addr; 2733 dma_addr_t i, start; 2734 unsigned int pages; 2735 2736 if ((dma_addr == DMA_ERROR_CODE) || 2737 (dma_addr + size > dma_dom->aperture_size)) 2738 return; 2739 2740 flush_addr = dma_addr; 2741 pages = iommu_num_pages(dma_addr, size, PAGE_SIZE); 2742 dma_addr &= PAGE_MASK; 2743 start = dma_addr; 2744 2745 for (i = 0; i < pages; ++i) { 2746 dma_ops_domain_unmap(dma_dom, start); 2747 start += PAGE_SIZE; 2748 } 2749 2750 SUB_STATS_COUNTER(alloced_io_mem, size); 2751 2752 dma_ops_free_addresses(dma_dom, dma_addr, pages); 2753 2754 if (amd_iommu_unmap_flush || dma_dom->need_flush) { 2755 domain_flush_pages(&dma_dom->domain, flush_addr, size); 2756 dma_dom->need_flush = false; 2757 } 2758} 2759 2760/* 2761 * The exported map_single function for dma_ops. 2762 */ 2763static dma_addr_t map_page(struct device *dev, struct page *page, 2764 unsigned long offset, size_t size, 2765 enum dma_data_direction dir, 2766 struct dma_attrs *attrs) 2767{ 2768 unsigned long flags; 2769 struct protection_domain *domain; 2770 dma_addr_t addr; 2771 u64 dma_mask; 2772 phys_addr_t paddr = page_to_phys(page) + offset; 2773 2774 INC_STATS_COUNTER(cnt_map_single); 2775 2776 domain = get_domain(dev); 2777 if (PTR_ERR(domain) == -EINVAL) 2778 return (dma_addr_t)paddr; 2779 else if (IS_ERR(domain)) 2780 return DMA_ERROR_CODE; 2781 2782 dma_mask = *dev->dma_mask; 2783 2784 spin_lock_irqsave(&domain->lock, flags); 2785 2786 addr = __map_single(dev, domain->priv, paddr, size, dir, false, 2787 dma_mask); 2788 if (addr == DMA_ERROR_CODE) 2789 goto out; 2790 2791 domain_flush_complete(domain); 2792 2793out: 2794 spin_unlock_irqrestore(&domain->lock, flags); 2795 2796 return addr; 2797} 2798 2799/* 2800 * The exported unmap_single function for dma_ops. 2801 */ 2802static void unmap_page(struct device *dev, dma_addr_t dma_addr, size_t size, 2803 enum dma_data_direction dir, struct dma_attrs *attrs) 2804{ 2805 unsigned long flags; 2806 struct protection_domain *domain; 2807 2808 INC_STATS_COUNTER(cnt_unmap_single); 2809 2810 domain = get_domain(dev); 2811 if (IS_ERR(domain)) 2812 return; 2813 2814 spin_lock_irqsave(&domain->lock, flags); 2815 2816 __unmap_single(domain->priv, dma_addr, size, dir); 2817 2818 domain_flush_complete(domain); 2819 2820 spin_unlock_irqrestore(&domain->lock, flags); 2821} 2822 2823/* 2824 * The exported map_sg function for dma_ops (handles scatter-gather 2825 * lists). 2826 */ 2827static int map_sg(struct device *dev, struct scatterlist *sglist, 2828 int nelems, enum dma_data_direction dir, 2829 struct dma_attrs *attrs) 2830{ 2831 unsigned long flags; 2832 struct protection_domain *domain; 2833 int i; 2834 struct scatterlist *s; 2835 phys_addr_t paddr; 2836 int mapped_elems = 0; 2837 u64 dma_mask; 2838 2839 INC_STATS_COUNTER(cnt_map_sg); 2840 2841 domain = get_domain(dev); 2842 if (IS_ERR(domain)) 2843 return 0; 2844 2845 dma_mask = *dev->dma_mask; 2846 2847 spin_lock_irqsave(&domain->lock, flags); 2848 2849 for_each_sg(sglist, s, nelems, i) { 2850 paddr = sg_phys(s); 2851 2852 s->dma_address = __map_single(dev, domain->priv, 2853 paddr, s->length, dir, false, 2854 dma_mask); 2855 2856 if (s->dma_address) { 2857 s->dma_length = s->length; 2858 mapped_elems++; 2859 } else 2860 goto unmap; 2861 } 2862 2863 domain_flush_complete(domain); 2864 2865out: 2866 spin_unlock_irqrestore(&domain->lock, flags); 2867 2868 return mapped_elems; 2869unmap: 2870 for_each_sg(sglist, s, mapped_elems, i) { 2871 if (s->dma_address) 2872 __unmap_single(domain->priv, s->dma_address, 2873 s->dma_length, dir); 2874 s->dma_address = s->dma_length = 0; 2875 } 2876 2877 mapped_elems = 0; 2878 2879 goto out; 2880} 2881 2882/* 2883 * The exported map_sg function for dma_ops (handles scatter-gather 2884 * lists). 2885 */ 2886static void unmap_sg(struct device *dev, struct scatterlist *sglist, 2887 int nelems, enum dma_data_direction dir, 2888 struct dma_attrs *attrs) 2889{ 2890 unsigned long flags; 2891 struct protection_domain *domain; 2892 struct scatterlist *s; 2893 int i; 2894 2895 INC_STATS_COUNTER(cnt_unmap_sg); 2896 2897 domain = get_domain(dev); 2898 if (IS_ERR(domain)) 2899 return; 2900 2901 spin_lock_irqsave(&domain->lock, flags); 2902 2903 for_each_sg(sglist, s, nelems, i) { 2904 __unmap_single(domain->priv, s->dma_address, 2905 s->dma_length, dir); 2906 s->dma_address = s->dma_length = 0; 2907 } 2908 2909 domain_flush_complete(domain); 2910 2911 spin_unlock_irqrestore(&domain->lock, flags); 2912} 2913 2914/* 2915 * The exported alloc_coherent function for dma_ops. 2916 */ 2917static void *alloc_coherent(struct device *dev, size_t size, 2918 dma_addr_t *dma_addr, gfp_t flag, 2919 struct dma_attrs *attrs) 2920{ 2921 u64 dma_mask = dev->coherent_dma_mask; 2922 struct protection_domain *domain; 2923 unsigned long flags; 2924 struct page *page; 2925 2926 INC_STATS_COUNTER(cnt_alloc_coherent); 2927 2928 domain = get_domain(dev); 2929 if (PTR_ERR(domain) == -EINVAL) { 2930 page = alloc_pages(flag, get_order(size)); 2931 *dma_addr = page_to_phys(page); 2932 return page_address(page); 2933 } else if (IS_ERR(domain)) 2934 return NULL; 2935 2936 size = PAGE_ALIGN(size); 2937 dma_mask = dev->coherent_dma_mask; 2938 flag &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32); 2939 flag |= __GFP_ZERO; 2940 2941 page = alloc_pages(flag | __GFP_NOWARN, get_order(size)); 2942 if (!page) { 2943 if (!(flag & __GFP_WAIT)) 2944 return NULL; 2945 2946 page = dma_alloc_from_contiguous(dev, size >> PAGE_SHIFT, 2947 get_order(size)); 2948 if (!page) 2949 return NULL; 2950 } 2951 2952 if (!dma_mask) 2953 dma_mask = *dev->dma_mask; 2954 2955 spin_lock_irqsave(&domain->lock, flags); 2956 2957 *dma_addr = __map_single(dev, domain->priv, page_to_phys(page), 2958 size, DMA_BIDIRECTIONAL, true, dma_mask); 2959 2960 if (*dma_addr == DMA_ERROR_CODE) { 2961 spin_unlock_irqrestore(&domain->lock, flags); 2962 goto out_free; 2963 } 2964 2965 domain_flush_complete(domain); 2966 2967 spin_unlock_irqrestore(&domain->lock, flags); 2968 2969 return page_address(page); 2970 2971out_free: 2972 2973 if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT)) 2974 __free_pages(page, get_order(size)); 2975 2976 return NULL; 2977} 2978 2979/* 2980 * The exported free_coherent function for dma_ops. 2981 */ 2982static void free_coherent(struct device *dev, size_t size, 2983 void *virt_addr, dma_addr_t dma_addr, 2984 struct dma_attrs *attrs) 2985{ 2986 struct protection_domain *domain; 2987 unsigned long flags; 2988 struct page *page; 2989 2990 INC_STATS_COUNTER(cnt_free_coherent); 2991 2992 page = virt_to_page(virt_addr); 2993 size = PAGE_ALIGN(size); 2994 2995 domain = get_domain(dev); 2996 if (IS_ERR(domain)) 2997 goto free_mem; 2998 2999 spin_lock_irqsave(&domain->lock, flags); 3000 3001 __unmap_single(domain->priv, dma_addr, size, DMA_BIDIRECTIONAL); 3002 3003 domain_flush_complete(domain); 3004 3005 spin_unlock_irqrestore(&domain->lock, flags); 3006 3007free_mem: 3008 if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT)) 3009 __free_pages(page, get_order(size)); 3010} 3011 3012/* 3013 * This function is called by the DMA layer to find out if we can handle a 3014 * particular device. It is part of the dma_ops. 3015 */ 3016static int amd_iommu_dma_supported(struct device *dev, u64 mask) 3017{ 3018 return check_device(dev); 3019} 3020 3021/* 3022 * The function for pre-allocating protection domains. 3023 * 3024 * If the driver core informs the DMA layer if a driver grabs a device 3025 * we don't need to preallocate the protection domains anymore. 3026 * For now we have to. 3027 */ 3028static void __init prealloc_protection_domains(void) 3029{ 3030 struct iommu_dev_data *dev_data; 3031 struct dma_ops_domain *dma_dom; 3032 struct pci_dev *dev = NULL; 3033 u16 devid; 3034 3035 for_each_pci_dev(dev) { 3036 3037 /* Do we handle this device? */ 3038 if (!check_device(&dev->dev)) 3039 continue; 3040 3041 dev_data = get_dev_data(&dev->dev); 3042 if (!amd_iommu_force_isolation && dev_data->iommu_v2) { 3043 /* Make sure passthrough domain is allocated */ 3044 alloc_passthrough_domain(); 3045 dev_data->passthrough = true; 3046 attach_device(&dev->dev, pt_domain); 3047 pr_info("AMD-Vi: Using passthrough domain for device %s\n", 3048 dev_name(&dev->dev)); 3049 } 3050 3051 /* Is there already any domain for it? */ 3052 if (domain_for_device(&dev->dev)) 3053 continue; 3054 3055 devid = get_device_id(&dev->dev); 3056 3057 dma_dom = dma_ops_domain_alloc(); 3058 if (!dma_dom) 3059 continue; 3060 init_unity_mappings_for_device(dma_dom, devid); 3061 dma_dom->target_dev = devid; 3062 3063 attach_device(&dev->dev, &dma_dom->domain); 3064 3065 list_add_tail(&dma_dom->list, &iommu_pd_list); 3066 } 3067} 3068 3069static struct dma_map_ops amd_iommu_dma_ops = { 3070 .alloc = alloc_coherent, 3071 .free = free_coherent, 3072 .map_page = map_page, 3073 .unmap_page = unmap_page, 3074 .map_sg = map_sg, 3075 .unmap_sg = unmap_sg, 3076 .dma_supported = amd_iommu_dma_supported, 3077}; 3078 3079static unsigned device_dma_ops_init(void) 3080{ 3081 struct iommu_dev_data *dev_data; 3082 struct pci_dev *pdev = NULL; 3083 unsigned unhandled = 0; 3084 3085 for_each_pci_dev(pdev) { 3086 if (!check_device(&pdev->dev)) { 3087 3088 iommu_ignore_device(&pdev->dev); 3089 3090 unhandled += 1; 3091 continue; 3092 } 3093 3094 dev_data = get_dev_data(&pdev->dev); 3095 3096 if (!dev_data->passthrough) 3097 pdev->dev.archdata.dma_ops = &amd_iommu_dma_ops; 3098 else 3099 pdev->dev.archdata.dma_ops = &nommu_dma_ops; 3100 } 3101 3102 return unhandled; 3103} 3104 3105/* 3106 * The function which clues the AMD IOMMU driver into dma_ops. 3107 */ 3108 3109void __init amd_iommu_init_api(void) 3110{ 3111 bus_set_iommu(&pci_bus_type, &amd_iommu_ops); 3112} 3113 3114int __init amd_iommu_init_dma_ops(void) 3115{ 3116 struct amd_iommu *iommu; 3117 int ret, unhandled; 3118 3119 /* 3120 * first allocate a default protection domain for every IOMMU we 3121 * found in the system. Devices not assigned to any other 3122 * protection domain will be assigned to the default one. 3123 */ 3124 for_each_iommu(iommu) { 3125 iommu->default_dom = dma_ops_domain_alloc(); 3126 if (iommu->default_dom == NULL) 3127 return -ENOMEM; 3128 iommu->default_dom->domain.flags |= PD_DEFAULT_MASK; 3129 ret = iommu_init_unity_mappings(iommu); 3130 if (ret) 3131 goto free_domains; 3132 } 3133 3134 /* 3135 * Pre-allocate the protection domains for each device. 3136 */ 3137 prealloc_protection_domains(); 3138 3139 iommu_detected = 1; 3140 swiotlb = 0; 3141 3142 /* Make the driver finally visible to the drivers */ 3143 unhandled = device_dma_ops_init(); 3144 if (unhandled && max_pfn > MAX_DMA32_PFN) { 3145 /* There are unhandled devices - initialize swiotlb for them */ 3146 swiotlb = 1; 3147 } 3148 3149 amd_iommu_stats_init(); 3150 3151 if (amd_iommu_unmap_flush) 3152 pr_info("AMD-Vi: IO/TLB flush on unmap enabled\n"); 3153 else 3154 pr_info("AMD-Vi: Lazy IO/TLB flushing enabled\n"); 3155 3156 return 0; 3157 3158free_domains: 3159 3160 for_each_iommu(iommu) { 3161 dma_ops_domain_free(iommu->default_dom); 3162 } 3163 3164 return ret; 3165} 3166 3167/***************************************************************************** 3168 * 3169 * The following functions belong to the exported interface of AMD IOMMU 3170 * 3171 * This interface allows access to lower level functions of the IOMMU 3172 * like protection domain handling and assignement of devices to domains 3173 * which is not possible with the dma_ops interface. 3174 * 3175 *****************************************************************************/ 3176 3177static void cleanup_domain(struct protection_domain *domain) 3178{ 3179 struct iommu_dev_data *entry; 3180 unsigned long flags; 3181 3182 write_lock_irqsave(&amd_iommu_devtable_lock, flags); 3183 3184 while (!list_empty(&domain->dev_list)) { 3185 entry = list_first_entry(&domain->dev_list, 3186 struct iommu_dev_data, list); 3187 __detach_device(entry); 3188 } 3189 3190 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); 3191} 3192 3193static void protection_domain_free(struct protection_domain *domain) 3194{ 3195 if (!domain) 3196 return; 3197 3198 del_domain_from_list(domain); 3199 3200 if (domain->id) 3201 domain_id_free(domain->id); 3202 3203 kfree(domain); 3204} 3205 3206static struct protection_domain *protection_domain_alloc(void) 3207{ 3208 struct protection_domain *domain; 3209 3210 domain = kzalloc(sizeof(*domain), GFP_KERNEL); 3211 if (!domain) 3212 return NULL; 3213 3214 spin_lock_init(&domain->lock); 3215 mutex_init(&domain->api_lock); 3216 domain->id = domain_id_alloc(); 3217 if (!domain->id) 3218 goto out_err; 3219 INIT_LIST_HEAD(&domain->dev_list); 3220 3221 add_domain_to_list(domain); 3222 3223 return domain; 3224 3225out_err: 3226 kfree(domain); 3227 3228 return NULL; 3229} 3230 3231static int __init alloc_passthrough_domain(void) 3232{ 3233 if (pt_domain != NULL) 3234 return 0; 3235 3236 /* allocate passthrough domain */ 3237 pt_domain = protection_domain_alloc(); 3238 if (!pt_domain) 3239 return -ENOMEM; 3240 3241 pt_domain->mode = PAGE_MODE_NONE; 3242 3243 return 0; 3244} 3245 3246static struct iommu_domain *amd_iommu_domain_alloc(unsigned type) 3247{ 3248 struct protection_domain *pdomain; 3249 3250 /* We only support unmanaged domains for now */ 3251 if (type != IOMMU_DOMAIN_UNMANAGED) 3252 return NULL; 3253 3254 pdomain = protection_domain_alloc(); 3255 if (!pdomain) 3256 goto out_free; 3257 3258 pdomain->mode = PAGE_MODE_3_LEVEL; 3259 pdomain->pt_root = (void *)get_zeroed_page(GFP_KERNEL); 3260 if (!pdomain->pt_root) 3261 goto out_free; 3262 3263 pdomain->domain.geometry.aperture_start = 0; 3264 pdomain->domain.geometry.aperture_end = ~0ULL; 3265 pdomain->domain.geometry.force_aperture = true; 3266 3267 return &pdomain->domain; 3268 3269out_free: 3270 protection_domain_free(pdomain); 3271 3272 return NULL; 3273} 3274 3275static void amd_iommu_domain_free(struct iommu_domain *dom) 3276{ 3277 struct protection_domain *domain; 3278 3279 if (!dom) 3280 return; 3281 3282 domain = to_pdomain(dom); 3283 3284 if (domain->dev_cnt > 0) 3285 cleanup_domain(domain); 3286 3287 BUG_ON(domain->dev_cnt != 0); 3288 3289 if (domain->mode != PAGE_MODE_NONE) 3290 free_pagetable(domain); 3291 3292 if (domain->flags & PD_IOMMUV2_MASK) 3293 free_gcr3_table(domain); 3294 3295 protection_domain_free(domain); 3296} 3297 3298static void amd_iommu_detach_device(struct iommu_domain *dom, 3299 struct device *dev) 3300{ 3301 struct iommu_dev_data *dev_data = dev->archdata.iommu; 3302 struct amd_iommu *iommu; 3303 u16 devid; 3304 3305 if (!check_device(dev)) 3306 return; 3307 3308 devid = get_device_id(dev); 3309 3310 if (dev_data->domain != NULL) 3311 detach_device(dev); 3312 3313 iommu = amd_iommu_rlookup_table[devid]; 3314 if (!iommu) 3315 return; 3316 3317 iommu_completion_wait(iommu); 3318} 3319 3320static int amd_iommu_attach_device(struct iommu_domain *dom, 3321 struct device *dev) 3322{ 3323 struct protection_domain *domain = to_pdomain(dom); 3324 struct iommu_dev_data *dev_data; 3325 struct amd_iommu *iommu; 3326 int ret; 3327 3328 if (!check_device(dev)) 3329 return -EINVAL; 3330 3331 dev_data = dev->archdata.iommu; 3332 3333 iommu = amd_iommu_rlookup_table[dev_data->devid]; 3334 if (!iommu) 3335 return -EINVAL; 3336 3337 if (dev_data->domain) 3338 detach_device(dev); 3339 3340 ret = attach_device(dev, domain); 3341 3342 iommu_completion_wait(iommu); 3343 3344 return ret; 3345} 3346 3347static int amd_iommu_map(struct iommu_domain *dom, unsigned long iova, 3348 phys_addr_t paddr, size_t page_size, int iommu_prot) 3349{ 3350 struct protection_domain *domain = to_pdomain(dom); 3351 int prot = 0; 3352 int ret; 3353 3354 if (domain->mode == PAGE_MODE_NONE) 3355 return -EINVAL; 3356 3357 if (iommu_prot & IOMMU_READ) 3358 prot |= IOMMU_PROT_IR; 3359 if (iommu_prot & IOMMU_WRITE) 3360 prot |= IOMMU_PROT_IW; 3361 3362 mutex_lock(&domain->api_lock); 3363 ret = iommu_map_page(domain, iova, paddr, prot, page_size); 3364 mutex_unlock(&domain->api_lock); 3365 3366 return ret; 3367} 3368 3369static size_t amd_iommu_unmap(struct iommu_domain *dom, unsigned long iova, 3370 size_t page_size) 3371{ 3372 struct protection_domain *domain = to_pdomain(dom); 3373 size_t unmap_size; 3374 3375 if (domain->mode == PAGE_MODE_NONE) 3376 return -EINVAL; 3377 3378 mutex_lock(&domain->api_lock); 3379 unmap_size = iommu_unmap_page(domain, iova, page_size); 3380 mutex_unlock(&domain->api_lock); 3381 3382 domain_flush_tlb_pde(domain); 3383 3384 return unmap_size; 3385} 3386 3387static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom, 3388 dma_addr_t iova) 3389{ 3390 struct protection_domain *domain = to_pdomain(dom); 3391 unsigned long offset_mask, pte_pgsize; 3392 u64 *pte, __pte; 3393 3394 if (domain->mode == PAGE_MODE_NONE) 3395 return iova; 3396 3397 pte = fetch_pte(domain, iova, &pte_pgsize); 3398 3399 if (!pte || !IOMMU_PTE_PRESENT(*pte)) 3400 return 0; 3401 3402 offset_mask = pte_pgsize - 1; 3403 __pte = *pte & PM_ADDR_MASK; 3404 3405 return (__pte & ~offset_mask) | (iova & offset_mask); 3406} 3407 3408static bool amd_iommu_capable(enum iommu_cap cap) 3409{ 3410 switch (cap) { 3411 case IOMMU_CAP_CACHE_COHERENCY: 3412 return true; 3413 case IOMMU_CAP_INTR_REMAP: 3414 return (irq_remapping_enabled == 1); 3415 case IOMMU_CAP_NOEXEC: 3416 return false; 3417 } 3418 3419 return false; 3420} 3421 3422static const struct iommu_ops amd_iommu_ops = { 3423 .capable = amd_iommu_capable, 3424 .domain_alloc = amd_iommu_domain_alloc, 3425 .domain_free = amd_iommu_domain_free, 3426 .attach_dev = amd_iommu_attach_device, 3427 .detach_dev = amd_iommu_detach_device, 3428 .map = amd_iommu_map, 3429 .unmap = amd_iommu_unmap, 3430 .map_sg = default_iommu_map_sg, 3431 .iova_to_phys = amd_iommu_iova_to_phys, 3432 .pgsize_bitmap = AMD_IOMMU_PGSIZES, 3433}; 3434 3435/***************************************************************************** 3436 * 3437 * The next functions do a basic initialization of IOMMU for pass through 3438 * mode 3439 * 3440 * In passthrough mode the IOMMU is initialized and enabled but not used for 3441 * DMA-API translation. 3442 * 3443 *****************************************************************************/ 3444 3445int __init amd_iommu_init_passthrough(void) 3446{ 3447 struct iommu_dev_data *dev_data; 3448 struct pci_dev *dev = NULL; 3449 int ret; 3450 3451 ret = alloc_passthrough_domain(); 3452 if (ret) 3453 return ret; 3454 3455 for_each_pci_dev(dev) { 3456 if (!check_device(&dev->dev)) 3457 continue; 3458 3459 dev_data = get_dev_data(&dev->dev); 3460 dev_data->passthrough = true; 3461 3462 attach_device(&dev->dev, pt_domain); 3463 } 3464 3465 amd_iommu_stats_init(); 3466 3467 pr_info("AMD-Vi: Initialized for Passthrough Mode\n"); 3468 3469 return 0; 3470} 3471 3472/* IOMMUv2 specific functions */ 3473int amd_iommu_register_ppr_notifier(struct notifier_block *nb) 3474{ 3475 return atomic_notifier_chain_register(&ppr_notifier, nb); 3476} 3477EXPORT_SYMBOL(amd_iommu_register_ppr_notifier); 3478 3479int amd_iommu_unregister_ppr_notifier(struct notifier_block *nb) 3480{ 3481 return atomic_notifier_chain_unregister(&ppr_notifier, nb); 3482} 3483EXPORT_SYMBOL(amd_iommu_unregister_ppr_notifier); 3484 3485void amd_iommu_domain_direct_map(struct iommu_domain *dom) 3486{ 3487 struct protection_domain *domain = to_pdomain(dom); 3488 unsigned long flags; 3489 3490 spin_lock_irqsave(&domain->lock, flags); 3491 3492 /* Update data structure */ 3493 domain->mode = PAGE_MODE_NONE; 3494 domain->updated = true; 3495 3496 /* Make changes visible to IOMMUs */ 3497 update_domain(domain); 3498 3499 /* Page-table is not visible to IOMMU anymore, so free it */ 3500 free_pagetable(domain); 3501 3502 spin_unlock_irqrestore(&domain->lock, flags); 3503} 3504EXPORT_SYMBOL(amd_iommu_domain_direct_map); 3505 3506int amd_iommu_domain_enable_v2(struct iommu_domain *dom, int pasids) 3507{ 3508 struct protection_domain *domain = to_pdomain(dom); 3509 unsigned long flags; 3510 int levels, ret; 3511 3512 if (pasids <= 0 || pasids > (PASID_MASK + 1)) 3513 return -EINVAL; 3514 3515 /* Number of GCR3 table levels required */ 3516 for (levels = 0; (pasids - 1) & ~0x1ff; pasids >>= 9) 3517 levels += 1; 3518 3519 if (levels > amd_iommu_max_glx_val) 3520 return -EINVAL; 3521 3522 spin_lock_irqsave(&domain->lock, flags); 3523 3524 /* 3525 * Save us all sanity checks whether devices already in the 3526 * domain support IOMMUv2. Just force that the domain has no 3527 * devices attached when it is switched into IOMMUv2 mode. 3528 */ 3529 ret = -EBUSY; 3530 if (domain->dev_cnt > 0 || domain->flags & PD_IOMMUV2_MASK) 3531 goto out; 3532 3533 ret = -ENOMEM; 3534 domain->gcr3_tbl = (void *)get_zeroed_page(GFP_ATOMIC); 3535 if (domain->gcr3_tbl == NULL) 3536 goto out; 3537 3538 domain->glx = levels; 3539 domain->flags |= PD_IOMMUV2_MASK; 3540 domain->updated = true; 3541 3542 update_domain(domain); 3543 3544 ret = 0; 3545 3546out: 3547 spin_unlock_irqrestore(&domain->lock, flags); 3548 3549 return ret; 3550} 3551EXPORT_SYMBOL(amd_iommu_domain_enable_v2); 3552 3553static int __flush_pasid(struct protection_domain *domain, int pasid, 3554 u64 address, bool size) 3555{ 3556 struct iommu_dev_data *dev_data; 3557 struct iommu_cmd cmd; 3558 int i, ret; 3559 3560 if (!(domain->flags & PD_IOMMUV2_MASK)) 3561 return -EINVAL; 3562 3563 build_inv_iommu_pasid(&cmd, domain->id, pasid, address, size); 3564 3565 /* 3566 * IOMMU TLB needs to be flushed before Device TLB to 3567 * prevent device TLB refill from IOMMU TLB 3568 */ 3569 for (i = 0; i < amd_iommus_present; ++i) { 3570 if (domain->dev_iommu[i] == 0) 3571 continue; 3572 3573 ret = iommu_queue_command(amd_iommus[i], &cmd); 3574 if (ret != 0) 3575 goto out; 3576 } 3577 3578 /* Wait until IOMMU TLB flushes are complete */ 3579 domain_flush_complete(domain); 3580 3581 /* Now flush device TLBs */ 3582 list_for_each_entry(dev_data, &domain->dev_list, list) { 3583 struct amd_iommu *iommu; 3584 int qdep; 3585 3586 BUG_ON(!dev_data->ats.enabled); 3587 3588 qdep = dev_data->ats.qdep; 3589 iommu = amd_iommu_rlookup_table[dev_data->devid]; 3590 3591 build_inv_iotlb_pasid(&cmd, dev_data->devid, pasid, 3592 qdep, address, size); 3593 3594 ret = iommu_queue_command(iommu, &cmd); 3595 if (ret != 0) 3596 goto out; 3597 } 3598 3599 /* Wait until all device TLBs are flushed */ 3600 domain_flush_complete(domain); 3601 3602 ret = 0; 3603 3604out: 3605 3606 return ret; 3607} 3608 3609static int __amd_iommu_flush_page(struct protection_domain *domain, int pasid, 3610 u64 address) 3611{ 3612 INC_STATS_COUNTER(invalidate_iotlb); 3613 3614 return __flush_pasid(domain, pasid, address, false); 3615} 3616 3617int amd_iommu_flush_page(struct iommu_domain *dom, int pasid, 3618 u64 address) 3619{ 3620 struct protection_domain *domain = to_pdomain(dom); 3621 unsigned long flags; 3622 int ret; 3623 3624 spin_lock_irqsave(&domain->lock, flags); 3625 ret = __amd_iommu_flush_page(domain, pasid, address); 3626 spin_unlock_irqrestore(&domain->lock, flags); 3627 3628 return ret; 3629} 3630EXPORT_SYMBOL(amd_iommu_flush_page); 3631 3632static int __amd_iommu_flush_tlb(struct protection_domain *domain, int pasid) 3633{ 3634 INC_STATS_COUNTER(invalidate_iotlb_all); 3635 3636 return __flush_pasid(domain, pasid, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 3637 true); 3638} 3639 3640int amd_iommu_flush_tlb(struct iommu_domain *dom, int pasid) 3641{ 3642 struct protection_domain *domain = to_pdomain(dom); 3643 unsigned long flags; 3644 int ret; 3645 3646 spin_lock_irqsave(&domain->lock, flags); 3647 ret = __amd_iommu_flush_tlb(domain, pasid); 3648 spin_unlock_irqrestore(&domain->lock, flags); 3649 3650 return ret; 3651} 3652EXPORT_SYMBOL(amd_iommu_flush_tlb); 3653 3654static u64 *__get_gcr3_pte(u64 *root, int level, int pasid, bool alloc) 3655{ 3656 int index; 3657 u64 *pte; 3658 3659 while (true) { 3660 3661 index = (pasid >> (9 * level)) & 0x1ff; 3662 pte = &root[index]; 3663 3664 if (level == 0) 3665 break; 3666 3667 if (!(*pte & GCR3_VALID)) { 3668 if (!alloc) 3669 return NULL; 3670 3671 root = (void *)get_zeroed_page(GFP_ATOMIC); 3672 if (root == NULL) 3673 return NULL; 3674 3675 *pte = __pa(root) | GCR3_VALID; 3676 } 3677 3678 root = __va(*pte & PAGE_MASK); 3679 3680 level -= 1; 3681 } 3682 3683 return pte; 3684} 3685 3686static int __set_gcr3(struct protection_domain *domain, int pasid, 3687 unsigned long cr3) 3688{ 3689 u64 *pte; 3690 3691 if (domain->mode != PAGE_MODE_NONE) 3692 return -EINVAL; 3693 3694 pte = __get_gcr3_pte(domain->gcr3_tbl, domain->glx, pasid, true); 3695 if (pte == NULL) 3696 return -ENOMEM; 3697 3698 *pte = (cr3 & PAGE_MASK) | GCR3_VALID; 3699 3700 return __amd_iommu_flush_tlb(domain, pasid); 3701} 3702 3703static int __clear_gcr3(struct protection_domain *domain, int pasid) 3704{ 3705 u64 *pte; 3706 3707 if (domain->mode != PAGE_MODE_NONE) 3708 return -EINVAL; 3709 3710 pte = __get_gcr3_pte(domain->gcr3_tbl, domain->glx, pasid, false); 3711 if (pte == NULL) 3712 return 0; 3713 3714 *pte = 0; 3715 3716 return __amd_iommu_flush_tlb(domain, pasid); 3717} 3718 3719int amd_iommu_domain_set_gcr3(struct iommu_domain *dom, int pasid, 3720 unsigned long cr3) 3721{ 3722 struct protection_domain *domain = to_pdomain(dom); 3723 unsigned long flags; 3724 int ret; 3725 3726 spin_lock_irqsave(&domain->lock, flags); 3727 ret = __set_gcr3(domain, pasid, cr3); 3728 spin_unlock_irqrestore(&domain->lock, flags); 3729 3730 return ret; 3731} 3732EXPORT_SYMBOL(amd_iommu_domain_set_gcr3); 3733 3734int amd_iommu_domain_clear_gcr3(struct iommu_domain *dom, int pasid) 3735{ 3736 struct protection_domain *domain = to_pdomain(dom); 3737 unsigned long flags; 3738 int ret; 3739 3740 spin_lock_irqsave(&domain->lock, flags); 3741 ret = __clear_gcr3(domain, pasid); 3742 spin_unlock_irqrestore(&domain->lock, flags); 3743 3744 return ret; 3745} 3746EXPORT_SYMBOL(amd_iommu_domain_clear_gcr3); 3747 3748int amd_iommu_complete_ppr(struct pci_dev *pdev, int pasid, 3749 int status, int tag) 3750{ 3751 struct iommu_dev_data *dev_data; 3752 struct amd_iommu *iommu; 3753 struct iommu_cmd cmd; 3754 3755 INC_STATS_COUNTER(complete_ppr); 3756 3757 dev_data = get_dev_data(&pdev->dev); 3758 iommu = amd_iommu_rlookup_table[dev_data->devid]; 3759 3760 build_complete_ppr(&cmd, dev_data->devid, pasid, status, 3761 tag, dev_data->pri_tlp); 3762 3763 return iommu_queue_command(iommu, &cmd); 3764} 3765EXPORT_SYMBOL(amd_iommu_complete_ppr); 3766 3767struct iommu_domain *amd_iommu_get_v2_domain(struct pci_dev *pdev) 3768{ 3769 struct protection_domain *pdomain; 3770 3771 pdomain = get_domain(&pdev->dev); 3772 if (IS_ERR(pdomain)) 3773 return NULL; 3774 3775 /* Only return IOMMUv2 domains */ 3776 if (!(pdomain->flags & PD_IOMMUV2_MASK)) 3777 return NULL; 3778 3779 return &pdomain->domain; 3780} 3781EXPORT_SYMBOL(amd_iommu_get_v2_domain); 3782 3783void amd_iommu_enable_device_erratum(struct pci_dev *pdev, u32 erratum) 3784{ 3785 struct iommu_dev_data *dev_data; 3786 3787 if (!amd_iommu_v2_supported()) 3788 return; 3789 3790 dev_data = get_dev_data(&pdev->dev); 3791 dev_data->errata |= (1 << erratum); 3792} 3793EXPORT_SYMBOL(amd_iommu_enable_device_erratum); 3794 3795int amd_iommu_device_info(struct pci_dev *pdev, 3796 struct amd_iommu_device_info *info) 3797{ 3798 int max_pasids; 3799 int pos; 3800 3801 if (pdev == NULL || info == NULL) 3802 return -EINVAL; 3803 3804 if (!amd_iommu_v2_supported()) 3805 return -EINVAL; 3806 3807 memset(info, 0, sizeof(*info)); 3808 3809 pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS); 3810 if (pos) 3811 info->flags |= AMD_IOMMU_DEVICE_FLAG_ATS_SUP; 3812 3813 pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI); 3814 if (pos) 3815 info->flags |= AMD_IOMMU_DEVICE_FLAG_PRI_SUP; 3816 3817 pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PASID); 3818 if (pos) { 3819 int features; 3820 3821 max_pasids = 1 << (9 * (amd_iommu_max_glx_val + 1)); 3822 max_pasids = min(max_pasids, (1 << 20)); 3823 3824 info->flags |= AMD_IOMMU_DEVICE_FLAG_PASID_SUP; 3825 info->max_pasids = min(pci_max_pasids(pdev), max_pasids); 3826 3827 features = pci_pasid_features(pdev); 3828 if (features & PCI_PASID_CAP_EXEC) 3829 info->flags |= AMD_IOMMU_DEVICE_FLAG_EXEC_SUP; 3830 if (features & PCI_PASID_CAP_PRIV) 3831 info->flags |= AMD_IOMMU_DEVICE_FLAG_PRIV_SUP; 3832 } 3833 3834 return 0; 3835} 3836EXPORT_SYMBOL(amd_iommu_device_info); 3837 3838#ifdef CONFIG_IRQ_REMAP 3839 3840/***************************************************************************** 3841 * 3842 * Interrupt Remapping Implementation 3843 * 3844 *****************************************************************************/ 3845 3846union irte { 3847 u32 val; 3848 struct { 3849 u32 valid : 1, 3850 no_fault : 1, 3851 int_type : 3, 3852 rq_eoi : 1, 3853 dm : 1, 3854 rsvd_1 : 1, 3855 destination : 8, 3856 vector : 8, 3857 rsvd_2 : 8; 3858 } fields; 3859}; 3860 3861#define DTE_IRQ_PHYS_ADDR_MASK (((1ULL << 45)-1) << 6) 3862#define DTE_IRQ_REMAP_INTCTL (2ULL << 60) 3863#define DTE_IRQ_TABLE_LEN (8ULL << 1) 3864#define DTE_IRQ_REMAP_ENABLE 1ULL 3865 3866static void set_dte_irq_entry(u16 devid, struct irq_remap_table *table) 3867{ 3868 u64 dte; 3869 3870 dte = amd_iommu_dev_table[devid].data[2]; 3871 dte &= ~DTE_IRQ_PHYS_ADDR_MASK; 3872 dte |= virt_to_phys(table->table); 3873 dte |= DTE_IRQ_REMAP_INTCTL; 3874 dte |= DTE_IRQ_TABLE_LEN; 3875 dte |= DTE_IRQ_REMAP_ENABLE; 3876 3877 amd_iommu_dev_table[devid].data[2] = dte; 3878} 3879 3880#define IRTE_ALLOCATED (~1U) 3881 3882static struct irq_remap_table *get_irq_table(u16 devid, bool ioapic) 3883{ 3884 struct irq_remap_table *table = NULL; 3885 struct amd_iommu *iommu; 3886 unsigned long flags; 3887 u16 alias; 3888 3889 write_lock_irqsave(&amd_iommu_devtable_lock, flags); 3890 3891 iommu = amd_iommu_rlookup_table[devid]; 3892 if (!iommu) 3893 goto out_unlock; 3894 3895 table = irq_lookup_table[devid]; 3896 if (table) 3897 goto out; 3898 3899 alias = amd_iommu_alias_table[devid]; 3900 table = irq_lookup_table[alias]; 3901 if (table) { 3902 irq_lookup_table[devid] = table; 3903 set_dte_irq_entry(devid, table); 3904 iommu_flush_dte(iommu, devid); 3905 goto out; 3906 } 3907 3908 /* Nothing there yet, allocate new irq remapping table */ 3909 table = kzalloc(sizeof(*table), GFP_ATOMIC); 3910 if (!table) 3911 goto out; 3912 3913 /* Initialize table spin-lock */ 3914 spin_lock_init(&table->lock); 3915 3916 if (ioapic) 3917 /* Keep the first 32 indexes free for IOAPIC interrupts */ 3918 table->min_index = 32; 3919 3920 table->table = kmem_cache_alloc(amd_iommu_irq_cache, GFP_ATOMIC); 3921 if (!table->table) { 3922 kfree(table); 3923 table = NULL; 3924 goto out; 3925 } 3926 3927 memset(table->table, 0, MAX_IRQS_PER_TABLE * sizeof(u32)); 3928 3929 if (ioapic) { 3930 int i; 3931 3932 for (i = 0; i < 32; ++i) 3933 table->table[i] = IRTE_ALLOCATED; 3934 } 3935 3936 irq_lookup_table[devid] = table; 3937 set_dte_irq_entry(devid, table); 3938 iommu_flush_dte(iommu, devid); 3939 if (devid != alias) { 3940 irq_lookup_table[alias] = table; 3941 set_dte_irq_entry(alias, table); 3942 iommu_flush_dte(iommu, alias); 3943 } 3944 3945out: 3946 iommu_completion_wait(iommu); 3947 3948out_unlock: 3949 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); 3950 3951 return table; 3952} 3953 3954static int alloc_irq_index(struct irq_cfg *cfg, u16 devid, int count) 3955{ 3956 struct irq_remap_table *table; 3957 unsigned long flags; 3958 int index, c; 3959 3960 table = get_irq_table(devid, false); 3961 if (!table) 3962 return -ENODEV; 3963 3964 spin_lock_irqsave(&table->lock, flags); 3965 3966 /* Scan table for free entries */ 3967 for (c = 0, index = table->min_index; 3968 index < MAX_IRQS_PER_TABLE; 3969 ++index) { 3970 if (table->table[index] == 0) 3971 c += 1; 3972 else 3973 c = 0; 3974 3975 if (c == count) { 3976 struct irq_2_irte *irte_info; 3977 3978 for (; c != 0; --c) 3979 table->table[index - c + 1] = IRTE_ALLOCATED; 3980 3981 index -= count - 1; 3982 3983 cfg->remapped = 1; 3984 irte_info = &cfg->irq_2_irte; 3985 irte_info->devid = devid; 3986 irte_info->index = index; 3987 3988 goto out; 3989 } 3990 } 3991 3992 index = -ENOSPC; 3993 3994out: 3995 spin_unlock_irqrestore(&table->lock, flags); 3996 3997 return index; 3998} 3999 4000static int get_irte(u16 devid, int index, union irte *irte) 4001{ 4002 struct irq_remap_table *table; 4003 unsigned long flags; 4004 4005 table = get_irq_table(devid, false); 4006 if (!table) 4007 return -ENOMEM; 4008 4009 spin_lock_irqsave(&table->lock, flags); 4010 irte->val = table->table[index]; 4011 spin_unlock_irqrestore(&table->lock, flags); 4012 4013 return 0; 4014} 4015 4016static int modify_irte(u16 devid, int index, union irte irte) 4017{ 4018 struct irq_remap_table *table; 4019 struct amd_iommu *iommu; 4020 unsigned long flags; 4021 4022 iommu = amd_iommu_rlookup_table[devid]; 4023 if (iommu == NULL) 4024 return -EINVAL; 4025 4026 table = get_irq_table(devid, false); 4027 if (!table) 4028 return -ENOMEM; 4029 4030 spin_lock_irqsave(&table->lock, flags); 4031 table->table[index] = irte.val; 4032 spin_unlock_irqrestore(&table->lock, flags); 4033 4034 iommu_flush_irt(iommu, devid); 4035 iommu_completion_wait(iommu); 4036 4037 return 0; 4038} 4039 4040static void free_irte(u16 devid, int index) 4041{ 4042 struct irq_remap_table *table; 4043 struct amd_iommu *iommu; 4044 unsigned long flags; 4045 4046 iommu = amd_iommu_rlookup_table[devid]; 4047 if (iommu == NULL) 4048 return; 4049 4050 table = get_irq_table(devid, false); 4051 if (!table) 4052 return; 4053 4054 spin_lock_irqsave(&table->lock, flags); 4055 table->table[index] = 0; 4056 spin_unlock_irqrestore(&table->lock, flags); 4057 4058 iommu_flush_irt(iommu, devid); 4059 iommu_completion_wait(iommu); 4060} 4061 4062static int setup_ioapic_entry(int irq, struct IO_APIC_route_entry *entry, 4063 unsigned int destination, int vector, 4064 struct io_apic_irq_attr *attr) 4065{ 4066 struct irq_remap_table *table; 4067 struct irq_2_irte *irte_info; 4068 struct irq_cfg *cfg; 4069 union irte irte; 4070 int ioapic_id; 4071 int index; 4072 int devid; 4073 int ret; 4074 4075 cfg = irq_cfg(irq); 4076 if (!cfg) 4077 return -EINVAL; 4078 4079 irte_info = &cfg->irq_2_irte; 4080 ioapic_id = mpc_ioapic_id(attr->ioapic); 4081 devid = get_ioapic_devid(ioapic_id); 4082 4083 if (devid < 0) 4084 return devid; 4085 4086 table = get_irq_table(devid, true); 4087 if (table == NULL) 4088 return -ENOMEM; 4089 4090 index = attr->ioapic_pin; 4091 4092 /* Setup IRQ remapping info */ 4093 cfg->remapped = 1; 4094 irte_info->devid = devid; 4095 irte_info->index = index; 4096 4097 /* Setup IRTE for IOMMU */ 4098 irte.val = 0; 4099 irte.fields.vector = vector; 4100 irte.fields.int_type = apic->irq_delivery_mode; 4101 irte.fields.destination = destination; 4102 irte.fields.dm = apic->irq_dest_mode; 4103 irte.fields.valid = 1; 4104 4105 ret = modify_irte(devid, index, irte); 4106 if (ret) 4107 return ret; 4108 4109 /* Setup IOAPIC entry */ 4110 memset(entry, 0, sizeof(*entry)); 4111 4112 entry->vector = index; 4113 entry->mask = 0; 4114 entry->trigger = attr->trigger; 4115 entry->polarity = attr->polarity; 4116 4117 /* 4118 * Mask level triggered irqs. 4119 */ 4120 if (attr->trigger) 4121 entry->mask = 1; 4122 4123 return 0; 4124} 4125 4126static int set_affinity(struct irq_data *data, const struct cpumask *mask, 4127 bool force) 4128{ 4129 struct irq_2_irte *irte_info; 4130 unsigned int dest, irq; 4131 struct irq_cfg *cfg; 4132 union irte irte; 4133 int err; 4134 4135 if (!config_enabled(CONFIG_SMP)) 4136 return -1; 4137 4138 cfg = irqd_cfg(data); 4139 irq = data->irq; 4140 irte_info = &cfg->irq_2_irte; 4141 4142 if (!cpumask_intersects(mask, cpu_online_mask)) 4143 return -EINVAL; 4144 4145 if (get_irte(irte_info->devid, irte_info->index, &irte)) 4146 return -EBUSY; 4147 4148 if (assign_irq_vector(irq, cfg, mask)) 4149 return -EBUSY; 4150 4151 err = apic->cpu_mask_to_apicid_and(cfg->domain, mask, &dest); 4152 if (err) { 4153 if (assign_irq_vector(irq, cfg, data->affinity)) 4154 pr_err("AMD-Vi: Failed to recover vector for irq %d\n", irq); 4155 return err; 4156 } 4157 4158 irte.fields.vector = cfg->vector; 4159 irte.fields.destination = dest; 4160 4161 modify_irte(irte_info->devid, irte_info->index, irte); 4162 4163 if (cfg->move_in_progress) 4164 send_cleanup_vector(cfg); 4165 4166 cpumask_copy(data->affinity, mask); 4167 4168 return 0; 4169} 4170 4171static int free_irq(int irq) 4172{ 4173 struct irq_2_irte *irte_info; 4174 struct irq_cfg *cfg; 4175 4176 cfg = irq_cfg(irq); 4177 if (!cfg) 4178 return -EINVAL; 4179 4180 irte_info = &cfg->irq_2_irte; 4181 4182 free_irte(irte_info->devid, irte_info->index); 4183 4184 return 0; 4185} 4186 4187static void compose_msi_msg(struct pci_dev *pdev, 4188 unsigned int irq, unsigned int dest, 4189 struct msi_msg *msg, u8 hpet_id) 4190{ 4191 struct irq_2_irte *irte_info; 4192 struct irq_cfg *cfg; 4193 union irte irte; 4194 4195 cfg = irq_cfg(irq); 4196 if (!cfg) 4197 return; 4198 4199 irte_info = &cfg->irq_2_irte; 4200 4201 irte.val = 0; 4202 irte.fields.vector = cfg->vector; 4203 irte.fields.int_type = apic->irq_delivery_mode; 4204 irte.fields.destination = dest; 4205 irte.fields.dm = apic->irq_dest_mode; 4206 irte.fields.valid = 1; 4207 4208 modify_irte(irte_info->devid, irte_info->index, irte); 4209 4210 msg->address_hi = MSI_ADDR_BASE_HI; 4211 msg->address_lo = MSI_ADDR_BASE_LO; 4212 msg->data = irte_info->index; 4213} 4214 4215static int msi_alloc_irq(struct pci_dev *pdev, int irq, int nvec) 4216{ 4217 struct irq_cfg *cfg; 4218 int index; 4219 u16 devid; 4220 4221 if (!pdev) 4222 return -EINVAL; 4223 4224 cfg = irq_cfg(irq); 4225 if (!cfg) 4226 return -EINVAL; 4227 4228 devid = get_device_id(&pdev->dev); 4229 index = alloc_irq_index(cfg, devid, nvec); 4230 4231 return index < 0 ? MAX_IRQS_PER_TABLE : index; 4232} 4233 4234static int msi_setup_irq(struct pci_dev *pdev, unsigned int irq, 4235 int index, int offset) 4236{ 4237 struct irq_2_irte *irte_info; 4238 struct irq_cfg *cfg; 4239 u16 devid; 4240 4241 if (!pdev) 4242 return -EINVAL; 4243 4244 cfg = irq_cfg(irq); 4245 if (!cfg) 4246 return -EINVAL; 4247 4248 if (index >= MAX_IRQS_PER_TABLE) 4249 return 0; 4250 4251 devid = get_device_id(&pdev->dev); 4252 irte_info = &cfg->irq_2_irte; 4253 4254 cfg->remapped = 1; 4255 irte_info->devid = devid; 4256 irte_info->index = index + offset; 4257 4258 return 0; 4259} 4260 4261static int alloc_hpet_msi(unsigned int irq, unsigned int id) 4262{ 4263 struct irq_2_irte *irte_info; 4264 struct irq_cfg *cfg; 4265 int index, devid; 4266 4267 cfg = irq_cfg(irq); 4268 if (!cfg) 4269 return -EINVAL; 4270 4271 irte_info = &cfg->irq_2_irte; 4272 devid = get_hpet_devid(id); 4273 if (devid < 0) 4274 return devid; 4275 4276 index = alloc_irq_index(cfg, devid, 1); 4277 if (index < 0) 4278 return index; 4279 4280 cfg->remapped = 1; 4281 irte_info->devid = devid; 4282 irte_info->index = index; 4283 4284 return 0; 4285} 4286 4287struct irq_remap_ops amd_iommu_irq_ops = { 4288 .prepare = amd_iommu_prepare, 4289 .enable = amd_iommu_enable, 4290 .disable = amd_iommu_disable, 4291 .reenable = amd_iommu_reenable, 4292 .enable_faulting = amd_iommu_enable_faulting, 4293 .setup_ioapic_entry = setup_ioapic_entry, 4294 .set_affinity = set_affinity, 4295 .free_irq = free_irq, 4296 .compose_msi_msg = compose_msi_msg, 4297 .msi_alloc_irq = msi_alloc_irq, 4298 .msi_setup_irq = msi_setup_irq, 4299 .alloc_hpet_msi = alloc_hpet_msi, 4300}; 4301#endif 4302