1/*
2 * Copyright (C) 2007-2010 Advanced Micro Devices, Inc.
3 * Author: Joerg Roedel <jroedel@suse.de>
4 *         Leo Duran <leo.duran@amd.com>
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 as published
8 * by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
18 */
19
20#include <linux/ratelimit.h>
21#include <linux/pci.h>
22#include <linux/pci-ats.h>
23#include <linux/bitmap.h>
24#include <linux/slab.h>
25#include <linux/debugfs.h>
26#include <linux/scatterlist.h>
27#include <linux/dma-mapping.h>
28#include <linux/iommu-helper.h>
29#include <linux/iommu.h>
30#include <linux/delay.h>
31#include <linux/amd-iommu.h>
32#include <linux/notifier.h>
33#include <linux/export.h>
34#include <linux/irq.h>
35#include <linux/msi.h>
36#include <linux/dma-contiguous.h>
37#include <asm/irq_remapping.h>
38#include <asm/io_apic.h>
39#include <asm/apic.h>
40#include <asm/hw_irq.h>
41#include <asm/msidef.h>
42#include <asm/proto.h>
43#include <asm/iommu.h>
44#include <asm/gart.h>
45#include <asm/dma.h>
46
47#include "amd_iommu_proto.h"
48#include "amd_iommu_types.h"
49#include "irq_remapping.h"
50
51#define CMD_SET_TYPE(cmd, t) ((cmd)->data[1] |= ((t) << 28))
52
53#define LOOP_TIMEOUT	100000
54
55/*
56 * This bitmap is used to advertise the page sizes our hardware support
57 * to the IOMMU core, which will then use this information to split
58 * physically contiguous memory regions it is mapping into page sizes
59 * that we support.
60 *
61 * 512GB Pages are not supported due to a hardware bug
62 */
63#define AMD_IOMMU_PGSIZES	((~0xFFFUL) & ~(2ULL << 38))
64
65static DEFINE_RWLOCK(amd_iommu_devtable_lock);
66
67/* A list of preallocated protection domains */
68static LIST_HEAD(iommu_pd_list);
69static DEFINE_SPINLOCK(iommu_pd_list_lock);
70
71/* List of all available dev_data structures */
72static LIST_HEAD(dev_data_list);
73static DEFINE_SPINLOCK(dev_data_list_lock);
74
75LIST_HEAD(ioapic_map);
76LIST_HEAD(hpet_map);
77
78/*
79 * Domain for untranslated devices - only allocated
80 * if iommu=pt passed on kernel cmd line.
81 */
82static struct protection_domain *pt_domain;
83
84static const struct iommu_ops amd_iommu_ops;
85
86static ATOMIC_NOTIFIER_HEAD(ppr_notifier);
87int amd_iommu_max_glx_val = -1;
88
89static struct dma_map_ops amd_iommu_dma_ops;
90
91/*
92 * This struct contains device specific data for the IOMMU
93 */
94struct iommu_dev_data {
95	struct list_head list;		  /* For domain->dev_list */
96	struct list_head dev_data_list;	  /* For global dev_data_list */
97	struct list_head alias_list;      /* Link alias-groups together */
98	struct iommu_dev_data *alias_data;/* The alias dev_data */
99	struct protection_domain *domain; /* Domain the device is bound to */
100	u16 devid;			  /* PCI Device ID */
101	bool iommu_v2;			  /* Device can make use of IOMMUv2 */
102	bool passthrough;		  /* Default for device is pt_domain */
103	struct {
104		bool enabled;
105		int qdep;
106	} ats;				  /* ATS state */
107	bool pri_tlp;			  /* PASID TLB required for
108					     PPR completions */
109	u32 errata;			  /* Bitmap for errata to apply */
110};
111
112/*
113 * general struct to manage commands send to an IOMMU
114 */
115struct iommu_cmd {
116	u32 data[4];
117};
118
119struct kmem_cache *amd_iommu_irq_cache;
120
121static void update_domain(struct protection_domain *domain);
122static int __init alloc_passthrough_domain(void);
123
124/****************************************************************************
125 *
126 * Helper functions
127 *
128 ****************************************************************************/
129
130static struct protection_domain *to_pdomain(struct iommu_domain *dom)
131{
132	return container_of(dom, struct protection_domain, domain);
133}
134
135static struct iommu_dev_data *alloc_dev_data(u16 devid)
136{
137	struct iommu_dev_data *dev_data;
138	unsigned long flags;
139
140	dev_data = kzalloc(sizeof(*dev_data), GFP_KERNEL);
141	if (!dev_data)
142		return NULL;
143
144	INIT_LIST_HEAD(&dev_data->alias_list);
145
146	dev_data->devid = devid;
147
148	spin_lock_irqsave(&dev_data_list_lock, flags);
149	list_add_tail(&dev_data->dev_data_list, &dev_data_list);
150	spin_unlock_irqrestore(&dev_data_list_lock, flags);
151
152	return dev_data;
153}
154
155static void free_dev_data(struct iommu_dev_data *dev_data)
156{
157	unsigned long flags;
158
159	spin_lock_irqsave(&dev_data_list_lock, flags);
160	list_del(&dev_data->dev_data_list);
161	spin_unlock_irqrestore(&dev_data_list_lock, flags);
162
163	kfree(dev_data);
164}
165
166static struct iommu_dev_data *search_dev_data(u16 devid)
167{
168	struct iommu_dev_data *dev_data;
169	unsigned long flags;
170
171	spin_lock_irqsave(&dev_data_list_lock, flags);
172	list_for_each_entry(dev_data, &dev_data_list, dev_data_list) {
173		if (dev_data->devid == devid)
174			goto out_unlock;
175	}
176
177	dev_data = NULL;
178
179out_unlock:
180	spin_unlock_irqrestore(&dev_data_list_lock, flags);
181
182	return dev_data;
183}
184
185static struct iommu_dev_data *find_dev_data(u16 devid)
186{
187	struct iommu_dev_data *dev_data;
188
189	dev_data = search_dev_data(devid);
190
191	if (dev_data == NULL)
192		dev_data = alloc_dev_data(devid);
193
194	return dev_data;
195}
196
197static inline u16 get_device_id(struct device *dev)
198{
199	struct pci_dev *pdev = to_pci_dev(dev);
200
201	return PCI_DEVID(pdev->bus->number, pdev->devfn);
202}
203
204static struct iommu_dev_data *get_dev_data(struct device *dev)
205{
206	return dev->archdata.iommu;
207}
208
209static bool pci_iommuv2_capable(struct pci_dev *pdev)
210{
211	static const int caps[] = {
212		PCI_EXT_CAP_ID_ATS,
213		PCI_EXT_CAP_ID_PRI,
214		PCI_EXT_CAP_ID_PASID,
215	};
216	int i, pos;
217
218	for (i = 0; i < 3; ++i) {
219		pos = pci_find_ext_capability(pdev, caps[i]);
220		if (pos == 0)
221			return false;
222	}
223
224	return true;
225}
226
227static bool pdev_pri_erratum(struct pci_dev *pdev, u32 erratum)
228{
229	struct iommu_dev_data *dev_data;
230
231	dev_data = get_dev_data(&pdev->dev);
232
233	return dev_data->errata & (1 << erratum) ? true : false;
234}
235
236/*
237 * In this function the list of preallocated protection domains is traversed to
238 * find the domain for a specific device
239 */
240static struct dma_ops_domain *find_protection_domain(u16 devid)
241{
242	struct dma_ops_domain *entry, *ret = NULL;
243	unsigned long flags;
244	u16 alias = amd_iommu_alias_table[devid];
245
246	if (list_empty(&iommu_pd_list))
247		return NULL;
248
249	spin_lock_irqsave(&iommu_pd_list_lock, flags);
250
251	list_for_each_entry(entry, &iommu_pd_list, list) {
252		if (entry->target_dev == devid ||
253		    entry->target_dev == alias) {
254			ret = entry;
255			break;
256		}
257	}
258
259	spin_unlock_irqrestore(&iommu_pd_list_lock, flags);
260
261	return ret;
262}
263
264/*
265 * This function checks if the driver got a valid device from the caller to
266 * avoid dereferencing invalid pointers.
267 */
268static bool check_device(struct device *dev)
269{
270	u16 devid;
271
272	if (!dev || !dev->dma_mask)
273		return false;
274
275	/* No PCI device */
276	if (!dev_is_pci(dev))
277		return false;
278
279	devid = get_device_id(dev);
280
281	/* Out of our scope? */
282	if (devid > amd_iommu_last_bdf)
283		return false;
284
285	if (amd_iommu_rlookup_table[devid] == NULL)
286		return false;
287
288	return true;
289}
290
291static void init_iommu_group(struct device *dev)
292{
293	struct iommu_group *group;
294
295	group = iommu_group_get_for_dev(dev);
296	if (!IS_ERR(group))
297		iommu_group_put(group);
298}
299
300static int __last_alias(struct pci_dev *pdev, u16 alias, void *data)
301{
302	*(u16 *)data = alias;
303	return 0;
304}
305
306static u16 get_alias(struct device *dev)
307{
308	struct pci_dev *pdev = to_pci_dev(dev);
309	u16 devid, ivrs_alias, pci_alias;
310
311	devid = get_device_id(dev);
312	ivrs_alias = amd_iommu_alias_table[devid];
313	pci_for_each_dma_alias(pdev, __last_alias, &pci_alias);
314
315	if (ivrs_alias == pci_alias)
316		return ivrs_alias;
317
318	/*
319	 * DMA alias showdown
320	 *
321	 * The IVRS is fairly reliable in telling us about aliases, but it
322	 * can't know about every screwy device.  If we don't have an IVRS
323	 * reported alias, use the PCI reported alias.  In that case we may
324	 * still need to initialize the rlookup and dev_table entries if the
325	 * alias is to a non-existent device.
326	 */
327	if (ivrs_alias == devid) {
328		if (!amd_iommu_rlookup_table[pci_alias]) {
329			amd_iommu_rlookup_table[pci_alias] =
330				amd_iommu_rlookup_table[devid];
331			memcpy(amd_iommu_dev_table[pci_alias].data,
332			       amd_iommu_dev_table[devid].data,
333			       sizeof(amd_iommu_dev_table[pci_alias].data));
334		}
335
336		return pci_alias;
337	}
338
339	pr_info("AMD-Vi: Using IVRS reported alias %02x:%02x.%d "
340		"for device %s[%04x:%04x], kernel reported alias "
341		"%02x:%02x.%d\n", PCI_BUS_NUM(ivrs_alias), PCI_SLOT(ivrs_alias),
342		PCI_FUNC(ivrs_alias), dev_name(dev), pdev->vendor, pdev->device,
343		PCI_BUS_NUM(pci_alias), PCI_SLOT(pci_alias),
344		PCI_FUNC(pci_alias));
345
346	/*
347	 * If we don't have a PCI DMA alias and the IVRS alias is on the same
348	 * bus, then the IVRS table may know about a quirk that we don't.
349	 */
350	if (pci_alias == devid &&
351	    PCI_BUS_NUM(ivrs_alias) == pdev->bus->number) {
352		pdev->dev_flags |= PCI_DEV_FLAGS_DMA_ALIAS_DEVFN;
353		pdev->dma_alias_devfn = ivrs_alias & 0xff;
354		pr_info("AMD-Vi: Added PCI DMA alias %02x.%d for %s\n",
355			PCI_SLOT(ivrs_alias), PCI_FUNC(ivrs_alias),
356			dev_name(dev));
357	}
358
359	return ivrs_alias;
360}
361
362static int iommu_init_device(struct device *dev)
363{
364	struct pci_dev *pdev = to_pci_dev(dev);
365	struct iommu_dev_data *dev_data;
366	u16 alias;
367
368	if (dev->archdata.iommu)
369		return 0;
370
371	dev_data = find_dev_data(get_device_id(dev));
372	if (!dev_data)
373		return -ENOMEM;
374
375	alias = get_alias(dev);
376
377	if (alias != dev_data->devid) {
378		struct iommu_dev_data *alias_data;
379
380		alias_data = find_dev_data(alias);
381		if (alias_data == NULL) {
382			pr_err("AMD-Vi: Warning: Unhandled device %s\n",
383					dev_name(dev));
384			free_dev_data(dev_data);
385			return -ENOTSUPP;
386		}
387		dev_data->alias_data = alias_data;
388
389		/* Add device to the alias_list */
390		list_add(&dev_data->alias_list, &alias_data->alias_list);
391	}
392
393	if (pci_iommuv2_capable(pdev)) {
394		struct amd_iommu *iommu;
395
396		iommu              = amd_iommu_rlookup_table[dev_data->devid];
397		dev_data->iommu_v2 = iommu->is_iommu_v2;
398	}
399
400	dev->archdata.iommu = dev_data;
401
402	iommu_device_link(amd_iommu_rlookup_table[dev_data->devid]->iommu_dev,
403			  dev);
404
405	return 0;
406}
407
408static void iommu_ignore_device(struct device *dev)
409{
410	u16 devid, alias;
411
412	devid = get_device_id(dev);
413	alias = amd_iommu_alias_table[devid];
414
415	memset(&amd_iommu_dev_table[devid], 0, sizeof(struct dev_table_entry));
416	memset(&amd_iommu_dev_table[alias], 0, sizeof(struct dev_table_entry));
417
418	amd_iommu_rlookup_table[devid] = NULL;
419	amd_iommu_rlookup_table[alias] = NULL;
420}
421
422static void iommu_uninit_device(struct device *dev)
423{
424	struct iommu_dev_data *dev_data = search_dev_data(get_device_id(dev));
425
426	if (!dev_data)
427		return;
428
429	iommu_device_unlink(amd_iommu_rlookup_table[dev_data->devid]->iommu_dev,
430			    dev);
431
432	iommu_group_remove_device(dev);
433
434	/* Unlink from alias, it may change if another device is re-plugged */
435	dev_data->alias_data = NULL;
436
437	/*
438	 * We keep dev_data around for unplugged devices and reuse it when the
439	 * device is re-plugged - not doing so would introduce a ton of races.
440	 */
441}
442
443void __init amd_iommu_uninit_devices(void)
444{
445	struct iommu_dev_data *dev_data, *n;
446	struct pci_dev *pdev = NULL;
447
448	for_each_pci_dev(pdev) {
449
450		if (!check_device(&pdev->dev))
451			continue;
452
453		iommu_uninit_device(&pdev->dev);
454	}
455
456	/* Free all of our dev_data structures */
457	list_for_each_entry_safe(dev_data, n, &dev_data_list, dev_data_list)
458		free_dev_data(dev_data);
459}
460
461int __init amd_iommu_init_devices(void)
462{
463	struct pci_dev *pdev = NULL;
464	int ret = 0;
465
466	for_each_pci_dev(pdev) {
467
468		if (!check_device(&pdev->dev))
469			continue;
470
471		ret = iommu_init_device(&pdev->dev);
472		if (ret == -ENOTSUPP)
473			iommu_ignore_device(&pdev->dev);
474		else if (ret)
475			goto out_free;
476	}
477
478	/*
479	 * Initialize IOMMU groups only after iommu_init_device() has
480	 * had a chance to populate any IVRS defined aliases.
481	 */
482	for_each_pci_dev(pdev) {
483		if (check_device(&pdev->dev))
484			init_iommu_group(&pdev->dev);
485	}
486
487	return 0;
488
489out_free:
490
491	amd_iommu_uninit_devices();
492
493	return ret;
494}
495#ifdef CONFIG_AMD_IOMMU_STATS
496
497/*
498 * Initialization code for statistics collection
499 */
500
501DECLARE_STATS_COUNTER(compl_wait);
502DECLARE_STATS_COUNTER(cnt_map_single);
503DECLARE_STATS_COUNTER(cnt_unmap_single);
504DECLARE_STATS_COUNTER(cnt_map_sg);
505DECLARE_STATS_COUNTER(cnt_unmap_sg);
506DECLARE_STATS_COUNTER(cnt_alloc_coherent);
507DECLARE_STATS_COUNTER(cnt_free_coherent);
508DECLARE_STATS_COUNTER(cross_page);
509DECLARE_STATS_COUNTER(domain_flush_single);
510DECLARE_STATS_COUNTER(domain_flush_all);
511DECLARE_STATS_COUNTER(alloced_io_mem);
512DECLARE_STATS_COUNTER(total_map_requests);
513DECLARE_STATS_COUNTER(complete_ppr);
514DECLARE_STATS_COUNTER(invalidate_iotlb);
515DECLARE_STATS_COUNTER(invalidate_iotlb_all);
516DECLARE_STATS_COUNTER(pri_requests);
517
518static struct dentry *stats_dir;
519static struct dentry *de_fflush;
520
521static void amd_iommu_stats_add(struct __iommu_counter *cnt)
522{
523	if (stats_dir == NULL)
524		return;
525
526	cnt->dent = debugfs_create_u64(cnt->name, 0444, stats_dir,
527				       &cnt->value);
528}
529
530static void amd_iommu_stats_init(void)
531{
532	stats_dir = debugfs_create_dir("amd-iommu", NULL);
533	if (stats_dir == NULL)
534		return;
535
536	de_fflush  = debugfs_create_bool("fullflush", 0444, stats_dir,
537					 &amd_iommu_unmap_flush);
538
539	amd_iommu_stats_add(&compl_wait);
540	amd_iommu_stats_add(&cnt_map_single);
541	amd_iommu_stats_add(&cnt_unmap_single);
542	amd_iommu_stats_add(&cnt_map_sg);
543	amd_iommu_stats_add(&cnt_unmap_sg);
544	amd_iommu_stats_add(&cnt_alloc_coherent);
545	amd_iommu_stats_add(&cnt_free_coherent);
546	amd_iommu_stats_add(&cross_page);
547	amd_iommu_stats_add(&domain_flush_single);
548	amd_iommu_stats_add(&domain_flush_all);
549	amd_iommu_stats_add(&alloced_io_mem);
550	amd_iommu_stats_add(&total_map_requests);
551	amd_iommu_stats_add(&complete_ppr);
552	amd_iommu_stats_add(&invalidate_iotlb);
553	amd_iommu_stats_add(&invalidate_iotlb_all);
554	amd_iommu_stats_add(&pri_requests);
555}
556
557#endif
558
559/****************************************************************************
560 *
561 * Interrupt handling functions
562 *
563 ****************************************************************************/
564
565static void dump_dte_entry(u16 devid)
566{
567	int i;
568
569	for (i = 0; i < 4; ++i)
570		pr_err("AMD-Vi: DTE[%d]: %016llx\n", i,
571			amd_iommu_dev_table[devid].data[i]);
572}
573
574static void dump_command(unsigned long phys_addr)
575{
576	struct iommu_cmd *cmd = phys_to_virt(phys_addr);
577	int i;
578
579	for (i = 0; i < 4; ++i)
580		pr_err("AMD-Vi: CMD[%d]: %08x\n", i, cmd->data[i]);
581}
582
583static void iommu_print_event(struct amd_iommu *iommu, void *__evt)
584{
585	int type, devid, domid, flags;
586	volatile u32 *event = __evt;
587	int count = 0;
588	u64 address;
589
590retry:
591	type    = (event[1] >> EVENT_TYPE_SHIFT)  & EVENT_TYPE_MASK;
592	devid   = (event[0] >> EVENT_DEVID_SHIFT) & EVENT_DEVID_MASK;
593	domid   = (event[1] >> EVENT_DOMID_SHIFT) & EVENT_DOMID_MASK;
594	flags   = (event[1] >> EVENT_FLAGS_SHIFT) & EVENT_FLAGS_MASK;
595	address = (u64)(((u64)event[3]) << 32) | event[2];
596
597	if (type == 0) {
598		/* Did we hit the erratum? */
599		if (++count == LOOP_TIMEOUT) {
600			pr_err("AMD-Vi: No event written to event log\n");
601			return;
602		}
603		udelay(1);
604		goto retry;
605	}
606
607	printk(KERN_ERR "AMD-Vi: Event logged [");
608
609	switch (type) {
610	case EVENT_TYPE_ILL_DEV:
611		printk("ILLEGAL_DEV_TABLE_ENTRY device=%02x:%02x.%x "
612		       "address=0x%016llx flags=0x%04x]\n",
613		       PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
614		       address, flags);
615		dump_dte_entry(devid);
616		break;
617	case EVENT_TYPE_IO_FAULT:
618		printk("IO_PAGE_FAULT device=%02x:%02x.%x "
619		       "domain=0x%04x address=0x%016llx flags=0x%04x]\n",
620		       PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
621		       domid, address, flags);
622		break;
623	case EVENT_TYPE_DEV_TAB_ERR:
624		printk("DEV_TAB_HARDWARE_ERROR device=%02x:%02x.%x "
625		       "address=0x%016llx flags=0x%04x]\n",
626		       PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
627		       address, flags);
628		break;
629	case EVENT_TYPE_PAGE_TAB_ERR:
630		printk("PAGE_TAB_HARDWARE_ERROR device=%02x:%02x.%x "
631		       "domain=0x%04x address=0x%016llx flags=0x%04x]\n",
632		       PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
633		       domid, address, flags);
634		break;
635	case EVENT_TYPE_ILL_CMD:
636		printk("ILLEGAL_COMMAND_ERROR address=0x%016llx]\n", address);
637		dump_command(address);
638		break;
639	case EVENT_TYPE_CMD_HARD_ERR:
640		printk("COMMAND_HARDWARE_ERROR address=0x%016llx "
641		       "flags=0x%04x]\n", address, flags);
642		break;
643	case EVENT_TYPE_IOTLB_INV_TO:
644		printk("IOTLB_INV_TIMEOUT device=%02x:%02x.%x "
645		       "address=0x%016llx]\n",
646		       PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
647		       address);
648		break;
649	case EVENT_TYPE_INV_DEV_REQ:
650		printk("INVALID_DEVICE_REQUEST device=%02x:%02x.%x "
651		       "address=0x%016llx flags=0x%04x]\n",
652		       PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
653		       address, flags);
654		break;
655	default:
656		printk(KERN_ERR "UNKNOWN type=0x%02x]\n", type);
657	}
658
659	memset(__evt, 0, 4 * sizeof(u32));
660}
661
662static void iommu_poll_events(struct amd_iommu *iommu)
663{
664	u32 head, tail;
665
666	head = readl(iommu->mmio_base + MMIO_EVT_HEAD_OFFSET);
667	tail = readl(iommu->mmio_base + MMIO_EVT_TAIL_OFFSET);
668
669	while (head != tail) {
670		iommu_print_event(iommu, iommu->evt_buf + head);
671		head = (head + EVENT_ENTRY_SIZE) % iommu->evt_buf_size;
672	}
673
674	writel(head, iommu->mmio_base + MMIO_EVT_HEAD_OFFSET);
675}
676
677static void iommu_handle_ppr_entry(struct amd_iommu *iommu, u64 *raw)
678{
679	struct amd_iommu_fault fault;
680
681	INC_STATS_COUNTER(pri_requests);
682
683	if (PPR_REQ_TYPE(raw[0]) != PPR_REQ_FAULT) {
684		pr_err_ratelimited("AMD-Vi: Unknown PPR request received\n");
685		return;
686	}
687
688	fault.address   = raw[1];
689	fault.pasid     = PPR_PASID(raw[0]);
690	fault.device_id = PPR_DEVID(raw[0]);
691	fault.tag       = PPR_TAG(raw[0]);
692	fault.flags     = PPR_FLAGS(raw[0]);
693
694	atomic_notifier_call_chain(&ppr_notifier, 0, &fault);
695}
696
697static void iommu_poll_ppr_log(struct amd_iommu *iommu)
698{
699	u32 head, tail;
700
701	if (iommu->ppr_log == NULL)
702		return;
703
704	head = readl(iommu->mmio_base + MMIO_PPR_HEAD_OFFSET);
705	tail = readl(iommu->mmio_base + MMIO_PPR_TAIL_OFFSET);
706
707	while (head != tail) {
708		volatile u64 *raw;
709		u64 entry[2];
710		int i;
711
712		raw = (u64 *)(iommu->ppr_log + head);
713
714		/*
715		 * Hardware bug: Interrupt may arrive before the entry is
716		 * written to memory. If this happens we need to wait for the
717		 * entry to arrive.
718		 */
719		for (i = 0; i < LOOP_TIMEOUT; ++i) {
720			if (PPR_REQ_TYPE(raw[0]) != 0)
721				break;
722			udelay(1);
723		}
724
725		/* Avoid memcpy function-call overhead */
726		entry[0] = raw[0];
727		entry[1] = raw[1];
728
729		/*
730		 * To detect the hardware bug we need to clear the entry
731		 * back to zero.
732		 */
733		raw[0] = raw[1] = 0UL;
734
735		/* Update head pointer of hardware ring-buffer */
736		head = (head + PPR_ENTRY_SIZE) % PPR_LOG_SIZE;
737		writel(head, iommu->mmio_base + MMIO_PPR_HEAD_OFFSET);
738
739		/* Handle PPR entry */
740		iommu_handle_ppr_entry(iommu, entry);
741
742		/* Refresh ring-buffer information */
743		head = readl(iommu->mmio_base + MMIO_PPR_HEAD_OFFSET);
744		tail = readl(iommu->mmio_base + MMIO_PPR_TAIL_OFFSET);
745	}
746}
747
748irqreturn_t amd_iommu_int_thread(int irq, void *data)
749{
750	struct amd_iommu *iommu = (struct amd_iommu *) data;
751	u32 status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET);
752
753	while (status & (MMIO_STATUS_EVT_INT_MASK | MMIO_STATUS_PPR_INT_MASK)) {
754		/* Enable EVT and PPR interrupts again */
755		writel((MMIO_STATUS_EVT_INT_MASK | MMIO_STATUS_PPR_INT_MASK),
756			iommu->mmio_base + MMIO_STATUS_OFFSET);
757
758		if (status & MMIO_STATUS_EVT_INT_MASK) {
759			pr_devel("AMD-Vi: Processing IOMMU Event Log\n");
760			iommu_poll_events(iommu);
761		}
762
763		if (status & MMIO_STATUS_PPR_INT_MASK) {
764			pr_devel("AMD-Vi: Processing IOMMU PPR Log\n");
765			iommu_poll_ppr_log(iommu);
766		}
767
768		/*
769		 * Hardware bug: ERBT1312
770		 * When re-enabling interrupt (by writing 1
771		 * to clear the bit), the hardware might also try to set
772		 * the interrupt bit in the event status register.
773		 * In this scenario, the bit will be set, and disable
774		 * subsequent interrupts.
775		 *
776		 * Workaround: The IOMMU driver should read back the
777		 * status register and check if the interrupt bits are cleared.
778		 * If not, driver will need to go through the interrupt handler
779		 * again and re-clear the bits
780		 */
781		status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET);
782	}
783	return IRQ_HANDLED;
784}
785
786irqreturn_t amd_iommu_int_handler(int irq, void *data)
787{
788	return IRQ_WAKE_THREAD;
789}
790
791/****************************************************************************
792 *
793 * IOMMU command queuing functions
794 *
795 ****************************************************************************/
796
797static int wait_on_sem(volatile u64 *sem)
798{
799	int i = 0;
800
801	while (*sem == 0 && i < LOOP_TIMEOUT) {
802		udelay(1);
803		i += 1;
804	}
805
806	if (i == LOOP_TIMEOUT) {
807		pr_alert("AMD-Vi: Completion-Wait loop timed out\n");
808		return -EIO;
809	}
810
811	return 0;
812}
813
814static void copy_cmd_to_buffer(struct amd_iommu *iommu,
815			       struct iommu_cmd *cmd,
816			       u32 tail)
817{
818	u8 *target;
819
820	target = iommu->cmd_buf + tail;
821	tail   = (tail + sizeof(*cmd)) % iommu->cmd_buf_size;
822
823	/* Copy command to buffer */
824	memcpy(target, cmd, sizeof(*cmd));
825
826	/* Tell the IOMMU about it */
827	writel(tail, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
828}
829
830static void build_completion_wait(struct iommu_cmd *cmd, u64 address)
831{
832	WARN_ON(address & 0x7ULL);
833
834	memset(cmd, 0, sizeof(*cmd));
835	cmd->data[0] = lower_32_bits(__pa(address)) | CMD_COMPL_WAIT_STORE_MASK;
836	cmd->data[1] = upper_32_bits(__pa(address));
837	cmd->data[2] = 1;
838	CMD_SET_TYPE(cmd, CMD_COMPL_WAIT);
839}
840
841static void build_inv_dte(struct iommu_cmd *cmd, u16 devid)
842{
843	memset(cmd, 0, sizeof(*cmd));
844	cmd->data[0] = devid;
845	CMD_SET_TYPE(cmd, CMD_INV_DEV_ENTRY);
846}
847
848static void build_inv_iommu_pages(struct iommu_cmd *cmd, u64 address,
849				  size_t size, u16 domid, int pde)
850{
851	u64 pages;
852	bool s;
853
854	pages = iommu_num_pages(address, size, PAGE_SIZE);
855	s     = false;
856
857	if (pages > 1) {
858		/*
859		 * If we have to flush more than one page, flush all
860		 * TLB entries for this domain
861		 */
862		address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS;
863		s = true;
864	}
865
866	address &= PAGE_MASK;
867
868	memset(cmd, 0, sizeof(*cmd));
869	cmd->data[1] |= domid;
870	cmd->data[2]  = lower_32_bits(address);
871	cmd->data[3]  = upper_32_bits(address);
872	CMD_SET_TYPE(cmd, CMD_INV_IOMMU_PAGES);
873	if (s) /* size bit - we flush more than one 4kb page */
874		cmd->data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK;
875	if (pde) /* PDE bit - we want to flush everything, not only the PTEs */
876		cmd->data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK;
877}
878
879static void build_inv_iotlb_pages(struct iommu_cmd *cmd, u16 devid, int qdep,
880				  u64 address, size_t size)
881{
882	u64 pages;
883	bool s;
884
885	pages = iommu_num_pages(address, size, PAGE_SIZE);
886	s     = false;
887
888	if (pages > 1) {
889		/*
890		 * If we have to flush more than one page, flush all
891		 * TLB entries for this domain
892		 */
893		address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS;
894		s = true;
895	}
896
897	address &= PAGE_MASK;
898
899	memset(cmd, 0, sizeof(*cmd));
900	cmd->data[0]  = devid;
901	cmd->data[0] |= (qdep & 0xff) << 24;
902	cmd->data[1]  = devid;
903	cmd->data[2]  = lower_32_bits(address);
904	cmd->data[3]  = upper_32_bits(address);
905	CMD_SET_TYPE(cmd, CMD_INV_IOTLB_PAGES);
906	if (s)
907		cmd->data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK;
908}
909
910static void build_inv_iommu_pasid(struct iommu_cmd *cmd, u16 domid, int pasid,
911				  u64 address, bool size)
912{
913	memset(cmd, 0, sizeof(*cmd));
914
915	address &= ~(0xfffULL);
916
917	cmd->data[0]  = pasid;
918	cmd->data[1]  = domid;
919	cmd->data[2]  = lower_32_bits(address);
920	cmd->data[3]  = upper_32_bits(address);
921	cmd->data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK;
922	cmd->data[2] |= CMD_INV_IOMMU_PAGES_GN_MASK;
923	if (size)
924		cmd->data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK;
925	CMD_SET_TYPE(cmd, CMD_INV_IOMMU_PAGES);
926}
927
928static void build_inv_iotlb_pasid(struct iommu_cmd *cmd, u16 devid, int pasid,
929				  int qdep, u64 address, bool size)
930{
931	memset(cmd, 0, sizeof(*cmd));
932
933	address &= ~(0xfffULL);
934
935	cmd->data[0]  = devid;
936	cmd->data[0] |= ((pasid >> 8) & 0xff) << 16;
937	cmd->data[0] |= (qdep  & 0xff) << 24;
938	cmd->data[1]  = devid;
939	cmd->data[1] |= (pasid & 0xff) << 16;
940	cmd->data[2]  = lower_32_bits(address);
941	cmd->data[2] |= CMD_INV_IOMMU_PAGES_GN_MASK;
942	cmd->data[3]  = upper_32_bits(address);
943	if (size)
944		cmd->data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK;
945	CMD_SET_TYPE(cmd, CMD_INV_IOTLB_PAGES);
946}
947
948static void build_complete_ppr(struct iommu_cmd *cmd, u16 devid, int pasid,
949			       int status, int tag, bool gn)
950{
951	memset(cmd, 0, sizeof(*cmd));
952
953	cmd->data[0]  = devid;
954	if (gn) {
955		cmd->data[1]  = pasid;
956		cmd->data[2]  = CMD_INV_IOMMU_PAGES_GN_MASK;
957	}
958	cmd->data[3]  = tag & 0x1ff;
959	cmd->data[3] |= (status & PPR_STATUS_MASK) << PPR_STATUS_SHIFT;
960
961	CMD_SET_TYPE(cmd, CMD_COMPLETE_PPR);
962}
963
964static void build_inv_all(struct iommu_cmd *cmd)
965{
966	memset(cmd, 0, sizeof(*cmd));
967	CMD_SET_TYPE(cmd, CMD_INV_ALL);
968}
969
970static void build_inv_irt(struct iommu_cmd *cmd, u16 devid)
971{
972	memset(cmd, 0, sizeof(*cmd));
973	cmd->data[0] = devid;
974	CMD_SET_TYPE(cmd, CMD_INV_IRT);
975}
976
977/*
978 * Writes the command to the IOMMUs command buffer and informs the
979 * hardware about the new command.
980 */
981static int iommu_queue_command_sync(struct amd_iommu *iommu,
982				    struct iommu_cmd *cmd,
983				    bool sync)
984{
985	u32 left, tail, head, next_tail;
986	unsigned long flags;
987
988	WARN_ON(iommu->cmd_buf_size & CMD_BUFFER_UNINITIALIZED);
989
990again:
991	spin_lock_irqsave(&iommu->lock, flags);
992
993	head      = readl(iommu->mmio_base + MMIO_CMD_HEAD_OFFSET);
994	tail      = readl(iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
995	next_tail = (tail + sizeof(*cmd)) % iommu->cmd_buf_size;
996	left      = (head - next_tail) % iommu->cmd_buf_size;
997
998	if (left <= 2) {
999		struct iommu_cmd sync_cmd;
1000		volatile u64 sem = 0;
1001		int ret;
1002
1003		build_completion_wait(&sync_cmd, (u64)&sem);
1004		copy_cmd_to_buffer(iommu, &sync_cmd, tail);
1005
1006		spin_unlock_irqrestore(&iommu->lock, flags);
1007
1008		if ((ret = wait_on_sem(&sem)) != 0)
1009			return ret;
1010
1011		goto again;
1012	}
1013
1014	copy_cmd_to_buffer(iommu, cmd, tail);
1015
1016	/* We need to sync now to make sure all commands are processed */
1017	iommu->need_sync = sync;
1018
1019	spin_unlock_irqrestore(&iommu->lock, flags);
1020
1021	return 0;
1022}
1023
1024static int iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd)
1025{
1026	return iommu_queue_command_sync(iommu, cmd, true);
1027}
1028
1029/*
1030 * This function queues a completion wait command into the command
1031 * buffer of an IOMMU
1032 */
1033static int iommu_completion_wait(struct amd_iommu *iommu)
1034{
1035	struct iommu_cmd cmd;
1036	volatile u64 sem = 0;
1037	int ret;
1038
1039	if (!iommu->need_sync)
1040		return 0;
1041
1042	build_completion_wait(&cmd, (u64)&sem);
1043
1044	ret = iommu_queue_command_sync(iommu, &cmd, false);
1045	if (ret)
1046		return ret;
1047
1048	return wait_on_sem(&sem);
1049}
1050
1051static int iommu_flush_dte(struct amd_iommu *iommu, u16 devid)
1052{
1053	struct iommu_cmd cmd;
1054
1055	build_inv_dte(&cmd, devid);
1056
1057	return iommu_queue_command(iommu, &cmd);
1058}
1059
1060static void iommu_flush_dte_all(struct amd_iommu *iommu)
1061{
1062	u32 devid;
1063
1064	for (devid = 0; devid <= 0xffff; ++devid)
1065		iommu_flush_dte(iommu, devid);
1066
1067	iommu_completion_wait(iommu);
1068}
1069
1070/*
1071 * This function uses heavy locking and may disable irqs for some time. But
1072 * this is no issue because it is only called during resume.
1073 */
1074static void iommu_flush_tlb_all(struct amd_iommu *iommu)
1075{
1076	u32 dom_id;
1077
1078	for (dom_id = 0; dom_id <= 0xffff; ++dom_id) {
1079		struct iommu_cmd cmd;
1080		build_inv_iommu_pages(&cmd, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS,
1081				      dom_id, 1);
1082		iommu_queue_command(iommu, &cmd);
1083	}
1084
1085	iommu_completion_wait(iommu);
1086}
1087
1088static void iommu_flush_all(struct amd_iommu *iommu)
1089{
1090	struct iommu_cmd cmd;
1091
1092	build_inv_all(&cmd);
1093
1094	iommu_queue_command(iommu, &cmd);
1095	iommu_completion_wait(iommu);
1096}
1097
1098static void iommu_flush_irt(struct amd_iommu *iommu, u16 devid)
1099{
1100	struct iommu_cmd cmd;
1101
1102	build_inv_irt(&cmd, devid);
1103
1104	iommu_queue_command(iommu, &cmd);
1105}
1106
1107static void iommu_flush_irt_all(struct amd_iommu *iommu)
1108{
1109	u32 devid;
1110
1111	for (devid = 0; devid <= MAX_DEV_TABLE_ENTRIES; devid++)
1112		iommu_flush_irt(iommu, devid);
1113
1114	iommu_completion_wait(iommu);
1115}
1116
1117void iommu_flush_all_caches(struct amd_iommu *iommu)
1118{
1119	if (iommu_feature(iommu, FEATURE_IA)) {
1120		iommu_flush_all(iommu);
1121	} else {
1122		iommu_flush_dte_all(iommu);
1123		iommu_flush_irt_all(iommu);
1124		iommu_flush_tlb_all(iommu);
1125	}
1126}
1127
1128/*
1129 * Command send function for flushing on-device TLB
1130 */
1131static int device_flush_iotlb(struct iommu_dev_data *dev_data,
1132			      u64 address, size_t size)
1133{
1134	struct amd_iommu *iommu;
1135	struct iommu_cmd cmd;
1136	int qdep;
1137
1138	qdep     = dev_data->ats.qdep;
1139	iommu    = amd_iommu_rlookup_table[dev_data->devid];
1140
1141	build_inv_iotlb_pages(&cmd, dev_data->devid, qdep, address, size);
1142
1143	return iommu_queue_command(iommu, &cmd);
1144}
1145
1146/*
1147 * Command send function for invalidating a device table entry
1148 */
1149static int device_flush_dte(struct iommu_dev_data *dev_data)
1150{
1151	struct amd_iommu *iommu;
1152	int ret;
1153
1154	iommu = amd_iommu_rlookup_table[dev_data->devid];
1155
1156	ret = iommu_flush_dte(iommu, dev_data->devid);
1157	if (ret)
1158		return ret;
1159
1160	if (dev_data->ats.enabled)
1161		ret = device_flush_iotlb(dev_data, 0, ~0UL);
1162
1163	return ret;
1164}
1165
1166/*
1167 * TLB invalidation function which is called from the mapping functions.
1168 * It invalidates a single PTE if the range to flush is within a single
1169 * page. Otherwise it flushes the whole TLB of the IOMMU.
1170 */
1171static void __domain_flush_pages(struct protection_domain *domain,
1172				 u64 address, size_t size, int pde)
1173{
1174	struct iommu_dev_data *dev_data;
1175	struct iommu_cmd cmd;
1176	int ret = 0, i;
1177
1178	build_inv_iommu_pages(&cmd, address, size, domain->id, pde);
1179
1180	for (i = 0; i < amd_iommus_present; ++i) {
1181		if (!domain->dev_iommu[i])
1182			continue;
1183
1184		/*
1185		 * Devices of this domain are behind this IOMMU
1186		 * We need a TLB flush
1187		 */
1188		ret |= iommu_queue_command(amd_iommus[i], &cmd);
1189	}
1190
1191	list_for_each_entry(dev_data, &domain->dev_list, list) {
1192
1193		if (!dev_data->ats.enabled)
1194			continue;
1195
1196		ret |= device_flush_iotlb(dev_data, address, size);
1197	}
1198
1199	WARN_ON(ret);
1200}
1201
1202static void domain_flush_pages(struct protection_domain *domain,
1203			       u64 address, size_t size)
1204{
1205	__domain_flush_pages(domain, address, size, 0);
1206}
1207
1208/* Flush the whole IO/TLB for a given protection domain */
1209static void domain_flush_tlb(struct protection_domain *domain)
1210{
1211	__domain_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 0);
1212}
1213
1214/* Flush the whole IO/TLB for a given protection domain - including PDE */
1215static void domain_flush_tlb_pde(struct protection_domain *domain)
1216{
1217	__domain_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 1);
1218}
1219
1220static void domain_flush_complete(struct protection_domain *domain)
1221{
1222	int i;
1223
1224	for (i = 0; i < amd_iommus_present; ++i) {
1225		if (!domain->dev_iommu[i])
1226			continue;
1227
1228		/*
1229		 * Devices of this domain are behind this IOMMU
1230		 * We need to wait for completion of all commands.
1231		 */
1232		iommu_completion_wait(amd_iommus[i]);
1233	}
1234}
1235
1236
1237/*
1238 * This function flushes the DTEs for all devices in domain
1239 */
1240static void domain_flush_devices(struct protection_domain *domain)
1241{
1242	struct iommu_dev_data *dev_data;
1243
1244	list_for_each_entry(dev_data, &domain->dev_list, list)
1245		device_flush_dte(dev_data);
1246}
1247
1248/****************************************************************************
1249 *
1250 * The functions below are used the create the page table mappings for
1251 * unity mapped regions.
1252 *
1253 ****************************************************************************/
1254
1255/*
1256 * This function is used to add another level to an IO page table. Adding
1257 * another level increases the size of the address space by 9 bits to a size up
1258 * to 64 bits.
1259 */
1260static bool increase_address_space(struct protection_domain *domain,
1261				   gfp_t gfp)
1262{
1263	u64 *pte;
1264
1265	if (domain->mode == PAGE_MODE_6_LEVEL)
1266		/* address space already 64 bit large */
1267		return false;
1268
1269	pte = (void *)get_zeroed_page(gfp);
1270	if (!pte)
1271		return false;
1272
1273	*pte             = PM_LEVEL_PDE(domain->mode,
1274					virt_to_phys(domain->pt_root));
1275	domain->pt_root  = pte;
1276	domain->mode    += 1;
1277	domain->updated  = true;
1278
1279	return true;
1280}
1281
1282static u64 *alloc_pte(struct protection_domain *domain,
1283		      unsigned long address,
1284		      unsigned long page_size,
1285		      u64 **pte_page,
1286		      gfp_t gfp)
1287{
1288	int level, end_lvl;
1289	u64 *pte, *page;
1290
1291	BUG_ON(!is_power_of_2(page_size));
1292
1293	while (address > PM_LEVEL_SIZE(domain->mode))
1294		increase_address_space(domain, gfp);
1295
1296	level   = domain->mode - 1;
1297	pte     = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
1298	address = PAGE_SIZE_ALIGN(address, page_size);
1299	end_lvl = PAGE_SIZE_LEVEL(page_size);
1300
1301	while (level > end_lvl) {
1302		if (!IOMMU_PTE_PRESENT(*pte)) {
1303			page = (u64 *)get_zeroed_page(gfp);
1304			if (!page)
1305				return NULL;
1306			*pte = PM_LEVEL_PDE(level, virt_to_phys(page));
1307		}
1308
1309		/* No level skipping support yet */
1310		if (PM_PTE_LEVEL(*pte) != level)
1311			return NULL;
1312
1313		level -= 1;
1314
1315		pte = IOMMU_PTE_PAGE(*pte);
1316
1317		if (pte_page && level == end_lvl)
1318			*pte_page = pte;
1319
1320		pte = &pte[PM_LEVEL_INDEX(level, address)];
1321	}
1322
1323	return pte;
1324}
1325
1326/*
1327 * This function checks if there is a PTE for a given dma address. If
1328 * there is one, it returns the pointer to it.
1329 */
1330static u64 *fetch_pte(struct protection_domain *domain,
1331		      unsigned long address,
1332		      unsigned long *page_size)
1333{
1334	int level;
1335	u64 *pte;
1336
1337	if (address > PM_LEVEL_SIZE(domain->mode))
1338		return NULL;
1339
1340	level	   =  domain->mode - 1;
1341	pte	   = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
1342	*page_size =  PTE_LEVEL_PAGE_SIZE(level);
1343
1344	while (level > 0) {
1345
1346		/* Not Present */
1347		if (!IOMMU_PTE_PRESENT(*pte))
1348			return NULL;
1349
1350		/* Large PTE */
1351		if (PM_PTE_LEVEL(*pte) == 7 ||
1352		    PM_PTE_LEVEL(*pte) == 0)
1353			break;
1354
1355		/* No level skipping support yet */
1356		if (PM_PTE_LEVEL(*pte) != level)
1357			return NULL;
1358
1359		level -= 1;
1360
1361		/* Walk to the next level */
1362		pte	   = IOMMU_PTE_PAGE(*pte);
1363		pte	   = &pte[PM_LEVEL_INDEX(level, address)];
1364		*page_size = PTE_LEVEL_PAGE_SIZE(level);
1365	}
1366
1367	if (PM_PTE_LEVEL(*pte) == 0x07) {
1368		unsigned long pte_mask;
1369
1370		/*
1371		 * If we have a series of large PTEs, make
1372		 * sure to return a pointer to the first one.
1373		 */
1374		*page_size = pte_mask = PTE_PAGE_SIZE(*pte);
1375		pte_mask   = ~((PAGE_SIZE_PTE_COUNT(pte_mask) << 3) - 1);
1376		pte        = (u64 *)(((unsigned long)pte) & pte_mask);
1377	}
1378
1379	return pte;
1380}
1381
1382/*
1383 * Generic mapping functions. It maps a physical address into a DMA
1384 * address space. It allocates the page table pages if necessary.
1385 * In the future it can be extended to a generic mapping function
1386 * supporting all features of AMD IOMMU page tables like level skipping
1387 * and full 64 bit address spaces.
1388 */
1389static int iommu_map_page(struct protection_domain *dom,
1390			  unsigned long bus_addr,
1391			  unsigned long phys_addr,
1392			  int prot,
1393			  unsigned long page_size)
1394{
1395	u64 __pte, *pte;
1396	int i, count;
1397
1398	BUG_ON(!IS_ALIGNED(bus_addr, page_size));
1399	BUG_ON(!IS_ALIGNED(phys_addr, page_size));
1400
1401	if (!(prot & IOMMU_PROT_MASK))
1402		return -EINVAL;
1403
1404	count = PAGE_SIZE_PTE_COUNT(page_size);
1405	pte   = alloc_pte(dom, bus_addr, page_size, NULL, GFP_KERNEL);
1406
1407	if (!pte)
1408		return -ENOMEM;
1409
1410	for (i = 0; i < count; ++i)
1411		if (IOMMU_PTE_PRESENT(pte[i]))
1412			return -EBUSY;
1413
1414	if (count > 1) {
1415		__pte = PAGE_SIZE_PTE(phys_addr, page_size);
1416		__pte |= PM_LEVEL_ENC(7) | IOMMU_PTE_P | IOMMU_PTE_FC;
1417	} else
1418		__pte = phys_addr | IOMMU_PTE_P | IOMMU_PTE_FC;
1419
1420	if (prot & IOMMU_PROT_IR)
1421		__pte |= IOMMU_PTE_IR;
1422	if (prot & IOMMU_PROT_IW)
1423		__pte |= IOMMU_PTE_IW;
1424
1425	for (i = 0; i < count; ++i)
1426		pte[i] = __pte;
1427
1428	update_domain(dom);
1429
1430	return 0;
1431}
1432
1433static unsigned long iommu_unmap_page(struct protection_domain *dom,
1434				      unsigned long bus_addr,
1435				      unsigned long page_size)
1436{
1437	unsigned long long unmapped;
1438	unsigned long unmap_size;
1439	u64 *pte;
1440
1441	BUG_ON(!is_power_of_2(page_size));
1442
1443	unmapped = 0;
1444
1445	while (unmapped < page_size) {
1446
1447		pte = fetch_pte(dom, bus_addr, &unmap_size);
1448
1449		if (pte) {
1450			int i, count;
1451
1452			count = PAGE_SIZE_PTE_COUNT(unmap_size);
1453			for (i = 0; i < count; i++)
1454				pte[i] = 0ULL;
1455		}
1456
1457		bus_addr  = (bus_addr & ~(unmap_size - 1)) + unmap_size;
1458		unmapped += unmap_size;
1459	}
1460
1461	BUG_ON(unmapped && !is_power_of_2(unmapped));
1462
1463	return unmapped;
1464}
1465
1466/*
1467 * This function checks if a specific unity mapping entry is needed for
1468 * this specific IOMMU.
1469 */
1470static int iommu_for_unity_map(struct amd_iommu *iommu,
1471			       struct unity_map_entry *entry)
1472{
1473	u16 bdf, i;
1474
1475	for (i = entry->devid_start; i <= entry->devid_end; ++i) {
1476		bdf = amd_iommu_alias_table[i];
1477		if (amd_iommu_rlookup_table[bdf] == iommu)
1478			return 1;
1479	}
1480
1481	return 0;
1482}
1483
1484/*
1485 * This function actually applies the mapping to the page table of the
1486 * dma_ops domain.
1487 */
1488static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
1489			     struct unity_map_entry *e)
1490{
1491	u64 addr;
1492	int ret;
1493
1494	for (addr = e->address_start; addr < e->address_end;
1495	     addr += PAGE_SIZE) {
1496		ret = iommu_map_page(&dma_dom->domain, addr, addr, e->prot,
1497				     PAGE_SIZE);
1498		if (ret)
1499			return ret;
1500		/*
1501		 * if unity mapping is in aperture range mark the page
1502		 * as allocated in the aperture
1503		 */
1504		if (addr < dma_dom->aperture_size)
1505			__set_bit(addr >> PAGE_SHIFT,
1506				  dma_dom->aperture[0]->bitmap);
1507	}
1508
1509	return 0;
1510}
1511
1512/*
1513 * Init the unity mappings for a specific IOMMU in the system
1514 *
1515 * Basically iterates over all unity mapping entries and applies them to
1516 * the default domain DMA of that IOMMU if necessary.
1517 */
1518static int iommu_init_unity_mappings(struct amd_iommu *iommu)
1519{
1520	struct unity_map_entry *entry;
1521	int ret;
1522
1523	list_for_each_entry(entry, &amd_iommu_unity_map, list) {
1524		if (!iommu_for_unity_map(iommu, entry))
1525			continue;
1526		ret = dma_ops_unity_map(iommu->default_dom, entry);
1527		if (ret)
1528			return ret;
1529	}
1530
1531	return 0;
1532}
1533
1534/*
1535 * Inits the unity mappings required for a specific device
1536 */
1537static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom,
1538					  u16 devid)
1539{
1540	struct unity_map_entry *e;
1541	int ret;
1542
1543	list_for_each_entry(e, &amd_iommu_unity_map, list) {
1544		if (!(devid >= e->devid_start && devid <= e->devid_end))
1545			continue;
1546		ret = dma_ops_unity_map(dma_dom, e);
1547		if (ret)
1548			return ret;
1549	}
1550
1551	return 0;
1552}
1553
1554/****************************************************************************
1555 *
1556 * The next functions belong to the address allocator for the dma_ops
1557 * interface functions. They work like the allocators in the other IOMMU
1558 * drivers. Its basically a bitmap which marks the allocated pages in
1559 * the aperture. Maybe it could be enhanced in the future to a more
1560 * efficient allocator.
1561 *
1562 ****************************************************************************/
1563
1564/*
1565 * The address allocator core functions.
1566 *
1567 * called with domain->lock held
1568 */
1569
1570/*
1571 * Used to reserve address ranges in the aperture (e.g. for exclusion
1572 * ranges.
1573 */
1574static void dma_ops_reserve_addresses(struct dma_ops_domain *dom,
1575				      unsigned long start_page,
1576				      unsigned int pages)
1577{
1578	unsigned int i, last_page = dom->aperture_size >> PAGE_SHIFT;
1579
1580	if (start_page + pages > last_page)
1581		pages = last_page - start_page;
1582
1583	for (i = start_page; i < start_page + pages; ++i) {
1584		int index = i / APERTURE_RANGE_PAGES;
1585		int page  = i % APERTURE_RANGE_PAGES;
1586		__set_bit(page, dom->aperture[index]->bitmap);
1587	}
1588}
1589
1590/*
1591 * This function is used to add a new aperture range to an existing
1592 * aperture in case of dma_ops domain allocation or address allocation
1593 * failure.
1594 */
1595static int alloc_new_range(struct dma_ops_domain *dma_dom,
1596			   bool populate, gfp_t gfp)
1597{
1598	int index = dma_dom->aperture_size >> APERTURE_RANGE_SHIFT;
1599	struct amd_iommu *iommu;
1600	unsigned long i, old_size, pte_pgsize;
1601
1602#ifdef CONFIG_IOMMU_STRESS
1603	populate = false;
1604#endif
1605
1606	if (index >= APERTURE_MAX_RANGES)
1607		return -ENOMEM;
1608
1609	dma_dom->aperture[index] = kzalloc(sizeof(struct aperture_range), gfp);
1610	if (!dma_dom->aperture[index])
1611		return -ENOMEM;
1612
1613	dma_dom->aperture[index]->bitmap = (void *)get_zeroed_page(gfp);
1614	if (!dma_dom->aperture[index]->bitmap)
1615		goto out_free;
1616
1617	dma_dom->aperture[index]->offset = dma_dom->aperture_size;
1618
1619	if (populate) {
1620		unsigned long address = dma_dom->aperture_size;
1621		int i, num_ptes = APERTURE_RANGE_PAGES / 512;
1622		u64 *pte, *pte_page;
1623
1624		for (i = 0; i < num_ptes; ++i) {
1625			pte = alloc_pte(&dma_dom->domain, address, PAGE_SIZE,
1626					&pte_page, gfp);
1627			if (!pte)
1628				goto out_free;
1629
1630			dma_dom->aperture[index]->pte_pages[i] = pte_page;
1631
1632			address += APERTURE_RANGE_SIZE / 64;
1633		}
1634	}
1635
1636	old_size                = dma_dom->aperture_size;
1637	dma_dom->aperture_size += APERTURE_RANGE_SIZE;
1638
1639	/* Reserve address range used for MSI messages */
1640	if (old_size < MSI_ADDR_BASE_LO &&
1641	    dma_dom->aperture_size > MSI_ADDR_BASE_LO) {
1642		unsigned long spage;
1643		int pages;
1644
1645		pages = iommu_num_pages(MSI_ADDR_BASE_LO, 0x10000, PAGE_SIZE);
1646		spage = MSI_ADDR_BASE_LO >> PAGE_SHIFT;
1647
1648		dma_ops_reserve_addresses(dma_dom, spage, pages);
1649	}
1650
1651	/* Initialize the exclusion range if necessary */
1652	for_each_iommu(iommu) {
1653		if (iommu->exclusion_start &&
1654		    iommu->exclusion_start >= dma_dom->aperture[index]->offset
1655		    && iommu->exclusion_start < dma_dom->aperture_size) {
1656			unsigned long startpage;
1657			int pages = iommu_num_pages(iommu->exclusion_start,
1658						    iommu->exclusion_length,
1659						    PAGE_SIZE);
1660			startpage = iommu->exclusion_start >> PAGE_SHIFT;
1661			dma_ops_reserve_addresses(dma_dom, startpage, pages);
1662		}
1663	}
1664
1665	/*
1666	 * Check for areas already mapped as present in the new aperture
1667	 * range and mark those pages as reserved in the allocator. Such
1668	 * mappings may already exist as a result of requested unity
1669	 * mappings for devices.
1670	 */
1671	for (i = dma_dom->aperture[index]->offset;
1672	     i < dma_dom->aperture_size;
1673	     i += pte_pgsize) {
1674		u64 *pte = fetch_pte(&dma_dom->domain, i, &pte_pgsize);
1675		if (!pte || !IOMMU_PTE_PRESENT(*pte))
1676			continue;
1677
1678		dma_ops_reserve_addresses(dma_dom, i >> PAGE_SHIFT,
1679					  pte_pgsize >> 12);
1680	}
1681
1682	update_domain(&dma_dom->domain);
1683
1684	return 0;
1685
1686out_free:
1687	update_domain(&dma_dom->domain);
1688
1689	free_page((unsigned long)dma_dom->aperture[index]->bitmap);
1690
1691	kfree(dma_dom->aperture[index]);
1692	dma_dom->aperture[index] = NULL;
1693
1694	return -ENOMEM;
1695}
1696
1697static unsigned long dma_ops_area_alloc(struct device *dev,
1698					struct dma_ops_domain *dom,
1699					unsigned int pages,
1700					unsigned long align_mask,
1701					u64 dma_mask,
1702					unsigned long start)
1703{
1704	unsigned long next_bit = dom->next_address % APERTURE_RANGE_SIZE;
1705	int max_index = dom->aperture_size >> APERTURE_RANGE_SHIFT;
1706	int i = start >> APERTURE_RANGE_SHIFT;
1707	unsigned long boundary_size;
1708	unsigned long address = -1;
1709	unsigned long limit;
1710
1711	next_bit >>= PAGE_SHIFT;
1712
1713	boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1,
1714			PAGE_SIZE) >> PAGE_SHIFT;
1715
1716	for (;i < max_index; ++i) {
1717		unsigned long offset = dom->aperture[i]->offset >> PAGE_SHIFT;
1718
1719		if (dom->aperture[i]->offset >= dma_mask)
1720			break;
1721
1722		limit = iommu_device_max_index(APERTURE_RANGE_PAGES, offset,
1723					       dma_mask >> PAGE_SHIFT);
1724
1725		address = iommu_area_alloc(dom->aperture[i]->bitmap,
1726					   limit, next_bit, pages, 0,
1727					    boundary_size, align_mask);
1728		if (address != -1) {
1729			address = dom->aperture[i]->offset +
1730				  (address << PAGE_SHIFT);
1731			dom->next_address = address + (pages << PAGE_SHIFT);
1732			break;
1733		}
1734
1735		next_bit = 0;
1736	}
1737
1738	return address;
1739}
1740
1741static unsigned long dma_ops_alloc_addresses(struct device *dev,
1742					     struct dma_ops_domain *dom,
1743					     unsigned int pages,
1744					     unsigned long align_mask,
1745					     u64 dma_mask)
1746{
1747	unsigned long address;
1748
1749#ifdef CONFIG_IOMMU_STRESS
1750	dom->next_address = 0;
1751	dom->need_flush = true;
1752#endif
1753
1754	address = dma_ops_area_alloc(dev, dom, pages, align_mask,
1755				     dma_mask, dom->next_address);
1756
1757	if (address == -1) {
1758		dom->next_address = 0;
1759		address = dma_ops_area_alloc(dev, dom, pages, align_mask,
1760					     dma_mask, 0);
1761		dom->need_flush = true;
1762	}
1763
1764	if (unlikely(address == -1))
1765		address = DMA_ERROR_CODE;
1766
1767	WARN_ON((address + (PAGE_SIZE*pages)) > dom->aperture_size);
1768
1769	return address;
1770}
1771
1772/*
1773 * The address free function.
1774 *
1775 * called with domain->lock held
1776 */
1777static void dma_ops_free_addresses(struct dma_ops_domain *dom,
1778				   unsigned long address,
1779				   unsigned int pages)
1780{
1781	unsigned i = address >> APERTURE_RANGE_SHIFT;
1782	struct aperture_range *range = dom->aperture[i];
1783
1784	BUG_ON(i >= APERTURE_MAX_RANGES || range == NULL);
1785
1786#ifdef CONFIG_IOMMU_STRESS
1787	if (i < 4)
1788		return;
1789#endif
1790
1791	if (address >= dom->next_address)
1792		dom->need_flush = true;
1793
1794	address = (address % APERTURE_RANGE_SIZE) >> PAGE_SHIFT;
1795
1796	bitmap_clear(range->bitmap, address, pages);
1797
1798}
1799
1800/****************************************************************************
1801 *
1802 * The next functions belong to the domain allocation. A domain is
1803 * allocated for every IOMMU as the default domain. If device isolation
1804 * is enabled, every device get its own domain. The most important thing
1805 * about domains is the page table mapping the DMA address space they
1806 * contain.
1807 *
1808 ****************************************************************************/
1809
1810/*
1811 * This function adds a protection domain to the global protection domain list
1812 */
1813static void add_domain_to_list(struct protection_domain *domain)
1814{
1815	unsigned long flags;
1816
1817	spin_lock_irqsave(&amd_iommu_pd_lock, flags);
1818	list_add(&domain->list, &amd_iommu_pd_list);
1819	spin_unlock_irqrestore(&amd_iommu_pd_lock, flags);
1820}
1821
1822/*
1823 * This function removes a protection domain to the global
1824 * protection domain list
1825 */
1826static void del_domain_from_list(struct protection_domain *domain)
1827{
1828	unsigned long flags;
1829
1830	spin_lock_irqsave(&amd_iommu_pd_lock, flags);
1831	list_del(&domain->list);
1832	spin_unlock_irqrestore(&amd_iommu_pd_lock, flags);
1833}
1834
1835static u16 domain_id_alloc(void)
1836{
1837	unsigned long flags;
1838	int id;
1839
1840	write_lock_irqsave(&amd_iommu_devtable_lock, flags);
1841	id = find_first_zero_bit(amd_iommu_pd_alloc_bitmap, MAX_DOMAIN_ID);
1842	BUG_ON(id == 0);
1843	if (id > 0 && id < MAX_DOMAIN_ID)
1844		__set_bit(id, amd_iommu_pd_alloc_bitmap);
1845	else
1846		id = 0;
1847	write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
1848
1849	return id;
1850}
1851
1852static void domain_id_free(int id)
1853{
1854	unsigned long flags;
1855
1856	write_lock_irqsave(&amd_iommu_devtable_lock, flags);
1857	if (id > 0 && id < MAX_DOMAIN_ID)
1858		__clear_bit(id, amd_iommu_pd_alloc_bitmap);
1859	write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
1860}
1861
1862#define DEFINE_FREE_PT_FN(LVL, FN)				\
1863static void free_pt_##LVL (unsigned long __pt)			\
1864{								\
1865	unsigned long p;					\
1866	u64 *pt;						\
1867	int i;							\
1868								\
1869	pt = (u64 *)__pt;					\
1870								\
1871	for (i = 0; i < 512; ++i) {				\
1872		/* PTE present? */				\
1873		if (!IOMMU_PTE_PRESENT(pt[i]))			\
1874			continue;				\
1875								\
1876		/* Large PTE? */				\
1877		if (PM_PTE_LEVEL(pt[i]) == 0 ||			\
1878		    PM_PTE_LEVEL(pt[i]) == 7)			\
1879			continue;				\
1880								\
1881		p = (unsigned long)IOMMU_PTE_PAGE(pt[i]);	\
1882		FN(p);						\
1883	}							\
1884	free_page((unsigned long)pt);				\
1885}
1886
1887DEFINE_FREE_PT_FN(l2, free_page)
1888DEFINE_FREE_PT_FN(l3, free_pt_l2)
1889DEFINE_FREE_PT_FN(l4, free_pt_l3)
1890DEFINE_FREE_PT_FN(l5, free_pt_l4)
1891DEFINE_FREE_PT_FN(l6, free_pt_l5)
1892
1893static void free_pagetable(struct protection_domain *domain)
1894{
1895	unsigned long root = (unsigned long)domain->pt_root;
1896
1897	switch (domain->mode) {
1898	case PAGE_MODE_NONE:
1899		break;
1900	case PAGE_MODE_1_LEVEL:
1901		free_page(root);
1902		break;
1903	case PAGE_MODE_2_LEVEL:
1904		free_pt_l2(root);
1905		break;
1906	case PAGE_MODE_3_LEVEL:
1907		free_pt_l3(root);
1908		break;
1909	case PAGE_MODE_4_LEVEL:
1910		free_pt_l4(root);
1911		break;
1912	case PAGE_MODE_5_LEVEL:
1913		free_pt_l5(root);
1914		break;
1915	case PAGE_MODE_6_LEVEL:
1916		free_pt_l6(root);
1917		break;
1918	default:
1919		BUG();
1920	}
1921}
1922
1923static void free_gcr3_tbl_level1(u64 *tbl)
1924{
1925	u64 *ptr;
1926	int i;
1927
1928	for (i = 0; i < 512; ++i) {
1929		if (!(tbl[i] & GCR3_VALID))
1930			continue;
1931
1932		ptr = __va(tbl[i] & PAGE_MASK);
1933
1934		free_page((unsigned long)ptr);
1935	}
1936}
1937
1938static void free_gcr3_tbl_level2(u64 *tbl)
1939{
1940	u64 *ptr;
1941	int i;
1942
1943	for (i = 0; i < 512; ++i) {
1944		if (!(tbl[i] & GCR3_VALID))
1945			continue;
1946
1947		ptr = __va(tbl[i] & PAGE_MASK);
1948
1949		free_gcr3_tbl_level1(ptr);
1950	}
1951}
1952
1953static void free_gcr3_table(struct protection_domain *domain)
1954{
1955	if (domain->glx == 2)
1956		free_gcr3_tbl_level2(domain->gcr3_tbl);
1957	else if (domain->glx == 1)
1958		free_gcr3_tbl_level1(domain->gcr3_tbl);
1959	else if (domain->glx != 0)
1960		BUG();
1961
1962	free_page((unsigned long)domain->gcr3_tbl);
1963}
1964
1965/*
1966 * Free a domain, only used if something went wrong in the
1967 * allocation path and we need to free an already allocated page table
1968 */
1969static void dma_ops_domain_free(struct dma_ops_domain *dom)
1970{
1971	int i;
1972
1973	if (!dom)
1974		return;
1975
1976	del_domain_from_list(&dom->domain);
1977
1978	free_pagetable(&dom->domain);
1979
1980	for (i = 0; i < APERTURE_MAX_RANGES; ++i) {
1981		if (!dom->aperture[i])
1982			continue;
1983		free_page((unsigned long)dom->aperture[i]->bitmap);
1984		kfree(dom->aperture[i]);
1985	}
1986
1987	kfree(dom);
1988}
1989
1990/*
1991 * Allocates a new protection domain usable for the dma_ops functions.
1992 * It also initializes the page table and the address allocator data
1993 * structures required for the dma_ops interface
1994 */
1995static struct dma_ops_domain *dma_ops_domain_alloc(void)
1996{
1997	struct dma_ops_domain *dma_dom;
1998
1999	dma_dom = kzalloc(sizeof(struct dma_ops_domain), GFP_KERNEL);
2000	if (!dma_dom)
2001		return NULL;
2002
2003	spin_lock_init(&dma_dom->domain.lock);
2004
2005	dma_dom->domain.id = domain_id_alloc();
2006	if (dma_dom->domain.id == 0)
2007		goto free_dma_dom;
2008	INIT_LIST_HEAD(&dma_dom->domain.dev_list);
2009	dma_dom->domain.mode = PAGE_MODE_2_LEVEL;
2010	dma_dom->domain.pt_root = (void *)get_zeroed_page(GFP_KERNEL);
2011	dma_dom->domain.flags = PD_DMA_OPS_MASK;
2012	dma_dom->domain.priv = dma_dom;
2013	if (!dma_dom->domain.pt_root)
2014		goto free_dma_dom;
2015
2016	dma_dom->need_flush = false;
2017	dma_dom->target_dev = 0xffff;
2018
2019	add_domain_to_list(&dma_dom->domain);
2020
2021	if (alloc_new_range(dma_dom, true, GFP_KERNEL))
2022		goto free_dma_dom;
2023
2024	/*
2025	 * mark the first page as allocated so we never return 0 as
2026	 * a valid dma-address. So we can use 0 as error value
2027	 */
2028	dma_dom->aperture[0]->bitmap[0] = 1;
2029	dma_dom->next_address = 0;
2030
2031
2032	return dma_dom;
2033
2034free_dma_dom:
2035	dma_ops_domain_free(dma_dom);
2036
2037	return NULL;
2038}
2039
2040/*
2041 * little helper function to check whether a given protection domain is a
2042 * dma_ops domain
2043 */
2044static bool dma_ops_domain(struct protection_domain *domain)
2045{
2046	return domain->flags & PD_DMA_OPS_MASK;
2047}
2048
2049static void set_dte_entry(u16 devid, struct protection_domain *domain, bool ats)
2050{
2051	u64 pte_root = 0;
2052	u64 flags = 0;
2053
2054	if (domain->mode != PAGE_MODE_NONE)
2055		pte_root = virt_to_phys(domain->pt_root);
2056
2057	pte_root |= (domain->mode & DEV_ENTRY_MODE_MASK)
2058		    << DEV_ENTRY_MODE_SHIFT;
2059	pte_root |= IOMMU_PTE_IR | IOMMU_PTE_IW | IOMMU_PTE_P | IOMMU_PTE_TV;
2060
2061	flags = amd_iommu_dev_table[devid].data[1];
2062
2063	if (ats)
2064		flags |= DTE_FLAG_IOTLB;
2065
2066	if (domain->flags & PD_IOMMUV2_MASK) {
2067		u64 gcr3 = __pa(domain->gcr3_tbl);
2068		u64 glx  = domain->glx;
2069		u64 tmp;
2070
2071		pte_root |= DTE_FLAG_GV;
2072		pte_root |= (glx & DTE_GLX_MASK) << DTE_GLX_SHIFT;
2073
2074		/* First mask out possible old values for GCR3 table */
2075		tmp = DTE_GCR3_VAL_B(~0ULL) << DTE_GCR3_SHIFT_B;
2076		flags    &= ~tmp;
2077
2078		tmp = DTE_GCR3_VAL_C(~0ULL) << DTE_GCR3_SHIFT_C;
2079		flags    &= ~tmp;
2080
2081		/* Encode GCR3 table into DTE */
2082		tmp = DTE_GCR3_VAL_A(gcr3) << DTE_GCR3_SHIFT_A;
2083		pte_root |= tmp;
2084
2085		tmp = DTE_GCR3_VAL_B(gcr3) << DTE_GCR3_SHIFT_B;
2086		flags    |= tmp;
2087
2088		tmp = DTE_GCR3_VAL_C(gcr3) << DTE_GCR3_SHIFT_C;
2089		flags    |= tmp;
2090	}
2091
2092	flags &= ~(0xffffUL);
2093	flags |= domain->id;
2094
2095	amd_iommu_dev_table[devid].data[1]  = flags;
2096	amd_iommu_dev_table[devid].data[0]  = pte_root;
2097}
2098
2099static void clear_dte_entry(u16 devid)
2100{
2101	/* remove entry from the device table seen by the hardware */
2102	amd_iommu_dev_table[devid].data[0]  = IOMMU_PTE_P | IOMMU_PTE_TV;
2103	amd_iommu_dev_table[devid].data[1] &= DTE_FLAG_MASK;
2104
2105	amd_iommu_apply_erratum_63(devid);
2106}
2107
2108static void do_attach(struct iommu_dev_data *dev_data,
2109		      struct protection_domain *domain)
2110{
2111	struct amd_iommu *iommu;
2112	bool ats;
2113
2114	iommu = amd_iommu_rlookup_table[dev_data->devid];
2115	ats   = dev_data->ats.enabled;
2116
2117	/* Update data structures */
2118	dev_data->domain = domain;
2119	list_add(&dev_data->list, &domain->dev_list);
2120	set_dte_entry(dev_data->devid, domain, ats);
2121
2122	/* Do reference counting */
2123	domain->dev_iommu[iommu->index] += 1;
2124	domain->dev_cnt                 += 1;
2125
2126	/* Flush the DTE entry */
2127	device_flush_dte(dev_data);
2128}
2129
2130static void do_detach(struct iommu_dev_data *dev_data)
2131{
2132	struct amd_iommu *iommu;
2133
2134	iommu = amd_iommu_rlookup_table[dev_data->devid];
2135
2136	/* decrease reference counters */
2137	dev_data->domain->dev_iommu[iommu->index] -= 1;
2138	dev_data->domain->dev_cnt                 -= 1;
2139
2140	/* Update data structures */
2141	dev_data->domain = NULL;
2142	list_del(&dev_data->list);
2143	clear_dte_entry(dev_data->devid);
2144
2145	/* Flush the DTE entry */
2146	device_flush_dte(dev_data);
2147}
2148
2149/*
2150 * If a device is not yet associated with a domain, this function does
2151 * assigns it visible for the hardware
2152 */
2153static int __attach_device(struct iommu_dev_data *dev_data,
2154			   struct protection_domain *domain)
2155{
2156	struct iommu_dev_data *head, *entry;
2157	int ret;
2158
2159	/* lock domain */
2160	spin_lock(&domain->lock);
2161
2162	head = dev_data;
2163
2164	if (head->alias_data != NULL)
2165		head = head->alias_data;
2166
2167	/* Now we have the root of the alias group, if any */
2168
2169	ret = -EBUSY;
2170	if (head->domain != NULL)
2171		goto out_unlock;
2172
2173	/* Attach alias group root */
2174	do_attach(head, domain);
2175
2176	/* Attach other devices in the alias group */
2177	list_for_each_entry(entry, &head->alias_list, alias_list)
2178		do_attach(entry, domain);
2179
2180	ret = 0;
2181
2182out_unlock:
2183
2184	/* ready */
2185	spin_unlock(&domain->lock);
2186
2187	return ret;
2188}
2189
2190
2191static void pdev_iommuv2_disable(struct pci_dev *pdev)
2192{
2193	pci_disable_ats(pdev);
2194	pci_disable_pri(pdev);
2195	pci_disable_pasid(pdev);
2196}
2197
2198/* FIXME: Change generic reset-function to do the same */
2199static int pri_reset_while_enabled(struct pci_dev *pdev)
2200{
2201	u16 control;
2202	int pos;
2203
2204	pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI);
2205	if (!pos)
2206		return -EINVAL;
2207
2208	pci_read_config_word(pdev, pos + PCI_PRI_CTRL, &control);
2209	control |= PCI_PRI_CTRL_RESET;
2210	pci_write_config_word(pdev, pos + PCI_PRI_CTRL, control);
2211
2212	return 0;
2213}
2214
2215static int pdev_iommuv2_enable(struct pci_dev *pdev)
2216{
2217	bool reset_enable;
2218	int reqs, ret;
2219
2220	/* FIXME: Hardcode number of outstanding requests for now */
2221	reqs = 32;
2222	if (pdev_pri_erratum(pdev, AMD_PRI_DEV_ERRATUM_LIMIT_REQ_ONE))
2223		reqs = 1;
2224	reset_enable = pdev_pri_erratum(pdev, AMD_PRI_DEV_ERRATUM_ENABLE_RESET);
2225
2226	/* Only allow access to user-accessible pages */
2227	ret = pci_enable_pasid(pdev, 0);
2228	if (ret)
2229		goto out_err;
2230
2231	/* First reset the PRI state of the device */
2232	ret = pci_reset_pri(pdev);
2233	if (ret)
2234		goto out_err;
2235
2236	/* Enable PRI */
2237	ret = pci_enable_pri(pdev, reqs);
2238	if (ret)
2239		goto out_err;
2240
2241	if (reset_enable) {
2242		ret = pri_reset_while_enabled(pdev);
2243		if (ret)
2244			goto out_err;
2245	}
2246
2247	ret = pci_enable_ats(pdev, PAGE_SHIFT);
2248	if (ret)
2249		goto out_err;
2250
2251	return 0;
2252
2253out_err:
2254	pci_disable_pri(pdev);
2255	pci_disable_pasid(pdev);
2256
2257	return ret;
2258}
2259
2260/* FIXME: Move this to PCI code */
2261#define PCI_PRI_TLP_OFF		(1 << 15)
2262
2263static bool pci_pri_tlp_required(struct pci_dev *pdev)
2264{
2265	u16 status;
2266	int pos;
2267
2268	pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI);
2269	if (!pos)
2270		return false;
2271
2272	pci_read_config_word(pdev, pos + PCI_PRI_STATUS, &status);
2273
2274	return (status & PCI_PRI_TLP_OFF) ? true : false;
2275}
2276
2277/*
2278 * If a device is not yet associated with a domain, this function
2279 * assigns it visible for the hardware
2280 */
2281static int attach_device(struct device *dev,
2282			 struct protection_domain *domain)
2283{
2284	struct pci_dev *pdev = to_pci_dev(dev);
2285	struct iommu_dev_data *dev_data;
2286	unsigned long flags;
2287	int ret;
2288
2289	dev_data = get_dev_data(dev);
2290
2291	if (domain->flags & PD_IOMMUV2_MASK) {
2292		if (!dev_data->iommu_v2 || !dev_data->passthrough)
2293			return -EINVAL;
2294
2295		if (pdev_iommuv2_enable(pdev) != 0)
2296			return -EINVAL;
2297
2298		dev_data->ats.enabled = true;
2299		dev_data->ats.qdep    = pci_ats_queue_depth(pdev);
2300		dev_data->pri_tlp     = pci_pri_tlp_required(pdev);
2301	} else if (amd_iommu_iotlb_sup &&
2302		   pci_enable_ats(pdev, PAGE_SHIFT) == 0) {
2303		dev_data->ats.enabled = true;
2304		dev_data->ats.qdep    = pci_ats_queue_depth(pdev);
2305	}
2306
2307	write_lock_irqsave(&amd_iommu_devtable_lock, flags);
2308	ret = __attach_device(dev_data, domain);
2309	write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
2310
2311	/*
2312	 * We might boot into a crash-kernel here. The crashed kernel
2313	 * left the caches in the IOMMU dirty. So we have to flush
2314	 * here to evict all dirty stuff.
2315	 */
2316	domain_flush_tlb_pde(domain);
2317
2318	return ret;
2319}
2320
2321/*
2322 * Removes a device from a protection domain (unlocked)
2323 */
2324static void __detach_device(struct iommu_dev_data *dev_data)
2325{
2326	struct iommu_dev_data *head, *entry;
2327	struct protection_domain *domain;
2328	unsigned long flags;
2329
2330	BUG_ON(!dev_data->domain);
2331
2332	domain = dev_data->domain;
2333
2334	spin_lock_irqsave(&domain->lock, flags);
2335
2336	head = dev_data;
2337	if (head->alias_data != NULL)
2338		head = head->alias_data;
2339
2340	list_for_each_entry(entry, &head->alias_list, alias_list)
2341		do_detach(entry);
2342
2343	do_detach(head);
2344
2345	spin_unlock_irqrestore(&domain->lock, flags);
2346
2347	/*
2348	 * If we run in passthrough mode the device must be assigned to the
2349	 * passthrough domain if it is detached from any other domain.
2350	 * Make sure we can deassign from the pt_domain itself.
2351	 */
2352	if (dev_data->passthrough &&
2353	    (dev_data->domain == NULL && domain != pt_domain))
2354		__attach_device(dev_data, pt_domain);
2355}
2356
2357/*
2358 * Removes a device from a protection domain (with devtable_lock held)
2359 */
2360static void detach_device(struct device *dev)
2361{
2362	struct protection_domain *domain;
2363	struct iommu_dev_data *dev_data;
2364	unsigned long flags;
2365
2366	dev_data = get_dev_data(dev);
2367	domain   = dev_data->domain;
2368
2369	/* lock device table */
2370	write_lock_irqsave(&amd_iommu_devtable_lock, flags);
2371	__detach_device(dev_data);
2372	write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
2373
2374	if (domain->flags & PD_IOMMUV2_MASK)
2375		pdev_iommuv2_disable(to_pci_dev(dev));
2376	else if (dev_data->ats.enabled)
2377		pci_disable_ats(to_pci_dev(dev));
2378
2379	dev_data->ats.enabled = false;
2380}
2381
2382/*
2383 * Find out the protection domain structure for a given PCI device. This
2384 * will give us the pointer to the page table root for example.
2385 */
2386static struct protection_domain *domain_for_device(struct device *dev)
2387{
2388	struct iommu_dev_data *dev_data;
2389	struct protection_domain *dom = NULL;
2390	unsigned long flags;
2391
2392	dev_data   = get_dev_data(dev);
2393
2394	if (dev_data->domain)
2395		return dev_data->domain;
2396
2397	if (dev_data->alias_data != NULL) {
2398		struct iommu_dev_data *alias_data = dev_data->alias_data;
2399
2400		read_lock_irqsave(&amd_iommu_devtable_lock, flags);
2401		if (alias_data->domain != NULL) {
2402			__attach_device(dev_data, alias_data->domain);
2403			dom = alias_data->domain;
2404		}
2405		read_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
2406	}
2407
2408	return dom;
2409}
2410
2411static int device_change_notifier(struct notifier_block *nb,
2412				  unsigned long action, void *data)
2413{
2414	struct dma_ops_domain *dma_domain;
2415	struct protection_domain *domain;
2416	struct iommu_dev_data *dev_data;
2417	struct device *dev = data;
2418	struct amd_iommu *iommu;
2419	unsigned long flags;
2420	u16 devid;
2421
2422	if (!check_device(dev))
2423		return 0;
2424
2425	devid    = get_device_id(dev);
2426	iommu    = amd_iommu_rlookup_table[devid];
2427	dev_data = get_dev_data(dev);
2428
2429	switch (action) {
2430	case BUS_NOTIFY_ADD_DEVICE:
2431
2432		iommu_init_device(dev);
2433		init_iommu_group(dev);
2434
2435		/*
2436		 * dev_data is still NULL and
2437		 * got initialized in iommu_init_device
2438		 */
2439		dev_data = get_dev_data(dev);
2440
2441		if (iommu_pass_through || dev_data->iommu_v2) {
2442			dev_data->passthrough = true;
2443			attach_device(dev, pt_domain);
2444			break;
2445		}
2446
2447		domain = domain_for_device(dev);
2448
2449		/* allocate a protection domain if a device is added */
2450		dma_domain = find_protection_domain(devid);
2451		if (!dma_domain) {
2452			dma_domain = dma_ops_domain_alloc();
2453			if (!dma_domain)
2454				goto out;
2455			dma_domain->target_dev = devid;
2456
2457			spin_lock_irqsave(&iommu_pd_list_lock, flags);
2458			list_add_tail(&dma_domain->list, &iommu_pd_list);
2459			spin_unlock_irqrestore(&iommu_pd_list_lock, flags);
2460		}
2461
2462		dev->archdata.dma_ops = &amd_iommu_dma_ops;
2463
2464		break;
2465	case BUS_NOTIFY_REMOVED_DEVICE:
2466
2467		iommu_uninit_device(dev);
2468
2469	default:
2470		goto out;
2471	}
2472
2473	iommu_completion_wait(iommu);
2474
2475out:
2476	return 0;
2477}
2478
2479static struct notifier_block device_nb = {
2480	.notifier_call = device_change_notifier,
2481};
2482
2483void amd_iommu_init_notifier(void)
2484{
2485	bus_register_notifier(&pci_bus_type, &device_nb);
2486}
2487
2488/*****************************************************************************
2489 *
2490 * The next functions belong to the dma_ops mapping/unmapping code.
2491 *
2492 *****************************************************************************/
2493
2494/*
2495 * In the dma_ops path we only have the struct device. This function
2496 * finds the corresponding IOMMU, the protection domain and the
2497 * requestor id for a given device.
2498 * If the device is not yet associated with a domain this is also done
2499 * in this function.
2500 */
2501static struct protection_domain *get_domain(struct device *dev)
2502{
2503	struct protection_domain *domain;
2504	struct dma_ops_domain *dma_dom;
2505	u16 devid = get_device_id(dev);
2506
2507	if (!check_device(dev))
2508		return ERR_PTR(-EINVAL);
2509
2510	domain = domain_for_device(dev);
2511	if (domain != NULL && !dma_ops_domain(domain))
2512		return ERR_PTR(-EBUSY);
2513
2514	if (domain != NULL)
2515		return domain;
2516
2517	/* Device not bound yet - bind it */
2518	dma_dom = find_protection_domain(devid);
2519	if (!dma_dom)
2520		dma_dom = amd_iommu_rlookup_table[devid]->default_dom;
2521	attach_device(dev, &dma_dom->domain);
2522	DUMP_printk("Using protection domain %d for device %s\n",
2523		    dma_dom->domain.id, dev_name(dev));
2524
2525	return &dma_dom->domain;
2526}
2527
2528static void update_device_table(struct protection_domain *domain)
2529{
2530	struct iommu_dev_data *dev_data;
2531
2532	list_for_each_entry(dev_data, &domain->dev_list, list)
2533		set_dte_entry(dev_data->devid, domain, dev_data->ats.enabled);
2534}
2535
2536static void update_domain(struct protection_domain *domain)
2537{
2538	if (!domain->updated)
2539		return;
2540
2541	update_device_table(domain);
2542
2543	domain_flush_devices(domain);
2544	domain_flush_tlb_pde(domain);
2545
2546	domain->updated = false;
2547}
2548
2549/*
2550 * This function fetches the PTE for a given address in the aperture
2551 */
2552static u64* dma_ops_get_pte(struct dma_ops_domain *dom,
2553			    unsigned long address)
2554{
2555	struct aperture_range *aperture;
2556	u64 *pte, *pte_page;
2557
2558	aperture = dom->aperture[APERTURE_RANGE_INDEX(address)];
2559	if (!aperture)
2560		return NULL;
2561
2562	pte = aperture->pte_pages[APERTURE_PAGE_INDEX(address)];
2563	if (!pte) {
2564		pte = alloc_pte(&dom->domain, address, PAGE_SIZE, &pte_page,
2565				GFP_ATOMIC);
2566		aperture->pte_pages[APERTURE_PAGE_INDEX(address)] = pte_page;
2567	} else
2568		pte += PM_LEVEL_INDEX(0, address);
2569
2570	update_domain(&dom->domain);
2571
2572	return pte;
2573}
2574
2575/*
2576 * This is the generic map function. It maps one 4kb page at paddr to
2577 * the given address in the DMA address space for the domain.
2578 */
2579static dma_addr_t dma_ops_domain_map(struct dma_ops_domain *dom,
2580				     unsigned long address,
2581				     phys_addr_t paddr,
2582				     int direction)
2583{
2584	u64 *pte, __pte;
2585
2586	WARN_ON(address > dom->aperture_size);
2587
2588	paddr &= PAGE_MASK;
2589
2590	pte  = dma_ops_get_pte(dom, address);
2591	if (!pte)
2592		return DMA_ERROR_CODE;
2593
2594	__pte = paddr | IOMMU_PTE_P | IOMMU_PTE_FC;
2595
2596	if (direction == DMA_TO_DEVICE)
2597		__pte |= IOMMU_PTE_IR;
2598	else if (direction == DMA_FROM_DEVICE)
2599		__pte |= IOMMU_PTE_IW;
2600	else if (direction == DMA_BIDIRECTIONAL)
2601		__pte |= IOMMU_PTE_IR | IOMMU_PTE_IW;
2602
2603	WARN_ON(*pte);
2604
2605	*pte = __pte;
2606
2607	return (dma_addr_t)address;
2608}
2609
2610/*
2611 * The generic unmapping function for on page in the DMA address space.
2612 */
2613static void dma_ops_domain_unmap(struct dma_ops_domain *dom,
2614				 unsigned long address)
2615{
2616	struct aperture_range *aperture;
2617	u64 *pte;
2618
2619	if (address >= dom->aperture_size)
2620		return;
2621
2622	aperture = dom->aperture[APERTURE_RANGE_INDEX(address)];
2623	if (!aperture)
2624		return;
2625
2626	pte  = aperture->pte_pages[APERTURE_PAGE_INDEX(address)];
2627	if (!pte)
2628		return;
2629
2630	pte += PM_LEVEL_INDEX(0, address);
2631
2632	WARN_ON(!*pte);
2633
2634	*pte = 0ULL;
2635}
2636
2637/*
2638 * This function contains common code for mapping of a physically
2639 * contiguous memory region into DMA address space. It is used by all
2640 * mapping functions provided with this IOMMU driver.
2641 * Must be called with the domain lock held.
2642 */
2643static dma_addr_t __map_single(struct device *dev,
2644			       struct dma_ops_domain *dma_dom,
2645			       phys_addr_t paddr,
2646			       size_t size,
2647			       int dir,
2648			       bool align,
2649			       u64 dma_mask)
2650{
2651	dma_addr_t offset = paddr & ~PAGE_MASK;
2652	dma_addr_t address, start, ret;
2653	unsigned int pages;
2654	unsigned long align_mask = 0;
2655	int i;
2656
2657	pages = iommu_num_pages(paddr, size, PAGE_SIZE);
2658	paddr &= PAGE_MASK;
2659
2660	INC_STATS_COUNTER(total_map_requests);
2661
2662	if (pages > 1)
2663		INC_STATS_COUNTER(cross_page);
2664
2665	if (align)
2666		align_mask = (1UL << get_order(size)) - 1;
2667
2668retry:
2669	address = dma_ops_alloc_addresses(dev, dma_dom, pages, align_mask,
2670					  dma_mask);
2671	if (unlikely(address == DMA_ERROR_CODE)) {
2672		/*
2673		 * setting next_address here will let the address
2674		 * allocator only scan the new allocated range in the
2675		 * first run. This is a small optimization.
2676		 */
2677		dma_dom->next_address = dma_dom->aperture_size;
2678
2679		if (alloc_new_range(dma_dom, false, GFP_ATOMIC))
2680			goto out;
2681
2682		/*
2683		 * aperture was successfully enlarged by 128 MB, try
2684		 * allocation again
2685		 */
2686		goto retry;
2687	}
2688
2689	start = address;
2690	for (i = 0; i < pages; ++i) {
2691		ret = dma_ops_domain_map(dma_dom, start, paddr, dir);
2692		if (ret == DMA_ERROR_CODE)
2693			goto out_unmap;
2694
2695		paddr += PAGE_SIZE;
2696		start += PAGE_SIZE;
2697	}
2698	address += offset;
2699
2700	ADD_STATS_COUNTER(alloced_io_mem, size);
2701
2702	if (unlikely(dma_dom->need_flush && !amd_iommu_unmap_flush)) {
2703		domain_flush_tlb(&dma_dom->domain);
2704		dma_dom->need_flush = false;
2705	} else if (unlikely(amd_iommu_np_cache))
2706		domain_flush_pages(&dma_dom->domain, address, size);
2707
2708out:
2709	return address;
2710
2711out_unmap:
2712
2713	for (--i; i >= 0; --i) {
2714		start -= PAGE_SIZE;
2715		dma_ops_domain_unmap(dma_dom, start);
2716	}
2717
2718	dma_ops_free_addresses(dma_dom, address, pages);
2719
2720	return DMA_ERROR_CODE;
2721}
2722
2723/*
2724 * Does the reverse of the __map_single function. Must be called with
2725 * the domain lock held too
2726 */
2727static void __unmap_single(struct dma_ops_domain *dma_dom,
2728			   dma_addr_t dma_addr,
2729			   size_t size,
2730			   int dir)
2731{
2732	dma_addr_t flush_addr;
2733	dma_addr_t i, start;
2734	unsigned int pages;
2735
2736	if ((dma_addr == DMA_ERROR_CODE) ||
2737	    (dma_addr + size > dma_dom->aperture_size))
2738		return;
2739
2740	flush_addr = dma_addr;
2741	pages = iommu_num_pages(dma_addr, size, PAGE_SIZE);
2742	dma_addr &= PAGE_MASK;
2743	start = dma_addr;
2744
2745	for (i = 0; i < pages; ++i) {
2746		dma_ops_domain_unmap(dma_dom, start);
2747		start += PAGE_SIZE;
2748	}
2749
2750	SUB_STATS_COUNTER(alloced_io_mem, size);
2751
2752	dma_ops_free_addresses(dma_dom, dma_addr, pages);
2753
2754	if (amd_iommu_unmap_flush || dma_dom->need_flush) {
2755		domain_flush_pages(&dma_dom->domain, flush_addr, size);
2756		dma_dom->need_flush = false;
2757	}
2758}
2759
2760/*
2761 * The exported map_single function for dma_ops.
2762 */
2763static dma_addr_t map_page(struct device *dev, struct page *page,
2764			   unsigned long offset, size_t size,
2765			   enum dma_data_direction dir,
2766			   struct dma_attrs *attrs)
2767{
2768	unsigned long flags;
2769	struct protection_domain *domain;
2770	dma_addr_t addr;
2771	u64 dma_mask;
2772	phys_addr_t paddr = page_to_phys(page) + offset;
2773
2774	INC_STATS_COUNTER(cnt_map_single);
2775
2776	domain = get_domain(dev);
2777	if (PTR_ERR(domain) == -EINVAL)
2778		return (dma_addr_t)paddr;
2779	else if (IS_ERR(domain))
2780		return DMA_ERROR_CODE;
2781
2782	dma_mask = *dev->dma_mask;
2783
2784	spin_lock_irqsave(&domain->lock, flags);
2785
2786	addr = __map_single(dev, domain->priv, paddr, size, dir, false,
2787			    dma_mask);
2788	if (addr == DMA_ERROR_CODE)
2789		goto out;
2790
2791	domain_flush_complete(domain);
2792
2793out:
2794	spin_unlock_irqrestore(&domain->lock, flags);
2795
2796	return addr;
2797}
2798
2799/*
2800 * The exported unmap_single function for dma_ops.
2801 */
2802static void unmap_page(struct device *dev, dma_addr_t dma_addr, size_t size,
2803		       enum dma_data_direction dir, struct dma_attrs *attrs)
2804{
2805	unsigned long flags;
2806	struct protection_domain *domain;
2807
2808	INC_STATS_COUNTER(cnt_unmap_single);
2809
2810	domain = get_domain(dev);
2811	if (IS_ERR(domain))
2812		return;
2813
2814	spin_lock_irqsave(&domain->lock, flags);
2815
2816	__unmap_single(domain->priv, dma_addr, size, dir);
2817
2818	domain_flush_complete(domain);
2819
2820	spin_unlock_irqrestore(&domain->lock, flags);
2821}
2822
2823/*
2824 * The exported map_sg function for dma_ops (handles scatter-gather
2825 * lists).
2826 */
2827static int map_sg(struct device *dev, struct scatterlist *sglist,
2828		  int nelems, enum dma_data_direction dir,
2829		  struct dma_attrs *attrs)
2830{
2831	unsigned long flags;
2832	struct protection_domain *domain;
2833	int i;
2834	struct scatterlist *s;
2835	phys_addr_t paddr;
2836	int mapped_elems = 0;
2837	u64 dma_mask;
2838
2839	INC_STATS_COUNTER(cnt_map_sg);
2840
2841	domain = get_domain(dev);
2842	if (IS_ERR(domain))
2843		return 0;
2844
2845	dma_mask = *dev->dma_mask;
2846
2847	spin_lock_irqsave(&domain->lock, flags);
2848
2849	for_each_sg(sglist, s, nelems, i) {
2850		paddr = sg_phys(s);
2851
2852		s->dma_address = __map_single(dev, domain->priv,
2853					      paddr, s->length, dir, false,
2854					      dma_mask);
2855
2856		if (s->dma_address) {
2857			s->dma_length = s->length;
2858			mapped_elems++;
2859		} else
2860			goto unmap;
2861	}
2862
2863	domain_flush_complete(domain);
2864
2865out:
2866	spin_unlock_irqrestore(&domain->lock, flags);
2867
2868	return mapped_elems;
2869unmap:
2870	for_each_sg(sglist, s, mapped_elems, i) {
2871		if (s->dma_address)
2872			__unmap_single(domain->priv, s->dma_address,
2873				       s->dma_length, dir);
2874		s->dma_address = s->dma_length = 0;
2875	}
2876
2877	mapped_elems = 0;
2878
2879	goto out;
2880}
2881
2882/*
2883 * The exported map_sg function for dma_ops (handles scatter-gather
2884 * lists).
2885 */
2886static void unmap_sg(struct device *dev, struct scatterlist *sglist,
2887		     int nelems, enum dma_data_direction dir,
2888		     struct dma_attrs *attrs)
2889{
2890	unsigned long flags;
2891	struct protection_domain *domain;
2892	struct scatterlist *s;
2893	int i;
2894
2895	INC_STATS_COUNTER(cnt_unmap_sg);
2896
2897	domain = get_domain(dev);
2898	if (IS_ERR(domain))
2899		return;
2900
2901	spin_lock_irqsave(&domain->lock, flags);
2902
2903	for_each_sg(sglist, s, nelems, i) {
2904		__unmap_single(domain->priv, s->dma_address,
2905			       s->dma_length, dir);
2906		s->dma_address = s->dma_length = 0;
2907	}
2908
2909	domain_flush_complete(domain);
2910
2911	spin_unlock_irqrestore(&domain->lock, flags);
2912}
2913
2914/*
2915 * The exported alloc_coherent function for dma_ops.
2916 */
2917static void *alloc_coherent(struct device *dev, size_t size,
2918			    dma_addr_t *dma_addr, gfp_t flag,
2919			    struct dma_attrs *attrs)
2920{
2921	u64 dma_mask = dev->coherent_dma_mask;
2922	struct protection_domain *domain;
2923	unsigned long flags;
2924	struct page *page;
2925
2926	INC_STATS_COUNTER(cnt_alloc_coherent);
2927
2928	domain = get_domain(dev);
2929	if (PTR_ERR(domain) == -EINVAL) {
2930		page = alloc_pages(flag, get_order(size));
2931		*dma_addr = page_to_phys(page);
2932		return page_address(page);
2933	} else if (IS_ERR(domain))
2934		return NULL;
2935
2936	size	  = PAGE_ALIGN(size);
2937	dma_mask  = dev->coherent_dma_mask;
2938	flag     &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
2939	flag     |= __GFP_ZERO;
2940
2941	page = alloc_pages(flag | __GFP_NOWARN,  get_order(size));
2942	if (!page) {
2943		if (!(flag & __GFP_WAIT))
2944			return NULL;
2945
2946		page = dma_alloc_from_contiguous(dev, size >> PAGE_SHIFT,
2947						 get_order(size));
2948		if (!page)
2949			return NULL;
2950	}
2951
2952	if (!dma_mask)
2953		dma_mask = *dev->dma_mask;
2954
2955	spin_lock_irqsave(&domain->lock, flags);
2956
2957	*dma_addr = __map_single(dev, domain->priv, page_to_phys(page),
2958				 size, DMA_BIDIRECTIONAL, true, dma_mask);
2959
2960	if (*dma_addr == DMA_ERROR_CODE) {
2961		spin_unlock_irqrestore(&domain->lock, flags);
2962		goto out_free;
2963	}
2964
2965	domain_flush_complete(domain);
2966
2967	spin_unlock_irqrestore(&domain->lock, flags);
2968
2969	return page_address(page);
2970
2971out_free:
2972
2973	if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
2974		__free_pages(page, get_order(size));
2975
2976	return NULL;
2977}
2978
2979/*
2980 * The exported free_coherent function for dma_ops.
2981 */
2982static void free_coherent(struct device *dev, size_t size,
2983			  void *virt_addr, dma_addr_t dma_addr,
2984			  struct dma_attrs *attrs)
2985{
2986	struct protection_domain *domain;
2987	unsigned long flags;
2988	struct page *page;
2989
2990	INC_STATS_COUNTER(cnt_free_coherent);
2991
2992	page = virt_to_page(virt_addr);
2993	size = PAGE_ALIGN(size);
2994
2995	domain = get_domain(dev);
2996	if (IS_ERR(domain))
2997		goto free_mem;
2998
2999	spin_lock_irqsave(&domain->lock, flags);
3000
3001	__unmap_single(domain->priv, dma_addr, size, DMA_BIDIRECTIONAL);
3002
3003	domain_flush_complete(domain);
3004
3005	spin_unlock_irqrestore(&domain->lock, flags);
3006
3007free_mem:
3008	if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3009		__free_pages(page, get_order(size));
3010}
3011
3012/*
3013 * This function is called by the DMA layer to find out if we can handle a
3014 * particular device. It is part of the dma_ops.
3015 */
3016static int amd_iommu_dma_supported(struct device *dev, u64 mask)
3017{
3018	return check_device(dev);
3019}
3020
3021/*
3022 * The function for pre-allocating protection domains.
3023 *
3024 * If the driver core informs the DMA layer if a driver grabs a device
3025 * we don't need to preallocate the protection domains anymore.
3026 * For now we have to.
3027 */
3028static void __init prealloc_protection_domains(void)
3029{
3030	struct iommu_dev_data *dev_data;
3031	struct dma_ops_domain *dma_dom;
3032	struct pci_dev *dev = NULL;
3033	u16 devid;
3034
3035	for_each_pci_dev(dev) {
3036
3037		/* Do we handle this device? */
3038		if (!check_device(&dev->dev))
3039			continue;
3040
3041		dev_data = get_dev_data(&dev->dev);
3042		if (!amd_iommu_force_isolation && dev_data->iommu_v2) {
3043			/* Make sure passthrough domain is allocated */
3044			alloc_passthrough_domain();
3045			dev_data->passthrough = true;
3046			attach_device(&dev->dev, pt_domain);
3047			pr_info("AMD-Vi: Using passthrough domain for device %s\n",
3048				dev_name(&dev->dev));
3049		}
3050
3051		/* Is there already any domain for it? */
3052		if (domain_for_device(&dev->dev))
3053			continue;
3054
3055		devid = get_device_id(&dev->dev);
3056
3057		dma_dom = dma_ops_domain_alloc();
3058		if (!dma_dom)
3059			continue;
3060		init_unity_mappings_for_device(dma_dom, devid);
3061		dma_dom->target_dev = devid;
3062
3063		attach_device(&dev->dev, &dma_dom->domain);
3064
3065		list_add_tail(&dma_dom->list, &iommu_pd_list);
3066	}
3067}
3068
3069static struct dma_map_ops amd_iommu_dma_ops = {
3070	.alloc = alloc_coherent,
3071	.free = free_coherent,
3072	.map_page = map_page,
3073	.unmap_page = unmap_page,
3074	.map_sg = map_sg,
3075	.unmap_sg = unmap_sg,
3076	.dma_supported = amd_iommu_dma_supported,
3077};
3078
3079static unsigned device_dma_ops_init(void)
3080{
3081	struct iommu_dev_data *dev_data;
3082	struct pci_dev *pdev = NULL;
3083	unsigned unhandled = 0;
3084
3085	for_each_pci_dev(pdev) {
3086		if (!check_device(&pdev->dev)) {
3087
3088			iommu_ignore_device(&pdev->dev);
3089
3090			unhandled += 1;
3091			continue;
3092		}
3093
3094		dev_data = get_dev_data(&pdev->dev);
3095
3096		if (!dev_data->passthrough)
3097			pdev->dev.archdata.dma_ops = &amd_iommu_dma_ops;
3098		else
3099			pdev->dev.archdata.dma_ops = &nommu_dma_ops;
3100	}
3101
3102	return unhandled;
3103}
3104
3105/*
3106 * The function which clues the AMD IOMMU driver into dma_ops.
3107 */
3108
3109void __init amd_iommu_init_api(void)
3110{
3111	bus_set_iommu(&pci_bus_type, &amd_iommu_ops);
3112}
3113
3114int __init amd_iommu_init_dma_ops(void)
3115{
3116	struct amd_iommu *iommu;
3117	int ret, unhandled;
3118
3119	/*
3120	 * first allocate a default protection domain for every IOMMU we
3121	 * found in the system. Devices not assigned to any other
3122	 * protection domain will be assigned to the default one.
3123	 */
3124	for_each_iommu(iommu) {
3125		iommu->default_dom = dma_ops_domain_alloc();
3126		if (iommu->default_dom == NULL)
3127			return -ENOMEM;
3128		iommu->default_dom->domain.flags |= PD_DEFAULT_MASK;
3129		ret = iommu_init_unity_mappings(iommu);
3130		if (ret)
3131			goto free_domains;
3132	}
3133
3134	/*
3135	 * Pre-allocate the protection domains for each device.
3136	 */
3137	prealloc_protection_domains();
3138
3139	iommu_detected = 1;
3140	swiotlb = 0;
3141
3142	/* Make the driver finally visible to the drivers */
3143	unhandled = device_dma_ops_init();
3144	if (unhandled && max_pfn > MAX_DMA32_PFN) {
3145		/* There are unhandled devices - initialize swiotlb for them */
3146		swiotlb = 1;
3147	}
3148
3149	amd_iommu_stats_init();
3150
3151	if (amd_iommu_unmap_flush)
3152		pr_info("AMD-Vi: IO/TLB flush on unmap enabled\n");
3153	else
3154		pr_info("AMD-Vi: Lazy IO/TLB flushing enabled\n");
3155
3156	return 0;
3157
3158free_domains:
3159
3160	for_each_iommu(iommu) {
3161		dma_ops_domain_free(iommu->default_dom);
3162	}
3163
3164	return ret;
3165}
3166
3167/*****************************************************************************
3168 *
3169 * The following functions belong to the exported interface of AMD IOMMU
3170 *
3171 * This interface allows access to lower level functions of the IOMMU
3172 * like protection domain handling and assignement of devices to domains
3173 * which is not possible with the dma_ops interface.
3174 *
3175 *****************************************************************************/
3176
3177static void cleanup_domain(struct protection_domain *domain)
3178{
3179	struct iommu_dev_data *entry;
3180	unsigned long flags;
3181
3182	write_lock_irqsave(&amd_iommu_devtable_lock, flags);
3183
3184	while (!list_empty(&domain->dev_list)) {
3185		entry = list_first_entry(&domain->dev_list,
3186					 struct iommu_dev_data, list);
3187		__detach_device(entry);
3188	}
3189
3190	write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
3191}
3192
3193static void protection_domain_free(struct protection_domain *domain)
3194{
3195	if (!domain)
3196		return;
3197
3198	del_domain_from_list(domain);
3199
3200	if (domain->id)
3201		domain_id_free(domain->id);
3202
3203	kfree(domain);
3204}
3205
3206static struct protection_domain *protection_domain_alloc(void)
3207{
3208	struct protection_domain *domain;
3209
3210	domain = kzalloc(sizeof(*domain), GFP_KERNEL);
3211	if (!domain)
3212		return NULL;
3213
3214	spin_lock_init(&domain->lock);
3215	mutex_init(&domain->api_lock);
3216	domain->id = domain_id_alloc();
3217	if (!domain->id)
3218		goto out_err;
3219	INIT_LIST_HEAD(&domain->dev_list);
3220
3221	add_domain_to_list(domain);
3222
3223	return domain;
3224
3225out_err:
3226	kfree(domain);
3227
3228	return NULL;
3229}
3230
3231static int __init alloc_passthrough_domain(void)
3232{
3233	if (pt_domain != NULL)
3234		return 0;
3235
3236	/* allocate passthrough domain */
3237	pt_domain = protection_domain_alloc();
3238	if (!pt_domain)
3239		return -ENOMEM;
3240
3241	pt_domain->mode = PAGE_MODE_NONE;
3242
3243	return 0;
3244}
3245
3246static struct iommu_domain *amd_iommu_domain_alloc(unsigned type)
3247{
3248	struct protection_domain *pdomain;
3249
3250	/* We only support unmanaged domains for now */
3251	if (type != IOMMU_DOMAIN_UNMANAGED)
3252		return NULL;
3253
3254	pdomain = protection_domain_alloc();
3255	if (!pdomain)
3256		goto out_free;
3257
3258	pdomain->mode    = PAGE_MODE_3_LEVEL;
3259	pdomain->pt_root = (void *)get_zeroed_page(GFP_KERNEL);
3260	if (!pdomain->pt_root)
3261		goto out_free;
3262
3263	pdomain->domain.geometry.aperture_start = 0;
3264	pdomain->domain.geometry.aperture_end   = ~0ULL;
3265	pdomain->domain.geometry.force_aperture = true;
3266
3267	return &pdomain->domain;
3268
3269out_free:
3270	protection_domain_free(pdomain);
3271
3272	return NULL;
3273}
3274
3275static void amd_iommu_domain_free(struct iommu_domain *dom)
3276{
3277	struct protection_domain *domain;
3278
3279	if (!dom)
3280		return;
3281
3282	domain = to_pdomain(dom);
3283
3284	if (domain->dev_cnt > 0)
3285		cleanup_domain(domain);
3286
3287	BUG_ON(domain->dev_cnt != 0);
3288
3289	if (domain->mode != PAGE_MODE_NONE)
3290		free_pagetable(domain);
3291
3292	if (domain->flags & PD_IOMMUV2_MASK)
3293		free_gcr3_table(domain);
3294
3295	protection_domain_free(domain);
3296}
3297
3298static void amd_iommu_detach_device(struct iommu_domain *dom,
3299				    struct device *dev)
3300{
3301	struct iommu_dev_data *dev_data = dev->archdata.iommu;
3302	struct amd_iommu *iommu;
3303	u16 devid;
3304
3305	if (!check_device(dev))
3306		return;
3307
3308	devid = get_device_id(dev);
3309
3310	if (dev_data->domain != NULL)
3311		detach_device(dev);
3312
3313	iommu = amd_iommu_rlookup_table[devid];
3314	if (!iommu)
3315		return;
3316
3317	iommu_completion_wait(iommu);
3318}
3319
3320static int amd_iommu_attach_device(struct iommu_domain *dom,
3321				   struct device *dev)
3322{
3323	struct protection_domain *domain = to_pdomain(dom);
3324	struct iommu_dev_data *dev_data;
3325	struct amd_iommu *iommu;
3326	int ret;
3327
3328	if (!check_device(dev))
3329		return -EINVAL;
3330
3331	dev_data = dev->archdata.iommu;
3332
3333	iommu = amd_iommu_rlookup_table[dev_data->devid];
3334	if (!iommu)
3335		return -EINVAL;
3336
3337	if (dev_data->domain)
3338		detach_device(dev);
3339
3340	ret = attach_device(dev, domain);
3341
3342	iommu_completion_wait(iommu);
3343
3344	return ret;
3345}
3346
3347static int amd_iommu_map(struct iommu_domain *dom, unsigned long iova,
3348			 phys_addr_t paddr, size_t page_size, int iommu_prot)
3349{
3350	struct protection_domain *domain = to_pdomain(dom);
3351	int prot = 0;
3352	int ret;
3353
3354	if (domain->mode == PAGE_MODE_NONE)
3355		return -EINVAL;
3356
3357	if (iommu_prot & IOMMU_READ)
3358		prot |= IOMMU_PROT_IR;
3359	if (iommu_prot & IOMMU_WRITE)
3360		prot |= IOMMU_PROT_IW;
3361
3362	mutex_lock(&domain->api_lock);
3363	ret = iommu_map_page(domain, iova, paddr, prot, page_size);
3364	mutex_unlock(&domain->api_lock);
3365
3366	return ret;
3367}
3368
3369static size_t amd_iommu_unmap(struct iommu_domain *dom, unsigned long iova,
3370			   size_t page_size)
3371{
3372	struct protection_domain *domain = to_pdomain(dom);
3373	size_t unmap_size;
3374
3375	if (domain->mode == PAGE_MODE_NONE)
3376		return -EINVAL;
3377
3378	mutex_lock(&domain->api_lock);
3379	unmap_size = iommu_unmap_page(domain, iova, page_size);
3380	mutex_unlock(&domain->api_lock);
3381
3382	domain_flush_tlb_pde(domain);
3383
3384	return unmap_size;
3385}
3386
3387static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom,
3388					  dma_addr_t iova)
3389{
3390	struct protection_domain *domain = to_pdomain(dom);
3391	unsigned long offset_mask, pte_pgsize;
3392	u64 *pte, __pte;
3393
3394	if (domain->mode == PAGE_MODE_NONE)
3395		return iova;
3396
3397	pte = fetch_pte(domain, iova, &pte_pgsize);
3398
3399	if (!pte || !IOMMU_PTE_PRESENT(*pte))
3400		return 0;
3401
3402	offset_mask = pte_pgsize - 1;
3403	__pte	    = *pte & PM_ADDR_MASK;
3404
3405	return (__pte & ~offset_mask) | (iova & offset_mask);
3406}
3407
3408static bool amd_iommu_capable(enum iommu_cap cap)
3409{
3410	switch (cap) {
3411	case IOMMU_CAP_CACHE_COHERENCY:
3412		return true;
3413	case IOMMU_CAP_INTR_REMAP:
3414		return (irq_remapping_enabled == 1);
3415	case IOMMU_CAP_NOEXEC:
3416		return false;
3417	}
3418
3419	return false;
3420}
3421
3422static const struct iommu_ops amd_iommu_ops = {
3423	.capable = amd_iommu_capable,
3424	.domain_alloc = amd_iommu_domain_alloc,
3425	.domain_free  = amd_iommu_domain_free,
3426	.attach_dev = amd_iommu_attach_device,
3427	.detach_dev = amd_iommu_detach_device,
3428	.map = amd_iommu_map,
3429	.unmap = amd_iommu_unmap,
3430	.map_sg = default_iommu_map_sg,
3431	.iova_to_phys = amd_iommu_iova_to_phys,
3432	.pgsize_bitmap	= AMD_IOMMU_PGSIZES,
3433};
3434
3435/*****************************************************************************
3436 *
3437 * The next functions do a basic initialization of IOMMU for pass through
3438 * mode
3439 *
3440 * In passthrough mode the IOMMU is initialized and enabled but not used for
3441 * DMA-API translation.
3442 *
3443 *****************************************************************************/
3444
3445int __init amd_iommu_init_passthrough(void)
3446{
3447	struct iommu_dev_data *dev_data;
3448	struct pci_dev *dev = NULL;
3449	int ret;
3450
3451	ret = alloc_passthrough_domain();
3452	if (ret)
3453		return ret;
3454
3455	for_each_pci_dev(dev) {
3456		if (!check_device(&dev->dev))
3457			continue;
3458
3459		dev_data = get_dev_data(&dev->dev);
3460		dev_data->passthrough = true;
3461
3462		attach_device(&dev->dev, pt_domain);
3463	}
3464
3465	amd_iommu_stats_init();
3466
3467	pr_info("AMD-Vi: Initialized for Passthrough Mode\n");
3468
3469	return 0;
3470}
3471
3472/* IOMMUv2 specific functions */
3473int amd_iommu_register_ppr_notifier(struct notifier_block *nb)
3474{
3475	return atomic_notifier_chain_register(&ppr_notifier, nb);
3476}
3477EXPORT_SYMBOL(amd_iommu_register_ppr_notifier);
3478
3479int amd_iommu_unregister_ppr_notifier(struct notifier_block *nb)
3480{
3481	return atomic_notifier_chain_unregister(&ppr_notifier, nb);
3482}
3483EXPORT_SYMBOL(amd_iommu_unregister_ppr_notifier);
3484
3485void amd_iommu_domain_direct_map(struct iommu_domain *dom)
3486{
3487	struct protection_domain *domain = to_pdomain(dom);
3488	unsigned long flags;
3489
3490	spin_lock_irqsave(&domain->lock, flags);
3491
3492	/* Update data structure */
3493	domain->mode    = PAGE_MODE_NONE;
3494	domain->updated = true;
3495
3496	/* Make changes visible to IOMMUs */
3497	update_domain(domain);
3498
3499	/* Page-table is not visible to IOMMU anymore, so free it */
3500	free_pagetable(domain);
3501
3502	spin_unlock_irqrestore(&domain->lock, flags);
3503}
3504EXPORT_SYMBOL(amd_iommu_domain_direct_map);
3505
3506int amd_iommu_domain_enable_v2(struct iommu_domain *dom, int pasids)
3507{
3508	struct protection_domain *domain = to_pdomain(dom);
3509	unsigned long flags;
3510	int levels, ret;
3511
3512	if (pasids <= 0 || pasids > (PASID_MASK + 1))
3513		return -EINVAL;
3514
3515	/* Number of GCR3 table levels required */
3516	for (levels = 0; (pasids - 1) & ~0x1ff; pasids >>= 9)
3517		levels += 1;
3518
3519	if (levels > amd_iommu_max_glx_val)
3520		return -EINVAL;
3521
3522	spin_lock_irqsave(&domain->lock, flags);
3523
3524	/*
3525	 * Save us all sanity checks whether devices already in the
3526	 * domain support IOMMUv2. Just force that the domain has no
3527	 * devices attached when it is switched into IOMMUv2 mode.
3528	 */
3529	ret = -EBUSY;
3530	if (domain->dev_cnt > 0 || domain->flags & PD_IOMMUV2_MASK)
3531		goto out;
3532
3533	ret = -ENOMEM;
3534	domain->gcr3_tbl = (void *)get_zeroed_page(GFP_ATOMIC);
3535	if (domain->gcr3_tbl == NULL)
3536		goto out;
3537
3538	domain->glx      = levels;
3539	domain->flags   |= PD_IOMMUV2_MASK;
3540	domain->updated  = true;
3541
3542	update_domain(domain);
3543
3544	ret = 0;
3545
3546out:
3547	spin_unlock_irqrestore(&domain->lock, flags);
3548
3549	return ret;
3550}
3551EXPORT_SYMBOL(amd_iommu_domain_enable_v2);
3552
3553static int __flush_pasid(struct protection_domain *domain, int pasid,
3554			 u64 address, bool size)
3555{
3556	struct iommu_dev_data *dev_data;
3557	struct iommu_cmd cmd;
3558	int i, ret;
3559
3560	if (!(domain->flags & PD_IOMMUV2_MASK))
3561		return -EINVAL;
3562
3563	build_inv_iommu_pasid(&cmd, domain->id, pasid, address, size);
3564
3565	/*
3566	 * IOMMU TLB needs to be flushed before Device TLB to
3567	 * prevent device TLB refill from IOMMU TLB
3568	 */
3569	for (i = 0; i < amd_iommus_present; ++i) {
3570		if (domain->dev_iommu[i] == 0)
3571			continue;
3572
3573		ret = iommu_queue_command(amd_iommus[i], &cmd);
3574		if (ret != 0)
3575			goto out;
3576	}
3577
3578	/* Wait until IOMMU TLB flushes are complete */
3579	domain_flush_complete(domain);
3580
3581	/* Now flush device TLBs */
3582	list_for_each_entry(dev_data, &domain->dev_list, list) {
3583		struct amd_iommu *iommu;
3584		int qdep;
3585
3586		BUG_ON(!dev_data->ats.enabled);
3587
3588		qdep  = dev_data->ats.qdep;
3589		iommu = amd_iommu_rlookup_table[dev_data->devid];
3590
3591		build_inv_iotlb_pasid(&cmd, dev_data->devid, pasid,
3592				      qdep, address, size);
3593
3594		ret = iommu_queue_command(iommu, &cmd);
3595		if (ret != 0)
3596			goto out;
3597	}
3598
3599	/* Wait until all device TLBs are flushed */
3600	domain_flush_complete(domain);
3601
3602	ret = 0;
3603
3604out:
3605
3606	return ret;
3607}
3608
3609static int __amd_iommu_flush_page(struct protection_domain *domain, int pasid,
3610				  u64 address)
3611{
3612	INC_STATS_COUNTER(invalidate_iotlb);
3613
3614	return __flush_pasid(domain, pasid, address, false);
3615}
3616
3617int amd_iommu_flush_page(struct iommu_domain *dom, int pasid,
3618			 u64 address)
3619{
3620	struct protection_domain *domain = to_pdomain(dom);
3621	unsigned long flags;
3622	int ret;
3623
3624	spin_lock_irqsave(&domain->lock, flags);
3625	ret = __amd_iommu_flush_page(domain, pasid, address);
3626	spin_unlock_irqrestore(&domain->lock, flags);
3627
3628	return ret;
3629}
3630EXPORT_SYMBOL(amd_iommu_flush_page);
3631
3632static int __amd_iommu_flush_tlb(struct protection_domain *domain, int pasid)
3633{
3634	INC_STATS_COUNTER(invalidate_iotlb_all);
3635
3636	return __flush_pasid(domain, pasid, CMD_INV_IOMMU_ALL_PAGES_ADDRESS,
3637			     true);
3638}
3639
3640int amd_iommu_flush_tlb(struct iommu_domain *dom, int pasid)
3641{
3642	struct protection_domain *domain = to_pdomain(dom);
3643	unsigned long flags;
3644	int ret;
3645
3646	spin_lock_irqsave(&domain->lock, flags);
3647	ret = __amd_iommu_flush_tlb(domain, pasid);
3648	spin_unlock_irqrestore(&domain->lock, flags);
3649
3650	return ret;
3651}
3652EXPORT_SYMBOL(amd_iommu_flush_tlb);
3653
3654static u64 *__get_gcr3_pte(u64 *root, int level, int pasid, bool alloc)
3655{
3656	int index;
3657	u64 *pte;
3658
3659	while (true) {
3660
3661		index = (pasid >> (9 * level)) & 0x1ff;
3662		pte   = &root[index];
3663
3664		if (level == 0)
3665			break;
3666
3667		if (!(*pte & GCR3_VALID)) {
3668			if (!alloc)
3669				return NULL;
3670
3671			root = (void *)get_zeroed_page(GFP_ATOMIC);
3672			if (root == NULL)
3673				return NULL;
3674
3675			*pte = __pa(root) | GCR3_VALID;
3676		}
3677
3678		root = __va(*pte & PAGE_MASK);
3679
3680		level -= 1;
3681	}
3682
3683	return pte;
3684}
3685
3686static int __set_gcr3(struct protection_domain *domain, int pasid,
3687		      unsigned long cr3)
3688{
3689	u64 *pte;
3690
3691	if (domain->mode != PAGE_MODE_NONE)
3692		return -EINVAL;
3693
3694	pte = __get_gcr3_pte(domain->gcr3_tbl, domain->glx, pasid, true);
3695	if (pte == NULL)
3696		return -ENOMEM;
3697
3698	*pte = (cr3 & PAGE_MASK) | GCR3_VALID;
3699
3700	return __amd_iommu_flush_tlb(domain, pasid);
3701}
3702
3703static int __clear_gcr3(struct protection_domain *domain, int pasid)
3704{
3705	u64 *pte;
3706
3707	if (domain->mode != PAGE_MODE_NONE)
3708		return -EINVAL;
3709
3710	pte = __get_gcr3_pte(domain->gcr3_tbl, domain->glx, pasid, false);
3711	if (pte == NULL)
3712		return 0;
3713
3714	*pte = 0;
3715
3716	return __amd_iommu_flush_tlb(domain, pasid);
3717}
3718
3719int amd_iommu_domain_set_gcr3(struct iommu_domain *dom, int pasid,
3720			      unsigned long cr3)
3721{
3722	struct protection_domain *domain = to_pdomain(dom);
3723	unsigned long flags;
3724	int ret;
3725
3726	spin_lock_irqsave(&domain->lock, flags);
3727	ret = __set_gcr3(domain, pasid, cr3);
3728	spin_unlock_irqrestore(&domain->lock, flags);
3729
3730	return ret;
3731}
3732EXPORT_SYMBOL(amd_iommu_domain_set_gcr3);
3733
3734int amd_iommu_domain_clear_gcr3(struct iommu_domain *dom, int pasid)
3735{
3736	struct protection_domain *domain = to_pdomain(dom);
3737	unsigned long flags;
3738	int ret;
3739
3740	spin_lock_irqsave(&domain->lock, flags);
3741	ret = __clear_gcr3(domain, pasid);
3742	spin_unlock_irqrestore(&domain->lock, flags);
3743
3744	return ret;
3745}
3746EXPORT_SYMBOL(amd_iommu_domain_clear_gcr3);
3747
3748int amd_iommu_complete_ppr(struct pci_dev *pdev, int pasid,
3749			   int status, int tag)
3750{
3751	struct iommu_dev_data *dev_data;
3752	struct amd_iommu *iommu;
3753	struct iommu_cmd cmd;
3754
3755	INC_STATS_COUNTER(complete_ppr);
3756
3757	dev_data = get_dev_data(&pdev->dev);
3758	iommu    = amd_iommu_rlookup_table[dev_data->devid];
3759
3760	build_complete_ppr(&cmd, dev_data->devid, pasid, status,
3761			   tag, dev_data->pri_tlp);
3762
3763	return iommu_queue_command(iommu, &cmd);
3764}
3765EXPORT_SYMBOL(amd_iommu_complete_ppr);
3766
3767struct iommu_domain *amd_iommu_get_v2_domain(struct pci_dev *pdev)
3768{
3769	struct protection_domain *pdomain;
3770
3771	pdomain = get_domain(&pdev->dev);
3772	if (IS_ERR(pdomain))
3773		return NULL;
3774
3775	/* Only return IOMMUv2 domains */
3776	if (!(pdomain->flags & PD_IOMMUV2_MASK))
3777		return NULL;
3778
3779	return &pdomain->domain;
3780}
3781EXPORT_SYMBOL(amd_iommu_get_v2_domain);
3782
3783void amd_iommu_enable_device_erratum(struct pci_dev *pdev, u32 erratum)
3784{
3785	struct iommu_dev_data *dev_data;
3786
3787	if (!amd_iommu_v2_supported())
3788		return;
3789
3790	dev_data = get_dev_data(&pdev->dev);
3791	dev_data->errata |= (1 << erratum);
3792}
3793EXPORT_SYMBOL(amd_iommu_enable_device_erratum);
3794
3795int amd_iommu_device_info(struct pci_dev *pdev,
3796                          struct amd_iommu_device_info *info)
3797{
3798	int max_pasids;
3799	int pos;
3800
3801	if (pdev == NULL || info == NULL)
3802		return -EINVAL;
3803
3804	if (!amd_iommu_v2_supported())
3805		return -EINVAL;
3806
3807	memset(info, 0, sizeof(*info));
3808
3809	pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS);
3810	if (pos)
3811		info->flags |= AMD_IOMMU_DEVICE_FLAG_ATS_SUP;
3812
3813	pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI);
3814	if (pos)
3815		info->flags |= AMD_IOMMU_DEVICE_FLAG_PRI_SUP;
3816
3817	pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PASID);
3818	if (pos) {
3819		int features;
3820
3821		max_pasids = 1 << (9 * (amd_iommu_max_glx_val + 1));
3822		max_pasids = min(max_pasids, (1 << 20));
3823
3824		info->flags |= AMD_IOMMU_DEVICE_FLAG_PASID_SUP;
3825		info->max_pasids = min(pci_max_pasids(pdev), max_pasids);
3826
3827		features = pci_pasid_features(pdev);
3828		if (features & PCI_PASID_CAP_EXEC)
3829			info->flags |= AMD_IOMMU_DEVICE_FLAG_EXEC_SUP;
3830		if (features & PCI_PASID_CAP_PRIV)
3831			info->flags |= AMD_IOMMU_DEVICE_FLAG_PRIV_SUP;
3832	}
3833
3834	return 0;
3835}
3836EXPORT_SYMBOL(amd_iommu_device_info);
3837
3838#ifdef CONFIG_IRQ_REMAP
3839
3840/*****************************************************************************
3841 *
3842 * Interrupt Remapping Implementation
3843 *
3844 *****************************************************************************/
3845
3846union irte {
3847	u32 val;
3848	struct {
3849		u32 valid	: 1,
3850		    no_fault	: 1,
3851		    int_type	: 3,
3852		    rq_eoi	: 1,
3853		    dm		: 1,
3854		    rsvd_1	: 1,
3855		    destination	: 8,
3856		    vector	: 8,
3857		    rsvd_2	: 8;
3858	} fields;
3859};
3860
3861#define DTE_IRQ_PHYS_ADDR_MASK	(((1ULL << 45)-1) << 6)
3862#define DTE_IRQ_REMAP_INTCTL    (2ULL << 60)
3863#define DTE_IRQ_TABLE_LEN       (8ULL << 1)
3864#define DTE_IRQ_REMAP_ENABLE    1ULL
3865
3866static void set_dte_irq_entry(u16 devid, struct irq_remap_table *table)
3867{
3868	u64 dte;
3869
3870	dte	= amd_iommu_dev_table[devid].data[2];
3871	dte	&= ~DTE_IRQ_PHYS_ADDR_MASK;
3872	dte	|= virt_to_phys(table->table);
3873	dte	|= DTE_IRQ_REMAP_INTCTL;
3874	dte	|= DTE_IRQ_TABLE_LEN;
3875	dte	|= DTE_IRQ_REMAP_ENABLE;
3876
3877	amd_iommu_dev_table[devid].data[2] = dte;
3878}
3879
3880#define IRTE_ALLOCATED (~1U)
3881
3882static struct irq_remap_table *get_irq_table(u16 devid, bool ioapic)
3883{
3884	struct irq_remap_table *table = NULL;
3885	struct amd_iommu *iommu;
3886	unsigned long flags;
3887	u16 alias;
3888
3889	write_lock_irqsave(&amd_iommu_devtable_lock, flags);
3890
3891	iommu = amd_iommu_rlookup_table[devid];
3892	if (!iommu)
3893		goto out_unlock;
3894
3895	table = irq_lookup_table[devid];
3896	if (table)
3897		goto out;
3898
3899	alias = amd_iommu_alias_table[devid];
3900	table = irq_lookup_table[alias];
3901	if (table) {
3902		irq_lookup_table[devid] = table;
3903		set_dte_irq_entry(devid, table);
3904		iommu_flush_dte(iommu, devid);
3905		goto out;
3906	}
3907
3908	/* Nothing there yet, allocate new irq remapping table */
3909	table = kzalloc(sizeof(*table), GFP_ATOMIC);
3910	if (!table)
3911		goto out;
3912
3913	/* Initialize table spin-lock */
3914	spin_lock_init(&table->lock);
3915
3916	if (ioapic)
3917		/* Keep the first 32 indexes free for IOAPIC interrupts */
3918		table->min_index = 32;
3919
3920	table->table = kmem_cache_alloc(amd_iommu_irq_cache, GFP_ATOMIC);
3921	if (!table->table) {
3922		kfree(table);
3923		table = NULL;
3924		goto out;
3925	}
3926
3927	memset(table->table, 0, MAX_IRQS_PER_TABLE * sizeof(u32));
3928
3929	if (ioapic) {
3930		int i;
3931
3932		for (i = 0; i < 32; ++i)
3933			table->table[i] = IRTE_ALLOCATED;
3934	}
3935
3936	irq_lookup_table[devid] = table;
3937	set_dte_irq_entry(devid, table);
3938	iommu_flush_dte(iommu, devid);
3939	if (devid != alias) {
3940		irq_lookup_table[alias] = table;
3941		set_dte_irq_entry(alias, table);
3942		iommu_flush_dte(iommu, alias);
3943	}
3944
3945out:
3946	iommu_completion_wait(iommu);
3947
3948out_unlock:
3949	write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
3950
3951	return table;
3952}
3953
3954static int alloc_irq_index(struct irq_cfg *cfg, u16 devid, int count)
3955{
3956	struct irq_remap_table *table;
3957	unsigned long flags;
3958	int index, c;
3959
3960	table = get_irq_table(devid, false);
3961	if (!table)
3962		return -ENODEV;
3963
3964	spin_lock_irqsave(&table->lock, flags);
3965
3966	/* Scan table for free entries */
3967	for (c = 0, index = table->min_index;
3968	     index < MAX_IRQS_PER_TABLE;
3969	     ++index) {
3970		if (table->table[index] == 0)
3971			c += 1;
3972		else
3973			c = 0;
3974
3975		if (c == count)	{
3976			struct irq_2_irte *irte_info;
3977
3978			for (; c != 0; --c)
3979				table->table[index - c + 1] = IRTE_ALLOCATED;
3980
3981			index -= count - 1;
3982
3983			cfg->remapped	      = 1;
3984			irte_info             = &cfg->irq_2_irte;
3985			irte_info->devid      = devid;
3986			irte_info->index      = index;
3987
3988			goto out;
3989		}
3990	}
3991
3992	index = -ENOSPC;
3993
3994out:
3995	spin_unlock_irqrestore(&table->lock, flags);
3996
3997	return index;
3998}
3999
4000static int get_irte(u16 devid, int index, union irte *irte)
4001{
4002	struct irq_remap_table *table;
4003	unsigned long flags;
4004
4005	table = get_irq_table(devid, false);
4006	if (!table)
4007		return -ENOMEM;
4008
4009	spin_lock_irqsave(&table->lock, flags);
4010	irte->val = table->table[index];
4011	spin_unlock_irqrestore(&table->lock, flags);
4012
4013	return 0;
4014}
4015
4016static int modify_irte(u16 devid, int index, union irte irte)
4017{
4018	struct irq_remap_table *table;
4019	struct amd_iommu *iommu;
4020	unsigned long flags;
4021
4022	iommu = amd_iommu_rlookup_table[devid];
4023	if (iommu == NULL)
4024		return -EINVAL;
4025
4026	table = get_irq_table(devid, false);
4027	if (!table)
4028		return -ENOMEM;
4029
4030	spin_lock_irqsave(&table->lock, flags);
4031	table->table[index] = irte.val;
4032	spin_unlock_irqrestore(&table->lock, flags);
4033
4034	iommu_flush_irt(iommu, devid);
4035	iommu_completion_wait(iommu);
4036
4037	return 0;
4038}
4039
4040static void free_irte(u16 devid, int index)
4041{
4042	struct irq_remap_table *table;
4043	struct amd_iommu *iommu;
4044	unsigned long flags;
4045
4046	iommu = amd_iommu_rlookup_table[devid];
4047	if (iommu == NULL)
4048		return;
4049
4050	table = get_irq_table(devid, false);
4051	if (!table)
4052		return;
4053
4054	spin_lock_irqsave(&table->lock, flags);
4055	table->table[index] = 0;
4056	spin_unlock_irqrestore(&table->lock, flags);
4057
4058	iommu_flush_irt(iommu, devid);
4059	iommu_completion_wait(iommu);
4060}
4061
4062static int setup_ioapic_entry(int irq, struct IO_APIC_route_entry *entry,
4063			      unsigned int destination, int vector,
4064			      struct io_apic_irq_attr *attr)
4065{
4066	struct irq_remap_table *table;
4067	struct irq_2_irte *irte_info;
4068	struct irq_cfg *cfg;
4069	union irte irte;
4070	int ioapic_id;
4071	int index;
4072	int devid;
4073	int ret;
4074
4075	cfg = irq_cfg(irq);
4076	if (!cfg)
4077		return -EINVAL;
4078
4079	irte_info = &cfg->irq_2_irte;
4080	ioapic_id = mpc_ioapic_id(attr->ioapic);
4081	devid     = get_ioapic_devid(ioapic_id);
4082
4083	if (devid < 0)
4084		return devid;
4085
4086	table = get_irq_table(devid, true);
4087	if (table == NULL)
4088		return -ENOMEM;
4089
4090	index = attr->ioapic_pin;
4091
4092	/* Setup IRQ remapping info */
4093	cfg->remapped	      = 1;
4094	irte_info->devid      = devid;
4095	irte_info->index      = index;
4096
4097	/* Setup IRTE for IOMMU */
4098	irte.val		= 0;
4099	irte.fields.vector      = vector;
4100	irte.fields.int_type    = apic->irq_delivery_mode;
4101	irte.fields.destination = destination;
4102	irte.fields.dm          = apic->irq_dest_mode;
4103	irte.fields.valid       = 1;
4104
4105	ret = modify_irte(devid, index, irte);
4106	if (ret)
4107		return ret;
4108
4109	/* Setup IOAPIC entry */
4110	memset(entry, 0, sizeof(*entry));
4111
4112	entry->vector        = index;
4113	entry->mask          = 0;
4114	entry->trigger       = attr->trigger;
4115	entry->polarity      = attr->polarity;
4116
4117	/*
4118	 * Mask level triggered irqs.
4119	 */
4120	if (attr->trigger)
4121		entry->mask = 1;
4122
4123	return 0;
4124}
4125
4126static int set_affinity(struct irq_data *data, const struct cpumask *mask,
4127			bool force)
4128{
4129	struct irq_2_irte *irte_info;
4130	unsigned int dest, irq;
4131	struct irq_cfg *cfg;
4132	union irte irte;
4133	int err;
4134
4135	if (!config_enabled(CONFIG_SMP))
4136		return -1;
4137
4138	cfg       = irqd_cfg(data);
4139	irq       = data->irq;
4140	irte_info = &cfg->irq_2_irte;
4141
4142	if (!cpumask_intersects(mask, cpu_online_mask))
4143		return -EINVAL;
4144
4145	if (get_irte(irte_info->devid, irte_info->index, &irte))
4146		return -EBUSY;
4147
4148	if (assign_irq_vector(irq, cfg, mask))
4149		return -EBUSY;
4150
4151	err = apic->cpu_mask_to_apicid_and(cfg->domain, mask, &dest);
4152	if (err) {
4153		if (assign_irq_vector(irq, cfg, data->affinity))
4154			pr_err("AMD-Vi: Failed to recover vector for irq %d\n", irq);
4155		return err;
4156	}
4157
4158	irte.fields.vector      = cfg->vector;
4159	irte.fields.destination = dest;
4160
4161	modify_irte(irte_info->devid, irte_info->index, irte);
4162
4163	if (cfg->move_in_progress)
4164		send_cleanup_vector(cfg);
4165
4166	cpumask_copy(data->affinity, mask);
4167
4168	return 0;
4169}
4170
4171static int free_irq(int irq)
4172{
4173	struct irq_2_irte *irte_info;
4174	struct irq_cfg *cfg;
4175
4176	cfg = irq_cfg(irq);
4177	if (!cfg)
4178		return -EINVAL;
4179
4180	irte_info = &cfg->irq_2_irte;
4181
4182	free_irte(irte_info->devid, irte_info->index);
4183
4184	return 0;
4185}
4186
4187static void compose_msi_msg(struct pci_dev *pdev,
4188			    unsigned int irq, unsigned int dest,
4189			    struct msi_msg *msg, u8 hpet_id)
4190{
4191	struct irq_2_irte *irte_info;
4192	struct irq_cfg *cfg;
4193	union irte irte;
4194
4195	cfg = irq_cfg(irq);
4196	if (!cfg)
4197		return;
4198
4199	irte_info = &cfg->irq_2_irte;
4200
4201	irte.val		= 0;
4202	irte.fields.vector	= cfg->vector;
4203	irte.fields.int_type    = apic->irq_delivery_mode;
4204	irte.fields.destination	= dest;
4205	irte.fields.dm		= apic->irq_dest_mode;
4206	irte.fields.valid	= 1;
4207
4208	modify_irte(irte_info->devid, irte_info->index, irte);
4209
4210	msg->address_hi = MSI_ADDR_BASE_HI;
4211	msg->address_lo = MSI_ADDR_BASE_LO;
4212	msg->data       = irte_info->index;
4213}
4214
4215static int msi_alloc_irq(struct pci_dev *pdev, int irq, int nvec)
4216{
4217	struct irq_cfg *cfg;
4218	int index;
4219	u16 devid;
4220
4221	if (!pdev)
4222		return -EINVAL;
4223
4224	cfg = irq_cfg(irq);
4225	if (!cfg)
4226		return -EINVAL;
4227
4228	devid = get_device_id(&pdev->dev);
4229	index = alloc_irq_index(cfg, devid, nvec);
4230
4231	return index < 0 ? MAX_IRQS_PER_TABLE : index;
4232}
4233
4234static int msi_setup_irq(struct pci_dev *pdev, unsigned int irq,
4235			 int index, int offset)
4236{
4237	struct irq_2_irte *irte_info;
4238	struct irq_cfg *cfg;
4239	u16 devid;
4240
4241	if (!pdev)
4242		return -EINVAL;
4243
4244	cfg = irq_cfg(irq);
4245	if (!cfg)
4246		return -EINVAL;
4247
4248	if (index >= MAX_IRQS_PER_TABLE)
4249		return 0;
4250
4251	devid		= get_device_id(&pdev->dev);
4252	irte_info	= &cfg->irq_2_irte;
4253
4254	cfg->remapped	      = 1;
4255	irte_info->devid      = devid;
4256	irte_info->index      = index + offset;
4257
4258	return 0;
4259}
4260
4261static int alloc_hpet_msi(unsigned int irq, unsigned int id)
4262{
4263	struct irq_2_irte *irte_info;
4264	struct irq_cfg *cfg;
4265	int index, devid;
4266
4267	cfg = irq_cfg(irq);
4268	if (!cfg)
4269		return -EINVAL;
4270
4271	irte_info = &cfg->irq_2_irte;
4272	devid     = get_hpet_devid(id);
4273	if (devid < 0)
4274		return devid;
4275
4276	index = alloc_irq_index(cfg, devid, 1);
4277	if (index < 0)
4278		return index;
4279
4280	cfg->remapped	      = 1;
4281	irte_info->devid      = devid;
4282	irte_info->index      = index;
4283
4284	return 0;
4285}
4286
4287struct irq_remap_ops amd_iommu_irq_ops = {
4288	.prepare		= amd_iommu_prepare,
4289	.enable			= amd_iommu_enable,
4290	.disable		= amd_iommu_disable,
4291	.reenable		= amd_iommu_reenable,
4292	.enable_faulting	= amd_iommu_enable_faulting,
4293	.setup_ioapic_entry	= setup_ioapic_entry,
4294	.set_affinity		= set_affinity,
4295	.free_irq		= free_irq,
4296	.compose_msi_msg	= compose_msi_msg,
4297	.msi_alloc_irq		= msi_alloc_irq,
4298	.msi_setup_irq		= msi_setup_irq,
4299	.alloc_hpet_msi		= alloc_hpet_msi,
4300};
4301#endif
4302