1/*
2 * Copyright © 2006-2014 Intel Corporation.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11 * more details.
12 *
13 * Authors: David Woodhouse <dwmw2@infradead.org>,
14 *          Ashok Raj <ashok.raj@intel.com>,
15 *          Shaohua Li <shaohua.li@intel.com>,
16 *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
17 *          Fenghua Yu <fenghua.yu@intel.com>
18 */
19
20#include <linux/init.h>
21#include <linux/bitmap.h>
22#include <linux/debugfs.h>
23#include <linux/export.h>
24#include <linux/slab.h>
25#include <linux/irq.h>
26#include <linux/interrupt.h>
27#include <linux/spinlock.h>
28#include <linux/pci.h>
29#include <linux/dmar.h>
30#include <linux/dma-mapping.h>
31#include <linux/mempool.h>
32#include <linux/memory.h>
33#include <linux/timer.h>
34#include <linux/iova.h>
35#include <linux/iommu.h>
36#include <linux/intel-iommu.h>
37#include <linux/syscore_ops.h>
38#include <linux/tboot.h>
39#include <linux/dmi.h>
40#include <linux/pci-ats.h>
41#include <linux/memblock.h>
42#include <linux/dma-contiguous.h>
43#include <asm/irq_remapping.h>
44#include <asm/cacheflush.h>
45#include <asm/iommu.h>
46
47#include "irq_remapping.h"
48
49#define ROOT_SIZE		VTD_PAGE_SIZE
50#define CONTEXT_SIZE		VTD_PAGE_SIZE
51
52#define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
53#define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
54#define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
55#define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
56
57#define IOAPIC_RANGE_START	(0xfee00000)
58#define IOAPIC_RANGE_END	(0xfeefffff)
59#define IOVA_START_ADDR		(0x1000)
60
61#define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
62
63#define MAX_AGAW_WIDTH 64
64#define MAX_AGAW_PFN_WIDTH	(MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
65
66#define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
67#define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
68
69/* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
70   to match. That way, we can use 'unsigned long' for PFNs with impunity. */
71#define DOMAIN_MAX_PFN(gaw)	((unsigned long) min_t(uint64_t, \
72				__DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
73#define DOMAIN_MAX_ADDR(gaw)	(((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
74
75/* IO virtual address start page frame number */
76#define IOVA_START_PFN		(1)
77
78#define IOVA_PFN(addr)		((addr) >> PAGE_SHIFT)
79#define DMA_32BIT_PFN		IOVA_PFN(DMA_BIT_MASK(32))
80#define DMA_64BIT_PFN		IOVA_PFN(DMA_BIT_MASK(64))
81
82/* page table handling */
83#define LEVEL_STRIDE		(9)
84#define LEVEL_MASK		(((u64)1 << LEVEL_STRIDE) - 1)
85
86/*
87 * This bitmap is used to advertise the page sizes our hardware support
88 * to the IOMMU core, which will then use this information to split
89 * physically contiguous memory regions it is mapping into page sizes
90 * that we support.
91 *
92 * Traditionally the IOMMU core just handed us the mappings directly,
93 * after making sure the size is an order of a 4KiB page and that the
94 * mapping has natural alignment.
95 *
96 * To retain this behavior, we currently advertise that we support
97 * all page sizes that are an order of 4KiB.
98 *
99 * If at some point we'd like to utilize the IOMMU core's new behavior,
100 * we could change this to advertise the real page sizes we support.
101 */
102#define INTEL_IOMMU_PGSIZES	(~0xFFFUL)
103
104static inline int agaw_to_level(int agaw)
105{
106	return agaw + 2;
107}
108
109static inline int agaw_to_width(int agaw)
110{
111	return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
112}
113
114static inline int width_to_agaw(int width)
115{
116	return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
117}
118
119static inline unsigned int level_to_offset_bits(int level)
120{
121	return (level - 1) * LEVEL_STRIDE;
122}
123
124static inline int pfn_level_offset(unsigned long pfn, int level)
125{
126	return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
127}
128
129static inline unsigned long level_mask(int level)
130{
131	return -1UL << level_to_offset_bits(level);
132}
133
134static inline unsigned long level_size(int level)
135{
136	return 1UL << level_to_offset_bits(level);
137}
138
139static inline unsigned long align_to_level(unsigned long pfn, int level)
140{
141	return (pfn + level_size(level) - 1) & level_mask(level);
142}
143
144static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
145{
146	return  1 << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
147}
148
149/* VT-d pages must always be _smaller_ than MM pages. Otherwise things
150   are never going to work. */
151static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
152{
153	return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
154}
155
156static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
157{
158	return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
159}
160static inline unsigned long page_to_dma_pfn(struct page *pg)
161{
162	return mm_to_dma_pfn(page_to_pfn(pg));
163}
164static inline unsigned long virt_to_dma_pfn(void *p)
165{
166	return page_to_dma_pfn(virt_to_page(p));
167}
168
169/* global iommu list, set NULL for ignored DMAR units */
170static struct intel_iommu **g_iommus;
171
172static void __init check_tylersburg_isoch(void);
173static int rwbf_quirk;
174
175/*
176 * set to 1 to panic kernel if can't successfully enable VT-d
177 * (used when kernel is launched w/ TXT)
178 */
179static int force_on = 0;
180
181/*
182 * 0: Present
183 * 1-11: Reserved
184 * 12-63: Context Ptr (12 - (haw-1))
185 * 64-127: Reserved
186 */
187struct root_entry {
188	u64	lo;
189	u64	hi;
190};
191#define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
192
193
194/*
195 * low 64 bits:
196 * 0: present
197 * 1: fault processing disable
198 * 2-3: translation type
199 * 12-63: address space root
200 * high 64 bits:
201 * 0-2: address width
202 * 3-6: aval
203 * 8-23: domain id
204 */
205struct context_entry {
206	u64 lo;
207	u64 hi;
208};
209
210static inline bool context_present(struct context_entry *context)
211{
212	return (context->lo & 1);
213}
214static inline void context_set_present(struct context_entry *context)
215{
216	context->lo |= 1;
217}
218
219static inline void context_set_fault_enable(struct context_entry *context)
220{
221	context->lo &= (((u64)-1) << 2) | 1;
222}
223
224static inline void context_set_translation_type(struct context_entry *context,
225						unsigned long value)
226{
227	context->lo &= (((u64)-1) << 4) | 3;
228	context->lo |= (value & 3) << 2;
229}
230
231static inline void context_set_address_root(struct context_entry *context,
232					    unsigned long value)
233{
234	context->lo &= ~VTD_PAGE_MASK;
235	context->lo |= value & VTD_PAGE_MASK;
236}
237
238static inline void context_set_address_width(struct context_entry *context,
239					     unsigned long value)
240{
241	context->hi |= value & 7;
242}
243
244static inline void context_set_domain_id(struct context_entry *context,
245					 unsigned long value)
246{
247	context->hi |= (value & ((1 << 16) - 1)) << 8;
248}
249
250static inline void context_clear_entry(struct context_entry *context)
251{
252	context->lo = 0;
253	context->hi = 0;
254}
255
256/*
257 * 0: readable
258 * 1: writable
259 * 2-6: reserved
260 * 7: super page
261 * 8-10: available
262 * 11: snoop behavior
263 * 12-63: Host physcial address
264 */
265struct dma_pte {
266	u64 val;
267};
268
269static inline void dma_clear_pte(struct dma_pte *pte)
270{
271	pte->val = 0;
272}
273
274static inline u64 dma_pte_addr(struct dma_pte *pte)
275{
276#ifdef CONFIG_64BIT
277	return pte->val & VTD_PAGE_MASK;
278#else
279	/* Must have a full atomic 64-bit read */
280	return  __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK;
281#endif
282}
283
284static inline bool dma_pte_present(struct dma_pte *pte)
285{
286	return (pte->val & 3) != 0;
287}
288
289static inline bool dma_pte_superpage(struct dma_pte *pte)
290{
291	return (pte->val & DMA_PTE_LARGE_PAGE);
292}
293
294static inline int first_pte_in_page(struct dma_pte *pte)
295{
296	return !((unsigned long)pte & ~VTD_PAGE_MASK);
297}
298
299/*
300 * This domain is a statically identity mapping domain.
301 *	1. This domain creats a static 1:1 mapping to all usable memory.
302 * 	2. It maps to each iommu if successful.
303 *	3. Each iommu mapps to this domain if successful.
304 */
305static struct dmar_domain *si_domain;
306static int hw_pass_through = 1;
307
308/* domain represents a virtual machine, more than one devices
309 * across iommus may be owned in one domain, e.g. kvm guest.
310 */
311#define DOMAIN_FLAG_VIRTUAL_MACHINE	(1 << 0)
312
313/* si_domain contains mulitple devices */
314#define DOMAIN_FLAG_STATIC_IDENTITY	(1 << 1)
315
316struct dmar_domain {
317	int	id;			/* domain id */
318	int	nid;			/* node id */
319	DECLARE_BITMAP(iommu_bmp, DMAR_UNITS_SUPPORTED);
320					/* bitmap of iommus this domain uses*/
321
322	struct list_head devices;	/* all devices' list */
323	struct iova_domain iovad;	/* iova's that belong to this domain */
324
325	struct dma_pte	*pgd;		/* virtual address */
326	int		gaw;		/* max guest address width */
327
328	/* adjusted guest address width, 0 is level 2 30-bit */
329	int		agaw;
330
331	int		flags;		/* flags to find out type of domain */
332
333	int		iommu_coherency;/* indicate coherency of iommu access */
334	int		iommu_snooping; /* indicate snooping control feature*/
335	int		iommu_count;	/* reference count of iommu */
336	int		iommu_superpage;/* Level of superpages supported:
337					   0 == 4KiB (no superpages), 1 == 2MiB,
338					   2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
339	spinlock_t	iommu_lock;	/* protect iommu set in domain */
340	u64		max_addr;	/* maximum mapped address */
341
342	struct iommu_domain domain;	/* generic domain data structure for
343					   iommu core */
344};
345
346/* PCI domain-device relationship */
347struct device_domain_info {
348	struct list_head link;	/* link to domain siblings */
349	struct list_head global; /* link to global list */
350	u8 bus;			/* PCI bus number */
351	u8 devfn;		/* PCI devfn number */
352	struct device *dev; /* it's NULL for PCIe-to-PCI bridge */
353	struct intel_iommu *iommu; /* IOMMU used by this device */
354	struct dmar_domain *domain; /* pointer to domain */
355};
356
357struct dmar_rmrr_unit {
358	struct list_head list;		/* list of rmrr units	*/
359	struct acpi_dmar_header *hdr;	/* ACPI header		*/
360	u64	base_address;		/* reserved base address*/
361	u64	end_address;		/* reserved end address */
362	struct dmar_dev_scope *devices;	/* target devices */
363	int	devices_cnt;		/* target device count */
364};
365
366struct dmar_atsr_unit {
367	struct list_head list;		/* list of ATSR units */
368	struct acpi_dmar_header *hdr;	/* ACPI header */
369	struct dmar_dev_scope *devices;	/* target devices */
370	int devices_cnt;		/* target device count */
371	u8 include_all:1;		/* include all ports */
372};
373
374static LIST_HEAD(dmar_atsr_units);
375static LIST_HEAD(dmar_rmrr_units);
376
377#define for_each_rmrr_units(rmrr) \
378	list_for_each_entry(rmrr, &dmar_rmrr_units, list)
379
380static void flush_unmaps_timeout(unsigned long data);
381
382static DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
383
384#define HIGH_WATER_MARK 250
385struct deferred_flush_tables {
386	int next;
387	struct iova *iova[HIGH_WATER_MARK];
388	struct dmar_domain *domain[HIGH_WATER_MARK];
389	struct page *freelist[HIGH_WATER_MARK];
390};
391
392static struct deferred_flush_tables *deferred_flush;
393
394/* bitmap for indexing intel_iommus */
395static int g_num_of_iommus;
396
397static DEFINE_SPINLOCK(async_umap_flush_lock);
398static LIST_HEAD(unmaps_to_do);
399
400static int timer_on;
401static long list_size;
402
403static void domain_exit(struct dmar_domain *domain);
404static void domain_remove_dev_info(struct dmar_domain *domain);
405static void domain_remove_one_dev_info(struct dmar_domain *domain,
406				       struct device *dev);
407static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
408					   struct device *dev);
409static int domain_detach_iommu(struct dmar_domain *domain,
410			       struct intel_iommu *iommu);
411
412#ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
413int dmar_disabled = 0;
414#else
415int dmar_disabled = 1;
416#endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
417
418int intel_iommu_enabled = 0;
419EXPORT_SYMBOL_GPL(intel_iommu_enabled);
420
421static int dmar_map_gfx = 1;
422static int dmar_forcedac;
423static int intel_iommu_strict;
424static int intel_iommu_superpage = 1;
425static int intel_iommu_ecs = 1;
426
427/* We only actually use ECS when PASID support (on the new bit 40)
428 * is also advertised. Some early implementations — the ones with
429 * PASID support on bit 28 — have issues even when we *only* use
430 * extended root/context tables. */
431#define ecs_enabled(iommu) (intel_iommu_ecs && ecap_ecs(iommu->ecap) && \
432			    ecap_pasid(iommu->ecap))
433
434int intel_iommu_gfx_mapped;
435EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
436
437#define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
438static DEFINE_SPINLOCK(device_domain_lock);
439static LIST_HEAD(device_domain_list);
440
441static const struct iommu_ops intel_iommu_ops;
442
443/* Convert generic 'struct iommu_domain to private struct dmar_domain */
444static struct dmar_domain *to_dmar_domain(struct iommu_domain *dom)
445{
446	return container_of(dom, struct dmar_domain, domain);
447}
448
449static int __init intel_iommu_setup(char *str)
450{
451	if (!str)
452		return -EINVAL;
453	while (*str) {
454		if (!strncmp(str, "on", 2)) {
455			dmar_disabled = 0;
456			printk(KERN_INFO "Intel-IOMMU: enabled\n");
457		} else if (!strncmp(str, "off", 3)) {
458			dmar_disabled = 1;
459			printk(KERN_INFO "Intel-IOMMU: disabled\n");
460		} else if (!strncmp(str, "igfx_off", 8)) {
461			dmar_map_gfx = 0;
462			printk(KERN_INFO
463				"Intel-IOMMU: disable GFX device mapping\n");
464		} else if (!strncmp(str, "forcedac", 8)) {
465			printk(KERN_INFO
466				"Intel-IOMMU: Forcing DAC for PCI devices\n");
467			dmar_forcedac = 1;
468		} else if (!strncmp(str, "strict", 6)) {
469			printk(KERN_INFO
470				"Intel-IOMMU: disable batched IOTLB flush\n");
471			intel_iommu_strict = 1;
472		} else if (!strncmp(str, "sp_off", 6)) {
473			printk(KERN_INFO
474				"Intel-IOMMU: disable supported super page\n");
475			intel_iommu_superpage = 0;
476		} else if (!strncmp(str, "ecs_off", 7)) {
477			printk(KERN_INFO
478				"Intel-IOMMU: disable extended context table support\n");
479			intel_iommu_ecs = 0;
480		}
481
482		str += strcspn(str, ",");
483		while (*str == ',')
484			str++;
485	}
486	return 0;
487}
488__setup("intel_iommu=", intel_iommu_setup);
489
490static struct kmem_cache *iommu_domain_cache;
491static struct kmem_cache *iommu_devinfo_cache;
492
493static inline void *alloc_pgtable_page(int node)
494{
495	struct page *page;
496	void *vaddr = NULL;
497
498	page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
499	if (page)
500		vaddr = page_address(page);
501	return vaddr;
502}
503
504static inline void free_pgtable_page(void *vaddr)
505{
506	free_page((unsigned long)vaddr);
507}
508
509static inline void *alloc_domain_mem(void)
510{
511	return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
512}
513
514static void free_domain_mem(void *vaddr)
515{
516	kmem_cache_free(iommu_domain_cache, vaddr);
517}
518
519static inline void * alloc_devinfo_mem(void)
520{
521	return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
522}
523
524static inline void free_devinfo_mem(void *vaddr)
525{
526	kmem_cache_free(iommu_devinfo_cache, vaddr);
527}
528
529static inline int domain_type_is_vm(struct dmar_domain *domain)
530{
531	return domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE;
532}
533
534static inline int domain_type_is_vm_or_si(struct dmar_domain *domain)
535{
536	return domain->flags & (DOMAIN_FLAG_VIRTUAL_MACHINE |
537				DOMAIN_FLAG_STATIC_IDENTITY);
538}
539
540static inline int domain_pfn_supported(struct dmar_domain *domain,
541				       unsigned long pfn)
542{
543	int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
544
545	return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
546}
547
548static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
549{
550	unsigned long sagaw;
551	int agaw = -1;
552
553	sagaw = cap_sagaw(iommu->cap);
554	for (agaw = width_to_agaw(max_gaw);
555	     agaw >= 0; agaw--) {
556		if (test_bit(agaw, &sagaw))
557			break;
558	}
559
560	return agaw;
561}
562
563/*
564 * Calculate max SAGAW for each iommu.
565 */
566int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
567{
568	return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
569}
570
571/*
572 * calculate agaw for each iommu.
573 * "SAGAW" may be different across iommus, use a default agaw, and
574 * get a supported less agaw for iommus that don't support the default agaw.
575 */
576int iommu_calculate_agaw(struct intel_iommu *iommu)
577{
578	return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
579}
580
581/* This functionin only returns single iommu in a domain */
582static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
583{
584	int iommu_id;
585
586	/* si_domain and vm domain should not get here. */
587	BUG_ON(domain_type_is_vm_or_si(domain));
588	iommu_id = find_first_bit(domain->iommu_bmp, g_num_of_iommus);
589	if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
590		return NULL;
591
592	return g_iommus[iommu_id];
593}
594
595static void domain_update_iommu_coherency(struct dmar_domain *domain)
596{
597	struct dmar_drhd_unit *drhd;
598	struct intel_iommu *iommu;
599	bool found = false;
600	int i;
601
602	domain->iommu_coherency = 1;
603
604	for_each_set_bit(i, domain->iommu_bmp, g_num_of_iommus) {
605		found = true;
606		if (!ecap_coherent(g_iommus[i]->ecap)) {
607			domain->iommu_coherency = 0;
608			break;
609		}
610	}
611	if (found)
612		return;
613
614	/* No hardware attached; use lowest common denominator */
615	rcu_read_lock();
616	for_each_active_iommu(iommu, drhd) {
617		if (!ecap_coherent(iommu->ecap)) {
618			domain->iommu_coherency = 0;
619			break;
620		}
621	}
622	rcu_read_unlock();
623}
624
625static int domain_update_iommu_snooping(struct intel_iommu *skip)
626{
627	struct dmar_drhd_unit *drhd;
628	struct intel_iommu *iommu;
629	int ret = 1;
630
631	rcu_read_lock();
632	for_each_active_iommu(iommu, drhd) {
633		if (iommu != skip) {
634			if (!ecap_sc_support(iommu->ecap)) {
635				ret = 0;
636				break;
637			}
638		}
639	}
640	rcu_read_unlock();
641
642	return ret;
643}
644
645static int domain_update_iommu_superpage(struct intel_iommu *skip)
646{
647	struct dmar_drhd_unit *drhd;
648	struct intel_iommu *iommu;
649	int mask = 0xf;
650
651	if (!intel_iommu_superpage) {
652		return 0;
653	}
654
655	/* set iommu_superpage to the smallest common denominator */
656	rcu_read_lock();
657	for_each_active_iommu(iommu, drhd) {
658		if (iommu != skip) {
659			mask &= cap_super_page_val(iommu->cap);
660			if (!mask)
661				break;
662		}
663	}
664	rcu_read_unlock();
665
666	return fls(mask);
667}
668
669/* Some capabilities may be different across iommus */
670static void domain_update_iommu_cap(struct dmar_domain *domain)
671{
672	domain_update_iommu_coherency(domain);
673	domain->iommu_snooping = domain_update_iommu_snooping(NULL);
674	domain->iommu_superpage = domain_update_iommu_superpage(NULL);
675}
676
677static inline struct context_entry *iommu_context_addr(struct intel_iommu *iommu,
678						       u8 bus, u8 devfn, int alloc)
679{
680	struct root_entry *root = &iommu->root_entry[bus];
681	struct context_entry *context;
682	u64 *entry;
683
684	entry = &root->lo;
685	if (ecs_enabled(iommu)) {
686		if (devfn >= 0x80) {
687			devfn -= 0x80;
688			entry = &root->hi;
689		}
690		devfn *= 2;
691	}
692	if (*entry & 1)
693		context = phys_to_virt(*entry & VTD_PAGE_MASK);
694	else {
695		unsigned long phy_addr;
696		if (!alloc)
697			return NULL;
698
699		context = alloc_pgtable_page(iommu->node);
700		if (!context)
701			return NULL;
702
703		__iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
704		phy_addr = virt_to_phys((void *)context);
705		*entry = phy_addr | 1;
706		__iommu_flush_cache(iommu, entry, sizeof(*entry));
707	}
708	return &context[devfn];
709}
710
711static int iommu_dummy(struct device *dev)
712{
713	return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
714}
715
716static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
717{
718	struct dmar_drhd_unit *drhd = NULL;
719	struct intel_iommu *iommu;
720	struct device *tmp;
721	struct pci_dev *ptmp, *pdev = NULL;
722	u16 segment = 0;
723	int i;
724
725	if (iommu_dummy(dev))
726		return NULL;
727
728	if (dev_is_pci(dev)) {
729		pdev = to_pci_dev(dev);
730		segment = pci_domain_nr(pdev->bus);
731	} else if (has_acpi_companion(dev))
732		dev = &ACPI_COMPANION(dev)->dev;
733
734	rcu_read_lock();
735	for_each_active_iommu(iommu, drhd) {
736		if (pdev && segment != drhd->segment)
737			continue;
738
739		for_each_active_dev_scope(drhd->devices,
740					  drhd->devices_cnt, i, tmp) {
741			if (tmp == dev) {
742				*bus = drhd->devices[i].bus;
743				*devfn = drhd->devices[i].devfn;
744				goto out;
745			}
746
747			if (!pdev || !dev_is_pci(tmp))
748				continue;
749
750			ptmp = to_pci_dev(tmp);
751			if (ptmp->subordinate &&
752			    ptmp->subordinate->number <= pdev->bus->number &&
753			    ptmp->subordinate->busn_res.end >= pdev->bus->number)
754				goto got_pdev;
755		}
756
757		if (pdev && drhd->include_all) {
758		got_pdev:
759			*bus = pdev->bus->number;
760			*devfn = pdev->devfn;
761			goto out;
762		}
763	}
764	iommu = NULL;
765 out:
766	rcu_read_unlock();
767
768	return iommu;
769}
770
771static void domain_flush_cache(struct dmar_domain *domain,
772			       void *addr, int size)
773{
774	if (!domain->iommu_coherency)
775		clflush_cache_range(addr, size);
776}
777
778static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
779{
780	struct context_entry *context;
781	int ret = 0;
782	unsigned long flags;
783
784	spin_lock_irqsave(&iommu->lock, flags);
785	context = iommu_context_addr(iommu, bus, devfn, 0);
786	if (context)
787		ret = context_present(context);
788	spin_unlock_irqrestore(&iommu->lock, flags);
789	return ret;
790}
791
792static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
793{
794	struct context_entry *context;
795	unsigned long flags;
796
797	spin_lock_irqsave(&iommu->lock, flags);
798	context = iommu_context_addr(iommu, bus, devfn, 0);
799	if (context) {
800		context_clear_entry(context);
801		__iommu_flush_cache(iommu, context, sizeof(*context));
802	}
803	spin_unlock_irqrestore(&iommu->lock, flags);
804}
805
806static void free_context_table(struct intel_iommu *iommu)
807{
808	int i;
809	unsigned long flags;
810	struct context_entry *context;
811
812	spin_lock_irqsave(&iommu->lock, flags);
813	if (!iommu->root_entry) {
814		goto out;
815	}
816	for (i = 0; i < ROOT_ENTRY_NR; i++) {
817		context = iommu_context_addr(iommu, i, 0, 0);
818		if (context)
819			free_pgtable_page(context);
820
821		if (!ecs_enabled(iommu))
822			continue;
823
824		context = iommu_context_addr(iommu, i, 0x80, 0);
825		if (context)
826			free_pgtable_page(context);
827
828	}
829	free_pgtable_page(iommu->root_entry);
830	iommu->root_entry = NULL;
831out:
832	spin_unlock_irqrestore(&iommu->lock, flags);
833}
834
835static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
836				      unsigned long pfn, int *target_level)
837{
838	struct dma_pte *parent, *pte = NULL;
839	int level = agaw_to_level(domain->agaw);
840	int offset;
841
842	BUG_ON(!domain->pgd);
843
844	if (!domain_pfn_supported(domain, pfn))
845		/* Address beyond IOMMU's addressing capabilities. */
846		return NULL;
847
848	parent = domain->pgd;
849
850	while (1) {
851		void *tmp_page;
852
853		offset = pfn_level_offset(pfn, level);
854		pte = &parent[offset];
855		if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
856			break;
857		if (level == *target_level)
858			break;
859
860		if (!dma_pte_present(pte)) {
861			uint64_t pteval;
862
863			tmp_page = alloc_pgtable_page(domain->nid);
864
865			if (!tmp_page)
866				return NULL;
867
868			domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
869			pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
870			if (cmpxchg64(&pte->val, 0ULL, pteval))
871				/* Someone else set it while we were thinking; use theirs. */
872				free_pgtable_page(tmp_page);
873			else
874				domain_flush_cache(domain, pte, sizeof(*pte));
875		}
876		if (level == 1)
877			break;
878
879		parent = phys_to_virt(dma_pte_addr(pte));
880		level--;
881	}
882
883	if (!*target_level)
884		*target_level = level;
885
886	return pte;
887}
888
889
890/* return address's pte at specific level */
891static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
892					 unsigned long pfn,
893					 int level, int *large_page)
894{
895	struct dma_pte *parent, *pte = NULL;
896	int total = agaw_to_level(domain->agaw);
897	int offset;
898
899	parent = domain->pgd;
900	while (level <= total) {
901		offset = pfn_level_offset(pfn, total);
902		pte = &parent[offset];
903		if (level == total)
904			return pte;
905
906		if (!dma_pte_present(pte)) {
907			*large_page = total;
908			break;
909		}
910
911		if (dma_pte_superpage(pte)) {
912			*large_page = total;
913			return pte;
914		}
915
916		parent = phys_to_virt(dma_pte_addr(pte));
917		total--;
918	}
919	return NULL;
920}
921
922/* clear last level pte, a tlb flush should be followed */
923static void dma_pte_clear_range(struct dmar_domain *domain,
924				unsigned long start_pfn,
925				unsigned long last_pfn)
926{
927	unsigned int large_page = 1;
928	struct dma_pte *first_pte, *pte;
929
930	BUG_ON(!domain_pfn_supported(domain, start_pfn));
931	BUG_ON(!domain_pfn_supported(domain, last_pfn));
932	BUG_ON(start_pfn > last_pfn);
933
934	/* we don't need lock here; nobody else touches the iova range */
935	do {
936		large_page = 1;
937		first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
938		if (!pte) {
939			start_pfn = align_to_level(start_pfn + 1, large_page + 1);
940			continue;
941		}
942		do {
943			dma_clear_pte(pte);
944			start_pfn += lvl_to_nr_pages(large_page);
945			pte++;
946		} while (start_pfn <= last_pfn && !first_pte_in_page(pte));
947
948		domain_flush_cache(domain, first_pte,
949				   (void *)pte - (void *)first_pte);
950
951	} while (start_pfn && start_pfn <= last_pfn);
952}
953
954static void dma_pte_free_level(struct dmar_domain *domain, int level,
955			       struct dma_pte *pte, unsigned long pfn,
956			       unsigned long start_pfn, unsigned long last_pfn)
957{
958	pfn = max(start_pfn, pfn);
959	pte = &pte[pfn_level_offset(pfn, level)];
960
961	do {
962		unsigned long level_pfn;
963		struct dma_pte *level_pte;
964
965		if (!dma_pte_present(pte) || dma_pte_superpage(pte))
966			goto next;
967
968		level_pfn = pfn & level_mask(level - 1);
969		level_pte = phys_to_virt(dma_pte_addr(pte));
970
971		if (level > 2)
972			dma_pte_free_level(domain, level - 1, level_pte,
973					   level_pfn, start_pfn, last_pfn);
974
975		/* If range covers entire pagetable, free it */
976		if (!(start_pfn > level_pfn ||
977		      last_pfn < level_pfn + level_size(level) - 1)) {
978			dma_clear_pte(pte);
979			domain_flush_cache(domain, pte, sizeof(*pte));
980			free_pgtable_page(level_pte);
981		}
982next:
983		pfn += level_size(level);
984	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
985}
986
987/* free page table pages. last level pte should already be cleared */
988static void dma_pte_free_pagetable(struct dmar_domain *domain,
989				   unsigned long start_pfn,
990				   unsigned long last_pfn)
991{
992	BUG_ON(!domain_pfn_supported(domain, start_pfn));
993	BUG_ON(!domain_pfn_supported(domain, last_pfn));
994	BUG_ON(start_pfn > last_pfn);
995
996	dma_pte_clear_range(domain, start_pfn, last_pfn);
997
998	/* We don't need lock here; nobody else touches the iova range */
999	dma_pte_free_level(domain, agaw_to_level(domain->agaw),
1000			   domain->pgd, 0, start_pfn, last_pfn);
1001
1002	/* free pgd */
1003	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1004		free_pgtable_page(domain->pgd);
1005		domain->pgd = NULL;
1006	}
1007}
1008
1009/* When a page at a given level is being unlinked from its parent, we don't
1010   need to *modify* it at all. All we need to do is make a list of all the
1011   pages which can be freed just as soon as we've flushed the IOTLB and we
1012   know the hardware page-walk will no longer touch them.
1013   The 'pte' argument is the *parent* PTE, pointing to the page that is to
1014   be freed. */
1015static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1016					    int level, struct dma_pte *pte,
1017					    struct page *freelist)
1018{
1019	struct page *pg;
1020
1021	pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1022	pg->freelist = freelist;
1023	freelist = pg;
1024
1025	if (level == 1)
1026		return freelist;
1027
1028	pte = page_address(pg);
1029	do {
1030		if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1031			freelist = dma_pte_list_pagetables(domain, level - 1,
1032							   pte, freelist);
1033		pte++;
1034	} while (!first_pte_in_page(pte));
1035
1036	return freelist;
1037}
1038
1039static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1040					struct dma_pte *pte, unsigned long pfn,
1041					unsigned long start_pfn,
1042					unsigned long last_pfn,
1043					struct page *freelist)
1044{
1045	struct dma_pte *first_pte = NULL, *last_pte = NULL;
1046
1047	pfn = max(start_pfn, pfn);
1048	pte = &pte[pfn_level_offset(pfn, level)];
1049
1050	do {
1051		unsigned long level_pfn;
1052
1053		if (!dma_pte_present(pte))
1054			goto next;
1055
1056		level_pfn = pfn & level_mask(level);
1057
1058		/* If range covers entire pagetable, free it */
1059		if (start_pfn <= level_pfn &&
1060		    last_pfn >= level_pfn + level_size(level) - 1) {
1061			/* These suborbinate page tables are going away entirely. Don't
1062			   bother to clear them; we're just going to *free* them. */
1063			if (level > 1 && !dma_pte_superpage(pte))
1064				freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1065
1066			dma_clear_pte(pte);
1067			if (!first_pte)
1068				first_pte = pte;
1069			last_pte = pte;
1070		} else if (level > 1) {
1071			/* Recurse down into a level that isn't *entirely* obsolete */
1072			freelist = dma_pte_clear_level(domain, level - 1,
1073						       phys_to_virt(dma_pte_addr(pte)),
1074						       level_pfn, start_pfn, last_pfn,
1075						       freelist);
1076		}
1077next:
1078		pfn += level_size(level);
1079	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1080
1081	if (first_pte)
1082		domain_flush_cache(domain, first_pte,
1083				   (void *)++last_pte - (void *)first_pte);
1084
1085	return freelist;
1086}
1087
1088/* We can't just free the pages because the IOMMU may still be walking
1089   the page tables, and may have cached the intermediate levels. The
1090   pages can only be freed after the IOTLB flush has been done. */
1091struct page *domain_unmap(struct dmar_domain *domain,
1092			  unsigned long start_pfn,
1093			  unsigned long last_pfn)
1094{
1095	struct page *freelist = NULL;
1096
1097	BUG_ON(!domain_pfn_supported(domain, start_pfn));
1098	BUG_ON(!domain_pfn_supported(domain, last_pfn));
1099	BUG_ON(start_pfn > last_pfn);
1100
1101	/* we don't need lock here; nobody else touches the iova range */
1102	freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1103				       domain->pgd, 0, start_pfn, last_pfn, NULL);
1104
1105	/* free pgd */
1106	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1107		struct page *pgd_page = virt_to_page(domain->pgd);
1108		pgd_page->freelist = freelist;
1109		freelist = pgd_page;
1110
1111		domain->pgd = NULL;
1112	}
1113
1114	return freelist;
1115}
1116
1117void dma_free_pagelist(struct page *freelist)
1118{
1119	struct page *pg;
1120
1121	while ((pg = freelist)) {
1122		freelist = pg->freelist;
1123		free_pgtable_page(page_address(pg));
1124	}
1125}
1126
1127/* iommu handling */
1128static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1129{
1130	struct root_entry *root;
1131	unsigned long flags;
1132
1133	root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1134	if (!root) {
1135		pr_err("IOMMU: allocating root entry for %s failed\n",
1136			iommu->name);
1137		return -ENOMEM;
1138	}
1139
1140	__iommu_flush_cache(iommu, root, ROOT_SIZE);
1141
1142	spin_lock_irqsave(&iommu->lock, flags);
1143	iommu->root_entry = root;
1144	spin_unlock_irqrestore(&iommu->lock, flags);
1145
1146	return 0;
1147}
1148
1149static void iommu_set_root_entry(struct intel_iommu *iommu)
1150{
1151	u64 addr;
1152	u32 sts;
1153	unsigned long flag;
1154
1155	addr = virt_to_phys(iommu->root_entry);
1156	if (ecs_enabled(iommu))
1157		addr |= DMA_RTADDR_RTT;
1158
1159	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1160	dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1161
1162	writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1163
1164	/* Make sure hardware complete it */
1165	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1166		      readl, (sts & DMA_GSTS_RTPS), sts);
1167
1168	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1169}
1170
1171static void iommu_flush_write_buffer(struct intel_iommu *iommu)
1172{
1173	u32 val;
1174	unsigned long flag;
1175
1176	if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1177		return;
1178
1179	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1180	writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1181
1182	/* Make sure hardware complete it */
1183	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1184		      readl, (!(val & DMA_GSTS_WBFS)), val);
1185
1186	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1187}
1188
1189/* return value determine if we need a write buffer flush */
1190static void __iommu_flush_context(struct intel_iommu *iommu,
1191				  u16 did, u16 source_id, u8 function_mask,
1192				  u64 type)
1193{
1194	u64 val = 0;
1195	unsigned long flag;
1196
1197	switch (type) {
1198	case DMA_CCMD_GLOBAL_INVL:
1199		val = DMA_CCMD_GLOBAL_INVL;
1200		break;
1201	case DMA_CCMD_DOMAIN_INVL:
1202		val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1203		break;
1204	case DMA_CCMD_DEVICE_INVL:
1205		val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1206			| DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1207		break;
1208	default:
1209		BUG();
1210	}
1211	val |= DMA_CCMD_ICC;
1212
1213	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1214	dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1215
1216	/* Make sure hardware complete it */
1217	IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1218		dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1219
1220	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1221}
1222
1223/* return value determine if we need a write buffer flush */
1224static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1225				u64 addr, unsigned int size_order, u64 type)
1226{
1227	int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1228	u64 val = 0, val_iva = 0;
1229	unsigned long flag;
1230
1231	switch (type) {
1232	case DMA_TLB_GLOBAL_FLUSH:
1233		/* global flush doesn't need set IVA_REG */
1234		val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1235		break;
1236	case DMA_TLB_DSI_FLUSH:
1237		val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1238		break;
1239	case DMA_TLB_PSI_FLUSH:
1240		val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1241		/* IH bit is passed in as part of address */
1242		val_iva = size_order | addr;
1243		break;
1244	default:
1245		BUG();
1246	}
1247	/* Note: set drain read/write */
1248#if 0
1249	/*
1250	 * This is probably to be super secure.. Looks like we can
1251	 * ignore it without any impact.
1252	 */
1253	if (cap_read_drain(iommu->cap))
1254		val |= DMA_TLB_READ_DRAIN;
1255#endif
1256	if (cap_write_drain(iommu->cap))
1257		val |= DMA_TLB_WRITE_DRAIN;
1258
1259	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1260	/* Note: Only uses first TLB reg currently */
1261	if (val_iva)
1262		dmar_writeq(iommu->reg + tlb_offset, val_iva);
1263	dmar_writeq(iommu->reg + tlb_offset + 8, val);
1264
1265	/* Make sure hardware complete it */
1266	IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1267		dmar_readq, (!(val & DMA_TLB_IVT)), val);
1268
1269	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1270
1271	/* check IOTLB invalidation granularity */
1272	if (DMA_TLB_IAIG(val) == 0)
1273		printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
1274	if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1275		pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
1276			(unsigned long long)DMA_TLB_IIRG(type),
1277			(unsigned long long)DMA_TLB_IAIG(val));
1278}
1279
1280static struct device_domain_info *
1281iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1282			 u8 bus, u8 devfn)
1283{
1284	bool found = false;
1285	unsigned long flags;
1286	struct device_domain_info *info;
1287	struct pci_dev *pdev;
1288
1289	if (!ecap_dev_iotlb_support(iommu->ecap))
1290		return NULL;
1291
1292	if (!iommu->qi)
1293		return NULL;
1294
1295	spin_lock_irqsave(&device_domain_lock, flags);
1296	list_for_each_entry(info, &domain->devices, link)
1297		if (info->iommu == iommu && info->bus == bus &&
1298		    info->devfn == devfn) {
1299			found = true;
1300			break;
1301		}
1302	spin_unlock_irqrestore(&device_domain_lock, flags);
1303
1304	if (!found || !info->dev || !dev_is_pci(info->dev))
1305		return NULL;
1306
1307	pdev = to_pci_dev(info->dev);
1308
1309	if (!pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS))
1310		return NULL;
1311
1312	if (!dmar_find_matched_atsr_unit(pdev))
1313		return NULL;
1314
1315	return info;
1316}
1317
1318static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1319{
1320	if (!info || !dev_is_pci(info->dev))
1321		return;
1322
1323	pci_enable_ats(to_pci_dev(info->dev), VTD_PAGE_SHIFT);
1324}
1325
1326static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1327{
1328	if (!info->dev || !dev_is_pci(info->dev) ||
1329	    !pci_ats_enabled(to_pci_dev(info->dev)))
1330		return;
1331
1332	pci_disable_ats(to_pci_dev(info->dev));
1333}
1334
1335static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1336				  u64 addr, unsigned mask)
1337{
1338	u16 sid, qdep;
1339	unsigned long flags;
1340	struct device_domain_info *info;
1341
1342	spin_lock_irqsave(&device_domain_lock, flags);
1343	list_for_each_entry(info, &domain->devices, link) {
1344		struct pci_dev *pdev;
1345		if (!info->dev || !dev_is_pci(info->dev))
1346			continue;
1347
1348		pdev = to_pci_dev(info->dev);
1349		if (!pci_ats_enabled(pdev))
1350			continue;
1351
1352		sid = info->bus << 8 | info->devfn;
1353		qdep = pci_ats_queue_depth(pdev);
1354		qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
1355	}
1356	spin_unlock_irqrestore(&device_domain_lock, flags);
1357}
1358
1359static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
1360				  unsigned long pfn, unsigned int pages, int ih, int map)
1361{
1362	unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1363	uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1364
1365	BUG_ON(pages == 0);
1366
1367	if (ih)
1368		ih = 1 << 6;
1369	/*
1370	 * Fallback to domain selective flush if no PSI support or the size is
1371	 * too big.
1372	 * PSI requires page size to be 2 ^ x, and the base address is naturally
1373	 * aligned to the size
1374	 */
1375	if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1376		iommu->flush.flush_iotlb(iommu, did, 0, 0,
1377						DMA_TLB_DSI_FLUSH);
1378	else
1379		iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1380						DMA_TLB_PSI_FLUSH);
1381
1382	/*
1383	 * In caching mode, changes of pages from non-present to present require
1384	 * flush. However, device IOTLB doesn't need to be flushed in this case.
1385	 */
1386	if (!cap_caching_mode(iommu->cap) || !map)
1387		iommu_flush_dev_iotlb(iommu->domains[did], addr, mask);
1388}
1389
1390static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1391{
1392	u32 pmen;
1393	unsigned long flags;
1394
1395	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1396	pmen = readl(iommu->reg + DMAR_PMEN_REG);
1397	pmen &= ~DMA_PMEN_EPM;
1398	writel(pmen, iommu->reg + DMAR_PMEN_REG);
1399
1400	/* wait for the protected region status bit to clear */
1401	IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1402		readl, !(pmen & DMA_PMEN_PRS), pmen);
1403
1404	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1405}
1406
1407static void iommu_enable_translation(struct intel_iommu *iommu)
1408{
1409	u32 sts;
1410	unsigned long flags;
1411
1412	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1413	iommu->gcmd |= DMA_GCMD_TE;
1414	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1415
1416	/* Make sure hardware complete it */
1417	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1418		      readl, (sts & DMA_GSTS_TES), sts);
1419
1420	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1421}
1422
1423static void iommu_disable_translation(struct intel_iommu *iommu)
1424{
1425	u32 sts;
1426	unsigned long flag;
1427
1428	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1429	iommu->gcmd &= ~DMA_GCMD_TE;
1430	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1431
1432	/* Make sure hardware complete it */
1433	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1434		      readl, (!(sts & DMA_GSTS_TES)), sts);
1435
1436	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1437}
1438
1439
1440static int iommu_init_domains(struct intel_iommu *iommu)
1441{
1442	unsigned long ndomains;
1443	unsigned long nlongs;
1444
1445	ndomains = cap_ndoms(iommu->cap);
1446	pr_debug("IOMMU%d: Number of Domains supported <%ld>\n",
1447		 iommu->seq_id, ndomains);
1448	nlongs = BITS_TO_LONGS(ndomains);
1449
1450	spin_lock_init(&iommu->lock);
1451
1452	/* TBD: there might be 64K domains,
1453	 * consider other allocation for future chip
1454	 */
1455	iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1456	if (!iommu->domain_ids) {
1457		pr_err("IOMMU%d: allocating domain id array failed\n",
1458		       iommu->seq_id);
1459		return -ENOMEM;
1460	}
1461	iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1462			GFP_KERNEL);
1463	if (!iommu->domains) {
1464		pr_err("IOMMU%d: allocating domain array failed\n",
1465		       iommu->seq_id);
1466		kfree(iommu->domain_ids);
1467		iommu->domain_ids = NULL;
1468		return -ENOMEM;
1469	}
1470
1471	/*
1472	 * if Caching mode is set, then invalid translations are tagged
1473	 * with domainid 0. Hence we need to pre-allocate it.
1474	 */
1475	if (cap_caching_mode(iommu->cap))
1476		set_bit(0, iommu->domain_ids);
1477	return 0;
1478}
1479
1480static void disable_dmar_iommu(struct intel_iommu *iommu)
1481{
1482	struct dmar_domain *domain;
1483	int i;
1484
1485	if ((iommu->domains) && (iommu->domain_ids)) {
1486		for_each_set_bit(i, iommu->domain_ids, cap_ndoms(iommu->cap)) {
1487			/*
1488			 * Domain id 0 is reserved for invalid translation
1489			 * if hardware supports caching mode.
1490			 */
1491			if (cap_caching_mode(iommu->cap) && i == 0)
1492				continue;
1493
1494			domain = iommu->domains[i];
1495			clear_bit(i, iommu->domain_ids);
1496			if (domain_detach_iommu(domain, iommu) == 0 &&
1497			    !domain_type_is_vm(domain))
1498				domain_exit(domain);
1499		}
1500	}
1501
1502	if (iommu->gcmd & DMA_GCMD_TE)
1503		iommu_disable_translation(iommu);
1504}
1505
1506static void free_dmar_iommu(struct intel_iommu *iommu)
1507{
1508	if ((iommu->domains) && (iommu->domain_ids)) {
1509		kfree(iommu->domains);
1510		kfree(iommu->domain_ids);
1511		iommu->domains = NULL;
1512		iommu->domain_ids = NULL;
1513	}
1514
1515	g_iommus[iommu->seq_id] = NULL;
1516
1517	/* free context mapping */
1518	free_context_table(iommu);
1519}
1520
1521static struct dmar_domain *alloc_domain(int flags)
1522{
1523	/* domain id for virtual machine, it won't be set in context */
1524	static atomic_t vm_domid = ATOMIC_INIT(0);
1525	struct dmar_domain *domain;
1526
1527	domain = alloc_domain_mem();
1528	if (!domain)
1529		return NULL;
1530
1531	memset(domain, 0, sizeof(*domain));
1532	domain->nid = -1;
1533	domain->flags = flags;
1534	spin_lock_init(&domain->iommu_lock);
1535	INIT_LIST_HEAD(&domain->devices);
1536	if (flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
1537		domain->id = atomic_inc_return(&vm_domid);
1538
1539	return domain;
1540}
1541
1542static int __iommu_attach_domain(struct dmar_domain *domain,
1543				 struct intel_iommu *iommu)
1544{
1545	int num;
1546	unsigned long ndomains;
1547
1548	ndomains = cap_ndoms(iommu->cap);
1549	num = find_first_zero_bit(iommu->domain_ids, ndomains);
1550	if (num < ndomains) {
1551		set_bit(num, iommu->domain_ids);
1552		iommu->domains[num] = domain;
1553	} else {
1554		num = -ENOSPC;
1555	}
1556
1557	return num;
1558}
1559
1560static int iommu_attach_domain(struct dmar_domain *domain,
1561			       struct intel_iommu *iommu)
1562{
1563	int num;
1564	unsigned long flags;
1565
1566	spin_lock_irqsave(&iommu->lock, flags);
1567	num = __iommu_attach_domain(domain, iommu);
1568	spin_unlock_irqrestore(&iommu->lock, flags);
1569	if (num < 0)
1570		pr_err("IOMMU: no free domain ids\n");
1571
1572	return num;
1573}
1574
1575static int iommu_attach_vm_domain(struct dmar_domain *domain,
1576				  struct intel_iommu *iommu)
1577{
1578	int num;
1579	unsigned long ndomains;
1580
1581	ndomains = cap_ndoms(iommu->cap);
1582	for_each_set_bit(num, iommu->domain_ids, ndomains)
1583		if (iommu->domains[num] == domain)
1584			return num;
1585
1586	return __iommu_attach_domain(domain, iommu);
1587}
1588
1589static void iommu_detach_domain(struct dmar_domain *domain,
1590				struct intel_iommu *iommu)
1591{
1592	unsigned long flags;
1593	int num, ndomains;
1594
1595	spin_lock_irqsave(&iommu->lock, flags);
1596	if (domain_type_is_vm_or_si(domain)) {
1597		ndomains = cap_ndoms(iommu->cap);
1598		for_each_set_bit(num, iommu->domain_ids, ndomains) {
1599			if (iommu->domains[num] == domain) {
1600				clear_bit(num, iommu->domain_ids);
1601				iommu->domains[num] = NULL;
1602				break;
1603			}
1604		}
1605	} else {
1606		clear_bit(domain->id, iommu->domain_ids);
1607		iommu->domains[domain->id] = NULL;
1608	}
1609	spin_unlock_irqrestore(&iommu->lock, flags);
1610}
1611
1612static void domain_attach_iommu(struct dmar_domain *domain,
1613			       struct intel_iommu *iommu)
1614{
1615	unsigned long flags;
1616
1617	spin_lock_irqsave(&domain->iommu_lock, flags);
1618	if (!test_and_set_bit(iommu->seq_id, domain->iommu_bmp)) {
1619		domain->iommu_count++;
1620		if (domain->iommu_count == 1)
1621			domain->nid = iommu->node;
1622		domain_update_iommu_cap(domain);
1623	}
1624	spin_unlock_irqrestore(&domain->iommu_lock, flags);
1625}
1626
1627static int domain_detach_iommu(struct dmar_domain *domain,
1628			       struct intel_iommu *iommu)
1629{
1630	unsigned long flags;
1631	int count = INT_MAX;
1632
1633	spin_lock_irqsave(&domain->iommu_lock, flags);
1634	if (test_and_clear_bit(iommu->seq_id, domain->iommu_bmp)) {
1635		count = --domain->iommu_count;
1636		domain_update_iommu_cap(domain);
1637	}
1638	spin_unlock_irqrestore(&domain->iommu_lock, flags);
1639
1640	return count;
1641}
1642
1643static struct iova_domain reserved_iova_list;
1644static struct lock_class_key reserved_rbtree_key;
1645
1646static int dmar_init_reserved_ranges(void)
1647{
1648	struct pci_dev *pdev = NULL;
1649	struct iova *iova;
1650	int i;
1651
1652	init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN,
1653			DMA_32BIT_PFN);
1654
1655	lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1656		&reserved_rbtree_key);
1657
1658	/* IOAPIC ranges shouldn't be accessed by DMA */
1659	iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1660		IOVA_PFN(IOAPIC_RANGE_END));
1661	if (!iova) {
1662		printk(KERN_ERR "Reserve IOAPIC range failed\n");
1663		return -ENODEV;
1664	}
1665
1666	/* Reserve all PCI MMIO to avoid peer-to-peer access */
1667	for_each_pci_dev(pdev) {
1668		struct resource *r;
1669
1670		for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1671			r = &pdev->resource[i];
1672			if (!r->flags || !(r->flags & IORESOURCE_MEM))
1673				continue;
1674			iova = reserve_iova(&reserved_iova_list,
1675					    IOVA_PFN(r->start),
1676					    IOVA_PFN(r->end));
1677			if (!iova) {
1678				printk(KERN_ERR "Reserve iova failed\n");
1679				return -ENODEV;
1680			}
1681		}
1682	}
1683	return 0;
1684}
1685
1686static void domain_reserve_special_ranges(struct dmar_domain *domain)
1687{
1688	copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1689}
1690
1691static inline int guestwidth_to_adjustwidth(int gaw)
1692{
1693	int agaw;
1694	int r = (gaw - 12) % 9;
1695
1696	if (r == 0)
1697		agaw = gaw;
1698	else
1699		agaw = gaw + 9 - r;
1700	if (agaw > 64)
1701		agaw = 64;
1702	return agaw;
1703}
1704
1705static int domain_init(struct dmar_domain *domain, int guest_width)
1706{
1707	struct intel_iommu *iommu;
1708	int adjust_width, agaw;
1709	unsigned long sagaw;
1710
1711	init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN,
1712			DMA_32BIT_PFN);
1713	domain_reserve_special_ranges(domain);
1714
1715	/* calculate AGAW */
1716	iommu = domain_get_iommu(domain);
1717	if (guest_width > cap_mgaw(iommu->cap))
1718		guest_width = cap_mgaw(iommu->cap);
1719	domain->gaw = guest_width;
1720	adjust_width = guestwidth_to_adjustwidth(guest_width);
1721	agaw = width_to_agaw(adjust_width);
1722	sagaw = cap_sagaw(iommu->cap);
1723	if (!test_bit(agaw, &sagaw)) {
1724		/* hardware doesn't support it, choose a bigger one */
1725		pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1726		agaw = find_next_bit(&sagaw, 5, agaw);
1727		if (agaw >= 5)
1728			return -ENODEV;
1729	}
1730	domain->agaw = agaw;
1731
1732	if (ecap_coherent(iommu->ecap))
1733		domain->iommu_coherency = 1;
1734	else
1735		domain->iommu_coherency = 0;
1736
1737	if (ecap_sc_support(iommu->ecap))
1738		domain->iommu_snooping = 1;
1739	else
1740		domain->iommu_snooping = 0;
1741
1742	if (intel_iommu_superpage)
1743		domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1744	else
1745		domain->iommu_superpage = 0;
1746
1747	domain->nid = iommu->node;
1748
1749	/* always allocate the top pgd */
1750	domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1751	if (!domain->pgd)
1752		return -ENOMEM;
1753	__iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1754	return 0;
1755}
1756
1757static void domain_exit(struct dmar_domain *domain)
1758{
1759	struct dmar_drhd_unit *drhd;
1760	struct intel_iommu *iommu;
1761	struct page *freelist = NULL;
1762
1763	/* Domain 0 is reserved, so dont process it */
1764	if (!domain)
1765		return;
1766
1767	/* Flush any lazy unmaps that may reference this domain */
1768	if (!intel_iommu_strict)
1769		flush_unmaps_timeout(0);
1770
1771	/* remove associated devices */
1772	domain_remove_dev_info(domain);
1773
1774	/* destroy iovas */
1775	put_iova_domain(&domain->iovad);
1776
1777	freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1778
1779	/* clear attached or cached domains */
1780	rcu_read_lock();
1781	for_each_active_iommu(iommu, drhd)
1782		if (domain_type_is_vm(domain) ||
1783		    test_bit(iommu->seq_id, domain->iommu_bmp))
1784			iommu_detach_domain(domain, iommu);
1785	rcu_read_unlock();
1786
1787	dma_free_pagelist(freelist);
1788
1789	free_domain_mem(domain);
1790}
1791
1792static int domain_context_mapping_one(struct dmar_domain *domain,
1793				      struct intel_iommu *iommu,
1794				      u8 bus, u8 devfn, int translation)
1795{
1796	struct context_entry *context;
1797	unsigned long flags;
1798	struct dma_pte *pgd;
1799	int id;
1800	int agaw;
1801	struct device_domain_info *info = NULL;
1802
1803	pr_debug("Set context mapping for %02x:%02x.%d\n",
1804		bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1805
1806	BUG_ON(!domain->pgd);
1807	BUG_ON(translation != CONTEXT_TT_PASS_THROUGH &&
1808	       translation != CONTEXT_TT_MULTI_LEVEL);
1809
1810	spin_lock_irqsave(&iommu->lock, flags);
1811	context = iommu_context_addr(iommu, bus, devfn, 1);
1812	spin_unlock_irqrestore(&iommu->lock, flags);
1813	if (!context)
1814		return -ENOMEM;
1815	spin_lock_irqsave(&iommu->lock, flags);
1816	if (context_present(context)) {
1817		spin_unlock_irqrestore(&iommu->lock, flags);
1818		return 0;
1819	}
1820
1821	id = domain->id;
1822	pgd = domain->pgd;
1823
1824	if (domain_type_is_vm_or_si(domain)) {
1825		if (domain_type_is_vm(domain)) {
1826			id = iommu_attach_vm_domain(domain, iommu);
1827			if (id < 0) {
1828				spin_unlock_irqrestore(&iommu->lock, flags);
1829				pr_err("IOMMU: no free domain ids\n");
1830				return -EFAULT;
1831			}
1832		}
1833
1834		/* Skip top levels of page tables for
1835		 * iommu which has less agaw than default.
1836		 * Unnecessary for PT mode.
1837		 */
1838		if (translation != CONTEXT_TT_PASS_THROUGH) {
1839			for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1840				pgd = phys_to_virt(dma_pte_addr(pgd));
1841				if (!dma_pte_present(pgd)) {
1842					spin_unlock_irqrestore(&iommu->lock, flags);
1843					return -ENOMEM;
1844				}
1845			}
1846		}
1847	}
1848
1849	context_set_domain_id(context, id);
1850
1851	if (translation != CONTEXT_TT_PASS_THROUGH) {
1852		info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
1853		translation = info ? CONTEXT_TT_DEV_IOTLB :
1854				     CONTEXT_TT_MULTI_LEVEL;
1855	}
1856	/*
1857	 * In pass through mode, AW must be programmed to indicate the largest
1858	 * AGAW value supported by hardware. And ASR is ignored by hardware.
1859	 */
1860	if (unlikely(translation == CONTEXT_TT_PASS_THROUGH))
1861		context_set_address_width(context, iommu->msagaw);
1862	else {
1863		context_set_address_root(context, virt_to_phys(pgd));
1864		context_set_address_width(context, iommu->agaw);
1865	}
1866
1867	context_set_translation_type(context, translation);
1868	context_set_fault_enable(context);
1869	context_set_present(context);
1870	domain_flush_cache(domain, context, sizeof(*context));
1871
1872	/*
1873	 * It's a non-present to present mapping. If hardware doesn't cache
1874	 * non-present entry we only need to flush the write-buffer. If the
1875	 * _does_ cache non-present entries, then it does so in the special
1876	 * domain #0, which we have to flush:
1877	 */
1878	if (cap_caching_mode(iommu->cap)) {
1879		iommu->flush.flush_context(iommu, 0,
1880					   (((u16)bus) << 8) | devfn,
1881					   DMA_CCMD_MASK_NOBIT,
1882					   DMA_CCMD_DEVICE_INVL);
1883		iommu->flush.flush_iotlb(iommu, id, 0, 0, DMA_TLB_DSI_FLUSH);
1884	} else {
1885		iommu_flush_write_buffer(iommu);
1886	}
1887	iommu_enable_dev_iotlb(info);
1888	spin_unlock_irqrestore(&iommu->lock, flags);
1889
1890	domain_attach_iommu(domain, iommu);
1891
1892	return 0;
1893}
1894
1895struct domain_context_mapping_data {
1896	struct dmar_domain *domain;
1897	struct intel_iommu *iommu;
1898	int translation;
1899};
1900
1901static int domain_context_mapping_cb(struct pci_dev *pdev,
1902				     u16 alias, void *opaque)
1903{
1904	struct domain_context_mapping_data *data = opaque;
1905
1906	return domain_context_mapping_one(data->domain, data->iommu,
1907					  PCI_BUS_NUM(alias), alias & 0xff,
1908					  data->translation);
1909}
1910
1911static int
1912domain_context_mapping(struct dmar_domain *domain, struct device *dev,
1913		       int translation)
1914{
1915	struct intel_iommu *iommu;
1916	u8 bus, devfn;
1917	struct domain_context_mapping_data data;
1918
1919	iommu = device_to_iommu(dev, &bus, &devfn);
1920	if (!iommu)
1921		return -ENODEV;
1922
1923	if (!dev_is_pci(dev))
1924		return domain_context_mapping_one(domain, iommu, bus, devfn,
1925						  translation);
1926
1927	data.domain = domain;
1928	data.iommu = iommu;
1929	data.translation = translation;
1930
1931	return pci_for_each_dma_alias(to_pci_dev(dev),
1932				      &domain_context_mapping_cb, &data);
1933}
1934
1935static int domain_context_mapped_cb(struct pci_dev *pdev,
1936				    u16 alias, void *opaque)
1937{
1938	struct intel_iommu *iommu = opaque;
1939
1940	return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
1941}
1942
1943static int domain_context_mapped(struct device *dev)
1944{
1945	struct intel_iommu *iommu;
1946	u8 bus, devfn;
1947
1948	iommu = device_to_iommu(dev, &bus, &devfn);
1949	if (!iommu)
1950		return -ENODEV;
1951
1952	if (!dev_is_pci(dev))
1953		return device_context_mapped(iommu, bus, devfn);
1954
1955	return !pci_for_each_dma_alias(to_pci_dev(dev),
1956				       domain_context_mapped_cb, iommu);
1957}
1958
1959/* Returns a number of VTD pages, but aligned to MM page size */
1960static inline unsigned long aligned_nrpages(unsigned long host_addr,
1961					    size_t size)
1962{
1963	host_addr &= ~PAGE_MASK;
1964	return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
1965}
1966
1967/* Return largest possible superpage level for a given mapping */
1968static inline int hardware_largepage_caps(struct dmar_domain *domain,
1969					  unsigned long iov_pfn,
1970					  unsigned long phy_pfn,
1971					  unsigned long pages)
1972{
1973	int support, level = 1;
1974	unsigned long pfnmerge;
1975
1976	support = domain->iommu_superpage;
1977
1978	/* To use a large page, the virtual *and* physical addresses
1979	   must be aligned to 2MiB/1GiB/etc. Lower bits set in either
1980	   of them will mean we have to use smaller pages. So just
1981	   merge them and check both at once. */
1982	pfnmerge = iov_pfn | phy_pfn;
1983
1984	while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
1985		pages >>= VTD_STRIDE_SHIFT;
1986		if (!pages)
1987			break;
1988		pfnmerge >>= VTD_STRIDE_SHIFT;
1989		level++;
1990		support--;
1991	}
1992	return level;
1993}
1994
1995static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1996			    struct scatterlist *sg, unsigned long phys_pfn,
1997			    unsigned long nr_pages, int prot)
1998{
1999	struct dma_pte *first_pte = NULL, *pte = NULL;
2000	phys_addr_t uninitialized_var(pteval);
2001	unsigned long sg_res = 0;
2002	unsigned int largepage_lvl = 0;
2003	unsigned long lvl_pages = 0;
2004
2005	BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2006
2007	if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2008		return -EINVAL;
2009
2010	prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
2011
2012	if (!sg) {
2013		sg_res = nr_pages;
2014		pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
2015	}
2016
2017	while (nr_pages > 0) {
2018		uint64_t tmp;
2019
2020		if (!sg_res) {
2021			sg_res = aligned_nrpages(sg->offset, sg->length);
2022			sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + sg->offset;
2023			sg->dma_length = sg->length;
2024			pteval = page_to_phys(sg_page(sg)) | prot;
2025			phys_pfn = pteval >> VTD_PAGE_SHIFT;
2026		}
2027
2028		if (!pte) {
2029			largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2030
2031			first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2032			if (!pte)
2033				return -ENOMEM;
2034			/* It is large page*/
2035			if (largepage_lvl > 1) {
2036				unsigned long nr_superpages, end_pfn;
2037
2038				pteval |= DMA_PTE_LARGE_PAGE;
2039				lvl_pages = lvl_to_nr_pages(largepage_lvl);
2040
2041				nr_superpages = sg_res / lvl_pages;
2042				end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
2043
2044				/*
2045				 * Ensure that old small page tables are
2046				 * removed to make room for superpage(s).
2047				 */
2048				dma_pte_free_pagetable(domain, iov_pfn, end_pfn);
2049			} else {
2050				pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2051			}
2052
2053		}
2054		/* We don't need lock here, nobody else
2055		 * touches the iova range
2056		 */
2057		tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2058		if (tmp) {
2059			static int dumps = 5;
2060			printk(KERN_CRIT "ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2061			       iov_pfn, tmp, (unsigned long long)pteval);
2062			if (dumps) {
2063				dumps--;
2064				debug_dma_dump_mappings(NULL);
2065			}
2066			WARN_ON(1);
2067		}
2068
2069		lvl_pages = lvl_to_nr_pages(largepage_lvl);
2070
2071		BUG_ON(nr_pages < lvl_pages);
2072		BUG_ON(sg_res < lvl_pages);
2073
2074		nr_pages -= lvl_pages;
2075		iov_pfn += lvl_pages;
2076		phys_pfn += lvl_pages;
2077		pteval += lvl_pages * VTD_PAGE_SIZE;
2078		sg_res -= lvl_pages;
2079
2080		/* If the next PTE would be the first in a new page, then we
2081		   need to flush the cache on the entries we've just written.
2082		   And then we'll need to recalculate 'pte', so clear it and
2083		   let it get set again in the if (!pte) block above.
2084
2085		   If we're done (!nr_pages) we need to flush the cache too.
2086
2087		   Also if we've been setting superpages, we may need to
2088		   recalculate 'pte' and switch back to smaller pages for the
2089		   end of the mapping, if the trailing size is not enough to
2090		   use another superpage (i.e. sg_res < lvl_pages). */
2091		pte++;
2092		if (!nr_pages || first_pte_in_page(pte) ||
2093		    (largepage_lvl > 1 && sg_res < lvl_pages)) {
2094			domain_flush_cache(domain, first_pte,
2095					   (void *)pte - (void *)first_pte);
2096			pte = NULL;
2097		}
2098
2099		if (!sg_res && nr_pages)
2100			sg = sg_next(sg);
2101	}
2102	return 0;
2103}
2104
2105static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2106				    struct scatterlist *sg, unsigned long nr_pages,
2107				    int prot)
2108{
2109	return __domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2110}
2111
2112static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2113				     unsigned long phys_pfn, unsigned long nr_pages,
2114				     int prot)
2115{
2116	return __domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2117}
2118
2119static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
2120{
2121	if (!iommu)
2122		return;
2123
2124	clear_context_table(iommu, bus, devfn);
2125	iommu->flush.flush_context(iommu, 0, 0, 0,
2126					   DMA_CCMD_GLOBAL_INVL);
2127	iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2128}
2129
2130static inline void unlink_domain_info(struct device_domain_info *info)
2131{
2132	assert_spin_locked(&device_domain_lock);
2133	list_del(&info->link);
2134	list_del(&info->global);
2135	if (info->dev)
2136		info->dev->archdata.iommu = NULL;
2137}
2138
2139static void domain_remove_dev_info(struct dmar_domain *domain)
2140{
2141	struct device_domain_info *info, *tmp;
2142	unsigned long flags;
2143
2144	spin_lock_irqsave(&device_domain_lock, flags);
2145	list_for_each_entry_safe(info, tmp, &domain->devices, link) {
2146		unlink_domain_info(info);
2147		spin_unlock_irqrestore(&device_domain_lock, flags);
2148
2149		iommu_disable_dev_iotlb(info);
2150		iommu_detach_dev(info->iommu, info->bus, info->devfn);
2151
2152		if (domain_type_is_vm(domain)) {
2153			iommu_detach_dependent_devices(info->iommu, info->dev);
2154			domain_detach_iommu(domain, info->iommu);
2155		}
2156
2157		free_devinfo_mem(info);
2158		spin_lock_irqsave(&device_domain_lock, flags);
2159	}
2160	spin_unlock_irqrestore(&device_domain_lock, flags);
2161}
2162
2163/*
2164 * find_domain
2165 * Note: we use struct device->archdata.iommu stores the info
2166 */
2167static struct dmar_domain *find_domain(struct device *dev)
2168{
2169	struct device_domain_info *info;
2170
2171	/* No lock here, assumes no domain exit in normal case */
2172	info = dev->archdata.iommu;
2173	if (info)
2174		return info->domain;
2175	return NULL;
2176}
2177
2178static inline struct device_domain_info *
2179dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2180{
2181	struct device_domain_info *info;
2182
2183	list_for_each_entry(info, &device_domain_list, global)
2184		if (info->iommu->segment == segment && info->bus == bus &&
2185		    info->devfn == devfn)
2186			return info;
2187
2188	return NULL;
2189}
2190
2191static struct dmar_domain *dmar_insert_dev_info(struct intel_iommu *iommu,
2192						int bus, int devfn,
2193						struct device *dev,
2194						struct dmar_domain *domain)
2195{
2196	struct dmar_domain *found = NULL;
2197	struct device_domain_info *info;
2198	unsigned long flags;
2199
2200	info = alloc_devinfo_mem();
2201	if (!info)
2202		return NULL;
2203
2204	info->bus = bus;
2205	info->devfn = devfn;
2206	info->dev = dev;
2207	info->domain = domain;
2208	info->iommu = iommu;
2209
2210	spin_lock_irqsave(&device_domain_lock, flags);
2211	if (dev)
2212		found = find_domain(dev);
2213	else {
2214		struct device_domain_info *info2;
2215		info2 = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
2216		if (info2)
2217			found = info2->domain;
2218	}
2219	if (found) {
2220		spin_unlock_irqrestore(&device_domain_lock, flags);
2221		free_devinfo_mem(info);
2222		/* Caller must free the original domain */
2223		return found;
2224	}
2225
2226	list_add(&info->link, &domain->devices);
2227	list_add(&info->global, &device_domain_list);
2228	if (dev)
2229		dev->archdata.iommu = info;
2230	spin_unlock_irqrestore(&device_domain_lock, flags);
2231
2232	return domain;
2233}
2234
2235static int get_last_alias(struct pci_dev *pdev, u16 alias, void *opaque)
2236{
2237	*(u16 *)opaque = alias;
2238	return 0;
2239}
2240
2241/* domain is initialized */
2242static struct dmar_domain *get_domain_for_dev(struct device *dev, int gaw)
2243{
2244	struct dmar_domain *domain, *tmp;
2245	struct intel_iommu *iommu;
2246	struct device_domain_info *info;
2247	u16 dma_alias;
2248	unsigned long flags;
2249	u8 bus, devfn;
2250
2251	domain = find_domain(dev);
2252	if (domain)
2253		return domain;
2254
2255	iommu = device_to_iommu(dev, &bus, &devfn);
2256	if (!iommu)
2257		return NULL;
2258
2259	if (dev_is_pci(dev)) {
2260		struct pci_dev *pdev = to_pci_dev(dev);
2261
2262		pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2263
2264		spin_lock_irqsave(&device_domain_lock, flags);
2265		info = dmar_search_domain_by_dev_info(pci_domain_nr(pdev->bus),
2266						      PCI_BUS_NUM(dma_alias),
2267						      dma_alias & 0xff);
2268		if (info) {
2269			iommu = info->iommu;
2270			domain = info->domain;
2271		}
2272		spin_unlock_irqrestore(&device_domain_lock, flags);
2273
2274		/* DMA alias already has a domain, uses it */
2275		if (info)
2276			goto found_domain;
2277	}
2278
2279	/* Allocate and initialize new domain for the device */
2280	domain = alloc_domain(0);
2281	if (!domain)
2282		return NULL;
2283	domain->id = iommu_attach_domain(domain, iommu);
2284	if (domain->id < 0) {
2285		free_domain_mem(domain);
2286		return NULL;
2287	}
2288	domain_attach_iommu(domain, iommu);
2289	if (domain_init(domain, gaw)) {
2290		domain_exit(domain);
2291		return NULL;
2292	}
2293
2294	/* register PCI DMA alias device */
2295	if (dev_is_pci(dev)) {
2296		tmp = dmar_insert_dev_info(iommu, PCI_BUS_NUM(dma_alias),
2297					   dma_alias & 0xff, NULL, domain);
2298
2299		if (!tmp || tmp != domain) {
2300			domain_exit(domain);
2301			domain = tmp;
2302		}
2303
2304		if (!domain)
2305			return NULL;
2306	}
2307
2308found_domain:
2309	tmp = dmar_insert_dev_info(iommu, bus, devfn, dev, domain);
2310
2311	if (!tmp || tmp != domain) {
2312		domain_exit(domain);
2313		domain = tmp;
2314	}
2315
2316	return domain;
2317}
2318
2319static int iommu_identity_mapping;
2320#define IDENTMAP_ALL		1
2321#define IDENTMAP_GFX		2
2322#define IDENTMAP_AZALIA		4
2323
2324static int iommu_domain_identity_map(struct dmar_domain *domain,
2325				     unsigned long long start,
2326				     unsigned long long end)
2327{
2328	unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2329	unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2330
2331	if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2332			  dma_to_mm_pfn(last_vpfn))) {
2333		printk(KERN_ERR "IOMMU: reserve iova failed\n");
2334		return -ENOMEM;
2335	}
2336
2337	pr_debug("Mapping reserved region %llx-%llx for domain %d\n",
2338		 start, end, domain->id);
2339	/*
2340	 * RMRR range might have overlap with physical memory range,
2341	 * clear it first
2342	 */
2343	dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2344
2345	return domain_pfn_mapping(domain, first_vpfn, first_vpfn,
2346				  last_vpfn - first_vpfn + 1,
2347				  DMA_PTE_READ|DMA_PTE_WRITE);
2348}
2349
2350static int iommu_prepare_identity_map(struct device *dev,
2351				      unsigned long long start,
2352				      unsigned long long end)
2353{
2354	struct dmar_domain *domain;
2355	int ret;
2356
2357	domain = get_domain_for_dev(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2358	if (!domain)
2359		return -ENOMEM;
2360
2361	/* For _hardware_ passthrough, don't bother. But for software
2362	   passthrough, we do it anyway -- it may indicate a memory
2363	   range which is reserved in E820, so which didn't get set
2364	   up to start with in si_domain */
2365	if (domain == si_domain && hw_pass_through) {
2366		printk("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
2367		       dev_name(dev), start, end);
2368		return 0;
2369	}
2370
2371	printk(KERN_INFO
2372	       "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
2373	       dev_name(dev), start, end);
2374
2375	if (end < start) {
2376		WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2377			"BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2378			dmi_get_system_info(DMI_BIOS_VENDOR),
2379			dmi_get_system_info(DMI_BIOS_VERSION),
2380		     dmi_get_system_info(DMI_PRODUCT_VERSION));
2381		ret = -EIO;
2382		goto error;
2383	}
2384
2385	if (end >> agaw_to_width(domain->agaw)) {
2386		WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2387		     "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2388		     agaw_to_width(domain->agaw),
2389		     dmi_get_system_info(DMI_BIOS_VENDOR),
2390		     dmi_get_system_info(DMI_BIOS_VERSION),
2391		     dmi_get_system_info(DMI_PRODUCT_VERSION));
2392		ret = -EIO;
2393		goto error;
2394	}
2395
2396	ret = iommu_domain_identity_map(domain, start, end);
2397	if (ret)
2398		goto error;
2399
2400	/* context entry init */
2401	ret = domain_context_mapping(domain, dev, CONTEXT_TT_MULTI_LEVEL);
2402	if (ret)
2403		goto error;
2404
2405	return 0;
2406
2407 error:
2408	domain_exit(domain);
2409	return ret;
2410}
2411
2412static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2413					 struct device *dev)
2414{
2415	if (dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2416		return 0;
2417	return iommu_prepare_identity_map(dev, rmrr->base_address,
2418					  rmrr->end_address);
2419}
2420
2421#ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
2422static inline void iommu_prepare_isa(void)
2423{
2424	struct pci_dev *pdev;
2425	int ret;
2426
2427	pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2428	if (!pdev)
2429		return;
2430
2431	printk(KERN_INFO "IOMMU: Prepare 0-16MiB unity mapping for LPC\n");
2432	ret = iommu_prepare_identity_map(&pdev->dev, 0, 16*1024*1024 - 1);
2433
2434	if (ret)
2435		printk(KERN_ERR "IOMMU: Failed to create 0-16MiB identity map; "
2436		       "floppy might not work\n");
2437
2438	pci_dev_put(pdev);
2439}
2440#else
2441static inline void iommu_prepare_isa(void)
2442{
2443	return;
2444}
2445#endif /* !CONFIG_INTEL_IOMMU_FLPY_WA */
2446
2447static int md_domain_init(struct dmar_domain *domain, int guest_width);
2448
2449static int __init si_domain_init(int hw)
2450{
2451	struct dmar_drhd_unit *drhd;
2452	struct intel_iommu *iommu;
2453	int nid, ret = 0;
2454	bool first = true;
2455
2456	si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2457	if (!si_domain)
2458		return -EFAULT;
2459
2460	for_each_active_iommu(iommu, drhd) {
2461		ret = iommu_attach_domain(si_domain, iommu);
2462		if (ret < 0) {
2463			domain_exit(si_domain);
2464			return -EFAULT;
2465		} else if (first) {
2466			si_domain->id = ret;
2467			first = false;
2468		} else if (si_domain->id != ret) {
2469			domain_exit(si_domain);
2470			return -EFAULT;
2471		}
2472		domain_attach_iommu(si_domain, iommu);
2473	}
2474
2475	if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2476		domain_exit(si_domain);
2477		return -EFAULT;
2478	}
2479
2480	pr_debug("IOMMU: identity mapping domain is domain %d\n",
2481		 si_domain->id);
2482
2483	if (hw)
2484		return 0;
2485
2486	for_each_online_node(nid) {
2487		unsigned long start_pfn, end_pfn;
2488		int i;
2489
2490		for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2491			ret = iommu_domain_identity_map(si_domain,
2492					PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2493			if (ret)
2494				return ret;
2495		}
2496	}
2497
2498	return 0;
2499}
2500
2501static int identity_mapping(struct device *dev)
2502{
2503	struct device_domain_info *info;
2504
2505	if (likely(!iommu_identity_mapping))
2506		return 0;
2507
2508	info = dev->archdata.iommu;
2509	if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2510		return (info->domain == si_domain);
2511
2512	return 0;
2513}
2514
2515static int domain_add_dev_info(struct dmar_domain *domain,
2516			       struct device *dev, int translation)
2517{
2518	struct dmar_domain *ndomain;
2519	struct intel_iommu *iommu;
2520	u8 bus, devfn;
2521	int ret;
2522
2523	iommu = device_to_iommu(dev, &bus, &devfn);
2524	if (!iommu)
2525		return -ENODEV;
2526
2527	ndomain = dmar_insert_dev_info(iommu, bus, devfn, dev, domain);
2528	if (ndomain != domain)
2529		return -EBUSY;
2530
2531	ret = domain_context_mapping(domain, dev, translation);
2532	if (ret) {
2533		domain_remove_one_dev_info(domain, dev);
2534		return ret;
2535	}
2536
2537	return 0;
2538}
2539
2540static bool device_has_rmrr(struct device *dev)
2541{
2542	struct dmar_rmrr_unit *rmrr;
2543	struct device *tmp;
2544	int i;
2545
2546	rcu_read_lock();
2547	for_each_rmrr_units(rmrr) {
2548		/*
2549		 * Return TRUE if this RMRR contains the device that
2550		 * is passed in.
2551		 */
2552		for_each_active_dev_scope(rmrr->devices,
2553					  rmrr->devices_cnt, i, tmp)
2554			if (tmp == dev) {
2555				rcu_read_unlock();
2556				return true;
2557			}
2558	}
2559	rcu_read_unlock();
2560	return false;
2561}
2562
2563/*
2564 * There are a couple cases where we need to restrict the functionality of
2565 * devices associated with RMRRs.  The first is when evaluating a device for
2566 * identity mapping because problems exist when devices are moved in and out
2567 * of domains and their respective RMRR information is lost.  This means that
2568 * a device with associated RMRRs will never be in a "passthrough" domain.
2569 * The second is use of the device through the IOMMU API.  This interface
2570 * expects to have full control of the IOVA space for the device.  We cannot
2571 * satisfy both the requirement that RMRR access is maintained and have an
2572 * unencumbered IOVA space.  We also have no ability to quiesce the device's
2573 * use of the RMRR space or even inform the IOMMU API user of the restriction.
2574 * We therefore prevent devices associated with an RMRR from participating in
2575 * the IOMMU API, which eliminates them from device assignment.
2576 *
2577 * In both cases we assume that PCI USB devices with RMRRs have them largely
2578 * for historical reasons and that the RMRR space is not actively used post
2579 * boot.  This exclusion may change if vendors begin to abuse it.
2580 *
2581 * The same exception is made for graphics devices, with the requirement that
2582 * any use of the RMRR regions will be torn down before assigning the device
2583 * to a guest.
2584 */
2585static bool device_is_rmrr_locked(struct device *dev)
2586{
2587	if (!device_has_rmrr(dev))
2588		return false;
2589
2590	if (dev_is_pci(dev)) {
2591		struct pci_dev *pdev = to_pci_dev(dev);
2592
2593		if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2594			return false;
2595	}
2596
2597	return true;
2598}
2599
2600static int iommu_should_identity_map(struct device *dev, int startup)
2601{
2602
2603	if (dev_is_pci(dev)) {
2604		struct pci_dev *pdev = to_pci_dev(dev);
2605
2606		if (device_is_rmrr_locked(dev))
2607			return 0;
2608
2609		if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2610			return 1;
2611
2612		if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2613			return 1;
2614
2615		if (!(iommu_identity_mapping & IDENTMAP_ALL))
2616			return 0;
2617
2618		/*
2619		 * We want to start off with all devices in the 1:1 domain, and
2620		 * take them out later if we find they can't access all of memory.
2621		 *
2622		 * However, we can't do this for PCI devices behind bridges,
2623		 * because all PCI devices behind the same bridge will end up
2624		 * with the same source-id on their transactions.
2625		 *
2626		 * Practically speaking, we can't change things around for these
2627		 * devices at run-time, because we can't be sure there'll be no
2628		 * DMA transactions in flight for any of their siblings.
2629		 *
2630		 * So PCI devices (unless they're on the root bus) as well as
2631		 * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2632		 * the 1:1 domain, just in _case_ one of their siblings turns out
2633		 * not to be able to map all of memory.
2634		 */
2635		if (!pci_is_pcie(pdev)) {
2636			if (!pci_is_root_bus(pdev->bus))
2637				return 0;
2638			if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2639				return 0;
2640		} else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
2641			return 0;
2642	} else {
2643		if (device_has_rmrr(dev))
2644			return 0;
2645	}
2646
2647	/*
2648	 * At boot time, we don't yet know if devices will be 64-bit capable.
2649	 * Assume that they will — if they turn out not to be, then we can
2650	 * take them out of the 1:1 domain later.
2651	 */
2652	if (!startup) {
2653		/*
2654		 * If the device's dma_mask is less than the system's memory
2655		 * size then this is not a candidate for identity mapping.
2656		 */
2657		u64 dma_mask = *dev->dma_mask;
2658
2659		if (dev->coherent_dma_mask &&
2660		    dev->coherent_dma_mask < dma_mask)
2661			dma_mask = dev->coherent_dma_mask;
2662
2663		return dma_mask >= dma_get_required_mask(dev);
2664	}
2665
2666	return 1;
2667}
2668
2669static int __init dev_prepare_static_identity_mapping(struct device *dev, int hw)
2670{
2671	int ret;
2672
2673	if (!iommu_should_identity_map(dev, 1))
2674		return 0;
2675
2676	ret = domain_add_dev_info(si_domain, dev,
2677				  hw ? CONTEXT_TT_PASS_THROUGH :
2678				       CONTEXT_TT_MULTI_LEVEL);
2679	if (!ret)
2680		pr_info("IOMMU: %s identity mapping for device %s\n",
2681			hw ? "hardware" : "software", dev_name(dev));
2682	else if (ret == -ENODEV)
2683		/* device not associated with an iommu */
2684		ret = 0;
2685
2686	return ret;
2687}
2688
2689
2690static int __init iommu_prepare_static_identity_mapping(int hw)
2691{
2692	struct pci_dev *pdev = NULL;
2693	struct dmar_drhd_unit *drhd;
2694	struct intel_iommu *iommu;
2695	struct device *dev;
2696	int i;
2697	int ret = 0;
2698
2699	ret = si_domain_init(hw);
2700	if (ret)
2701		return -EFAULT;
2702
2703	for_each_pci_dev(pdev) {
2704		ret = dev_prepare_static_identity_mapping(&pdev->dev, hw);
2705		if (ret)
2706			return ret;
2707	}
2708
2709	for_each_active_iommu(iommu, drhd)
2710		for_each_active_dev_scope(drhd->devices, drhd->devices_cnt, i, dev) {
2711			struct acpi_device_physical_node *pn;
2712			struct acpi_device *adev;
2713
2714			if (dev->bus != &acpi_bus_type)
2715				continue;
2716
2717			adev= to_acpi_device(dev);
2718			mutex_lock(&adev->physical_node_lock);
2719			list_for_each_entry(pn, &adev->physical_node_list, node) {
2720				ret = dev_prepare_static_identity_mapping(pn->dev, hw);
2721				if (ret)
2722					break;
2723			}
2724			mutex_unlock(&adev->physical_node_lock);
2725			if (ret)
2726				return ret;
2727		}
2728
2729	return 0;
2730}
2731
2732static void intel_iommu_init_qi(struct intel_iommu *iommu)
2733{
2734	/*
2735	 * Start from the sane iommu hardware state.
2736	 * If the queued invalidation is already initialized by us
2737	 * (for example, while enabling interrupt-remapping) then
2738	 * we got the things already rolling from a sane state.
2739	 */
2740	if (!iommu->qi) {
2741		/*
2742		 * Clear any previous faults.
2743		 */
2744		dmar_fault(-1, iommu);
2745		/*
2746		 * Disable queued invalidation if supported and already enabled
2747		 * before OS handover.
2748		 */
2749		dmar_disable_qi(iommu);
2750	}
2751
2752	if (dmar_enable_qi(iommu)) {
2753		/*
2754		 * Queued Invalidate not enabled, use Register Based Invalidate
2755		 */
2756		iommu->flush.flush_context = __iommu_flush_context;
2757		iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2758		pr_info("IOMMU: %s using Register based invalidation\n",
2759			iommu->name);
2760	} else {
2761		iommu->flush.flush_context = qi_flush_context;
2762		iommu->flush.flush_iotlb = qi_flush_iotlb;
2763		pr_info("IOMMU: %s using Queued invalidation\n", iommu->name);
2764	}
2765}
2766
2767static int __init init_dmars(void)
2768{
2769	struct dmar_drhd_unit *drhd;
2770	struct dmar_rmrr_unit *rmrr;
2771	struct device *dev;
2772	struct intel_iommu *iommu;
2773	int i, ret;
2774
2775	/*
2776	 * for each drhd
2777	 *    allocate root
2778	 *    initialize and program root entry to not present
2779	 * endfor
2780	 */
2781	for_each_drhd_unit(drhd) {
2782		/*
2783		 * lock not needed as this is only incremented in the single
2784		 * threaded kernel __init code path all other access are read
2785		 * only
2786		 */
2787		if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
2788			g_num_of_iommus++;
2789			continue;
2790		}
2791		printk_once(KERN_ERR "intel-iommu: exceeded %d IOMMUs\n",
2792			  DMAR_UNITS_SUPPORTED);
2793	}
2794
2795	/* Preallocate enough resources for IOMMU hot-addition */
2796	if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
2797		g_num_of_iommus = DMAR_UNITS_SUPPORTED;
2798
2799	g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
2800			GFP_KERNEL);
2801	if (!g_iommus) {
2802		printk(KERN_ERR "Allocating global iommu array failed\n");
2803		ret = -ENOMEM;
2804		goto error;
2805	}
2806
2807	deferred_flush = kzalloc(g_num_of_iommus *
2808		sizeof(struct deferred_flush_tables), GFP_KERNEL);
2809	if (!deferred_flush) {
2810		ret = -ENOMEM;
2811		goto free_g_iommus;
2812	}
2813
2814	for_each_active_iommu(iommu, drhd) {
2815		g_iommus[iommu->seq_id] = iommu;
2816
2817		ret = iommu_init_domains(iommu);
2818		if (ret)
2819			goto free_iommu;
2820
2821		/*
2822		 * TBD:
2823		 * we could share the same root & context tables
2824		 * among all IOMMU's. Need to Split it later.
2825		 */
2826		ret = iommu_alloc_root_entry(iommu);
2827		if (ret)
2828			goto free_iommu;
2829		if (!ecap_pass_through(iommu->ecap))
2830			hw_pass_through = 0;
2831	}
2832
2833	for_each_active_iommu(iommu, drhd)
2834		intel_iommu_init_qi(iommu);
2835
2836	if (iommu_pass_through)
2837		iommu_identity_mapping |= IDENTMAP_ALL;
2838
2839#ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
2840	iommu_identity_mapping |= IDENTMAP_GFX;
2841#endif
2842
2843	check_tylersburg_isoch();
2844
2845	/*
2846	 * If pass through is not set or not enabled, setup context entries for
2847	 * identity mappings for rmrr, gfx, and isa and may fall back to static
2848	 * identity mapping if iommu_identity_mapping is set.
2849	 */
2850	if (iommu_identity_mapping) {
2851		ret = iommu_prepare_static_identity_mapping(hw_pass_through);
2852		if (ret) {
2853			printk(KERN_CRIT "Failed to setup IOMMU pass-through\n");
2854			goto free_iommu;
2855		}
2856	}
2857	/*
2858	 * For each rmrr
2859	 *   for each dev attached to rmrr
2860	 *   do
2861	 *     locate drhd for dev, alloc domain for dev
2862	 *     allocate free domain
2863	 *     allocate page table entries for rmrr
2864	 *     if context not allocated for bus
2865	 *           allocate and init context
2866	 *           set present in root table for this bus
2867	 *     init context with domain, translation etc
2868	 *    endfor
2869	 * endfor
2870	 */
2871	printk(KERN_INFO "IOMMU: Setting RMRR:\n");
2872	for_each_rmrr_units(rmrr) {
2873		/* some BIOS lists non-exist devices in DMAR table. */
2874		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2875					  i, dev) {
2876			ret = iommu_prepare_rmrr_dev(rmrr, dev);
2877			if (ret)
2878				printk(KERN_ERR
2879				       "IOMMU: mapping reserved region failed\n");
2880		}
2881	}
2882
2883	iommu_prepare_isa();
2884
2885	/*
2886	 * for each drhd
2887	 *   enable fault log
2888	 *   global invalidate context cache
2889	 *   global invalidate iotlb
2890	 *   enable translation
2891	 */
2892	for_each_iommu(iommu, drhd) {
2893		if (drhd->ignored) {
2894			/*
2895			 * we always have to disable PMRs or DMA may fail on
2896			 * this device
2897			 */
2898			if (force_on)
2899				iommu_disable_protect_mem_regions(iommu);
2900			continue;
2901		}
2902
2903		iommu_flush_write_buffer(iommu);
2904
2905		ret = dmar_set_interrupt(iommu);
2906		if (ret)
2907			goto free_iommu;
2908
2909		iommu_set_root_entry(iommu);
2910
2911		iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
2912		iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2913		iommu_enable_translation(iommu);
2914		iommu_disable_protect_mem_regions(iommu);
2915	}
2916
2917	return 0;
2918
2919free_iommu:
2920	for_each_active_iommu(iommu, drhd) {
2921		disable_dmar_iommu(iommu);
2922		free_dmar_iommu(iommu);
2923	}
2924	kfree(deferred_flush);
2925free_g_iommus:
2926	kfree(g_iommus);
2927error:
2928	return ret;
2929}
2930
2931/* This takes a number of _MM_ pages, not VTD pages */
2932static struct iova *intel_alloc_iova(struct device *dev,
2933				     struct dmar_domain *domain,
2934				     unsigned long nrpages, uint64_t dma_mask)
2935{
2936	struct iova *iova = NULL;
2937
2938	/* Restrict dma_mask to the width that the iommu can handle */
2939	dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
2940
2941	if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
2942		/*
2943		 * First try to allocate an io virtual address in
2944		 * DMA_BIT_MASK(32) and if that fails then try allocating
2945		 * from higher range
2946		 */
2947		iova = alloc_iova(&domain->iovad, nrpages,
2948				  IOVA_PFN(DMA_BIT_MASK(32)), 1);
2949		if (iova)
2950			return iova;
2951	}
2952	iova = alloc_iova(&domain->iovad, nrpages, IOVA_PFN(dma_mask), 1);
2953	if (unlikely(!iova)) {
2954		printk(KERN_ERR "Allocating %ld-page iova for %s failed",
2955		       nrpages, dev_name(dev));
2956		return NULL;
2957	}
2958
2959	return iova;
2960}
2961
2962static struct dmar_domain *__get_valid_domain_for_dev(struct device *dev)
2963{
2964	struct dmar_domain *domain;
2965	int ret;
2966
2967	domain = get_domain_for_dev(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2968	if (!domain) {
2969		printk(KERN_ERR "Allocating domain for %s failed",
2970		       dev_name(dev));
2971		return NULL;
2972	}
2973
2974	/* make sure context mapping is ok */
2975	if (unlikely(!domain_context_mapped(dev))) {
2976		ret = domain_context_mapping(domain, dev, CONTEXT_TT_MULTI_LEVEL);
2977		if (ret) {
2978			printk(KERN_ERR "Domain context map for %s failed",
2979			       dev_name(dev));
2980			return NULL;
2981		}
2982	}
2983
2984	return domain;
2985}
2986
2987static inline struct dmar_domain *get_valid_domain_for_dev(struct device *dev)
2988{
2989	struct device_domain_info *info;
2990
2991	/* No lock here, assumes no domain exit in normal case */
2992	info = dev->archdata.iommu;
2993	if (likely(info))
2994		return info->domain;
2995
2996	return __get_valid_domain_for_dev(dev);
2997}
2998
2999/* Check if the dev needs to go through non-identity map and unmap process.*/
3000static int iommu_no_mapping(struct device *dev)
3001{
3002	int found;
3003
3004	if (iommu_dummy(dev))
3005		return 1;
3006
3007	if (!iommu_identity_mapping)
3008		return 0;
3009
3010	found = identity_mapping(dev);
3011	if (found) {
3012		if (iommu_should_identity_map(dev, 0))
3013			return 1;
3014		else {
3015			/*
3016			 * 32 bit DMA is removed from si_domain and fall back
3017			 * to non-identity mapping.
3018			 */
3019			domain_remove_one_dev_info(si_domain, dev);
3020			printk(KERN_INFO "32bit %s uses non-identity mapping\n",
3021			       dev_name(dev));
3022			return 0;
3023		}
3024	} else {
3025		/*
3026		 * In case of a detached 64 bit DMA device from vm, the device
3027		 * is put into si_domain for identity mapping.
3028		 */
3029		if (iommu_should_identity_map(dev, 0)) {
3030			int ret;
3031			ret = domain_add_dev_info(si_domain, dev,
3032						  hw_pass_through ?
3033						  CONTEXT_TT_PASS_THROUGH :
3034						  CONTEXT_TT_MULTI_LEVEL);
3035			if (!ret) {
3036				printk(KERN_INFO "64bit %s uses identity mapping\n",
3037				       dev_name(dev));
3038				return 1;
3039			}
3040		}
3041	}
3042
3043	return 0;
3044}
3045
3046static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
3047				     size_t size, int dir, u64 dma_mask)
3048{
3049	struct dmar_domain *domain;
3050	phys_addr_t start_paddr;
3051	struct iova *iova;
3052	int prot = 0;
3053	int ret;
3054	struct intel_iommu *iommu;
3055	unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3056
3057	BUG_ON(dir == DMA_NONE);
3058
3059	if (iommu_no_mapping(dev))
3060		return paddr;
3061
3062	domain = get_valid_domain_for_dev(dev);
3063	if (!domain)
3064		return 0;
3065
3066	iommu = domain_get_iommu(domain);
3067	size = aligned_nrpages(paddr, size);
3068
3069	iova = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3070	if (!iova)
3071		goto error;
3072
3073	/*
3074	 * Check if DMAR supports zero-length reads on write only
3075	 * mappings..
3076	 */
3077	if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3078			!cap_zlr(iommu->cap))
3079		prot |= DMA_PTE_READ;
3080	if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3081		prot |= DMA_PTE_WRITE;
3082	/*
3083	 * paddr - (paddr + size) might be partial page, we should map the whole
3084	 * page.  Note: if two part of one page are separately mapped, we
3085	 * might have two guest_addr mapping to the same host paddr, but this
3086	 * is not a big problem
3087	 */
3088	ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova->pfn_lo),
3089				 mm_to_dma_pfn(paddr_pfn), size, prot);
3090	if (ret)
3091		goto error;
3092
3093	/* it's a non-present to present mapping. Only flush if caching mode */
3094	if (cap_caching_mode(iommu->cap))
3095		iommu_flush_iotlb_psi(iommu, domain->id, mm_to_dma_pfn(iova->pfn_lo), size, 0, 1);
3096	else
3097		iommu_flush_write_buffer(iommu);
3098
3099	start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
3100	start_paddr += paddr & ~PAGE_MASK;
3101	return start_paddr;
3102
3103error:
3104	if (iova)
3105		__free_iova(&domain->iovad, iova);
3106	printk(KERN_ERR"Device %s request: %zx@%llx dir %d --- failed\n",
3107		dev_name(dev), size, (unsigned long long)paddr, dir);
3108	return 0;
3109}
3110
3111static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3112				 unsigned long offset, size_t size,
3113				 enum dma_data_direction dir,
3114				 struct dma_attrs *attrs)
3115{
3116	return __intel_map_single(dev, page_to_phys(page) + offset, size,
3117				  dir, *dev->dma_mask);
3118}
3119
3120static void flush_unmaps(void)
3121{
3122	int i, j;
3123
3124	timer_on = 0;
3125
3126	/* just flush them all */
3127	for (i = 0; i < g_num_of_iommus; i++) {
3128		struct intel_iommu *iommu = g_iommus[i];
3129		if (!iommu)
3130			continue;
3131
3132		if (!deferred_flush[i].next)
3133			continue;
3134
3135		/* In caching mode, global flushes turn emulation expensive */
3136		if (!cap_caching_mode(iommu->cap))
3137			iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3138					 DMA_TLB_GLOBAL_FLUSH);
3139		for (j = 0; j < deferred_flush[i].next; j++) {
3140			unsigned long mask;
3141			struct iova *iova = deferred_flush[i].iova[j];
3142			struct dmar_domain *domain = deferred_flush[i].domain[j];
3143
3144			/* On real hardware multiple invalidations are expensive */
3145			if (cap_caching_mode(iommu->cap))
3146				iommu_flush_iotlb_psi(iommu, domain->id,
3147					iova->pfn_lo, iova_size(iova),
3148					!deferred_flush[i].freelist[j], 0);
3149			else {
3150				mask = ilog2(mm_to_dma_pfn(iova_size(iova)));
3151				iommu_flush_dev_iotlb(deferred_flush[i].domain[j],
3152						(uint64_t)iova->pfn_lo << PAGE_SHIFT, mask);
3153			}
3154			__free_iova(&deferred_flush[i].domain[j]->iovad, iova);
3155			if (deferred_flush[i].freelist[j])
3156				dma_free_pagelist(deferred_flush[i].freelist[j]);
3157		}
3158		deferred_flush[i].next = 0;
3159	}
3160
3161	list_size = 0;
3162}
3163
3164static void flush_unmaps_timeout(unsigned long data)
3165{
3166	unsigned long flags;
3167
3168	spin_lock_irqsave(&async_umap_flush_lock, flags);
3169	flush_unmaps();
3170	spin_unlock_irqrestore(&async_umap_flush_lock, flags);
3171}
3172
3173static void add_unmap(struct dmar_domain *dom, struct iova *iova, struct page *freelist)
3174{
3175	unsigned long flags;
3176	int next, iommu_id;
3177	struct intel_iommu *iommu;
3178
3179	spin_lock_irqsave(&async_umap_flush_lock, flags);
3180	if (list_size == HIGH_WATER_MARK)
3181		flush_unmaps();
3182
3183	iommu = domain_get_iommu(dom);
3184	iommu_id = iommu->seq_id;
3185
3186	next = deferred_flush[iommu_id].next;
3187	deferred_flush[iommu_id].domain[next] = dom;
3188	deferred_flush[iommu_id].iova[next] = iova;
3189	deferred_flush[iommu_id].freelist[next] = freelist;
3190	deferred_flush[iommu_id].next++;
3191
3192	if (!timer_on) {
3193		mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
3194		timer_on = 1;
3195	}
3196	list_size++;
3197	spin_unlock_irqrestore(&async_umap_flush_lock, flags);
3198}
3199
3200static void intel_unmap(struct device *dev, dma_addr_t dev_addr)
3201{
3202	struct dmar_domain *domain;
3203	unsigned long start_pfn, last_pfn;
3204	struct iova *iova;
3205	struct intel_iommu *iommu;
3206	struct page *freelist;
3207
3208	if (iommu_no_mapping(dev))
3209		return;
3210
3211	domain = find_domain(dev);
3212	BUG_ON(!domain);
3213
3214	iommu = domain_get_iommu(domain);
3215
3216	iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
3217	if (WARN_ONCE(!iova, "Driver unmaps unmatched page at PFN %llx\n",
3218		      (unsigned long long)dev_addr))
3219		return;
3220
3221	start_pfn = mm_to_dma_pfn(iova->pfn_lo);
3222	last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
3223
3224	pr_debug("Device %s unmapping: pfn %lx-%lx\n",
3225		 dev_name(dev), start_pfn, last_pfn);
3226
3227	freelist = domain_unmap(domain, start_pfn, last_pfn);
3228
3229	if (intel_iommu_strict) {
3230		iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
3231				      last_pfn - start_pfn + 1, !freelist, 0);
3232		/* free iova */
3233		__free_iova(&domain->iovad, iova);
3234		dma_free_pagelist(freelist);
3235	} else {
3236		add_unmap(domain, iova, freelist);
3237		/*
3238		 * queue up the release of the unmap to save the 1/6th of the
3239		 * cpu used up by the iotlb flush operation...
3240		 */
3241	}
3242}
3243
3244static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3245			     size_t size, enum dma_data_direction dir,
3246			     struct dma_attrs *attrs)
3247{
3248	intel_unmap(dev, dev_addr);
3249}
3250
3251static void *intel_alloc_coherent(struct device *dev, size_t size,
3252				  dma_addr_t *dma_handle, gfp_t flags,
3253				  struct dma_attrs *attrs)
3254{
3255	struct page *page = NULL;
3256	int order;
3257
3258	size = PAGE_ALIGN(size);
3259	order = get_order(size);
3260
3261	if (!iommu_no_mapping(dev))
3262		flags &= ~(GFP_DMA | GFP_DMA32);
3263	else if (dev->coherent_dma_mask < dma_get_required_mask(dev)) {
3264		if (dev->coherent_dma_mask < DMA_BIT_MASK(32))
3265			flags |= GFP_DMA;
3266		else
3267			flags |= GFP_DMA32;
3268	}
3269
3270	if (flags & __GFP_WAIT) {
3271		unsigned int count = size >> PAGE_SHIFT;
3272
3273		page = dma_alloc_from_contiguous(dev, count, order);
3274		if (page && iommu_no_mapping(dev) &&
3275		    page_to_phys(page) + size > dev->coherent_dma_mask) {
3276			dma_release_from_contiguous(dev, page, count);
3277			page = NULL;
3278		}
3279	}
3280
3281	if (!page)
3282		page = alloc_pages(flags, order);
3283	if (!page)
3284		return NULL;
3285	memset(page_address(page), 0, size);
3286
3287	*dma_handle = __intel_map_single(dev, page_to_phys(page), size,
3288					 DMA_BIDIRECTIONAL,
3289					 dev->coherent_dma_mask);
3290	if (*dma_handle)
3291		return page_address(page);
3292	if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3293		__free_pages(page, order);
3294
3295	return NULL;
3296}
3297
3298static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3299				dma_addr_t dma_handle, struct dma_attrs *attrs)
3300{
3301	int order;
3302	struct page *page = virt_to_page(vaddr);
3303
3304	size = PAGE_ALIGN(size);
3305	order = get_order(size);
3306
3307	intel_unmap(dev, dma_handle);
3308	if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3309		__free_pages(page, order);
3310}
3311
3312static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3313			   int nelems, enum dma_data_direction dir,
3314			   struct dma_attrs *attrs)
3315{
3316	intel_unmap(dev, sglist[0].dma_address);
3317}
3318
3319static int intel_nontranslate_map_sg(struct device *hddev,
3320	struct scatterlist *sglist, int nelems, int dir)
3321{
3322	int i;
3323	struct scatterlist *sg;
3324
3325	for_each_sg(sglist, sg, nelems, i) {
3326		BUG_ON(!sg_page(sg));
3327		sg->dma_address = page_to_phys(sg_page(sg)) + sg->offset;
3328		sg->dma_length = sg->length;
3329	}
3330	return nelems;
3331}
3332
3333static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3334			enum dma_data_direction dir, struct dma_attrs *attrs)
3335{
3336	int i;
3337	struct dmar_domain *domain;
3338	size_t size = 0;
3339	int prot = 0;
3340	struct iova *iova = NULL;
3341	int ret;
3342	struct scatterlist *sg;
3343	unsigned long start_vpfn;
3344	struct intel_iommu *iommu;
3345
3346	BUG_ON(dir == DMA_NONE);
3347	if (iommu_no_mapping(dev))
3348		return intel_nontranslate_map_sg(dev, sglist, nelems, dir);
3349
3350	domain = get_valid_domain_for_dev(dev);
3351	if (!domain)
3352		return 0;
3353
3354	iommu = domain_get_iommu(domain);
3355
3356	for_each_sg(sglist, sg, nelems, i)
3357		size += aligned_nrpages(sg->offset, sg->length);
3358
3359	iova = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3360				*dev->dma_mask);
3361	if (!iova) {
3362		sglist->dma_length = 0;
3363		return 0;
3364	}
3365
3366	/*
3367	 * Check if DMAR supports zero-length reads on write only
3368	 * mappings..
3369	 */
3370	if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3371			!cap_zlr(iommu->cap))
3372		prot |= DMA_PTE_READ;
3373	if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3374		prot |= DMA_PTE_WRITE;
3375
3376	start_vpfn = mm_to_dma_pfn(iova->pfn_lo);
3377
3378	ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3379	if (unlikely(ret)) {
3380		dma_pte_free_pagetable(domain, start_vpfn,
3381				       start_vpfn + size - 1);
3382		__free_iova(&domain->iovad, iova);
3383		return 0;
3384	}
3385
3386	/* it's a non-present to present mapping. Only flush if caching mode */
3387	if (cap_caching_mode(iommu->cap))
3388		iommu_flush_iotlb_psi(iommu, domain->id, start_vpfn, size, 0, 1);
3389	else
3390		iommu_flush_write_buffer(iommu);
3391
3392	return nelems;
3393}
3394
3395static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
3396{
3397	return !dma_addr;
3398}
3399
3400struct dma_map_ops intel_dma_ops = {
3401	.alloc = intel_alloc_coherent,
3402	.free = intel_free_coherent,
3403	.map_sg = intel_map_sg,
3404	.unmap_sg = intel_unmap_sg,
3405	.map_page = intel_map_page,
3406	.unmap_page = intel_unmap_page,
3407	.mapping_error = intel_mapping_error,
3408};
3409
3410static inline int iommu_domain_cache_init(void)
3411{
3412	int ret = 0;
3413
3414	iommu_domain_cache = kmem_cache_create("iommu_domain",
3415					 sizeof(struct dmar_domain),
3416					 0,
3417					 SLAB_HWCACHE_ALIGN,
3418
3419					 NULL);
3420	if (!iommu_domain_cache) {
3421		printk(KERN_ERR "Couldn't create iommu_domain cache\n");
3422		ret = -ENOMEM;
3423	}
3424
3425	return ret;
3426}
3427
3428static inline int iommu_devinfo_cache_init(void)
3429{
3430	int ret = 0;
3431
3432	iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3433					 sizeof(struct device_domain_info),
3434					 0,
3435					 SLAB_HWCACHE_ALIGN,
3436					 NULL);
3437	if (!iommu_devinfo_cache) {
3438		printk(KERN_ERR "Couldn't create devinfo cache\n");
3439		ret = -ENOMEM;
3440	}
3441
3442	return ret;
3443}
3444
3445static int __init iommu_init_mempool(void)
3446{
3447	int ret;
3448	ret = iommu_iova_cache_init();
3449	if (ret)
3450		return ret;
3451
3452	ret = iommu_domain_cache_init();
3453	if (ret)
3454		goto domain_error;
3455
3456	ret = iommu_devinfo_cache_init();
3457	if (!ret)
3458		return ret;
3459
3460	kmem_cache_destroy(iommu_domain_cache);
3461domain_error:
3462	iommu_iova_cache_destroy();
3463
3464	return -ENOMEM;
3465}
3466
3467static void __init iommu_exit_mempool(void)
3468{
3469	kmem_cache_destroy(iommu_devinfo_cache);
3470	kmem_cache_destroy(iommu_domain_cache);
3471	iommu_iova_cache_destroy();
3472}
3473
3474static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3475{
3476	struct dmar_drhd_unit *drhd;
3477	u32 vtbar;
3478	int rc;
3479
3480	/* We know that this device on this chipset has its own IOMMU.
3481	 * If we find it under a different IOMMU, then the BIOS is lying
3482	 * to us. Hope that the IOMMU for this device is actually
3483	 * disabled, and it needs no translation...
3484	 */
3485	rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3486	if (rc) {
3487		/* "can't" happen */
3488		dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3489		return;
3490	}
3491	vtbar &= 0xffff0000;
3492
3493	/* we know that the this iommu should be at offset 0xa000 from vtbar */
3494	drhd = dmar_find_matched_drhd_unit(pdev);
3495	if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
3496			    TAINT_FIRMWARE_WORKAROUND,
3497			    "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
3498		pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3499}
3500DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
3501
3502static void __init init_no_remapping_devices(void)
3503{
3504	struct dmar_drhd_unit *drhd;
3505	struct device *dev;
3506	int i;
3507
3508	for_each_drhd_unit(drhd) {
3509		if (!drhd->include_all) {
3510			for_each_active_dev_scope(drhd->devices,
3511						  drhd->devices_cnt, i, dev)
3512				break;
3513			/* ignore DMAR unit if no devices exist */
3514			if (i == drhd->devices_cnt)
3515				drhd->ignored = 1;
3516		}
3517	}
3518
3519	for_each_active_drhd_unit(drhd) {
3520		if (drhd->include_all)
3521			continue;
3522
3523		for_each_active_dev_scope(drhd->devices,
3524					  drhd->devices_cnt, i, dev)
3525			if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
3526				break;
3527		if (i < drhd->devices_cnt)
3528			continue;
3529
3530		/* This IOMMU has *only* gfx devices. Either bypass it or
3531		   set the gfx_mapped flag, as appropriate */
3532		if (dmar_map_gfx) {
3533			intel_iommu_gfx_mapped = 1;
3534		} else {
3535			drhd->ignored = 1;
3536			for_each_active_dev_scope(drhd->devices,
3537						  drhd->devices_cnt, i, dev)
3538				dev->archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3539		}
3540	}
3541}
3542
3543#ifdef CONFIG_SUSPEND
3544static int init_iommu_hw(void)
3545{
3546	struct dmar_drhd_unit *drhd;
3547	struct intel_iommu *iommu = NULL;
3548
3549	for_each_active_iommu(iommu, drhd)
3550		if (iommu->qi)
3551			dmar_reenable_qi(iommu);
3552
3553	for_each_iommu(iommu, drhd) {
3554		if (drhd->ignored) {
3555			/*
3556			 * we always have to disable PMRs or DMA may fail on
3557			 * this device
3558			 */
3559			if (force_on)
3560				iommu_disable_protect_mem_regions(iommu);
3561			continue;
3562		}
3563
3564		iommu_flush_write_buffer(iommu);
3565
3566		iommu_set_root_entry(iommu);
3567
3568		iommu->flush.flush_context(iommu, 0, 0, 0,
3569					   DMA_CCMD_GLOBAL_INVL);
3570		iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3571		iommu_enable_translation(iommu);
3572		iommu_disable_protect_mem_regions(iommu);
3573	}
3574
3575	return 0;
3576}
3577
3578static void iommu_flush_all(void)
3579{
3580	struct dmar_drhd_unit *drhd;
3581	struct intel_iommu *iommu;
3582
3583	for_each_active_iommu(iommu, drhd) {
3584		iommu->flush.flush_context(iommu, 0, 0, 0,
3585					   DMA_CCMD_GLOBAL_INVL);
3586		iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3587					 DMA_TLB_GLOBAL_FLUSH);
3588	}
3589}
3590
3591static int iommu_suspend(void)
3592{
3593	struct dmar_drhd_unit *drhd;
3594	struct intel_iommu *iommu = NULL;
3595	unsigned long flag;
3596
3597	for_each_active_iommu(iommu, drhd) {
3598		iommu->iommu_state = kzalloc(sizeof(u32) * MAX_SR_DMAR_REGS,
3599						 GFP_ATOMIC);
3600		if (!iommu->iommu_state)
3601			goto nomem;
3602	}
3603
3604	iommu_flush_all();
3605
3606	for_each_active_iommu(iommu, drhd) {
3607		iommu_disable_translation(iommu);
3608
3609		raw_spin_lock_irqsave(&iommu->register_lock, flag);
3610
3611		iommu->iommu_state[SR_DMAR_FECTL_REG] =
3612			readl(iommu->reg + DMAR_FECTL_REG);
3613		iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3614			readl(iommu->reg + DMAR_FEDATA_REG);
3615		iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3616			readl(iommu->reg + DMAR_FEADDR_REG);
3617		iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3618			readl(iommu->reg + DMAR_FEUADDR_REG);
3619
3620		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3621	}
3622	return 0;
3623
3624nomem:
3625	for_each_active_iommu(iommu, drhd)
3626		kfree(iommu->iommu_state);
3627
3628	return -ENOMEM;
3629}
3630
3631static void iommu_resume(void)
3632{
3633	struct dmar_drhd_unit *drhd;
3634	struct intel_iommu *iommu = NULL;
3635	unsigned long flag;
3636
3637	if (init_iommu_hw()) {
3638		if (force_on)
3639			panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3640		else
3641			WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3642		return;
3643	}
3644
3645	for_each_active_iommu(iommu, drhd) {
3646
3647		raw_spin_lock_irqsave(&iommu->register_lock, flag);
3648
3649		writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3650			iommu->reg + DMAR_FECTL_REG);
3651		writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3652			iommu->reg + DMAR_FEDATA_REG);
3653		writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3654			iommu->reg + DMAR_FEADDR_REG);
3655		writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3656			iommu->reg + DMAR_FEUADDR_REG);
3657
3658		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3659	}
3660
3661	for_each_active_iommu(iommu, drhd)
3662		kfree(iommu->iommu_state);
3663}
3664
3665static struct syscore_ops iommu_syscore_ops = {
3666	.resume		= iommu_resume,
3667	.suspend	= iommu_suspend,
3668};
3669
3670static void __init init_iommu_pm_ops(void)
3671{
3672	register_syscore_ops(&iommu_syscore_ops);
3673}
3674
3675#else
3676static inline void init_iommu_pm_ops(void) {}
3677#endif	/* CONFIG_PM */
3678
3679
3680int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
3681{
3682	struct acpi_dmar_reserved_memory *rmrr;
3683	struct dmar_rmrr_unit *rmrru;
3684
3685	rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3686	if (!rmrru)
3687		return -ENOMEM;
3688
3689	rmrru->hdr = header;
3690	rmrr = (struct acpi_dmar_reserved_memory *)header;
3691	rmrru->base_address = rmrr->base_address;
3692	rmrru->end_address = rmrr->end_address;
3693	rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
3694				((void *)rmrr) + rmrr->header.length,
3695				&rmrru->devices_cnt);
3696	if (rmrru->devices_cnt && rmrru->devices == NULL) {
3697		kfree(rmrru);
3698		return -ENOMEM;
3699	}
3700
3701	list_add(&rmrru->list, &dmar_rmrr_units);
3702
3703	return 0;
3704}
3705
3706static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
3707{
3708	struct dmar_atsr_unit *atsru;
3709	struct acpi_dmar_atsr *tmp;
3710
3711	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
3712		tmp = (struct acpi_dmar_atsr *)atsru->hdr;
3713		if (atsr->segment != tmp->segment)
3714			continue;
3715		if (atsr->header.length != tmp->header.length)
3716			continue;
3717		if (memcmp(atsr, tmp, atsr->header.length) == 0)
3718			return atsru;
3719	}
3720
3721	return NULL;
3722}
3723
3724int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3725{
3726	struct acpi_dmar_atsr *atsr;
3727	struct dmar_atsr_unit *atsru;
3728
3729	if (system_state != SYSTEM_BOOTING && !intel_iommu_enabled)
3730		return 0;
3731
3732	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3733	atsru = dmar_find_atsr(atsr);
3734	if (atsru)
3735		return 0;
3736
3737	atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
3738	if (!atsru)
3739		return -ENOMEM;
3740
3741	/*
3742	 * If memory is allocated from slab by ACPI _DSM method, we need to
3743	 * copy the memory content because the memory buffer will be freed
3744	 * on return.
3745	 */
3746	atsru->hdr = (void *)(atsru + 1);
3747	memcpy(atsru->hdr, hdr, hdr->length);
3748	atsru->include_all = atsr->flags & 0x1;
3749	if (!atsru->include_all) {
3750		atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
3751				(void *)atsr + atsr->header.length,
3752				&atsru->devices_cnt);
3753		if (atsru->devices_cnt && atsru->devices == NULL) {
3754			kfree(atsru);
3755			return -ENOMEM;
3756		}
3757	}
3758
3759	list_add_rcu(&atsru->list, &dmar_atsr_units);
3760
3761	return 0;
3762}
3763
3764static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
3765{
3766	dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
3767	kfree(atsru);
3768}
3769
3770int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3771{
3772	struct acpi_dmar_atsr *atsr;
3773	struct dmar_atsr_unit *atsru;
3774
3775	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3776	atsru = dmar_find_atsr(atsr);
3777	if (atsru) {
3778		list_del_rcu(&atsru->list);
3779		synchronize_rcu();
3780		intel_iommu_free_atsr(atsru);
3781	}
3782
3783	return 0;
3784}
3785
3786int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3787{
3788	int i;
3789	struct device *dev;
3790	struct acpi_dmar_atsr *atsr;
3791	struct dmar_atsr_unit *atsru;
3792
3793	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3794	atsru = dmar_find_atsr(atsr);
3795	if (!atsru)
3796		return 0;
3797
3798	if (!atsru->include_all && atsru->devices && atsru->devices_cnt)
3799		for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
3800					  i, dev)
3801			return -EBUSY;
3802
3803	return 0;
3804}
3805
3806static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
3807{
3808	int sp, ret = 0;
3809	struct intel_iommu *iommu = dmaru->iommu;
3810
3811	if (g_iommus[iommu->seq_id])
3812		return 0;
3813
3814	if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
3815		pr_warn("IOMMU: %s doesn't support hardware pass through.\n",
3816			iommu->name);
3817		return -ENXIO;
3818	}
3819	if (!ecap_sc_support(iommu->ecap) &&
3820	    domain_update_iommu_snooping(iommu)) {
3821		pr_warn("IOMMU: %s doesn't support snooping.\n",
3822			iommu->name);
3823		return -ENXIO;
3824	}
3825	sp = domain_update_iommu_superpage(iommu) - 1;
3826	if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
3827		pr_warn("IOMMU: %s doesn't support large page.\n",
3828			iommu->name);
3829		return -ENXIO;
3830	}
3831
3832	/*
3833	 * Disable translation if already enabled prior to OS handover.
3834	 */
3835	if (iommu->gcmd & DMA_GCMD_TE)
3836		iommu_disable_translation(iommu);
3837
3838	g_iommus[iommu->seq_id] = iommu;
3839	ret = iommu_init_domains(iommu);
3840	if (ret == 0)
3841		ret = iommu_alloc_root_entry(iommu);
3842	if (ret)
3843		goto out;
3844
3845	if (dmaru->ignored) {
3846		/*
3847		 * we always have to disable PMRs or DMA may fail on this device
3848		 */
3849		if (force_on)
3850			iommu_disable_protect_mem_regions(iommu);
3851		return 0;
3852	}
3853
3854	intel_iommu_init_qi(iommu);
3855	iommu_flush_write_buffer(iommu);
3856	ret = dmar_set_interrupt(iommu);
3857	if (ret)
3858		goto disable_iommu;
3859
3860	iommu_set_root_entry(iommu);
3861	iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3862	iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3863	iommu_enable_translation(iommu);
3864
3865	if (si_domain) {
3866		ret = iommu_attach_domain(si_domain, iommu);
3867		if (ret < 0 || si_domain->id != ret)
3868			goto disable_iommu;
3869		domain_attach_iommu(si_domain, iommu);
3870	}
3871
3872	iommu_disable_protect_mem_regions(iommu);
3873	return 0;
3874
3875disable_iommu:
3876	disable_dmar_iommu(iommu);
3877out:
3878	free_dmar_iommu(iommu);
3879	return ret;
3880}
3881
3882int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
3883{
3884	int ret = 0;
3885	struct intel_iommu *iommu = dmaru->iommu;
3886
3887	if (!intel_iommu_enabled)
3888		return 0;
3889	if (iommu == NULL)
3890		return -EINVAL;
3891
3892	if (insert) {
3893		ret = intel_iommu_add(dmaru);
3894	} else {
3895		disable_dmar_iommu(iommu);
3896		free_dmar_iommu(iommu);
3897	}
3898
3899	return ret;
3900}
3901
3902static void intel_iommu_free_dmars(void)
3903{
3904	struct dmar_rmrr_unit *rmrru, *rmrr_n;
3905	struct dmar_atsr_unit *atsru, *atsr_n;
3906
3907	list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
3908		list_del(&rmrru->list);
3909		dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
3910		kfree(rmrru);
3911	}
3912
3913	list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
3914		list_del(&atsru->list);
3915		intel_iommu_free_atsr(atsru);
3916	}
3917}
3918
3919int dmar_find_matched_atsr_unit(struct pci_dev *dev)
3920{
3921	int i, ret = 1;
3922	struct pci_bus *bus;
3923	struct pci_dev *bridge = NULL;
3924	struct device *tmp;
3925	struct acpi_dmar_atsr *atsr;
3926	struct dmar_atsr_unit *atsru;
3927
3928	dev = pci_physfn(dev);
3929	for (bus = dev->bus; bus; bus = bus->parent) {
3930		bridge = bus->self;
3931		/* If it's an integrated device, allow ATS */
3932		if (!bridge)
3933			return 1;
3934		/* Connected via non-PCIe: no ATS */
3935		if (!pci_is_pcie(bridge) ||
3936		    pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
3937			return 0;
3938		/* If we found the root port, look it up in the ATSR */
3939		if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
3940			break;
3941	}
3942
3943	rcu_read_lock();
3944	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
3945		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3946		if (atsr->segment != pci_domain_nr(dev->bus))
3947			continue;
3948
3949		for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
3950			if (tmp == &bridge->dev)
3951				goto out;
3952
3953		if (atsru->include_all)
3954			goto out;
3955	}
3956	ret = 0;
3957out:
3958	rcu_read_unlock();
3959
3960	return ret;
3961}
3962
3963int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
3964{
3965	int ret = 0;
3966	struct dmar_rmrr_unit *rmrru;
3967	struct dmar_atsr_unit *atsru;
3968	struct acpi_dmar_atsr *atsr;
3969	struct acpi_dmar_reserved_memory *rmrr;
3970
3971	if (!intel_iommu_enabled && system_state != SYSTEM_BOOTING)
3972		return 0;
3973
3974	list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
3975		rmrr = container_of(rmrru->hdr,
3976				    struct acpi_dmar_reserved_memory, header);
3977		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3978			ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
3979				((void *)rmrr) + rmrr->header.length,
3980				rmrr->segment, rmrru->devices,
3981				rmrru->devices_cnt);
3982			if(ret < 0)
3983				return ret;
3984		} else if (info->event == BUS_NOTIFY_DEL_DEVICE) {
3985			dmar_remove_dev_scope(info, rmrr->segment,
3986				rmrru->devices, rmrru->devices_cnt);
3987		}
3988	}
3989
3990	list_for_each_entry(atsru, &dmar_atsr_units, list) {
3991		if (atsru->include_all)
3992			continue;
3993
3994		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3995		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3996			ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
3997					(void *)atsr + atsr->header.length,
3998					atsr->segment, atsru->devices,
3999					atsru->devices_cnt);
4000			if (ret > 0)
4001				break;
4002			else if(ret < 0)
4003				return ret;
4004		} else if (info->event == BUS_NOTIFY_DEL_DEVICE) {
4005			if (dmar_remove_dev_scope(info, atsr->segment,
4006					atsru->devices, atsru->devices_cnt))
4007				break;
4008		}
4009	}
4010
4011	return 0;
4012}
4013
4014/*
4015 * Here we only respond to action of unbound device from driver.
4016 *
4017 * Added device is not attached to its DMAR domain here yet. That will happen
4018 * when mapping the device to iova.
4019 */
4020static int device_notifier(struct notifier_block *nb,
4021				  unsigned long action, void *data)
4022{
4023	struct device *dev = data;
4024	struct dmar_domain *domain;
4025
4026	if (iommu_dummy(dev))
4027		return 0;
4028
4029	if (action != BUS_NOTIFY_REMOVED_DEVICE)
4030		return 0;
4031
4032	domain = find_domain(dev);
4033	if (!domain)
4034		return 0;
4035
4036	down_read(&dmar_global_lock);
4037	domain_remove_one_dev_info(domain, dev);
4038	if (!domain_type_is_vm_or_si(domain) && list_empty(&domain->devices))
4039		domain_exit(domain);
4040	up_read(&dmar_global_lock);
4041
4042	return 0;
4043}
4044
4045static struct notifier_block device_nb = {
4046	.notifier_call = device_notifier,
4047};
4048
4049static int intel_iommu_memory_notifier(struct notifier_block *nb,
4050				       unsigned long val, void *v)
4051{
4052	struct memory_notify *mhp = v;
4053	unsigned long long start, end;
4054	unsigned long start_vpfn, last_vpfn;
4055
4056	switch (val) {
4057	case MEM_GOING_ONLINE:
4058		start = mhp->start_pfn << PAGE_SHIFT;
4059		end = ((mhp->start_pfn + mhp->nr_pages) << PAGE_SHIFT) - 1;
4060		if (iommu_domain_identity_map(si_domain, start, end)) {
4061			pr_warn("dmar: failed to build identity map for [%llx-%llx]\n",
4062				start, end);
4063			return NOTIFY_BAD;
4064		}
4065		break;
4066
4067	case MEM_OFFLINE:
4068	case MEM_CANCEL_ONLINE:
4069		start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4070		last_vpfn = mm_to_dma_pfn(mhp->start_pfn + mhp->nr_pages - 1);
4071		while (start_vpfn <= last_vpfn) {
4072			struct iova *iova;
4073			struct dmar_drhd_unit *drhd;
4074			struct intel_iommu *iommu;
4075			struct page *freelist;
4076
4077			iova = find_iova(&si_domain->iovad, start_vpfn);
4078			if (iova == NULL) {
4079				pr_debug("dmar: failed get IOVA for PFN %lx\n",
4080					 start_vpfn);
4081				break;
4082			}
4083
4084			iova = split_and_remove_iova(&si_domain->iovad, iova,
4085						     start_vpfn, last_vpfn);
4086			if (iova == NULL) {
4087				pr_warn("dmar: failed to split IOVA PFN [%lx-%lx]\n",
4088					start_vpfn, last_vpfn);
4089				return NOTIFY_BAD;
4090			}
4091
4092			freelist = domain_unmap(si_domain, iova->pfn_lo,
4093					       iova->pfn_hi);
4094
4095			rcu_read_lock();
4096			for_each_active_iommu(iommu, drhd)
4097				iommu_flush_iotlb_psi(iommu, si_domain->id,
4098					iova->pfn_lo, iova_size(iova),
4099					!freelist, 0);
4100			rcu_read_unlock();
4101			dma_free_pagelist(freelist);
4102
4103			start_vpfn = iova->pfn_hi + 1;
4104			free_iova_mem(iova);
4105		}
4106		break;
4107	}
4108
4109	return NOTIFY_OK;
4110}
4111
4112static struct notifier_block intel_iommu_memory_nb = {
4113	.notifier_call = intel_iommu_memory_notifier,
4114	.priority = 0
4115};
4116
4117
4118static ssize_t intel_iommu_show_version(struct device *dev,
4119					struct device_attribute *attr,
4120					char *buf)
4121{
4122	struct intel_iommu *iommu = dev_get_drvdata(dev);
4123	u32 ver = readl(iommu->reg + DMAR_VER_REG);
4124	return sprintf(buf, "%d:%d\n",
4125		       DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4126}
4127static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4128
4129static ssize_t intel_iommu_show_address(struct device *dev,
4130					struct device_attribute *attr,
4131					char *buf)
4132{
4133	struct intel_iommu *iommu = dev_get_drvdata(dev);
4134	return sprintf(buf, "%llx\n", iommu->reg_phys);
4135}
4136static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4137
4138static ssize_t intel_iommu_show_cap(struct device *dev,
4139				    struct device_attribute *attr,
4140				    char *buf)
4141{
4142	struct intel_iommu *iommu = dev_get_drvdata(dev);
4143	return sprintf(buf, "%llx\n", iommu->cap);
4144}
4145static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4146
4147static ssize_t intel_iommu_show_ecap(struct device *dev,
4148				    struct device_attribute *attr,
4149				    char *buf)
4150{
4151	struct intel_iommu *iommu = dev_get_drvdata(dev);
4152	return sprintf(buf, "%llx\n", iommu->ecap);
4153}
4154static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4155
4156static struct attribute *intel_iommu_attrs[] = {
4157	&dev_attr_version.attr,
4158	&dev_attr_address.attr,
4159	&dev_attr_cap.attr,
4160	&dev_attr_ecap.attr,
4161	NULL,
4162};
4163
4164static struct attribute_group intel_iommu_group = {
4165	.name = "intel-iommu",
4166	.attrs = intel_iommu_attrs,
4167};
4168
4169const struct attribute_group *intel_iommu_groups[] = {
4170	&intel_iommu_group,
4171	NULL,
4172};
4173
4174int __init intel_iommu_init(void)
4175{
4176	int ret = -ENODEV;
4177	struct dmar_drhd_unit *drhd;
4178	struct intel_iommu *iommu;
4179
4180	/* VT-d is required for a TXT/tboot launch, so enforce that */
4181	force_on = tboot_force_iommu();
4182
4183	if (iommu_init_mempool()) {
4184		if (force_on)
4185			panic("tboot: Failed to initialize iommu memory\n");
4186		return -ENOMEM;
4187	}
4188
4189	down_write(&dmar_global_lock);
4190	if (dmar_table_init()) {
4191		if (force_on)
4192			panic("tboot: Failed to initialize DMAR table\n");
4193		goto out_free_dmar;
4194	}
4195
4196	/*
4197	 * Disable translation if already enabled prior to OS handover.
4198	 */
4199	for_each_active_iommu(iommu, drhd)
4200		if (iommu->gcmd & DMA_GCMD_TE)
4201			iommu_disable_translation(iommu);
4202
4203	if (dmar_dev_scope_init() < 0) {
4204		if (force_on)
4205			panic("tboot: Failed to initialize DMAR device scope\n");
4206		goto out_free_dmar;
4207	}
4208
4209	if (no_iommu || dmar_disabled)
4210		goto out_free_dmar;
4211
4212	if (list_empty(&dmar_rmrr_units))
4213		printk(KERN_INFO "DMAR: No RMRR found\n");
4214
4215	if (list_empty(&dmar_atsr_units))
4216		printk(KERN_INFO "DMAR: No ATSR found\n");
4217
4218	if (dmar_init_reserved_ranges()) {
4219		if (force_on)
4220			panic("tboot: Failed to reserve iommu ranges\n");
4221		goto out_free_reserved_range;
4222	}
4223
4224	init_no_remapping_devices();
4225
4226	ret = init_dmars();
4227	if (ret) {
4228		if (force_on)
4229			panic("tboot: Failed to initialize DMARs\n");
4230		printk(KERN_ERR "IOMMU: dmar init failed\n");
4231		goto out_free_reserved_range;
4232	}
4233	up_write(&dmar_global_lock);
4234	printk(KERN_INFO
4235	"PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
4236
4237	init_timer(&unmap_timer);
4238#ifdef CONFIG_SWIOTLB
4239	swiotlb = 0;
4240#endif
4241	dma_ops = &intel_dma_ops;
4242
4243	init_iommu_pm_ops();
4244
4245	for_each_active_iommu(iommu, drhd)
4246		iommu->iommu_dev = iommu_device_create(NULL, iommu,
4247						       intel_iommu_groups,
4248						       iommu->name);
4249
4250	bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4251	bus_register_notifier(&pci_bus_type, &device_nb);
4252	if (si_domain && !hw_pass_through)
4253		register_memory_notifier(&intel_iommu_memory_nb);
4254
4255	intel_iommu_enabled = 1;
4256
4257	return 0;
4258
4259out_free_reserved_range:
4260	put_iova_domain(&reserved_iova_list);
4261out_free_dmar:
4262	intel_iommu_free_dmars();
4263	up_write(&dmar_global_lock);
4264	iommu_exit_mempool();
4265	return ret;
4266}
4267
4268static int iommu_detach_dev_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4269{
4270	struct intel_iommu *iommu = opaque;
4271
4272	iommu_detach_dev(iommu, PCI_BUS_NUM(alias), alias & 0xff);
4273	return 0;
4274}
4275
4276/*
4277 * NB - intel-iommu lacks any sort of reference counting for the users of
4278 * dependent devices.  If multiple endpoints have intersecting dependent
4279 * devices, unbinding the driver from any one of them will possibly leave
4280 * the others unable to operate.
4281 */
4282static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
4283					   struct device *dev)
4284{
4285	if (!iommu || !dev || !dev_is_pci(dev))
4286		return;
4287
4288	pci_for_each_dma_alias(to_pci_dev(dev), &iommu_detach_dev_cb, iommu);
4289}
4290
4291static void domain_remove_one_dev_info(struct dmar_domain *domain,
4292				       struct device *dev)
4293{
4294	struct device_domain_info *info, *tmp;
4295	struct intel_iommu *iommu;
4296	unsigned long flags;
4297	bool found = false;
4298	u8 bus, devfn;
4299
4300	iommu = device_to_iommu(dev, &bus, &devfn);
4301	if (!iommu)
4302		return;
4303
4304	spin_lock_irqsave(&device_domain_lock, flags);
4305	list_for_each_entry_safe(info, tmp, &domain->devices, link) {
4306		if (info->iommu == iommu && info->bus == bus &&
4307		    info->devfn == devfn) {
4308			unlink_domain_info(info);
4309			spin_unlock_irqrestore(&device_domain_lock, flags);
4310
4311			iommu_disable_dev_iotlb(info);
4312			iommu_detach_dev(iommu, info->bus, info->devfn);
4313			iommu_detach_dependent_devices(iommu, dev);
4314			free_devinfo_mem(info);
4315
4316			spin_lock_irqsave(&device_domain_lock, flags);
4317
4318			if (found)
4319				break;
4320			else
4321				continue;
4322		}
4323
4324		/* if there is no other devices under the same iommu
4325		 * owned by this domain, clear this iommu in iommu_bmp
4326		 * update iommu count and coherency
4327		 */
4328		if (info->iommu == iommu)
4329			found = true;
4330	}
4331
4332	spin_unlock_irqrestore(&device_domain_lock, flags);
4333
4334	if (found == 0) {
4335		domain_detach_iommu(domain, iommu);
4336		if (!domain_type_is_vm_or_si(domain))
4337			iommu_detach_domain(domain, iommu);
4338	}
4339}
4340
4341static int md_domain_init(struct dmar_domain *domain, int guest_width)
4342{
4343	int adjust_width;
4344
4345	init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN,
4346			DMA_32BIT_PFN);
4347	domain_reserve_special_ranges(domain);
4348
4349	/* calculate AGAW */
4350	domain->gaw = guest_width;
4351	adjust_width = guestwidth_to_adjustwidth(guest_width);
4352	domain->agaw = width_to_agaw(adjust_width);
4353
4354	domain->iommu_coherency = 0;
4355	domain->iommu_snooping = 0;
4356	domain->iommu_superpage = 0;
4357	domain->max_addr = 0;
4358
4359	/* always allocate the top pgd */
4360	domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
4361	if (!domain->pgd)
4362		return -ENOMEM;
4363	domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4364	return 0;
4365}
4366
4367static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
4368{
4369	struct dmar_domain *dmar_domain;
4370	struct iommu_domain *domain;
4371
4372	if (type != IOMMU_DOMAIN_UNMANAGED)
4373		return NULL;
4374
4375	dmar_domain = alloc_domain(DOMAIN_FLAG_VIRTUAL_MACHINE);
4376	if (!dmar_domain) {
4377		printk(KERN_ERR
4378			"intel_iommu_domain_init: dmar_domain == NULL\n");
4379		return NULL;
4380	}
4381	if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4382		printk(KERN_ERR
4383			"intel_iommu_domain_init() failed\n");
4384		domain_exit(dmar_domain);
4385		return NULL;
4386	}
4387	domain_update_iommu_cap(dmar_domain);
4388
4389	domain = &dmar_domain->domain;
4390	domain->geometry.aperture_start = 0;
4391	domain->geometry.aperture_end   = __DOMAIN_MAX_ADDR(dmar_domain->gaw);
4392	domain->geometry.force_aperture = true;
4393
4394	return domain;
4395}
4396
4397static void intel_iommu_domain_free(struct iommu_domain *domain)
4398{
4399	domain_exit(to_dmar_domain(domain));
4400}
4401
4402static int intel_iommu_attach_device(struct iommu_domain *domain,
4403				     struct device *dev)
4404{
4405	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4406	struct intel_iommu *iommu;
4407	int addr_width;
4408	u8 bus, devfn;
4409
4410	if (device_is_rmrr_locked(dev)) {
4411		dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
4412		return -EPERM;
4413	}
4414
4415	/* normally dev is not mapped */
4416	if (unlikely(domain_context_mapped(dev))) {
4417		struct dmar_domain *old_domain;
4418
4419		old_domain = find_domain(dev);
4420		if (old_domain) {
4421			if (domain_type_is_vm_or_si(dmar_domain))
4422				domain_remove_one_dev_info(old_domain, dev);
4423			else
4424				domain_remove_dev_info(old_domain);
4425
4426			if (!domain_type_is_vm_or_si(old_domain) &&
4427			     list_empty(&old_domain->devices))
4428				domain_exit(old_domain);
4429		}
4430	}
4431
4432	iommu = device_to_iommu(dev, &bus, &devfn);
4433	if (!iommu)
4434		return -ENODEV;
4435
4436	/* check if this iommu agaw is sufficient for max mapped address */
4437	addr_width = agaw_to_width(iommu->agaw);
4438	if (addr_width > cap_mgaw(iommu->cap))
4439		addr_width = cap_mgaw(iommu->cap);
4440
4441	if (dmar_domain->max_addr > (1LL << addr_width)) {
4442		printk(KERN_ERR "%s: iommu width (%d) is not "
4443		       "sufficient for the mapped address (%llx)\n",
4444		       __func__, addr_width, dmar_domain->max_addr);
4445		return -EFAULT;
4446	}
4447	dmar_domain->gaw = addr_width;
4448
4449	/*
4450	 * Knock out extra levels of page tables if necessary
4451	 */
4452	while (iommu->agaw < dmar_domain->agaw) {
4453		struct dma_pte *pte;
4454
4455		pte = dmar_domain->pgd;
4456		if (dma_pte_present(pte)) {
4457			dmar_domain->pgd = (struct dma_pte *)
4458				phys_to_virt(dma_pte_addr(pte));
4459			free_pgtable_page(pte);
4460		}
4461		dmar_domain->agaw--;
4462	}
4463
4464	return domain_add_dev_info(dmar_domain, dev, CONTEXT_TT_MULTI_LEVEL);
4465}
4466
4467static void intel_iommu_detach_device(struct iommu_domain *domain,
4468				      struct device *dev)
4469{
4470	domain_remove_one_dev_info(to_dmar_domain(domain), dev);
4471}
4472
4473static int intel_iommu_map(struct iommu_domain *domain,
4474			   unsigned long iova, phys_addr_t hpa,
4475			   size_t size, int iommu_prot)
4476{
4477	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4478	u64 max_addr;
4479	int prot = 0;
4480	int ret;
4481
4482	if (iommu_prot & IOMMU_READ)
4483		prot |= DMA_PTE_READ;
4484	if (iommu_prot & IOMMU_WRITE)
4485		prot |= DMA_PTE_WRITE;
4486	if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
4487		prot |= DMA_PTE_SNP;
4488
4489	max_addr = iova + size;
4490	if (dmar_domain->max_addr < max_addr) {
4491		u64 end;
4492
4493		/* check if minimum agaw is sufficient for mapped address */
4494		end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4495		if (end < max_addr) {
4496			printk(KERN_ERR "%s: iommu width (%d) is not "
4497			       "sufficient for the mapped address (%llx)\n",
4498			       __func__, dmar_domain->gaw, max_addr);
4499			return -EFAULT;
4500		}
4501		dmar_domain->max_addr = max_addr;
4502	}
4503	/* Round up size to next multiple of PAGE_SIZE, if it and
4504	   the low bits of hpa would take us onto the next page */
4505	size = aligned_nrpages(hpa, size);
4506	ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4507				 hpa >> VTD_PAGE_SHIFT, size, prot);
4508	return ret;
4509}
4510
4511static size_t intel_iommu_unmap(struct iommu_domain *domain,
4512				unsigned long iova, size_t size)
4513{
4514	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4515	struct page *freelist = NULL;
4516	struct intel_iommu *iommu;
4517	unsigned long start_pfn, last_pfn;
4518	unsigned int npages;
4519	int iommu_id, num, ndomains, level = 0;
4520
4521	/* Cope with horrid API which requires us to unmap more than the
4522	   size argument if it happens to be a large-page mapping. */
4523	if (!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level))
4524		BUG();
4525
4526	if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
4527		size = VTD_PAGE_SIZE << level_to_offset_bits(level);
4528
4529	start_pfn = iova >> VTD_PAGE_SHIFT;
4530	last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
4531
4532	freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
4533
4534	npages = last_pfn - start_pfn + 1;
4535
4536	for_each_set_bit(iommu_id, dmar_domain->iommu_bmp, g_num_of_iommus) {
4537               iommu = g_iommus[iommu_id];
4538
4539               /*
4540                * find bit position of dmar_domain
4541                */
4542               ndomains = cap_ndoms(iommu->cap);
4543               for_each_set_bit(num, iommu->domain_ids, ndomains) {
4544                       if (iommu->domains[num] == dmar_domain)
4545                               iommu_flush_iotlb_psi(iommu, num, start_pfn,
4546						     npages, !freelist, 0);
4547	       }
4548
4549	}
4550
4551	dma_free_pagelist(freelist);
4552
4553	if (dmar_domain->max_addr == iova + size)
4554		dmar_domain->max_addr = iova;
4555
4556	return size;
4557}
4558
4559static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4560					    dma_addr_t iova)
4561{
4562	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4563	struct dma_pte *pte;
4564	int level = 0;
4565	u64 phys = 0;
4566
4567	pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
4568	if (pte)
4569		phys = dma_pte_addr(pte);
4570
4571	return phys;
4572}
4573
4574static bool intel_iommu_capable(enum iommu_cap cap)
4575{
4576	if (cap == IOMMU_CAP_CACHE_COHERENCY)
4577		return domain_update_iommu_snooping(NULL) == 1;
4578	if (cap == IOMMU_CAP_INTR_REMAP)
4579		return irq_remapping_enabled == 1;
4580
4581	return false;
4582}
4583
4584static int intel_iommu_add_device(struct device *dev)
4585{
4586	struct intel_iommu *iommu;
4587	struct iommu_group *group;
4588	u8 bus, devfn;
4589
4590	iommu = device_to_iommu(dev, &bus, &devfn);
4591	if (!iommu)
4592		return -ENODEV;
4593
4594	iommu_device_link(iommu->iommu_dev, dev);
4595
4596	group = iommu_group_get_for_dev(dev);
4597
4598	if (IS_ERR(group))
4599		return PTR_ERR(group);
4600
4601	iommu_group_put(group);
4602	return 0;
4603}
4604
4605static void intel_iommu_remove_device(struct device *dev)
4606{
4607	struct intel_iommu *iommu;
4608	u8 bus, devfn;
4609
4610	iommu = device_to_iommu(dev, &bus, &devfn);
4611	if (!iommu)
4612		return;
4613
4614	iommu_group_remove_device(dev);
4615
4616	iommu_device_unlink(iommu->iommu_dev, dev);
4617}
4618
4619static const struct iommu_ops intel_iommu_ops = {
4620	.capable	= intel_iommu_capable,
4621	.domain_alloc	= intel_iommu_domain_alloc,
4622	.domain_free	= intel_iommu_domain_free,
4623	.attach_dev	= intel_iommu_attach_device,
4624	.detach_dev	= intel_iommu_detach_device,
4625	.map		= intel_iommu_map,
4626	.unmap		= intel_iommu_unmap,
4627	.map_sg		= default_iommu_map_sg,
4628	.iova_to_phys	= intel_iommu_iova_to_phys,
4629	.add_device	= intel_iommu_add_device,
4630	.remove_device	= intel_iommu_remove_device,
4631	.pgsize_bitmap	= INTEL_IOMMU_PGSIZES,
4632};
4633
4634static void quirk_iommu_g4x_gfx(struct pci_dev *dev)
4635{
4636	/* G4x/GM45 integrated gfx dmar support is totally busted. */
4637	printk(KERN_INFO "DMAR: Disabling IOMMU for graphics on this chipset\n");
4638	dmar_map_gfx = 0;
4639}
4640
4641DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_g4x_gfx);
4642DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_g4x_gfx);
4643DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_g4x_gfx);
4644DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_g4x_gfx);
4645DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_g4x_gfx);
4646DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_g4x_gfx);
4647DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_g4x_gfx);
4648
4649static void quirk_iommu_rwbf(struct pci_dev *dev)
4650{
4651	/*
4652	 * Mobile 4 Series Chipset neglects to set RWBF capability,
4653	 * but needs it. Same seems to hold for the desktop versions.
4654	 */
4655	printk(KERN_INFO "DMAR: Forcing write-buffer flush capability\n");
4656	rwbf_quirk = 1;
4657}
4658
4659DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4660DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
4661DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
4662DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
4663DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
4664DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
4665DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
4666
4667#define GGC 0x52
4668#define GGC_MEMORY_SIZE_MASK	(0xf << 8)
4669#define GGC_MEMORY_SIZE_NONE	(0x0 << 8)
4670#define GGC_MEMORY_SIZE_1M	(0x1 << 8)
4671#define GGC_MEMORY_SIZE_2M	(0x3 << 8)
4672#define GGC_MEMORY_VT_ENABLED	(0x8 << 8)
4673#define GGC_MEMORY_SIZE_2M_VT	(0x9 << 8)
4674#define GGC_MEMORY_SIZE_3M_VT	(0xa << 8)
4675#define GGC_MEMORY_SIZE_4M_VT	(0xb << 8)
4676
4677static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4678{
4679	unsigned short ggc;
4680
4681	if (pci_read_config_word(dev, GGC, &ggc))
4682		return;
4683
4684	if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4685		printk(KERN_INFO "DMAR: BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4686		dmar_map_gfx = 0;
4687	} else if (dmar_map_gfx) {
4688		/* we have to ensure the gfx device is idle before we flush */
4689		printk(KERN_INFO "DMAR: Disabling batched IOTLB flush on Ironlake\n");
4690		intel_iommu_strict = 1;
4691       }
4692}
4693DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4694DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4695DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4696DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4697
4698/* On Tylersburg chipsets, some BIOSes have been known to enable the
4699   ISOCH DMAR unit for the Azalia sound device, but not give it any
4700   TLB entries, which causes it to deadlock. Check for that.  We do
4701   this in a function called from init_dmars(), instead of in a PCI
4702   quirk, because we don't want to print the obnoxious "BIOS broken"
4703   message if VT-d is actually disabled.
4704*/
4705static void __init check_tylersburg_isoch(void)
4706{
4707	struct pci_dev *pdev;
4708	uint32_t vtisochctrl;
4709
4710	/* If there's no Azalia in the system anyway, forget it. */
4711	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4712	if (!pdev)
4713		return;
4714	pci_dev_put(pdev);
4715
4716	/* System Management Registers. Might be hidden, in which case
4717	   we can't do the sanity check. But that's OK, because the
4718	   known-broken BIOSes _don't_ actually hide it, so far. */
4719	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4720	if (!pdev)
4721		return;
4722
4723	if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4724		pci_dev_put(pdev);
4725		return;
4726	}
4727
4728	pci_dev_put(pdev);
4729
4730	/* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4731	if (vtisochctrl & 1)
4732		return;
4733
4734	/* Drop all bits other than the number of TLB entries */
4735	vtisochctrl &= 0x1c;
4736
4737	/* If we have the recommended number of TLB entries (16), fine. */
4738	if (vtisochctrl == 0x10)
4739		return;
4740
4741	/* Zero TLB entries? You get to ride the short bus to school. */
4742	if (!vtisochctrl) {
4743		WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
4744		     "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4745		     dmi_get_system_info(DMI_BIOS_VENDOR),
4746		     dmi_get_system_info(DMI_BIOS_VERSION),
4747		     dmi_get_system_info(DMI_PRODUCT_VERSION));
4748		iommu_identity_mapping |= IDENTMAP_AZALIA;
4749		return;
4750	}
4751
4752	printk(KERN_WARNING "DMAR: Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
4753	       vtisochctrl);
4754}
4755