1/*
2 * Copyright 2002 Andi Kleen, SuSE Labs.
3 * Thanks to Ben LaHaise for precious feedback.
4 */
5#include <linux/highmem.h>
6#include <linux/bootmem.h>
7#include <linux/module.h>
8#include <linux/sched.h>
9#include <linux/mm.h>
10#include <linux/interrupt.h>
11#include <linux/seq_file.h>
12#include <linux/debugfs.h>
13#include <linux/pfn.h>
14#include <linux/percpu.h>
15#include <linux/gfp.h>
16#include <linux/pci.h>
17
18#include <asm/e820.h>
19#include <asm/processor.h>
20#include <asm/tlbflush.h>
21#include <asm/sections.h>
22#include <asm/setup.h>
23#include <asm/uaccess.h>
24#include <asm/pgalloc.h>
25#include <asm/proto.h>
26#include <asm/pat.h>
27
28/*
29 * The current flushing context - we pass it instead of 5 arguments:
30 */
31struct cpa_data {
32	unsigned long	*vaddr;
33	pgd_t		*pgd;
34	pgprot_t	mask_set;
35	pgprot_t	mask_clr;
36	unsigned long	numpages;
37	int		flags;
38	unsigned long	pfn;
39	unsigned	force_split : 1;
40	int		curpage;
41	struct page	**pages;
42};
43
44/*
45 * Serialize cpa() (for !DEBUG_PAGEALLOC which uses large identity mappings)
46 * using cpa_lock. So that we don't allow any other cpu, with stale large tlb
47 * entries change the page attribute in parallel to some other cpu
48 * splitting a large page entry along with changing the attribute.
49 */
50static DEFINE_SPINLOCK(cpa_lock);
51
52#define CPA_FLUSHTLB 1
53#define CPA_ARRAY 2
54#define CPA_PAGES_ARRAY 4
55
56#ifdef CONFIG_PROC_FS
57static unsigned long direct_pages_count[PG_LEVEL_NUM];
58
59void update_page_count(int level, unsigned long pages)
60{
61	/* Protect against CPA */
62	spin_lock(&pgd_lock);
63	direct_pages_count[level] += pages;
64	spin_unlock(&pgd_lock);
65}
66
67static void split_page_count(int level)
68{
69	direct_pages_count[level]--;
70	direct_pages_count[level - 1] += PTRS_PER_PTE;
71}
72
73void arch_report_meminfo(struct seq_file *m)
74{
75	seq_printf(m, "DirectMap4k:    %8lu kB\n",
76			direct_pages_count[PG_LEVEL_4K] << 2);
77#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
78	seq_printf(m, "DirectMap2M:    %8lu kB\n",
79			direct_pages_count[PG_LEVEL_2M] << 11);
80#else
81	seq_printf(m, "DirectMap4M:    %8lu kB\n",
82			direct_pages_count[PG_LEVEL_2M] << 12);
83#endif
84	if (direct_gbpages)
85		seq_printf(m, "DirectMap1G:    %8lu kB\n",
86			direct_pages_count[PG_LEVEL_1G] << 20);
87}
88#else
89static inline void split_page_count(int level) { }
90#endif
91
92#ifdef CONFIG_X86_64
93
94static inline unsigned long highmap_start_pfn(void)
95{
96	return __pa_symbol(_text) >> PAGE_SHIFT;
97}
98
99static inline unsigned long highmap_end_pfn(void)
100{
101	return __pa_symbol(roundup(_brk_end, PMD_SIZE)) >> PAGE_SHIFT;
102}
103
104#endif
105
106#ifdef CONFIG_DEBUG_PAGEALLOC
107# define debug_pagealloc 1
108#else
109# define debug_pagealloc 0
110#endif
111
112static inline int
113within(unsigned long addr, unsigned long start, unsigned long end)
114{
115	return addr >= start && addr < end;
116}
117
118/*
119 * Flushing functions
120 */
121
122/**
123 * clflush_cache_range - flush a cache range with clflush
124 * @vaddr:	virtual start address
125 * @size:	number of bytes to flush
126 *
127 * clflushopt is an unordered instruction which needs fencing with mfence or
128 * sfence to avoid ordering issues.
129 */
130void clflush_cache_range(void *vaddr, unsigned int size)
131{
132	void *vend = vaddr + size - 1;
133
134	mb();
135
136	for (; vaddr < vend; vaddr += boot_cpu_data.x86_clflush_size)
137		clflushopt(vaddr);
138	/*
139	 * Flush any possible final partial cacheline:
140	 */
141	clflushopt(vend);
142
143	mb();
144}
145EXPORT_SYMBOL_GPL(clflush_cache_range);
146
147static void __cpa_flush_all(void *arg)
148{
149	unsigned long cache = (unsigned long)arg;
150
151	/*
152	 * Flush all to work around Errata in early athlons regarding
153	 * large page flushing.
154	 */
155	__flush_tlb_all();
156
157	if (cache && boot_cpu_data.x86 >= 4)
158		wbinvd();
159}
160
161static void cpa_flush_all(unsigned long cache)
162{
163	BUG_ON(irqs_disabled());
164
165	on_each_cpu(__cpa_flush_all, (void *) cache, 1);
166}
167
168static void __cpa_flush_range(void *arg)
169{
170	/*
171	 * We could optimize that further and do individual per page
172	 * tlb invalidates for a low number of pages. Caveat: we must
173	 * flush the high aliases on 64bit as well.
174	 */
175	__flush_tlb_all();
176}
177
178static void cpa_flush_range(unsigned long start, int numpages, int cache)
179{
180	unsigned int i, level;
181	unsigned long addr;
182
183	BUG_ON(irqs_disabled());
184	WARN_ON(PAGE_ALIGN(start) != start);
185
186	on_each_cpu(__cpa_flush_range, NULL, 1);
187
188	if (!cache)
189		return;
190
191	/*
192	 * We only need to flush on one CPU,
193	 * clflush is a MESI-coherent instruction that
194	 * will cause all other CPUs to flush the same
195	 * cachelines:
196	 */
197	for (i = 0, addr = start; i < numpages; i++, addr += PAGE_SIZE) {
198		pte_t *pte = lookup_address(addr, &level);
199
200		/*
201		 * Only flush present addresses:
202		 */
203		if (pte && (pte_val(*pte) & _PAGE_PRESENT))
204			clflush_cache_range((void *) addr, PAGE_SIZE);
205	}
206}
207
208static void cpa_flush_array(unsigned long *start, int numpages, int cache,
209			    int in_flags, struct page **pages)
210{
211	unsigned int i, level;
212	unsigned long do_wbinvd = cache && numpages >= 1024; /* 4M threshold */
213
214	BUG_ON(irqs_disabled());
215
216	on_each_cpu(__cpa_flush_all, (void *) do_wbinvd, 1);
217
218	if (!cache || do_wbinvd)
219		return;
220
221	/*
222	 * We only need to flush on one CPU,
223	 * clflush is a MESI-coherent instruction that
224	 * will cause all other CPUs to flush the same
225	 * cachelines:
226	 */
227	for (i = 0; i < numpages; i++) {
228		unsigned long addr;
229		pte_t *pte;
230
231		if (in_flags & CPA_PAGES_ARRAY)
232			addr = (unsigned long)page_address(pages[i]);
233		else
234			addr = start[i];
235
236		pte = lookup_address(addr, &level);
237
238		/*
239		 * Only flush present addresses:
240		 */
241		if (pte && (pte_val(*pte) & _PAGE_PRESENT))
242			clflush_cache_range((void *)addr, PAGE_SIZE);
243	}
244}
245
246/*
247 * Certain areas of memory on x86 require very specific protection flags,
248 * for example the BIOS area or kernel text. Callers don't always get this
249 * right (again, ioremap() on BIOS memory is not uncommon) so this function
250 * checks and fixes these known static required protection bits.
251 */
252static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
253				   unsigned long pfn)
254{
255	pgprot_t forbidden = __pgprot(0);
256
257	/*
258	 * The BIOS area between 640k and 1Mb needs to be executable for
259	 * PCI BIOS based config access (CONFIG_PCI_GOBIOS) support.
260	 */
261#ifdef CONFIG_PCI_BIOS
262	if (pcibios_enabled && within(pfn, BIOS_BEGIN >> PAGE_SHIFT, BIOS_END >> PAGE_SHIFT))
263		pgprot_val(forbidden) |= _PAGE_NX;
264#endif
265
266	/*
267	 * The kernel text needs to be executable for obvious reasons
268	 * Does not cover __inittext since that is gone later on. On
269	 * 64bit we do not enforce !NX on the low mapping
270	 */
271	if (within(address, (unsigned long)_text, (unsigned long)_etext))
272		pgprot_val(forbidden) |= _PAGE_NX;
273
274	/*
275	 * The .rodata section needs to be read-only. Using the pfn
276	 * catches all aliases.
277	 */
278	if (within(pfn, __pa_symbol(__start_rodata) >> PAGE_SHIFT,
279		   __pa_symbol(__end_rodata) >> PAGE_SHIFT))
280		pgprot_val(forbidden) |= _PAGE_RW;
281
282#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA)
283	/*
284	 * Once the kernel maps the text as RO (kernel_set_to_readonly is set),
285	 * kernel text mappings for the large page aligned text, rodata sections
286	 * will be always read-only. For the kernel identity mappings covering
287	 * the holes caused by this alignment can be anything that user asks.
288	 *
289	 * This will preserve the large page mappings for kernel text/data
290	 * at no extra cost.
291	 */
292	if (kernel_set_to_readonly &&
293	    within(address, (unsigned long)_text,
294		   (unsigned long)__end_rodata_hpage_align)) {
295		unsigned int level;
296
297		/*
298		 * Don't enforce the !RW mapping for the kernel text mapping,
299		 * if the current mapping is already using small page mapping.
300		 * No need to work hard to preserve large page mappings in this
301		 * case.
302		 *
303		 * This also fixes the Linux Xen paravirt guest boot failure
304		 * (because of unexpected read-only mappings for kernel identity
305		 * mappings). In this paravirt guest case, the kernel text
306		 * mapping and the kernel identity mapping share the same
307		 * page-table pages. Thus we can't really use different
308		 * protections for the kernel text and identity mappings. Also,
309		 * these shared mappings are made of small page mappings.
310		 * Thus this don't enforce !RW mapping for small page kernel
311		 * text mapping logic will help Linux Xen parvirt guest boot
312		 * as well.
313		 */
314		if (lookup_address(address, &level) && (level != PG_LEVEL_4K))
315			pgprot_val(forbidden) |= _PAGE_RW;
316	}
317#endif
318
319	prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden));
320
321	return prot;
322}
323
324/*
325 * Lookup the page table entry for a virtual address in a specific pgd.
326 * Return a pointer to the entry and the level of the mapping.
327 */
328pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address,
329			     unsigned int *level)
330{
331	pud_t *pud;
332	pmd_t *pmd;
333
334	*level = PG_LEVEL_NONE;
335
336	if (pgd_none(*pgd))
337		return NULL;
338
339	pud = pud_offset(pgd, address);
340	if (pud_none(*pud))
341		return NULL;
342
343	*level = PG_LEVEL_1G;
344	if (pud_large(*pud) || !pud_present(*pud))
345		return (pte_t *)pud;
346
347	pmd = pmd_offset(pud, address);
348	if (pmd_none(*pmd))
349		return NULL;
350
351	*level = PG_LEVEL_2M;
352	if (pmd_large(*pmd) || !pmd_present(*pmd))
353		return (pte_t *)pmd;
354
355	*level = PG_LEVEL_4K;
356
357	return pte_offset_kernel(pmd, address);
358}
359
360/*
361 * Lookup the page table entry for a virtual address. Return a pointer
362 * to the entry and the level of the mapping.
363 *
364 * Note: We return pud and pmd either when the entry is marked large
365 * or when the present bit is not set. Otherwise we would return a
366 * pointer to a nonexisting mapping.
367 */
368pte_t *lookup_address(unsigned long address, unsigned int *level)
369{
370        return lookup_address_in_pgd(pgd_offset_k(address), address, level);
371}
372EXPORT_SYMBOL_GPL(lookup_address);
373
374static pte_t *_lookup_address_cpa(struct cpa_data *cpa, unsigned long address,
375				  unsigned int *level)
376{
377        if (cpa->pgd)
378		return lookup_address_in_pgd(cpa->pgd + pgd_index(address),
379					       address, level);
380
381        return lookup_address(address, level);
382}
383
384/*
385 * Lookup the PMD entry for a virtual address. Return a pointer to the entry
386 * or NULL if not present.
387 */
388pmd_t *lookup_pmd_address(unsigned long address)
389{
390	pgd_t *pgd;
391	pud_t *pud;
392
393	pgd = pgd_offset_k(address);
394	if (pgd_none(*pgd))
395		return NULL;
396
397	pud = pud_offset(pgd, address);
398	if (pud_none(*pud) || pud_large(*pud) || !pud_present(*pud))
399		return NULL;
400
401	return pmd_offset(pud, address);
402}
403
404/*
405 * This is necessary because __pa() does not work on some
406 * kinds of memory, like vmalloc() or the alloc_remap()
407 * areas on 32-bit NUMA systems.  The percpu areas can
408 * end up in this kind of memory, for instance.
409 *
410 * This could be optimized, but it is only intended to be
411 * used at inititalization time, and keeping it
412 * unoptimized should increase the testing coverage for
413 * the more obscure platforms.
414 */
415phys_addr_t slow_virt_to_phys(void *__virt_addr)
416{
417	unsigned long virt_addr = (unsigned long)__virt_addr;
418	phys_addr_t phys_addr;
419	unsigned long offset;
420	enum pg_level level;
421	unsigned long psize;
422	unsigned long pmask;
423	pte_t *pte;
424
425	pte = lookup_address(virt_addr, &level);
426	BUG_ON(!pte);
427	psize = page_level_size(level);
428	pmask = page_level_mask(level);
429	offset = virt_addr & ~pmask;
430	phys_addr = (phys_addr_t)pte_pfn(*pte) << PAGE_SHIFT;
431	return (phys_addr | offset);
432}
433EXPORT_SYMBOL_GPL(slow_virt_to_phys);
434
435/*
436 * Set the new pmd in all the pgds we know about:
437 */
438static void __set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte)
439{
440	/* change init_mm */
441	set_pte_atomic(kpte, pte);
442#ifdef CONFIG_X86_32
443	if (!SHARED_KERNEL_PMD) {
444		struct page *page;
445
446		list_for_each_entry(page, &pgd_list, lru) {
447			pgd_t *pgd;
448			pud_t *pud;
449			pmd_t *pmd;
450
451			pgd = (pgd_t *)page_address(page) + pgd_index(address);
452			pud = pud_offset(pgd, address);
453			pmd = pmd_offset(pud, address);
454			set_pte_atomic((pte_t *)pmd, pte);
455		}
456	}
457#endif
458}
459
460static int
461try_preserve_large_page(pte_t *kpte, unsigned long address,
462			struct cpa_data *cpa)
463{
464	unsigned long nextpage_addr, numpages, pmask, psize, addr, pfn;
465	pte_t new_pte, old_pte, *tmp;
466	pgprot_t old_prot, new_prot, req_prot;
467	int i, do_split = 1;
468	enum pg_level level;
469
470	if (cpa->force_split)
471		return 1;
472
473	spin_lock(&pgd_lock);
474	/*
475	 * Check for races, another CPU might have split this page
476	 * up already:
477	 */
478	tmp = _lookup_address_cpa(cpa, address, &level);
479	if (tmp != kpte)
480		goto out_unlock;
481
482	switch (level) {
483	case PG_LEVEL_2M:
484#ifdef CONFIG_X86_64
485	case PG_LEVEL_1G:
486#endif
487		psize = page_level_size(level);
488		pmask = page_level_mask(level);
489		break;
490	default:
491		do_split = -EINVAL;
492		goto out_unlock;
493	}
494
495	/*
496	 * Calculate the number of pages, which fit into this large
497	 * page starting at address:
498	 */
499	nextpage_addr = (address + psize) & pmask;
500	numpages = (nextpage_addr - address) >> PAGE_SHIFT;
501	if (numpages < cpa->numpages)
502		cpa->numpages = numpages;
503
504	/*
505	 * We are safe now. Check whether the new pgprot is the same:
506	 * Convert protection attributes to 4k-format, as cpa->mask* are set
507	 * up accordingly.
508	 */
509	old_pte = *kpte;
510	old_prot = req_prot = pgprot_large_2_4k(pte_pgprot(old_pte));
511
512	pgprot_val(req_prot) &= ~pgprot_val(cpa->mask_clr);
513	pgprot_val(req_prot) |= pgprot_val(cpa->mask_set);
514
515	/*
516	 * req_prot is in format of 4k pages. It must be converted to large
517	 * page format: the caching mode includes the PAT bit located at
518	 * different bit positions in the two formats.
519	 */
520	req_prot = pgprot_4k_2_large(req_prot);
521
522	/*
523	 * Set the PSE and GLOBAL flags only if the PRESENT flag is
524	 * set otherwise pmd_present/pmd_huge will return true even on
525	 * a non present pmd. The canon_pgprot will clear _PAGE_GLOBAL
526	 * for the ancient hardware that doesn't support it.
527	 */
528	if (pgprot_val(req_prot) & _PAGE_PRESENT)
529		pgprot_val(req_prot) |= _PAGE_PSE | _PAGE_GLOBAL;
530	else
531		pgprot_val(req_prot) &= ~(_PAGE_PSE | _PAGE_GLOBAL);
532
533	req_prot = canon_pgprot(req_prot);
534
535	/*
536	 * old_pte points to the large page base address. So we need
537	 * to add the offset of the virtual address:
538	 */
539	pfn = pte_pfn(old_pte) + ((address & (psize - 1)) >> PAGE_SHIFT);
540	cpa->pfn = pfn;
541
542	new_prot = static_protections(req_prot, address, pfn);
543
544	/*
545	 * We need to check the full range, whether
546	 * static_protection() requires a different pgprot for one of
547	 * the pages in the range we try to preserve:
548	 */
549	addr = address & pmask;
550	pfn = pte_pfn(old_pte);
551	for (i = 0; i < (psize >> PAGE_SHIFT); i++, addr += PAGE_SIZE, pfn++) {
552		pgprot_t chk_prot = static_protections(req_prot, addr, pfn);
553
554		if (pgprot_val(chk_prot) != pgprot_val(new_prot))
555			goto out_unlock;
556	}
557
558	/*
559	 * If there are no changes, return. maxpages has been updated
560	 * above:
561	 */
562	if (pgprot_val(new_prot) == pgprot_val(old_prot)) {
563		do_split = 0;
564		goto out_unlock;
565	}
566
567	/*
568	 * We need to change the attributes. Check, whether we can
569	 * change the large page in one go. We request a split, when
570	 * the address is not aligned and the number of pages is
571	 * smaller than the number of pages in the large page. Note
572	 * that we limited the number of possible pages already to
573	 * the number of pages in the large page.
574	 */
575	if (address == (address & pmask) && cpa->numpages == (psize >> PAGE_SHIFT)) {
576		/*
577		 * The address is aligned and the number of pages
578		 * covers the full page.
579		 */
580		new_pte = pfn_pte(pte_pfn(old_pte), new_prot);
581		__set_pmd_pte(kpte, address, new_pte);
582		cpa->flags |= CPA_FLUSHTLB;
583		do_split = 0;
584	}
585
586out_unlock:
587	spin_unlock(&pgd_lock);
588
589	return do_split;
590}
591
592static int
593__split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address,
594		   struct page *base)
595{
596	pte_t *pbase = (pte_t *)page_address(base);
597	unsigned long pfn, pfninc = 1;
598	unsigned int i, level;
599	pte_t *tmp;
600	pgprot_t ref_prot;
601
602	spin_lock(&pgd_lock);
603	/*
604	 * Check for races, another CPU might have split this page
605	 * up for us already:
606	 */
607	tmp = _lookup_address_cpa(cpa, address, &level);
608	if (tmp != kpte) {
609		spin_unlock(&pgd_lock);
610		return 1;
611	}
612
613	paravirt_alloc_pte(&init_mm, page_to_pfn(base));
614	ref_prot = pte_pgprot(pte_clrhuge(*kpte));
615
616	/* promote PAT bit to correct position */
617	if (level == PG_LEVEL_2M)
618		ref_prot = pgprot_large_2_4k(ref_prot);
619
620#ifdef CONFIG_X86_64
621	if (level == PG_LEVEL_1G) {
622		pfninc = PMD_PAGE_SIZE >> PAGE_SHIFT;
623		/*
624		 * Set the PSE flags only if the PRESENT flag is set
625		 * otherwise pmd_present/pmd_huge will return true
626		 * even on a non present pmd.
627		 */
628		if (pgprot_val(ref_prot) & _PAGE_PRESENT)
629			pgprot_val(ref_prot) |= _PAGE_PSE;
630		else
631			pgprot_val(ref_prot) &= ~_PAGE_PSE;
632	}
633#endif
634
635	/*
636	 * Set the GLOBAL flags only if the PRESENT flag is set
637	 * otherwise pmd/pte_present will return true even on a non
638	 * present pmd/pte. The canon_pgprot will clear _PAGE_GLOBAL
639	 * for the ancient hardware that doesn't support it.
640	 */
641	if (pgprot_val(ref_prot) & _PAGE_PRESENT)
642		pgprot_val(ref_prot) |= _PAGE_GLOBAL;
643	else
644		pgprot_val(ref_prot) &= ~_PAGE_GLOBAL;
645
646	/*
647	 * Get the target pfn from the original entry:
648	 */
649	pfn = pte_pfn(*kpte);
650	for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc)
651		set_pte(&pbase[i], pfn_pte(pfn, canon_pgprot(ref_prot)));
652
653	if (pfn_range_is_mapped(PFN_DOWN(__pa(address)),
654				PFN_DOWN(__pa(address)) + 1))
655		split_page_count(level);
656
657	/*
658	 * Install the new, split up pagetable.
659	 *
660	 * We use the standard kernel pagetable protections for the new
661	 * pagetable protections, the actual ptes set above control the
662	 * primary protection behavior:
663	 */
664	__set_pmd_pte(kpte, address, mk_pte(base, __pgprot(_KERNPG_TABLE)));
665
666	/*
667	 * Intel Atom errata AAH41 workaround.
668	 *
669	 * The real fix should be in hw or in a microcode update, but
670	 * we also probabilistically try to reduce the window of having
671	 * a large TLB mixed with 4K TLBs while instruction fetches are
672	 * going on.
673	 */
674	__flush_tlb_all();
675	spin_unlock(&pgd_lock);
676
677	return 0;
678}
679
680static int split_large_page(struct cpa_data *cpa, pte_t *kpte,
681			    unsigned long address)
682{
683	struct page *base;
684
685	if (!debug_pagealloc)
686		spin_unlock(&cpa_lock);
687	base = alloc_pages(GFP_KERNEL | __GFP_NOTRACK, 0);
688	if (!debug_pagealloc)
689		spin_lock(&cpa_lock);
690	if (!base)
691		return -ENOMEM;
692
693	if (__split_large_page(cpa, kpte, address, base))
694		__free_page(base);
695
696	return 0;
697}
698
699static bool try_to_free_pte_page(pte_t *pte)
700{
701	int i;
702
703	for (i = 0; i < PTRS_PER_PTE; i++)
704		if (!pte_none(pte[i]))
705			return false;
706
707	free_page((unsigned long)pte);
708	return true;
709}
710
711static bool try_to_free_pmd_page(pmd_t *pmd)
712{
713	int i;
714
715	for (i = 0; i < PTRS_PER_PMD; i++)
716		if (!pmd_none(pmd[i]))
717			return false;
718
719	free_page((unsigned long)pmd);
720	return true;
721}
722
723static bool try_to_free_pud_page(pud_t *pud)
724{
725	int i;
726
727	for (i = 0; i < PTRS_PER_PUD; i++)
728		if (!pud_none(pud[i]))
729			return false;
730
731	free_page((unsigned long)pud);
732	return true;
733}
734
735static bool unmap_pte_range(pmd_t *pmd, unsigned long start, unsigned long end)
736{
737	pte_t *pte = pte_offset_kernel(pmd, start);
738
739	while (start < end) {
740		set_pte(pte, __pte(0));
741
742		start += PAGE_SIZE;
743		pte++;
744	}
745
746	if (try_to_free_pte_page((pte_t *)pmd_page_vaddr(*pmd))) {
747		pmd_clear(pmd);
748		return true;
749	}
750	return false;
751}
752
753static void __unmap_pmd_range(pud_t *pud, pmd_t *pmd,
754			      unsigned long start, unsigned long end)
755{
756	if (unmap_pte_range(pmd, start, end))
757		if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud)))
758			pud_clear(pud);
759}
760
761static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)
762{
763	pmd_t *pmd = pmd_offset(pud, start);
764
765	/*
766	 * Not on a 2MB page boundary?
767	 */
768	if (start & (PMD_SIZE - 1)) {
769		unsigned long next_page = (start + PMD_SIZE) & PMD_MASK;
770		unsigned long pre_end = min_t(unsigned long, end, next_page);
771
772		__unmap_pmd_range(pud, pmd, start, pre_end);
773
774		start = pre_end;
775		pmd++;
776	}
777
778	/*
779	 * Try to unmap in 2M chunks.
780	 */
781	while (end - start >= PMD_SIZE) {
782		if (pmd_large(*pmd))
783			pmd_clear(pmd);
784		else
785			__unmap_pmd_range(pud, pmd, start, start + PMD_SIZE);
786
787		start += PMD_SIZE;
788		pmd++;
789	}
790
791	/*
792	 * 4K leftovers?
793	 */
794	if (start < end)
795		return __unmap_pmd_range(pud, pmd, start, end);
796
797	/*
798	 * Try again to free the PMD page if haven't succeeded above.
799	 */
800	if (!pud_none(*pud))
801		if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud)))
802			pud_clear(pud);
803}
804
805static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
806{
807	pud_t *pud = pud_offset(pgd, start);
808
809	/*
810	 * Not on a GB page boundary?
811	 */
812	if (start & (PUD_SIZE - 1)) {
813		unsigned long next_page = (start + PUD_SIZE) & PUD_MASK;
814		unsigned long pre_end	= min_t(unsigned long, end, next_page);
815
816		unmap_pmd_range(pud, start, pre_end);
817
818		start = pre_end;
819		pud++;
820	}
821
822	/*
823	 * Try to unmap in 1G chunks?
824	 */
825	while (end - start >= PUD_SIZE) {
826
827		if (pud_large(*pud))
828			pud_clear(pud);
829		else
830			unmap_pmd_range(pud, start, start + PUD_SIZE);
831
832		start += PUD_SIZE;
833		pud++;
834	}
835
836	/*
837	 * 2M leftovers?
838	 */
839	if (start < end)
840		unmap_pmd_range(pud, start, end);
841
842	/*
843	 * No need to try to free the PUD page because we'll free it in
844	 * populate_pgd's error path
845	 */
846}
847
848static void unmap_pgd_range(pgd_t *root, unsigned long addr, unsigned long end)
849{
850	pgd_t *pgd_entry = root + pgd_index(addr);
851
852	unmap_pud_range(pgd_entry, addr, end);
853
854	if (try_to_free_pud_page((pud_t *)pgd_page_vaddr(*pgd_entry)))
855		pgd_clear(pgd_entry);
856}
857
858static int alloc_pte_page(pmd_t *pmd)
859{
860	pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK);
861	if (!pte)
862		return -1;
863
864	set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE));
865	return 0;
866}
867
868static int alloc_pmd_page(pud_t *pud)
869{
870	pmd_t *pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK);
871	if (!pmd)
872		return -1;
873
874	set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
875	return 0;
876}
877
878static void populate_pte(struct cpa_data *cpa,
879			 unsigned long start, unsigned long end,
880			 unsigned num_pages, pmd_t *pmd, pgprot_t pgprot)
881{
882	pte_t *pte;
883
884	pte = pte_offset_kernel(pmd, start);
885
886	while (num_pages-- && start < end) {
887
888		/* deal with the NX bit */
889		if (!(pgprot_val(pgprot) & _PAGE_NX))
890			cpa->pfn &= ~_PAGE_NX;
891
892		set_pte(pte, pfn_pte(cpa->pfn >> PAGE_SHIFT, pgprot));
893
894		start	 += PAGE_SIZE;
895		cpa->pfn += PAGE_SIZE;
896		pte++;
897	}
898}
899
900static int populate_pmd(struct cpa_data *cpa,
901			unsigned long start, unsigned long end,
902			unsigned num_pages, pud_t *pud, pgprot_t pgprot)
903{
904	unsigned int cur_pages = 0;
905	pmd_t *pmd;
906	pgprot_t pmd_pgprot;
907
908	/*
909	 * Not on a 2M boundary?
910	 */
911	if (start & (PMD_SIZE - 1)) {
912		unsigned long pre_end = start + (num_pages << PAGE_SHIFT);
913		unsigned long next_page = (start + PMD_SIZE) & PMD_MASK;
914
915		pre_end   = min_t(unsigned long, pre_end, next_page);
916		cur_pages = (pre_end - start) >> PAGE_SHIFT;
917		cur_pages = min_t(unsigned int, num_pages, cur_pages);
918
919		/*
920		 * Need a PTE page?
921		 */
922		pmd = pmd_offset(pud, start);
923		if (pmd_none(*pmd))
924			if (alloc_pte_page(pmd))
925				return -1;
926
927		populate_pte(cpa, start, pre_end, cur_pages, pmd, pgprot);
928
929		start = pre_end;
930	}
931
932	/*
933	 * We mapped them all?
934	 */
935	if (num_pages == cur_pages)
936		return cur_pages;
937
938	pmd_pgprot = pgprot_4k_2_large(pgprot);
939
940	while (end - start >= PMD_SIZE) {
941
942		/*
943		 * We cannot use a 1G page so allocate a PMD page if needed.
944		 */
945		if (pud_none(*pud))
946			if (alloc_pmd_page(pud))
947				return -1;
948
949		pmd = pmd_offset(pud, start);
950
951		set_pmd(pmd, __pmd(cpa->pfn | _PAGE_PSE |
952				   massage_pgprot(pmd_pgprot)));
953
954		start	  += PMD_SIZE;
955		cpa->pfn  += PMD_SIZE;
956		cur_pages += PMD_SIZE >> PAGE_SHIFT;
957	}
958
959	/*
960	 * Map trailing 4K pages.
961	 */
962	if (start < end) {
963		pmd = pmd_offset(pud, start);
964		if (pmd_none(*pmd))
965			if (alloc_pte_page(pmd))
966				return -1;
967
968		populate_pte(cpa, start, end, num_pages - cur_pages,
969			     pmd, pgprot);
970	}
971	return num_pages;
972}
973
974static int populate_pud(struct cpa_data *cpa, unsigned long start, pgd_t *pgd,
975			pgprot_t pgprot)
976{
977	pud_t *pud;
978	unsigned long end;
979	int cur_pages = 0;
980	pgprot_t pud_pgprot;
981
982	end = start + (cpa->numpages << PAGE_SHIFT);
983
984	/*
985	 * Not on a Gb page boundary? => map everything up to it with
986	 * smaller pages.
987	 */
988	if (start & (PUD_SIZE - 1)) {
989		unsigned long pre_end;
990		unsigned long next_page = (start + PUD_SIZE) & PUD_MASK;
991
992		pre_end   = min_t(unsigned long, end, next_page);
993		cur_pages = (pre_end - start) >> PAGE_SHIFT;
994		cur_pages = min_t(int, (int)cpa->numpages, cur_pages);
995
996		pud = pud_offset(pgd, start);
997
998		/*
999		 * Need a PMD page?
1000		 */
1001		if (pud_none(*pud))
1002			if (alloc_pmd_page(pud))
1003				return -1;
1004
1005		cur_pages = populate_pmd(cpa, start, pre_end, cur_pages,
1006					 pud, pgprot);
1007		if (cur_pages < 0)
1008			return cur_pages;
1009
1010		start = pre_end;
1011	}
1012
1013	/* We mapped them all? */
1014	if (cpa->numpages == cur_pages)
1015		return cur_pages;
1016
1017	pud = pud_offset(pgd, start);
1018	pud_pgprot = pgprot_4k_2_large(pgprot);
1019
1020	/*
1021	 * Map everything starting from the Gb boundary, possibly with 1G pages
1022	 */
1023	while (end - start >= PUD_SIZE) {
1024		set_pud(pud, __pud(cpa->pfn | _PAGE_PSE |
1025				   massage_pgprot(pud_pgprot)));
1026
1027		start	  += PUD_SIZE;
1028		cpa->pfn  += PUD_SIZE;
1029		cur_pages += PUD_SIZE >> PAGE_SHIFT;
1030		pud++;
1031	}
1032
1033	/* Map trailing leftover */
1034	if (start < end) {
1035		int tmp;
1036
1037		pud = pud_offset(pgd, start);
1038		if (pud_none(*pud))
1039			if (alloc_pmd_page(pud))
1040				return -1;
1041
1042		tmp = populate_pmd(cpa, start, end, cpa->numpages - cur_pages,
1043				   pud, pgprot);
1044		if (tmp < 0)
1045			return cur_pages;
1046
1047		cur_pages += tmp;
1048	}
1049	return cur_pages;
1050}
1051
1052/*
1053 * Restrictions for kernel page table do not necessarily apply when mapping in
1054 * an alternate PGD.
1055 */
1056static int populate_pgd(struct cpa_data *cpa, unsigned long addr)
1057{
1058	pgprot_t pgprot = __pgprot(_KERNPG_TABLE);
1059	pud_t *pud = NULL;	/* shut up gcc */
1060	pgd_t *pgd_entry;
1061	int ret;
1062
1063	pgd_entry = cpa->pgd + pgd_index(addr);
1064
1065	/*
1066	 * Allocate a PUD page and hand it down for mapping.
1067	 */
1068	if (pgd_none(*pgd_entry)) {
1069		pud = (pud_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK);
1070		if (!pud)
1071			return -1;
1072
1073		set_pgd(pgd_entry, __pgd(__pa(pud) | _KERNPG_TABLE));
1074	}
1075
1076	pgprot_val(pgprot) &= ~pgprot_val(cpa->mask_clr);
1077	pgprot_val(pgprot) |=  pgprot_val(cpa->mask_set);
1078
1079	ret = populate_pud(cpa, addr, pgd_entry, pgprot);
1080	if (ret < 0) {
1081		unmap_pgd_range(cpa->pgd, addr,
1082				addr + (cpa->numpages << PAGE_SHIFT));
1083		return ret;
1084	}
1085
1086	cpa->numpages = ret;
1087	return 0;
1088}
1089
1090static int __cpa_process_fault(struct cpa_data *cpa, unsigned long vaddr,
1091			       int primary)
1092{
1093	if (cpa->pgd)
1094		return populate_pgd(cpa, vaddr);
1095
1096	/*
1097	 * Ignore all non primary paths.
1098	 */
1099	if (!primary)
1100		return 0;
1101
1102	/*
1103	 * Ignore the NULL PTE for kernel identity mapping, as it is expected
1104	 * to have holes.
1105	 * Also set numpages to '1' indicating that we processed cpa req for
1106	 * one virtual address page and its pfn. TBD: numpages can be set based
1107	 * on the initial value and the level returned by lookup_address().
1108	 */
1109	if (within(vaddr, PAGE_OFFSET,
1110		   PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))) {
1111		cpa->numpages = 1;
1112		cpa->pfn = __pa(vaddr) >> PAGE_SHIFT;
1113		return 0;
1114	} else {
1115		WARN(1, KERN_WARNING "CPA: called for zero pte. "
1116			"vaddr = %lx cpa->vaddr = %lx\n", vaddr,
1117			*cpa->vaddr);
1118
1119		return -EFAULT;
1120	}
1121}
1122
1123static int __change_page_attr(struct cpa_data *cpa, int primary)
1124{
1125	unsigned long address;
1126	int do_split, err;
1127	unsigned int level;
1128	pte_t *kpte, old_pte;
1129
1130	if (cpa->flags & CPA_PAGES_ARRAY) {
1131		struct page *page = cpa->pages[cpa->curpage];
1132		if (unlikely(PageHighMem(page)))
1133			return 0;
1134		address = (unsigned long)page_address(page);
1135	} else if (cpa->flags & CPA_ARRAY)
1136		address = cpa->vaddr[cpa->curpage];
1137	else
1138		address = *cpa->vaddr;
1139repeat:
1140	kpte = _lookup_address_cpa(cpa, address, &level);
1141	if (!kpte)
1142		return __cpa_process_fault(cpa, address, primary);
1143
1144	old_pte = *kpte;
1145	if (!pte_val(old_pte))
1146		return __cpa_process_fault(cpa, address, primary);
1147
1148	if (level == PG_LEVEL_4K) {
1149		pte_t new_pte;
1150		pgprot_t new_prot = pte_pgprot(old_pte);
1151		unsigned long pfn = pte_pfn(old_pte);
1152
1153		pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr);
1154		pgprot_val(new_prot) |= pgprot_val(cpa->mask_set);
1155
1156		new_prot = static_protections(new_prot, address, pfn);
1157
1158		/*
1159		 * Set the GLOBAL flags only if the PRESENT flag is
1160		 * set otherwise pte_present will return true even on
1161		 * a non present pte. The canon_pgprot will clear
1162		 * _PAGE_GLOBAL for the ancient hardware that doesn't
1163		 * support it.
1164		 */
1165		if (pgprot_val(new_prot) & _PAGE_PRESENT)
1166			pgprot_val(new_prot) |= _PAGE_GLOBAL;
1167		else
1168			pgprot_val(new_prot) &= ~_PAGE_GLOBAL;
1169
1170		/*
1171		 * We need to keep the pfn from the existing PTE,
1172		 * after all we're only going to change it's attributes
1173		 * not the memory it points to
1174		 */
1175		new_pte = pfn_pte(pfn, canon_pgprot(new_prot));
1176		cpa->pfn = pfn;
1177		/*
1178		 * Do we really change anything ?
1179		 */
1180		if (pte_val(old_pte) != pte_val(new_pte)) {
1181			set_pte_atomic(kpte, new_pte);
1182			cpa->flags |= CPA_FLUSHTLB;
1183		}
1184		cpa->numpages = 1;
1185		return 0;
1186	}
1187
1188	/*
1189	 * Check, whether we can keep the large page intact
1190	 * and just change the pte:
1191	 */
1192	do_split = try_preserve_large_page(kpte, address, cpa);
1193	/*
1194	 * When the range fits into the existing large page,
1195	 * return. cp->numpages and cpa->tlbflush have been updated in
1196	 * try_large_page:
1197	 */
1198	if (do_split <= 0)
1199		return do_split;
1200
1201	/*
1202	 * We have to split the large page:
1203	 */
1204	err = split_large_page(cpa, kpte, address);
1205	if (!err) {
1206		/*
1207	 	 * Do a global flush tlb after splitting the large page
1208	 	 * and before we do the actual change page attribute in the PTE.
1209	 	 *
1210	 	 * With out this, we violate the TLB application note, that says
1211	 	 * "The TLBs may contain both ordinary and large-page
1212		 *  translations for a 4-KByte range of linear addresses. This
1213		 *  may occur if software modifies the paging structures so that
1214		 *  the page size used for the address range changes. If the two
1215		 *  translations differ with respect to page frame or attributes
1216		 *  (e.g., permissions), processor behavior is undefined and may
1217		 *  be implementation-specific."
1218	 	 *
1219	 	 * We do this global tlb flush inside the cpa_lock, so that we
1220		 * don't allow any other cpu, with stale tlb entries change the
1221		 * page attribute in parallel, that also falls into the
1222		 * just split large page entry.
1223	 	 */
1224		flush_tlb_all();
1225		goto repeat;
1226	}
1227
1228	return err;
1229}
1230
1231static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias);
1232
1233static int cpa_process_alias(struct cpa_data *cpa)
1234{
1235	struct cpa_data alias_cpa;
1236	unsigned long laddr = (unsigned long)__va(cpa->pfn << PAGE_SHIFT);
1237	unsigned long vaddr;
1238	int ret;
1239
1240	if (!pfn_range_is_mapped(cpa->pfn, cpa->pfn + 1))
1241		return 0;
1242
1243	/*
1244	 * No need to redo, when the primary call touched the direct
1245	 * mapping already:
1246	 */
1247	if (cpa->flags & CPA_PAGES_ARRAY) {
1248		struct page *page = cpa->pages[cpa->curpage];
1249		if (unlikely(PageHighMem(page)))
1250			return 0;
1251		vaddr = (unsigned long)page_address(page);
1252	} else if (cpa->flags & CPA_ARRAY)
1253		vaddr = cpa->vaddr[cpa->curpage];
1254	else
1255		vaddr = *cpa->vaddr;
1256
1257	if (!(within(vaddr, PAGE_OFFSET,
1258		    PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT)))) {
1259
1260		alias_cpa = *cpa;
1261		alias_cpa.vaddr = &laddr;
1262		alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY);
1263
1264		ret = __change_page_attr_set_clr(&alias_cpa, 0);
1265		if (ret)
1266			return ret;
1267	}
1268
1269#ifdef CONFIG_X86_64
1270	/*
1271	 * If the primary call didn't touch the high mapping already
1272	 * and the physical address is inside the kernel map, we need
1273	 * to touch the high mapped kernel as well:
1274	 */
1275	if (!within(vaddr, (unsigned long)_text, _brk_end) &&
1276	    within(cpa->pfn, highmap_start_pfn(), highmap_end_pfn())) {
1277		unsigned long temp_cpa_vaddr = (cpa->pfn << PAGE_SHIFT) +
1278					       __START_KERNEL_map - phys_base;
1279		alias_cpa = *cpa;
1280		alias_cpa.vaddr = &temp_cpa_vaddr;
1281		alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY);
1282
1283		/*
1284		 * The high mapping range is imprecise, so ignore the
1285		 * return value.
1286		 */
1287		__change_page_attr_set_clr(&alias_cpa, 0);
1288	}
1289#endif
1290
1291	return 0;
1292}
1293
1294static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias)
1295{
1296	int ret, numpages = cpa->numpages;
1297
1298	while (numpages) {
1299		/*
1300		 * Store the remaining nr of pages for the large page
1301		 * preservation check.
1302		 */
1303		cpa->numpages = numpages;
1304		/* for array changes, we can't use large page */
1305		if (cpa->flags & (CPA_ARRAY | CPA_PAGES_ARRAY))
1306			cpa->numpages = 1;
1307
1308		if (!debug_pagealloc)
1309			spin_lock(&cpa_lock);
1310		ret = __change_page_attr(cpa, checkalias);
1311		if (!debug_pagealloc)
1312			spin_unlock(&cpa_lock);
1313		if (ret)
1314			return ret;
1315
1316		if (checkalias) {
1317			ret = cpa_process_alias(cpa);
1318			if (ret)
1319				return ret;
1320		}
1321
1322		/*
1323		 * Adjust the number of pages with the result of the
1324		 * CPA operation. Either a large page has been
1325		 * preserved or a single page update happened.
1326		 */
1327		BUG_ON(cpa->numpages > numpages || !cpa->numpages);
1328		numpages -= cpa->numpages;
1329		if (cpa->flags & (CPA_PAGES_ARRAY | CPA_ARRAY))
1330			cpa->curpage++;
1331		else
1332			*cpa->vaddr += cpa->numpages * PAGE_SIZE;
1333
1334	}
1335	return 0;
1336}
1337
1338static int change_page_attr_set_clr(unsigned long *addr, int numpages,
1339				    pgprot_t mask_set, pgprot_t mask_clr,
1340				    int force_split, int in_flag,
1341				    struct page **pages)
1342{
1343	struct cpa_data cpa;
1344	int ret, cache, checkalias;
1345	unsigned long baddr = 0;
1346
1347	memset(&cpa, 0, sizeof(cpa));
1348
1349	/*
1350	 * Check, if we are requested to change a not supported
1351	 * feature:
1352	 */
1353	mask_set = canon_pgprot(mask_set);
1354	mask_clr = canon_pgprot(mask_clr);
1355	if (!pgprot_val(mask_set) && !pgprot_val(mask_clr) && !force_split)
1356		return 0;
1357
1358	/* Ensure we are PAGE_SIZE aligned */
1359	if (in_flag & CPA_ARRAY) {
1360		int i;
1361		for (i = 0; i < numpages; i++) {
1362			if (addr[i] & ~PAGE_MASK) {
1363				addr[i] &= PAGE_MASK;
1364				WARN_ON_ONCE(1);
1365			}
1366		}
1367	} else if (!(in_flag & CPA_PAGES_ARRAY)) {
1368		/*
1369		 * in_flag of CPA_PAGES_ARRAY implies it is aligned.
1370		 * No need to cehck in that case
1371		 */
1372		if (*addr & ~PAGE_MASK) {
1373			*addr &= PAGE_MASK;
1374			/*
1375			 * People should not be passing in unaligned addresses:
1376			 */
1377			WARN_ON_ONCE(1);
1378		}
1379		/*
1380		 * Save address for cache flush. *addr is modified in the call
1381		 * to __change_page_attr_set_clr() below.
1382		 */
1383		baddr = *addr;
1384	}
1385
1386	/* Must avoid aliasing mappings in the highmem code */
1387	kmap_flush_unused();
1388
1389	vm_unmap_aliases();
1390
1391	cpa.vaddr = addr;
1392	cpa.pages = pages;
1393	cpa.numpages = numpages;
1394	cpa.mask_set = mask_set;
1395	cpa.mask_clr = mask_clr;
1396	cpa.flags = 0;
1397	cpa.curpage = 0;
1398	cpa.force_split = force_split;
1399
1400	if (in_flag & (CPA_ARRAY | CPA_PAGES_ARRAY))
1401		cpa.flags |= in_flag;
1402
1403	/* No alias checking for _NX bit modifications */
1404	checkalias = (pgprot_val(mask_set) | pgprot_val(mask_clr)) != _PAGE_NX;
1405
1406	ret = __change_page_attr_set_clr(&cpa, checkalias);
1407
1408	/*
1409	 * Check whether we really changed something:
1410	 */
1411	if (!(cpa.flags & CPA_FLUSHTLB))
1412		goto out;
1413
1414	/*
1415	 * No need to flush, when we did not set any of the caching
1416	 * attributes:
1417	 */
1418	cache = !!pgprot2cachemode(mask_set);
1419
1420	/*
1421	 * On success we use CLFLUSH, when the CPU supports it to
1422	 * avoid the WBINVD. If the CPU does not support it and in the
1423	 * error case we fall back to cpa_flush_all (which uses
1424	 * WBINVD):
1425	 */
1426	if (!ret && cpu_has_clflush) {
1427		if (cpa.flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) {
1428			cpa_flush_array(addr, numpages, cache,
1429					cpa.flags, pages);
1430		} else
1431			cpa_flush_range(baddr, numpages, cache);
1432	} else
1433		cpa_flush_all(cache);
1434
1435out:
1436	return ret;
1437}
1438
1439static inline int change_page_attr_set(unsigned long *addr, int numpages,
1440				       pgprot_t mask, int array)
1441{
1442	return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0), 0,
1443		(array ? CPA_ARRAY : 0), NULL);
1444}
1445
1446static inline int change_page_attr_clear(unsigned long *addr, int numpages,
1447					 pgprot_t mask, int array)
1448{
1449	return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask, 0,
1450		(array ? CPA_ARRAY : 0), NULL);
1451}
1452
1453static inline int cpa_set_pages_array(struct page **pages, int numpages,
1454				       pgprot_t mask)
1455{
1456	return change_page_attr_set_clr(NULL, numpages, mask, __pgprot(0), 0,
1457		CPA_PAGES_ARRAY, pages);
1458}
1459
1460static inline int cpa_clear_pages_array(struct page **pages, int numpages,
1461					 pgprot_t mask)
1462{
1463	return change_page_attr_set_clr(NULL, numpages, __pgprot(0), mask, 0,
1464		CPA_PAGES_ARRAY, pages);
1465}
1466
1467int _set_memory_uc(unsigned long addr, int numpages)
1468{
1469	/*
1470	 * for now UC MINUS. see comments in ioremap_nocache()
1471	 */
1472	return change_page_attr_set(&addr, numpages,
1473				    cachemode2pgprot(_PAGE_CACHE_MODE_UC_MINUS),
1474				    0);
1475}
1476
1477int set_memory_uc(unsigned long addr, int numpages)
1478{
1479	int ret;
1480
1481	/*
1482	 * for now UC MINUS. see comments in ioremap_nocache()
1483	 */
1484	ret = reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
1485			      _PAGE_CACHE_MODE_UC_MINUS, NULL);
1486	if (ret)
1487		goto out_err;
1488
1489	ret = _set_memory_uc(addr, numpages);
1490	if (ret)
1491		goto out_free;
1492
1493	return 0;
1494
1495out_free:
1496	free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
1497out_err:
1498	return ret;
1499}
1500EXPORT_SYMBOL(set_memory_uc);
1501
1502static int _set_memory_array(unsigned long *addr, int addrinarray,
1503		enum page_cache_mode new_type)
1504{
1505	int i, j;
1506	int ret;
1507
1508	/*
1509	 * for now UC MINUS. see comments in ioremap_nocache()
1510	 */
1511	for (i = 0; i < addrinarray; i++) {
1512		ret = reserve_memtype(__pa(addr[i]), __pa(addr[i]) + PAGE_SIZE,
1513					new_type, NULL);
1514		if (ret)
1515			goto out_free;
1516	}
1517
1518	ret = change_page_attr_set(addr, addrinarray,
1519				   cachemode2pgprot(_PAGE_CACHE_MODE_UC_MINUS),
1520				   1);
1521
1522	if (!ret && new_type == _PAGE_CACHE_MODE_WC)
1523		ret = change_page_attr_set_clr(addr, addrinarray,
1524					       cachemode2pgprot(
1525						_PAGE_CACHE_MODE_WC),
1526					       __pgprot(_PAGE_CACHE_MASK),
1527					       0, CPA_ARRAY, NULL);
1528	if (ret)
1529		goto out_free;
1530
1531	return 0;
1532
1533out_free:
1534	for (j = 0; j < i; j++)
1535		free_memtype(__pa(addr[j]), __pa(addr[j]) + PAGE_SIZE);
1536
1537	return ret;
1538}
1539
1540int set_memory_array_uc(unsigned long *addr, int addrinarray)
1541{
1542	return _set_memory_array(addr, addrinarray, _PAGE_CACHE_MODE_UC_MINUS);
1543}
1544EXPORT_SYMBOL(set_memory_array_uc);
1545
1546int set_memory_array_wc(unsigned long *addr, int addrinarray)
1547{
1548	return _set_memory_array(addr, addrinarray, _PAGE_CACHE_MODE_WC);
1549}
1550EXPORT_SYMBOL(set_memory_array_wc);
1551
1552int _set_memory_wc(unsigned long addr, int numpages)
1553{
1554	int ret;
1555	unsigned long addr_copy = addr;
1556
1557	ret = change_page_attr_set(&addr, numpages,
1558				   cachemode2pgprot(_PAGE_CACHE_MODE_UC_MINUS),
1559				   0);
1560	if (!ret) {
1561		ret = change_page_attr_set_clr(&addr_copy, numpages,
1562					       cachemode2pgprot(
1563						_PAGE_CACHE_MODE_WC),
1564					       __pgprot(_PAGE_CACHE_MASK),
1565					       0, 0, NULL);
1566	}
1567	return ret;
1568}
1569
1570int set_memory_wc(unsigned long addr, int numpages)
1571{
1572	int ret;
1573
1574	if (!pat_enabled)
1575		return set_memory_uc(addr, numpages);
1576
1577	ret = reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
1578		_PAGE_CACHE_MODE_WC, NULL);
1579	if (ret)
1580		goto out_err;
1581
1582	ret = _set_memory_wc(addr, numpages);
1583	if (ret)
1584		goto out_free;
1585
1586	return 0;
1587
1588out_free:
1589	free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
1590out_err:
1591	return ret;
1592}
1593EXPORT_SYMBOL(set_memory_wc);
1594
1595int _set_memory_wb(unsigned long addr, int numpages)
1596{
1597	/* WB cache mode is hard wired to all cache attribute bits being 0 */
1598	return change_page_attr_clear(&addr, numpages,
1599				      __pgprot(_PAGE_CACHE_MASK), 0);
1600}
1601
1602int set_memory_wb(unsigned long addr, int numpages)
1603{
1604	int ret;
1605
1606	ret = _set_memory_wb(addr, numpages);
1607	if (ret)
1608		return ret;
1609
1610	free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
1611	return 0;
1612}
1613EXPORT_SYMBOL(set_memory_wb);
1614
1615int set_memory_array_wb(unsigned long *addr, int addrinarray)
1616{
1617	int i;
1618	int ret;
1619
1620	/* WB cache mode is hard wired to all cache attribute bits being 0 */
1621	ret = change_page_attr_clear(addr, addrinarray,
1622				      __pgprot(_PAGE_CACHE_MASK), 1);
1623	if (ret)
1624		return ret;
1625
1626	for (i = 0; i < addrinarray; i++)
1627		free_memtype(__pa(addr[i]), __pa(addr[i]) + PAGE_SIZE);
1628
1629	return 0;
1630}
1631EXPORT_SYMBOL(set_memory_array_wb);
1632
1633int set_memory_x(unsigned long addr, int numpages)
1634{
1635	if (!(__supported_pte_mask & _PAGE_NX))
1636		return 0;
1637
1638	return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_NX), 0);
1639}
1640EXPORT_SYMBOL(set_memory_x);
1641
1642int set_memory_nx(unsigned long addr, int numpages)
1643{
1644	if (!(__supported_pte_mask & _PAGE_NX))
1645		return 0;
1646
1647	return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_NX), 0);
1648}
1649EXPORT_SYMBOL(set_memory_nx);
1650
1651int set_memory_ro(unsigned long addr, int numpages)
1652{
1653	return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_RW), 0);
1654}
1655
1656int set_memory_rw(unsigned long addr, int numpages)
1657{
1658	return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_RW), 0);
1659}
1660
1661int set_memory_np(unsigned long addr, int numpages)
1662{
1663	return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_PRESENT), 0);
1664}
1665
1666int set_memory_4k(unsigned long addr, int numpages)
1667{
1668	return change_page_attr_set_clr(&addr, numpages, __pgprot(0),
1669					__pgprot(0), 1, 0, NULL);
1670}
1671
1672int set_pages_uc(struct page *page, int numpages)
1673{
1674	unsigned long addr = (unsigned long)page_address(page);
1675
1676	return set_memory_uc(addr, numpages);
1677}
1678EXPORT_SYMBOL(set_pages_uc);
1679
1680static int _set_pages_array(struct page **pages, int addrinarray,
1681		enum page_cache_mode new_type)
1682{
1683	unsigned long start;
1684	unsigned long end;
1685	int i;
1686	int free_idx;
1687	int ret;
1688
1689	for (i = 0; i < addrinarray; i++) {
1690		if (PageHighMem(pages[i]))
1691			continue;
1692		start = page_to_pfn(pages[i]) << PAGE_SHIFT;
1693		end = start + PAGE_SIZE;
1694		if (reserve_memtype(start, end, new_type, NULL))
1695			goto err_out;
1696	}
1697
1698	ret = cpa_set_pages_array(pages, addrinarray,
1699			cachemode2pgprot(_PAGE_CACHE_MODE_UC_MINUS));
1700	if (!ret && new_type == _PAGE_CACHE_MODE_WC)
1701		ret = change_page_attr_set_clr(NULL, addrinarray,
1702					       cachemode2pgprot(
1703						_PAGE_CACHE_MODE_WC),
1704					       __pgprot(_PAGE_CACHE_MASK),
1705					       0, CPA_PAGES_ARRAY, pages);
1706	if (ret)
1707		goto err_out;
1708	return 0; /* Success */
1709err_out:
1710	free_idx = i;
1711	for (i = 0; i < free_idx; i++) {
1712		if (PageHighMem(pages[i]))
1713			continue;
1714		start = page_to_pfn(pages[i]) << PAGE_SHIFT;
1715		end = start + PAGE_SIZE;
1716		free_memtype(start, end);
1717	}
1718	return -EINVAL;
1719}
1720
1721int set_pages_array_uc(struct page **pages, int addrinarray)
1722{
1723	return _set_pages_array(pages, addrinarray, _PAGE_CACHE_MODE_UC_MINUS);
1724}
1725EXPORT_SYMBOL(set_pages_array_uc);
1726
1727int set_pages_array_wc(struct page **pages, int addrinarray)
1728{
1729	return _set_pages_array(pages, addrinarray, _PAGE_CACHE_MODE_WC);
1730}
1731EXPORT_SYMBOL(set_pages_array_wc);
1732
1733int set_pages_wb(struct page *page, int numpages)
1734{
1735	unsigned long addr = (unsigned long)page_address(page);
1736
1737	return set_memory_wb(addr, numpages);
1738}
1739EXPORT_SYMBOL(set_pages_wb);
1740
1741int set_pages_array_wb(struct page **pages, int addrinarray)
1742{
1743	int retval;
1744	unsigned long start;
1745	unsigned long end;
1746	int i;
1747
1748	/* WB cache mode is hard wired to all cache attribute bits being 0 */
1749	retval = cpa_clear_pages_array(pages, addrinarray,
1750			__pgprot(_PAGE_CACHE_MASK));
1751	if (retval)
1752		return retval;
1753
1754	for (i = 0; i < addrinarray; i++) {
1755		if (PageHighMem(pages[i]))
1756			continue;
1757		start = page_to_pfn(pages[i]) << PAGE_SHIFT;
1758		end = start + PAGE_SIZE;
1759		free_memtype(start, end);
1760	}
1761
1762	return 0;
1763}
1764EXPORT_SYMBOL(set_pages_array_wb);
1765
1766int set_pages_x(struct page *page, int numpages)
1767{
1768	unsigned long addr = (unsigned long)page_address(page);
1769
1770	return set_memory_x(addr, numpages);
1771}
1772EXPORT_SYMBOL(set_pages_x);
1773
1774int set_pages_nx(struct page *page, int numpages)
1775{
1776	unsigned long addr = (unsigned long)page_address(page);
1777
1778	return set_memory_nx(addr, numpages);
1779}
1780EXPORT_SYMBOL(set_pages_nx);
1781
1782int set_pages_ro(struct page *page, int numpages)
1783{
1784	unsigned long addr = (unsigned long)page_address(page);
1785
1786	return set_memory_ro(addr, numpages);
1787}
1788
1789int set_pages_rw(struct page *page, int numpages)
1790{
1791	unsigned long addr = (unsigned long)page_address(page);
1792
1793	return set_memory_rw(addr, numpages);
1794}
1795
1796#ifdef CONFIG_DEBUG_PAGEALLOC
1797
1798static int __set_pages_p(struct page *page, int numpages)
1799{
1800	unsigned long tempaddr = (unsigned long) page_address(page);
1801	struct cpa_data cpa = { .vaddr = &tempaddr,
1802				.pgd = NULL,
1803				.numpages = numpages,
1804				.mask_set = __pgprot(_PAGE_PRESENT | _PAGE_RW),
1805				.mask_clr = __pgprot(0),
1806				.flags = 0};
1807
1808	/*
1809	 * No alias checking needed for setting present flag. otherwise,
1810	 * we may need to break large pages for 64-bit kernel text
1811	 * mappings (this adds to complexity if we want to do this from
1812	 * atomic context especially). Let's keep it simple!
1813	 */
1814	return __change_page_attr_set_clr(&cpa, 0);
1815}
1816
1817static int __set_pages_np(struct page *page, int numpages)
1818{
1819	unsigned long tempaddr = (unsigned long) page_address(page);
1820	struct cpa_data cpa = { .vaddr = &tempaddr,
1821				.pgd = NULL,
1822				.numpages = numpages,
1823				.mask_set = __pgprot(0),
1824				.mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW),
1825				.flags = 0};
1826
1827	/*
1828	 * No alias checking needed for setting not present flag. otherwise,
1829	 * we may need to break large pages for 64-bit kernel text
1830	 * mappings (this adds to complexity if we want to do this from
1831	 * atomic context especially). Let's keep it simple!
1832	 */
1833	return __change_page_attr_set_clr(&cpa, 0);
1834}
1835
1836void __kernel_map_pages(struct page *page, int numpages, int enable)
1837{
1838	if (PageHighMem(page))
1839		return;
1840	if (!enable) {
1841		debug_check_no_locks_freed(page_address(page),
1842					   numpages * PAGE_SIZE);
1843	}
1844
1845	/*
1846	 * The return value is ignored as the calls cannot fail.
1847	 * Large pages for identity mappings are not used at boot time
1848	 * and hence no memory allocations during large page split.
1849	 */
1850	if (enable)
1851		__set_pages_p(page, numpages);
1852	else
1853		__set_pages_np(page, numpages);
1854
1855	/*
1856	 * We should perform an IPI and flush all tlbs,
1857	 * but that can deadlock->flush only current cpu:
1858	 */
1859	__flush_tlb_all();
1860
1861	arch_flush_lazy_mmu_mode();
1862}
1863
1864#ifdef CONFIG_HIBERNATION
1865
1866bool kernel_page_present(struct page *page)
1867{
1868	unsigned int level;
1869	pte_t *pte;
1870
1871	if (PageHighMem(page))
1872		return false;
1873
1874	pte = lookup_address((unsigned long)page_address(page), &level);
1875	return (pte_val(*pte) & _PAGE_PRESENT);
1876}
1877
1878#endif /* CONFIG_HIBERNATION */
1879
1880#endif /* CONFIG_DEBUG_PAGEALLOC */
1881
1882int kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address,
1883			    unsigned numpages, unsigned long page_flags)
1884{
1885	int retval = -EINVAL;
1886
1887	struct cpa_data cpa = {
1888		.vaddr = &address,
1889		.pfn = pfn,
1890		.pgd = pgd,
1891		.numpages = numpages,
1892		.mask_set = __pgprot(0),
1893		.mask_clr = __pgprot(0),
1894		.flags = 0,
1895	};
1896
1897	if (!(__supported_pte_mask & _PAGE_NX))
1898		goto out;
1899
1900	if (!(page_flags & _PAGE_NX))
1901		cpa.mask_clr = __pgprot(_PAGE_NX);
1902
1903	cpa.mask_set = __pgprot(_PAGE_PRESENT | page_flags);
1904
1905	retval = __change_page_attr_set_clr(&cpa, 0);
1906	__flush_tlb_all();
1907
1908out:
1909	return retval;
1910}
1911
1912void kernel_unmap_pages_in_pgd(pgd_t *root, unsigned long address,
1913			       unsigned numpages)
1914{
1915	unmap_pgd_range(root, address, address + (numpages << PAGE_SHIFT));
1916}
1917
1918/*
1919 * The testcases use internal knowledge of the implementation that shouldn't
1920 * be exposed to the rest of the kernel. Include these directly here.
1921 */
1922#ifdef CONFIG_CPA_DEBUG
1923#include "pageattr-test.c"
1924#endif
1925