1/* 2 * Handle caching attributes in page tables (PAT) 3 * 4 * Authors: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> 5 * Suresh B Siddha <suresh.b.siddha@intel.com> 6 * 7 * Loosely based on earlier PAT patchset from Eric Biederman and Andi Kleen. 8 */ 9 10#include <linux/seq_file.h> 11#include <linux/bootmem.h> 12#include <linux/debugfs.h> 13#include <linux/kernel.h> 14#include <linux/module.h> 15#include <linux/slab.h> 16#include <linux/mm.h> 17#include <linux/fs.h> 18#include <linux/rbtree.h> 19 20#include <asm/cacheflush.h> 21#include <asm/processor.h> 22#include <asm/tlbflush.h> 23#include <asm/x86_init.h> 24#include <asm/pgtable.h> 25#include <asm/fcntl.h> 26#include <asm/e820.h> 27#include <asm/mtrr.h> 28#include <asm/page.h> 29#include <asm/msr.h> 30#include <asm/pat.h> 31#include <asm/io.h> 32 33#include "pat_internal.h" 34#include "mm_internal.h" 35 36#ifdef CONFIG_X86_PAT 37int __read_mostly pat_enabled = 1; 38 39static inline void pat_disable(const char *reason) 40{ 41 pat_enabled = 0; 42 printk(KERN_INFO "%s\n", reason); 43} 44 45static int __init nopat(char *str) 46{ 47 pat_disable("PAT support disabled."); 48 return 0; 49} 50early_param("nopat", nopat); 51#else 52static inline void pat_disable(const char *reason) 53{ 54 (void)reason; 55} 56#endif 57 58 59int pat_debug_enable; 60 61static int __init pat_debug_setup(char *str) 62{ 63 pat_debug_enable = 1; 64 return 0; 65} 66__setup("debugpat", pat_debug_setup); 67 68static u64 __read_mostly boot_pat_state; 69 70#ifdef CONFIG_X86_PAT 71/* 72 * X86 PAT uses page flags WC and Uncached together to keep track of 73 * memory type of pages that have backing page struct. X86 PAT supports 3 74 * different memory types, _PAGE_CACHE_MODE_WB, _PAGE_CACHE_MODE_WC and 75 * _PAGE_CACHE_MODE_UC_MINUS and fourth state where page's memory type has not 76 * been changed from its default (value of -1 used to denote this). 77 * Note we do not support _PAGE_CACHE_MODE_UC here. 78 */ 79 80#define _PGMT_DEFAULT 0 81#define _PGMT_WC (1UL << PG_arch_1) 82#define _PGMT_UC_MINUS (1UL << PG_uncached) 83#define _PGMT_WB (1UL << PG_uncached | 1UL << PG_arch_1) 84#define _PGMT_MASK (1UL << PG_uncached | 1UL << PG_arch_1) 85#define _PGMT_CLEAR_MASK (~_PGMT_MASK) 86 87static inline enum page_cache_mode get_page_memtype(struct page *pg) 88{ 89 unsigned long pg_flags = pg->flags & _PGMT_MASK; 90 91 if (pg_flags == _PGMT_DEFAULT) 92 return -1; 93 else if (pg_flags == _PGMT_WC) 94 return _PAGE_CACHE_MODE_WC; 95 else if (pg_flags == _PGMT_UC_MINUS) 96 return _PAGE_CACHE_MODE_UC_MINUS; 97 else 98 return _PAGE_CACHE_MODE_WB; 99} 100 101static inline void set_page_memtype(struct page *pg, 102 enum page_cache_mode memtype) 103{ 104 unsigned long memtype_flags; 105 unsigned long old_flags; 106 unsigned long new_flags; 107 108 switch (memtype) { 109 case _PAGE_CACHE_MODE_WC: 110 memtype_flags = _PGMT_WC; 111 break; 112 case _PAGE_CACHE_MODE_UC_MINUS: 113 memtype_flags = _PGMT_UC_MINUS; 114 break; 115 case _PAGE_CACHE_MODE_WB: 116 memtype_flags = _PGMT_WB; 117 break; 118 default: 119 memtype_flags = _PGMT_DEFAULT; 120 break; 121 } 122 123 do { 124 old_flags = pg->flags; 125 new_flags = (old_flags & _PGMT_CLEAR_MASK) | memtype_flags; 126 } while (cmpxchg(&pg->flags, old_flags, new_flags) != old_flags); 127} 128#else 129static inline enum page_cache_mode get_page_memtype(struct page *pg) 130{ 131 return -1; 132} 133static inline void set_page_memtype(struct page *pg, 134 enum page_cache_mode memtype) 135{ 136} 137#endif 138 139enum { 140 PAT_UC = 0, /* uncached */ 141 PAT_WC = 1, /* Write combining */ 142 PAT_WT = 4, /* Write Through */ 143 PAT_WP = 5, /* Write Protected */ 144 PAT_WB = 6, /* Write Back (default) */ 145 PAT_UC_MINUS = 7, /* UC, but can be overriden by MTRR */ 146}; 147 148#define CM(c) (_PAGE_CACHE_MODE_ ## c) 149 150static enum page_cache_mode pat_get_cache_mode(unsigned pat_val, char *msg) 151{ 152 enum page_cache_mode cache; 153 char *cache_mode; 154 155 switch (pat_val) { 156 case PAT_UC: cache = CM(UC); cache_mode = "UC "; break; 157 case PAT_WC: cache = CM(WC); cache_mode = "WC "; break; 158 case PAT_WT: cache = CM(WT); cache_mode = "WT "; break; 159 case PAT_WP: cache = CM(WP); cache_mode = "WP "; break; 160 case PAT_WB: cache = CM(WB); cache_mode = "WB "; break; 161 case PAT_UC_MINUS: cache = CM(UC_MINUS); cache_mode = "UC- "; break; 162 default: cache = CM(WB); cache_mode = "WB "; break; 163 } 164 165 memcpy(msg, cache_mode, 4); 166 167 return cache; 168} 169 170#undef CM 171 172/* 173 * Update the cache mode to pgprot translation tables according to PAT 174 * configuration. 175 * Using lower indices is preferred, so we start with highest index. 176 */ 177void pat_init_cache_modes(void) 178{ 179 int i; 180 enum page_cache_mode cache; 181 char pat_msg[33]; 182 u64 pat; 183 184 rdmsrl(MSR_IA32_CR_PAT, pat); 185 pat_msg[32] = 0; 186 for (i = 7; i >= 0; i--) { 187 cache = pat_get_cache_mode((pat >> (i * 8)) & 7, 188 pat_msg + 4 * i); 189 update_cache_mode_entry(i, cache); 190 } 191 pr_info("PAT configuration [0-7]: %s\n", pat_msg); 192} 193 194#define PAT(x, y) ((u64)PAT_ ## y << ((x)*8)) 195 196void pat_init(void) 197{ 198 u64 pat; 199 bool boot_cpu = !boot_pat_state; 200 201 if (!pat_enabled) 202 return; 203 204 if (!cpu_has_pat) { 205 if (!boot_pat_state) { 206 pat_disable("PAT not supported by CPU."); 207 return; 208 } else { 209 /* 210 * If this happens we are on a secondary CPU, but 211 * switched to PAT on the boot CPU. We have no way to 212 * undo PAT. 213 */ 214 printk(KERN_ERR "PAT enabled, " 215 "but not supported by secondary CPU\n"); 216 BUG(); 217 } 218 } 219 220 /* Set PWT to Write-Combining. All other bits stay the same */ 221 /* 222 * PTE encoding used in Linux: 223 * PAT 224 * |PCD 225 * ||PWT 226 * ||| 227 * 000 WB _PAGE_CACHE_WB 228 * 001 WC _PAGE_CACHE_WC 229 * 010 UC- _PAGE_CACHE_UC_MINUS 230 * 011 UC _PAGE_CACHE_UC 231 * PAT bit unused 232 */ 233 pat = PAT(0, WB) | PAT(1, WC) | PAT(2, UC_MINUS) | PAT(3, UC) | 234 PAT(4, WB) | PAT(5, WC) | PAT(6, UC_MINUS) | PAT(7, UC); 235 236 /* Boot CPU check */ 237 if (!boot_pat_state) { 238 rdmsrl(MSR_IA32_CR_PAT, boot_pat_state); 239 if (!boot_pat_state) { 240 pat_disable("PAT read returns always zero, disabled."); 241 return; 242 } 243 } 244 245 wrmsrl(MSR_IA32_CR_PAT, pat); 246 247 if (boot_cpu) 248 pat_init_cache_modes(); 249} 250 251#undef PAT 252 253static DEFINE_SPINLOCK(memtype_lock); /* protects memtype accesses */ 254 255/* 256 * Does intersection of PAT memory type and MTRR memory type and returns 257 * the resulting memory type as PAT understands it. 258 * (Type in pat and mtrr will not have same value) 259 * The intersection is based on "Effective Memory Type" tables in IA-32 260 * SDM vol 3a 261 */ 262static unsigned long pat_x_mtrr_type(u64 start, u64 end, 263 enum page_cache_mode req_type) 264{ 265 /* 266 * Look for MTRR hint to get the effective type in case where PAT 267 * request is for WB. 268 */ 269 if (req_type == _PAGE_CACHE_MODE_WB) { 270 u8 mtrr_type; 271 272 mtrr_type = mtrr_type_lookup(start, end); 273 if (mtrr_type != MTRR_TYPE_WRBACK) 274 return _PAGE_CACHE_MODE_UC_MINUS; 275 276 return _PAGE_CACHE_MODE_WB; 277 } 278 279 return req_type; 280} 281 282struct pagerange_state { 283 unsigned long cur_pfn; 284 int ram; 285 int not_ram; 286}; 287 288static int 289pagerange_is_ram_callback(unsigned long initial_pfn, unsigned long total_nr_pages, void *arg) 290{ 291 struct pagerange_state *state = arg; 292 293 state->not_ram |= initial_pfn > state->cur_pfn; 294 state->ram |= total_nr_pages > 0; 295 state->cur_pfn = initial_pfn + total_nr_pages; 296 297 return state->ram && state->not_ram; 298} 299 300static int pat_pagerange_is_ram(resource_size_t start, resource_size_t end) 301{ 302 int ret = 0; 303 unsigned long start_pfn = start >> PAGE_SHIFT; 304 unsigned long end_pfn = (end + PAGE_SIZE - 1) >> PAGE_SHIFT; 305 struct pagerange_state state = {start_pfn, 0, 0}; 306 307 /* 308 * For legacy reasons, physical address range in the legacy ISA 309 * region is tracked as non-RAM. This will allow users of 310 * /dev/mem to map portions of legacy ISA region, even when 311 * some of those portions are listed(or not even listed) with 312 * different e820 types(RAM/reserved/..) 313 */ 314 if (start_pfn < ISA_END_ADDRESS >> PAGE_SHIFT) 315 start_pfn = ISA_END_ADDRESS >> PAGE_SHIFT; 316 317 if (start_pfn < end_pfn) { 318 ret = walk_system_ram_range(start_pfn, end_pfn - start_pfn, 319 &state, pagerange_is_ram_callback); 320 } 321 322 return (ret > 0) ? -1 : (state.ram ? 1 : 0); 323} 324 325/* 326 * For RAM pages, we use page flags to mark the pages with appropriate type. 327 * Here we do two pass: 328 * - Find the memtype of all the pages in the range, look for any conflicts 329 * - In case of no conflicts, set the new memtype for pages in the range 330 */ 331static int reserve_ram_pages_type(u64 start, u64 end, 332 enum page_cache_mode req_type, 333 enum page_cache_mode *new_type) 334{ 335 struct page *page; 336 u64 pfn; 337 338 if (req_type == _PAGE_CACHE_MODE_UC) { 339 /* We do not support strong UC */ 340 WARN_ON_ONCE(1); 341 req_type = _PAGE_CACHE_MODE_UC_MINUS; 342 } 343 344 for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) { 345 enum page_cache_mode type; 346 347 page = pfn_to_page(pfn); 348 type = get_page_memtype(page); 349 if (type != -1) { 350 pr_info("reserve_ram_pages_type failed [mem %#010Lx-%#010Lx], track 0x%x, req 0x%x\n", 351 start, end - 1, type, req_type); 352 if (new_type) 353 *new_type = type; 354 355 return -EBUSY; 356 } 357 } 358 359 if (new_type) 360 *new_type = req_type; 361 362 for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) { 363 page = pfn_to_page(pfn); 364 set_page_memtype(page, req_type); 365 } 366 return 0; 367} 368 369static int free_ram_pages_type(u64 start, u64 end) 370{ 371 struct page *page; 372 u64 pfn; 373 374 for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) { 375 page = pfn_to_page(pfn); 376 set_page_memtype(page, -1); 377 } 378 return 0; 379} 380 381/* 382 * req_type typically has one of the: 383 * - _PAGE_CACHE_MODE_WB 384 * - _PAGE_CACHE_MODE_WC 385 * - _PAGE_CACHE_MODE_UC_MINUS 386 * - _PAGE_CACHE_MODE_UC 387 * 388 * If new_type is NULL, function will return an error if it cannot reserve the 389 * region with req_type. If new_type is non-NULL, function will return 390 * available type in new_type in case of no error. In case of any error 391 * it will return a negative return value. 392 */ 393int reserve_memtype(u64 start, u64 end, enum page_cache_mode req_type, 394 enum page_cache_mode *new_type) 395{ 396 struct memtype *new; 397 enum page_cache_mode actual_type; 398 int is_range_ram; 399 int err = 0; 400 401 BUG_ON(start >= end); /* end is exclusive */ 402 403 if (!pat_enabled) { 404 /* This is identical to page table setting without PAT */ 405 if (new_type) { 406 if (req_type == _PAGE_CACHE_MODE_WC) 407 *new_type = _PAGE_CACHE_MODE_UC_MINUS; 408 else 409 *new_type = req_type; 410 } 411 return 0; 412 } 413 414 /* Low ISA region is always mapped WB in page table. No need to track */ 415 if (x86_platform.is_untracked_pat_range(start, end)) { 416 if (new_type) 417 *new_type = _PAGE_CACHE_MODE_WB; 418 return 0; 419 } 420 421 /* 422 * Call mtrr_lookup to get the type hint. This is an 423 * optimization for /dev/mem mmap'ers into WB memory (BIOS 424 * tools and ACPI tools). Use WB request for WB memory and use 425 * UC_MINUS otherwise. 426 */ 427 actual_type = pat_x_mtrr_type(start, end, req_type); 428 429 if (new_type) 430 *new_type = actual_type; 431 432 is_range_ram = pat_pagerange_is_ram(start, end); 433 if (is_range_ram == 1) { 434 435 err = reserve_ram_pages_type(start, end, req_type, new_type); 436 437 return err; 438 } else if (is_range_ram < 0) { 439 return -EINVAL; 440 } 441 442 new = kzalloc(sizeof(struct memtype), GFP_KERNEL); 443 if (!new) 444 return -ENOMEM; 445 446 new->start = start; 447 new->end = end; 448 new->type = actual_type; 449 450 spin_lock(&memtype_lock); 451 452 err = rbt_memtype_check_insert(new, new_type); 453 if (err) { 454 printk(KERN_INFO "reserve_memtype failed [mem %#010Lx-%#010Lx], track %s, req %s\n", 455 start, end - 1, 456 cattr_name(new->type), cattr_name(req_type)); 457 kfree(new); 458 spin_unlock(&memtype_lock); 459 460 return err; 461 } 462 463 spin_unlock(&memtype_lock); 464 465 dprintk("reserve_memtype added [mem %#010Lx-%#010Lx], track %s, req %s, ret %s\n", 466 start, end - 1, cattr_name(new->type), cattr_name(req_type), 467 new_type ? cattr_name(*new_type) : "-"); 468 469 return err; 470} 471 472int free_memtype(u64 start, u64 end) 473{ 474 int err = -EINVAL; 475 int is_range_ram; 476 struct memtype *entry; 477 478 if (!pat_enabled) 479 return 0; 480 481 /* Low ISA region is always mapped WB. No need to track */ 482 if (x86_platform.is_untracked_pat_range(start, end)) 483 return 0; 484 485 is_range_ram = pat_pagerange_is_ram(start, end); 486 if (is_range_ram == 1) { 487 488 err = free_ram_pages_type(start, end); 489 490 return err; 491 } else if (is_range_ram < 0) { 492 return -EINVAL; 493 } 494 495 spin_lock(&memtype_lock); 496 entry = rbt_memtype_erase(start, end); 497 spin_unlock(&memtype_lock); 498 499 if (!entry) { 500 printk(KERN_INFO "%s:%d freeing invalid memtype [mem %#010Lx-%#010Lx]\n", 501 current->comm, current->pid, start, end - 1); 502 return -EINVAL; 503 } 504 505 kfree(entry); 506 507 dprintk("free_memtype request [mem %#010Lx-%#010Lx]\n", start, end - 1); 508 509 return 0; 510} 511 512 513/** 514 * lookup_memtype - Looksup the memory type for a physical address 515 * @paddr: physical address of which memory type needs to be looked up 516 * 517 * Only to be called when PAT is enabled 518 * 519 * Returns _PAGE_CACHE_MODE_WB, _PAGE_CACHE_MODE_WC, _PAGE_CACHE_MODE_UC_MINUS 520 * or _PAGE_CACHE_MODE_UC 521 */ 522static enum page_cache_mode lookup_memtype(u64 paddr) 523{ 524 enum page_cache_mode rettype = _PAGE_CACHE_MODE_WB; 525 struct memtype *entry; 526 527 if (x86_platform.is_untracked_pat_range(paddr, paddr + PAGE_SIZE)) 528 return rettype; 529 530 if (pat_pagerange_is_ram(paddr, paddr + PAGE_SIZE)) { 531 struct page *page; 532 page = pfn_to_page(paddr >> PAGE_SHIFT); 533 rettype = get_page_memtype(page); 534 /* 535 * -1 from get_page_memtype() implies RAM page is in its 536 * default state and not reserved, and hence of type WB 537 */ 538 if (rettype == -1) 539 rettype = _PAGE_CACHE_MODE_WB; 540 541 return rettype; 542 } 543 544 spin_lock(&memtype_lock); 545 546 entry = rbt_memtype_lookup(paddr); 547 if (entry != NULL) 548 rettype = entry->type; 549 else 550 rettype = _PAGE_CACHE_MODE_UC_MINUS; 551 552 spin_unlock(&memtype_lock); 553 return rettype; 554} 555 556/** 557 * io_reserve_memtype - Request a memory type mapping for a region of memory 558 * @start: start (physical address) of the region 559 * @end: end (physical address) of the region 560 * @type: A pointer to memtype, with requested type. On success, requested 561 * or any other compatible type that was available for the region is returned 562 * 563 * On success, returns 0 564 * On failure, returns non-zero 565 */ 566int io_reserve_memtype(resource_size_t start, resource_size_t end, 567 enum page_cache_mode *type) 568{ 569 resource_size_t size = end - start; 570 enum page_cache_mode req_type = *type; 571 enum page_cache_mode new_type; 572 int ret; 573 574 WARN_ON_ONCE(iomem_map_sanity_check(start, size)); 575 576 ret = reserve_memtype(start, end, req_type, &new_type); 577 if (ret) 578 goto out_err; 579 580 if (!is_new_memtype_allowed(start, size, req_type, new_type)) 581 goto out_free; 582 583 if (kernel_map_sync_memtype(start, size, new_type) < 0) 584 goto out_free; 585 586 *type = new_type; 587 return 0; 588 589out_free: 590 free_memtype(start, end); 591 ret = -EBUSY; 592out_err: 593 return ret; 594} 595 596/** 597 * io_free_memtype - Release a memory type mapping for a region of memory 598 * @start: start (physical address) of the region 599 * @end: end (physical address) of the region 600 */ 601void io_free_memtype(resource_size_t start, resource_size_t end) 602{ 603 free_memtype(start, end); 604} 605 606pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, 607 unsigned long size, pgprot_t vma_prot) 608{ 609 return vma_prot; 610} 611 612#ifdef CONFIG_STRICT_DEVMEM 613/* This check is done in drivers/char/mem.c in case of STRICT_DEVMEM */ 614static inline int range_is_allowed(unsigned long pfn, unsigned long size) 615{ 616 return 1; 617} 618#else 619/* This check is needed to avoid cache aliasing when PAT is enabled */ 620static inline int range_is_allowed(unsigned long pfn, unsigned long size) 621{ 622 u64 from = ((u64)pfn) << PAGE_SHIFT; 623 u64 to = from + size; 624 u64 cursor = from; 625 626 if (!pat_enabled) 627 return 1; 628 629 while (cursor < to) { 630 if (!devmem_is_allowed(pfn)) { 631 printk(KERN_INFO "Program %s tried to access /dev/mem between [mem %#010Lx-%#010Lx], PAT prevents it\n", 632 current->comm, from, to - 1); 633 return 0; 634 } 635 cursor += PAGE_SIZE; 636 pfn++; 637 } 638 return 1; 639} 640#endif /* CONFIG_STRICT_DEVMEM */ 641 642int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, 643 unsigned long size, pgprot_t *vma_prot) 644{ 645 enum page_cache_mode pcm = _PAGE_CACHE_MODE_WB; 646 647 if (!range_is_allowed(pfn, size)) 648 return 0; 649 650 if (file->f_flags & O_DSYNC) 651 pcm = _PAGE_CACHE_MODE_UC_MINUS; 652 653#ifdef CONFIG_X86_32 654 /* 655 * On the PPro and successors, the MTRRs are used to set 656 * memory types for physical addresses outside main memory, 657 * so blindly setting UC or PWT on those pages is wrong. 658 * For Pentiums and earlier, the surround logic should disable 659 * caching for the high addresses through the KEN pin, but 660 * we maintain the tradition of paranoia in this code. 661 */ 662 if (!pat_enabled && 663 !(boot_cpu_has(X86_FEATURE_MTRR) || 664 boot_cpu_has(X86_FEATURE_K6_MTRR) || 665 boot_cpu_has(X86_FEATURE_CYRIX_ARR) || 666 boot_cpu_has(X86_FEATURE_CENTAUR_MCR)) && 667 (pfn << PAGE_SHIFT) >= __pa(high_memory)) { 668 pcm = _PAGE_CACHE_MODE_UC; 669 } 670#endif 671 672 *vma_prot = __pgprot((pgprot_val(*vma_prot) & ~_PAGE_CACHE_MASK) | 673 cachemode2protval(pcm)); 674 return 1; 675} 676 677/* 678 * Change the memory type for the physial address range in kernel identity 679 * mapping space if that range is a part of identity map. 680 */ 681int kernel_map_sync_memtype(u64 base, unsigned long size, 682 enum page_cache_mode pcm) 683{ 684 unsigned long id_sz; 685 686 if (base > __pa(high_memory-1)) 687 return 0; 688 689 /* 690 * some areas in the middle of the kernel identity range 691 * are not mapped, like the PCI space. 692 */ 693 if (!page_is_ram(base >> PAGE_SHIFT)) 694 return 0; 695 696 id_sz = (__pa(high_memory-1) <= base + size) ? 697 __pa(high_memory) - base : 698 size; 699 700 if (ioremap_change_attr((unsigned long)__va(base), id_sz, pcm) < 0) { 701 printk(KERN_INFO "%s:%d ioremap_change_attr failed %s " 702 "for [mem %#010Lx-%#010Lx]\n", 703 current->comm, current->pid, 704 cattr_name(pcm), 705 base, (unsigned long long)(base + size-1)); 706 return -EINVAL; 707 } 708 return 0; 709} 710 711/* 712 * Internal interface to reserve a range of physical memory with prot. 713 * Reserved non RAM regions only and after successful reserve_memtype, 714 * this func also keeps identity mapping (if any) in sync with this new prot. 715 */ 716static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot, 717 int strict_prot) 718{ 719 int is_ram = 0; 720 int ret; 721 enum page_cache_mode want_pcm = pgprot2cachemode(*vma_prot); 722 enum page_cache_mode pcm = want_pcm; 723 724 is_ram = pat_pagerange_is_ram(paddr, paddr + size); 725 726 /* 727 * reserve_pfn_range() for RAM pages. We do not refcount to keep 728 * track of number of mappings of RAM pages. We can assert that 729 * the type requested matches the type of first page in the range. 730 */ 731 if (is_ram) { 732 if (!pat_enabled) 733 return 0; 734 735 pcm = lookup_memtype(paddr); 736 if (want_pcm != pcm) { 737 printk(KERN_WARNING "%s:%d map pfn RAM range req %s for [mem %#010Lx-%#010Lx], got %s\n", 738 current->comm, current->pid, 739 cattr_name(want_pcm), 740 (unsigned long long)paddr, 741 (unsigned long long)(paddr + size - 1), 742 cattr_name(pcm)); 743 *vma_prot = __pgprot((pgprot_val(*vma_prot) & 744 (~_PAGE_CACHE_MASK)) | 745 cachemode2protval(pcm)); 746 } 747 return 0; 748 } 749 750 ret = reserve_memtype(paddr, paddr + size, want_pcm, &pcm); 751 if (ret) 752 return ret; 753 754 if (pcm != want_pcm) { 755 if (strict_prot || 756 !is_new_memtype_allowed(paddr, size, want_pcm, pcm)) { 757 free_memtype(paddr, paddr + size); 758 printk(KERN_ERR "%s:%d map pfn expected mapping type %s" 759 " for [mem %#010Lx-%#010Lx], got %s\n", 760 current->comm, current->pid, 761 cattr_name(want_pcm), 762 (unsigned long long)paddr, 763 (unsigned long long)(paddr + size - 1), 764 cattr_name(pcm)); 765 return -EINVAL; 766 } 767 /* 768 * We allow returning different type than the one requested in 769 * non strict case. 770 */ 771 *vma_prot = __pgprot((pgprot_val(*vma_prot) & 772 (~_PAGE_CACHE_MASK)) | 773 cachemode2protval(pcm)); 774 } 775 776 if (kernel_map_sync_memtype(paddr, size, pcm) < 0) { 777 free_memtype(paddr, paddr + size); 778 return -EINVAL; 779 } 780 return 0; 781} 782 783/* 784 * Internal interface to free a range of physical memory. 785 * Frees non RAM regions only. 786 */ 787static void free_pfn_range(u64 paddr, unsigned long size) 788{ 789 int is_ram; 790 791 is_ram = pat_pagerange_is_ram(paddr, paddr + size); 792 if (is_ram == 0) 793 free_memtype(paddr, paddr + size); 794} 795 796/* 797 * track_pfn_copy is called when vma that is covering the pfnmap gets 798 * copied through copy_page_range(). 799 * 800 * If the vma has a linear pfn mapping for the entire range, we get the prot 801 * from pte and reserve the entire vma range with single reserve_pfn_range call. 802 */ 803int track_pfn_copy(struct vm_area_struct *vma) 804{ 805 resource_size_t paddr; 806 unsigned long prot; 807 unsigned long vma_size = vma->vm_end - vma->vm_start; 808 pgprot_t pgprot; 809 810 if (vma->vm_flags & VM_PAT) { 811 /* 812 * reserve the whole chunk covered by vma. We need the 813 * starting address and protection from pte. 814 */ 815 if (follow_phys(vma, vma->vm_start, 0, &prot, &paddr)) { 816 WARN_ON_ONCE(1); 817 return -EINVAL; 818 } 819 pgprot = __pgprot(prot); 820 return reserve_pfn_range(paddr, vma_size, &pgprot, 1); 821 } 822 823 return 0; 824} 825 826/* 827 * prot is passed in as a parameter for the new mapping. If the vma has a 828 * linear pfn mapping for the entire range reserve the entire vma range with 829 * single reserve_pfn_range call. 830 */ 831int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot, 832 unsigned long pfn, unsigned long addr, unsigned long size) 833{ 834 resource_size_t paddr = (resource_size_t)pfn << PAGE_SHIFT; 835 enum page_cache_mode pcm; 836 837 /* reserve the whole chunk starting from paddr */ 838 if (addr == vma->vm_start && size == (vma->vm_end - vma->vm_start)) { 839 int ret; 840 841 ret = reserve_pfn_range(paddr, size, prot, 0); 842 if (!ret) 843 vma->vm_flags |= VM_PAT; 844 return ret; 845 } 846 847 if (!pat_enabled) 848 return 0; 849 850 /* 851 * For anything smaller than the vma size we set prot based on the 852 * lookup. 853 */ 854 pcm = lookup_memtype(paddr); 855 856 /* Check memtype for the remaining pages */ 857 while (size > PAGE_SIZE) { 858 size -= PAGE_SIZE; 859 paddr += PAGE_SIZE; 860 if (pcm != lookup_memtype(paddr)) 861 return -EINVAL; 862 } 863 864 *prot = __pgprot((pgprot_val(vma->vm_page_prot) & (~_PAGE_CACHE_MASK)) | 865 cachemode2protval(pcm)); 866 867 return 0; 868} 869 870int track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot, 871 unsigned long pfn) 872{ 873 enum page_cache_mode pcm; 874 875 if (!pat_enabled) 876 return 0; 877 878 /* Set prot based on lookup */ 879 pcm = lookup_memtype((resource_size_t)pfn << PAGE_SHIFT); 880 *prot = __pgprot((pgprot_val(vma->vm_page_prot) & (~_PAGE_CACHE_MASK)) | 881 cachemode2protval(pcm)); 882 883 return 0; 884} 885 886/* 887 * untrack_pfn is called while unmapping a pfnmap for a region. 888 * untrack can be called for a specific region indicated by pfn and size or 889 * can be for the entire vma (in which case pfn, size are zero). 890 */ 891void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, 892 unsigned long size) 893{ 894 resource_size_t paddr; 895 unsigned long prot; 896 897 if (!(vma->vm_flags & VM_PAT)) 898 return; 899 900 /* free the chunk starting from pfn or the whole chunk */ 901 paddr = (resource_size_t)pfn << PAGE_SHIFT; 902 if (!paddr && !size) { 903 if (follow_phys(vma, vma->vm_start, 0, &prot, &paddr)) { 904 WARN_ON_ONCE(1); 905 return; 906 } 907 908 size = vma->vm_end - vma->vm_start; 909 } 910 free_pfn_range(paddr, size); 911 vma->vm_flags &= ~VM_PAT; 912} 913 914pgprot_t pgprot_writecombine(pgprot_t prot) 915{ 916 if (pat_enabled) 917 return __pgprot(pgprot_val(prot) | 918 cachemode2protval(_PAGE_CACHE_MODE_WC)); 919 else 920 return pgprot_noncached(prot); 921} 922EXPORT_SYMBOL_GPL(pgprot_writecombine); 923 924#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_X86_PAT) 925 926static struct memtype *memtype_get_idx(loff_t pos) 927{ 928 struct memtype *print_entry; 929 int ret; 930 931 print_entry = kzalloc(sizeof(struct memtype), GFP_KERNEL); 932 if (!print_entry) 933 return NULL; 934 935 spin_lock(&memtype_lock); 936 ret = rbt_memtype_copy_nth_element(print_entry, pos); 937 spin_unlock(&memtype_lock); 938 939 if (!ret) { 940 return print_entry; 941 } else { 942 kfree(print_entry); 943 return NULL; 944 } 945} 946 947static void *memtype_seq_start(struct seq_file *seq, loff_t *pos) 948{ 949 if (*pos == 0) { 950 ++*pos; 951 seq_puts(seq, "PAT memtype list:\n"); 952 } 953 954 return memtype_get_idx(*pos); 955} 956 957static void *memtype_seq_next(struct seq_file *seq, void *v, loff_t *pos) 958{ 959 ++*pos; 960 return memtype_get_idx(*pos); 961} 962 963static void memtype_seq_stop(struct seq_file *seq, void *v) 964{ 965} 966 967static int memtype_seq_show(struct seq_file *seq, void *v) 968{ 969 struct memtype *print_entry = (struct memtype *)v; 970 971 seq_printf(seq, "%s @ 0x%Lx-0x%Lx\n", cattr_name(print_entry->type), 972 print_entry->start, print_entry->end); 973 kfree(print_entry); 974 975 return 0; 976} 977 978static const struct seq_operations memtype_seq_ops = { 979 .start = memtype_seq_start, 980 .next = memtype_seq_next, 981 .stop = memtype_seq_stop, 982 .show = memtype_seq_show, 983}; 984 985static int memtype_seq_open(struct inode *inode, struct file *file) 986{ 987 return seq_open(file, &memtype_seq_ops); 988} 989 990static const struct file_operations memtype_fops = { 991 .open = memtype_seq_open, 992 .read = seq_read, 993 .llseek = seq_lseek, 994 .release = seq_release, 995}; 996 997static int __init pat_memtype_list_init(void) 998{ 999 if (pat_enabled) { 1000 debugfs_create_file("pat_memtype_list", S_IRUSR, 1001 arch_debugfs_dir, NULL, &memtype_fops); 1002 } 1003 return 0; 1004} 1005 1006late_initcall(pat_memtype_list_init); 1007 1008#endif /* CONFIG_DEBUG_FS && CONFIG_X86_PAT */ 1009