This source file includes following definitions.
- pe_order
- init_dax_wait_table
- dax_to_pfn
- dax_make_entry
- dax_is_locked
- dax_entry_order
- dax_is_pmd_entry
- dax_is_pte_entry
- dax_is_zero_entry
- dax_is_empty_entry
- dax_is_conflict
- dax_entry_waitqueue
- wake_exceptional_entry_func
- dax_wake_entry
- get_unlocked_entry
- wait_entry_unlocked
- put_unlocked_entry
- dax_unlock_entry
- dax_lock_entry
- dax_entry_size
- dax_end_pfn
- dax_associate_entry
- dax_disassociate_entry
- dax_busy_page
- dax_lock_page
- dax_unlock_page
- grab_mapping_entry
- dax_layout_busy_page
- __dax_invalidate_entry
- dax_delete_mapping_entry
- dax_invalidate_mapping_entry_sync
- copy_user_dax
- dax_insert_entry
- pgoff_address
- dax_entry_mkclean
- dax_writeback_one
- dax_writeback_mapping_range
- dax_iomap_sector
- dax_iomap_pfn
- dax_load_hole
- dax_range_is_aligned
- __dax_zero_page_range
- dax_iomap_actor
- dax_iomap_rw
- dax_fault_return
- dax_fault_is_synchronous
- dax_iomap_pte_fault
- dax_pmd_load_hole
- dax_iomap_pmd_fault
- dax_iomap_pmd_fault
- dax_iomap_fault
- dax_insert_pfn_mkwrite
- dax_finish_sync_fault
   1 
   2 
   3 
   4 
   5 
   6 
   7 
   8 
   9 #include <linux/atomic.h>
  10 #include <linux/blkdev.h>
  11 #include <linux/buffer_head.h>
  12 #include <linux/dax.h>
  13 #include <linux/fs.h>
  14 #include <linux/genhd.h>
  15 #include <linux/highmem.h>
  16 #include <linux/memcontrol.h>
  17 #include <linux/mm.h>
  18 #include <linux/mutex.h>
  19 #include <linux/pagevec.h>
  20 #include <linux/sched.h>
  21 #include <linux/sched/signal.h>
  22 #include <linux/uio.h>
  23 #include <linux/vmstat.h>
  24 #include <linux/pfn_t.h>
  25 #include <linux/sizes.h>
  26 #include <linux/mmu_notifier.h>
  27 #include <linux/iomap.h>
  28 #include <asm/pgalloc.h>
  29 
  30 #define CREATE_TRACE_POINTS
  31 #include <trace/events/fs_dax.h>
  32 
  33 static inline unsigned int pe_order(enum page_entry_size pe_size)
  34 {
  35         if (pe_size == PE_SIZE_PTE)
  36                 return PAGE_SHIFT - PAGE_SHIFT;
  37         if (pe_size == PE_SIZE_PMD)
  38                 return PMD_SHIFT - PAGE_SHIFT;
  39         if (pe_size == PE_SIZE_PUD)
  40                 return PUD_SHIFT - PAGE_SHIFT;
  41         return ~0;
  42 }
  43 
  44 
  45 #define DAX_WAIT_TABLE_BITS 12
  46 #define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS)
  47 
  48 
  49 #define PG_PMD_COLOUR   ((PMD_SIZE >> PAGE_SHIFT) - 1)
  50 #define PG_PMD_NR       (PMD_SIZE >> PAGE_SHIFT)
  51 
  52 
  53 #define PMD_ORDER       (PMD_SHIFT - PAGE_SHIFT)
  54 
  55 static wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES];
  56 
  57 static int __init init_dax_wait_table(void)
  58 {
  59         int i;
  60 
  61         for (i = 0; i < DAX_WAIT_TABLE_ENTRIES; i++)
  62                 init_waitqueue_head(wait_table + i);
  63         return 0;
  64 }
  65 fs_initcall(init_dax_wait_table);
  66 
  67 
  68 
  69 
  70 
  71 
  72 
  73 
  74 
  75 
  76 
  77 #define DAX_SHIFT       (4)
  78 #define DAX_LOCKED      (1UL << 0)
  79 #define DAX_PMD         (1UL << 1)
  80 #define DAX_ZERO_PAGE   (1UL << 2)
  81 #define DAX_EMPTY       (1UL << 3)
  82 
  83 static unsigned long dax_to_pfn(void *entry)
  84 {
  85         return xa_to_value(entry) >> DAX_SHIFT;
  86 }
  87 
  88 static void *dax_make_entry(pfn_t pfn, unsigned long flags)
  89 {
  90         return xa_mk_value(flags | (pfn_t_to_pfn(pfn) << DAX_SHIFT));
  91 }
  92 
  93 static bool dax_is_locked(void *entry)
  94 {
  95         return xa_to_value(entry) & DAX_LOCKED;
  96 }
  97 
  98 static unsigned int dax_entry_order(void *entry)
  99 {
 100         if (xa_to_value(entry) & DAX_PMD)
 101                 return PMD_ORDER;
 102         return 0;
 103 }
 104 
 105 static unsigned long dax_is_pmd_entry(void *entry)
 106 {
 107         return xa_to_value(entry) & DAX_PMD;
 108 }
 109 
 110 static bool dax_is_pte_entry(void *entry)
 111 {
 112         return !(xa_to_value(entry) & DAX_PMD);
 113 }
 114 
 115 static int dax_is_zero_entry(void *entry)
 116 {
 117         return xa_to_value(entry) & DAX_ZERO_PAGE;
 118 }
 119 
 120 static int dax_is_empty_entry(void *entry)
 121 {
 122         return xa_to_value(entry) & DAX_EMPTY;
 123 }
 124 
 125 
 126 
 127 
 128 
 129 static bool dax_is_conflict(void *entry)
 130 {
 131         return entry == XA_RETRY_ENTRY;
 132 }
 133 
 134 
 135 
 136 
 137 struct exceptional_entry_key {
 138         struct xarray *xa;
 139         pgoff_t entry_start;
 140 };
 141 
 142 struct wait_exceptional_entry_queue {
 143         wait_queue_entry_t wait;
 144         struct exceptional_entry_key key;
 145 };
 146 
 147 static wait_queue_head_t *dax_entry_waitqueue(struct xa_state *xas,
 148                 void *entry, struct exceptional_entry_key *key)
 149 {
 150         unsigned long hash;
 151         unsigned long index = xas->xa_index;
 152 
 153         
 154 
 155 
 156 
 157 
 158         if (dax_is_pmd_entry(entry))
 159                 index &= ~PG_PMD_COLOUR;
 160         key->xa = xas->xa;
 161         key->entry_start = index;
 162 
 163         hash = hash_long((unsigned long)xas->xa ^ index, DAX_WAIT_TABLE_BITS);
 164         return wait_table + hash;
 165 }
 166 
 167 static int wake_exceptional_entry_func(wait_queue_entry_t *wait,
 168                 unsigned int mode, int sync, void *keyp)
 169 {
 170         struct exceptional_entry_key *key = keyp;
 171         struct wait_exceptional_entry_queue *ewait =
 172                 container_of(wait, struct wait_exceptional_entry_queue, wait);
 173 
 174         if (key->xa != ewait->key.xa ||
 175             key->entry_start != ewait->key.entry_start)
 176                 return 0;
 177         return autoremove_wake_function(wait, mode, sync, NULL);
 178 }
 179 
 180 
 181 
 182 
 183 
 184 
 185 static void dax_wake_entry(struct xa_state *xas, void *entry, bool wake_all)
 186 {
 187         struct exceptional_entry_key key;
 188         wait_queue_head_t *wq;
 189 
 190         wq = dax_entry_waitqueue(xas, entry, &key);
 191 
 192         
 193 
 194 
 195 
 196 
 197 
 198         if (waitqueue_active(wq))
 199                 __wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key);
 200 }
 201 
 202 
 203 
 204 
 205 
 206 
 207 
 208 
 209 
 210 
 211 
 212 static void *get_unlocked_entry(struct xa_state *xas, unsigned int order)
 213 {
 214         void *entry;
 215         struct wait_exceptional_entry_queue ewait;
 216         wait_queue_head_t *wq;
 217 
 218         init_wait(&ewait.wait);
 219         ewait.wait.func = wake_exceptional_entry_func;
 220 
 221         for (;;) {
 222                 entry = xas_find_conflict(xas);
 223                 if (!entry || WARN_ON_ONCE(!xa_is_value(entry)))
 224                         return entry;
 225                 if (dax_entry_order(entry) < order)
 226                         return XA_RETRY_ENTRY;
 227                 if (!dax_is_locked(entry))
 228                         return entry;
 229 
 230                 wq = dax_entry_waitqueue(xas, entry, &ewait.key);
 231                 prepare_to_wait_exclusive(wq, &ewait.wait,
 232                                           TASK_UNINTERRUPTIBLE);
 233                 xas_unlock_irq(xas);
 234                 xas_reset(xas);
 235                 schedule();
 236                 finish_wait(wq, &ewait.wait);
 237                 xas_lock_irq(xas);
 238         }
 239 }
 240 
 241 
 242 
 243 
 244 
 245 
 246 static void wait_entry_unlocked(struct xa_state *xas, void *entry)
 247 {
 248         struct wait_exceptional_entry_queue ewait;
 249         wait_queue_head_t *wq;
 250 
 251         init_wait(&ewait.wait);
 252         ewait.wait.func = wake_exceptional_entry_func;
 253 
 254         wq = dax_entry_waitqueue(xas, entry, &ewait.key);
 255         
 256 
 257 
 258 
 259 
 260 
 261         prepare_to_wait(wq, &ewait.wait, TASK_UNINTERRUPTIBLE);
 262         xas_unlock_irq(xas);
 263         schedule();
 264         finish_wait(wq, &ewait.wait);
 265 }
 266 
 267 static void put_unlocked_entry(struct xa_state *xas, void *entry)
 268 {
 269         
 270         if (entry && !dax_is_conflict(entry))
 271                 dax_wake_entry(xas, entry, false);
 272 }
 273 
 274 
 275 
 276 
 277 
 278 
 279 static void dax_unlock_entry(struct xa_state *xas, void *entry)
 280 {
 281         void *old;
 282 
 283         BUG_ON(dax_is_locked(entry));
 284         xas_reset(xas);
 285         xas_lock_irq(xas);
 286         old = xas_store(xas, entry);
 287         xas_unlock_irq(xas);
 288         BUG_ON(!dax_is_locked(old));
 289         dax_wake_entry(xas, entry, false);
 290 }
 291 
 292 
 293 
 294 
 295 static void *dax_lock_entry(struct xa_state *xas, void *entry)
 296 {
 297         unsigned long v = xa_to_value(entry);
 298         return xas_store(xas, xa_mk_value(v | DAX_LOCKED));
 299 }
 300 
 301 static unsigned long dax_entry_size(void *entry)
 302 {
 303         if (dax_is_zero_entry(entry))
 304                 return 0;
 305         else if (dax_is_empty_entry(entry))
 306                 return 0;
 307         else if (dax_is_pmd_entry(entry))
 308                 return PMD_SIZE;
 309         else
 310                 return PAGE_SIZE;
 311 }
 312 
 313 static unsigned long dax_end_pfn(void *entry)
 314 {
 315         return dax_to_pfn(entry) + dax_entry_size(entry) / PAGE_SIZE;
 316 }
 317 
 318 
 319 
 320 
 321 
 322 #define for_each_mapped_pfn(entry, pfn) \
 323         for (pfn = dax_to_pfn(entry); \
 324                         pfn < dax_end_pfn(entry); pfn++)
 325 
 326 
 327 
 328 
 329 
 330 
 331 static void dax_associate_entry(void *entry, struct address_space *mapping,
 332                 struct vm_area_struct *vma, unsigned long address)
 333 {
 334         unsigned long size = dax_entry_size(entry), pfn, index;
 335         int i = 0;
 336 
 337         if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
 338                 return;
 339 
 340         index = linear_page_index(vma, address & ~(size - 1));
 341         for_each_mapped_pfn(entry, pfn) {
 342                 struct page *page = pfn_to_page(pfn);
 343 
 344                 WARN_ON_ONCE(page->mapping);
 345                 page->mapping = mapping;
 346                 page->index = index + i++;
 347         }
 348 }
 349 
 350 static void dax_disassociate_entry(void *entry, struct address_space *mapping,
 351                 bool trunc)
 352 {
 353         unsigned long pfn;
 354 
 355         if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
 356                 return;
 357 
 358         for_each_mapped_pfn(entry, pfn) {
 359                 struct page *page = pfn_to_page(pfn);
 360 
 361                 WARN_ON_ONCE(trunc && page_ref_count(page) > 1);
 362                 WARN_ON_ONCE(page->mapping && page->mapping != mapping);
 363                 page->mapping = NULL;
 364                 page->index = 0;
 365         }
 366 }
 367 
 368 static struct page *dax_busy_page(void *entry)
 369 {
 370         unsigned long pfn;
 371 
 372         for_each_mapped_pfn(entry, pfn) {
 373                 struct page *page = pfn_to_page(pfn);
 374 
 375                 if (page_ref_count(page) > 1)
 376                         return page;
 377         }
 378         return NULL;
 379 }
 380 
 381 
 382 
 383 
 384 
 385 
 386 
 387 
 388 
 389 dax_entry_t dax_lock_page(struct page *page)
 390 {
 391         XA_STATE(xas, NULL, 0);
 392         void *entry;
 393 
 394         
 395         rcu_read_lock();
 396         for (;;) {
 397                 struct address_space *mapping = READ_ONCE(page->mapping);
 398 
 399                 entry = NULL;
 400                 if (!mapping || !dax_mapping(mapping))
 401                         break;
 402 
 403                 
 404 
 405 
 406 
 407 
 408 
 409 
 410                 entry = (void *)~0UL;
 411                 if (S_ISCHR(mapping->host->i_mode))
 412                         break;
 413 
 414                 xas.xa = &mapping->i_pages;
 415                 xas_lock_irq(&xas);
 416                 if (mapping != page->mapping) {
 417                         xas_unlock_irq(&xas);
 418                         continue;
 419                 }
 420                 xas_set(&xas, page->index);
 421                 entry = xas_load(&xas);
 422                 if (dax_is_locked(entry)) {
 423                         rcu_read_unlock();
 424                         wait_entry_unlocked(&xas, entry);
 425                         rcu_read_lock();
 426                         continue;
 427                 }
 428                 dax_lock_entry(&xas, entry);
 429                 xas_unlock_irq(&xas);
 430                 break;
 431         }
 432         rcu_read_unlock();
 433         return (dax_entry_t)entry;
 434 }
 435 
 436 void dax_unlock_page(struct page *page, dax_entry_t cookie)
 437 {
 438         struct address_space *mapping = page->mapping;
 439         XA_STATE(xas, &mapping->i_pages, page->index);
 440 
 441         if (S_ISCHR(mapping->host->i_mode))
 442                 return;
 443 
 444         dax_unlock_entry(&xas, (void *)cookie);
 445 }
 446 
 447 
 448 
 449 
 450 
 451 
 452 
 453 
 454 
 455 
 456 
 457 
 458 
 459 
 460 
 461 
 462 
 463 
 464 
 465 
 466 
 467 
 468 
 469 
 470 
 471 
 472 
 473 
 474 
 475 
 476 static void *grab_mapping_entry(struct xa_state *xas,
 477                 struct address_space *mapping, unsigned int order)
 478 {
 479         unsigned long index = xas->xa_index;
 480         bool pmd_downgrade = false; 
 481         void *entry;
 482 
 483 retry:
 484         xas_lock_irq(xas);
 485         entry = get_unlocked_entry(xas, order);
 486 
 487         if (entry) {
 488                 if (dax_is_conflict(entry))
 489                         goto fallback;
 490                 if (!xa_is_value(entry)) {
 491                         xas_set_err(xas, EIO);
 492                         goto out_unlock;
 493                 }
 494 
 495                 if (order == 0) {
 496                         if (dax_is_pmd_entry(entry) &&
 497                             (dax_is_zero_entry(entry) ||
 498                              dax_is_empty_entry(entry))) {
 499                                 pmd_downgrade = true;
 500                         }
 501                 }
 502         }
 503 
 504         if (pmd_downgrade) {
 505                 
 506 
 507 
 508 
 509                 dax_lock_entry(xas, entry);
 510 
 511                 
 512 
 513 
 514 
 515 
 516                 if (dax_is_zero_entry(entry)) {
 517                         xas_unlock_irq(xas);
 518                         unmap_mapping_pages(mapping,
 519                                         xas->xa_index & ~PG_PMD_COLOUR,
 520                                         PG_PMD_NR, false);
 521                         xas_reset(xas);
 522                         xas_lock_irq(xas);
 523                 }
 524 
 525                 dax_disassociate_entry(entry, mapping, false);
 526                 xas_store(xas, NULL);   
 527                 dax_wake_entry(xas, entry, true);
 528                 mapping->nrexceptional--;
 529                 entry = NULL;
 530                 xas_set(xas, index);
 531         }
 532 
 533         if (entry) {
 534                 dax_lock_entry(xas, entry);
 535         } else {
 536                 unsigned long flags = DAX_EMPTY;
 537 
 538                 if (order > 0)
 539                         flags |= DAX_PMD;
 540                 entry = dax_make_entry(pfn_to_pfn_t(0), flags);
 541                 dax_lock_entry(xas, entry);
 542                 if (xas_error(xas))
 543                         goto out_unlock;
 544                 mapping->nrexceptional++;
 545         }
 546 
 547 out_unlock:
 548         xas_unlock_irq(xas);
 549         if (xas_nomem(xas, mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM))
 550                 goto retry;
 551         if (xas->xa_node == XA_ERROR(-ENOMEM))
 552                 return xa_mk_internal(VM_FAULT_OOM);
 553         if (xas_error(xas))
 554                 return xa_mk_internal(VM_FAULT_SIGBUS);
 555         return entry;
 556 fallback:
 557         xas_unlock_irq(xas);
 558         return xa_mk_internal(VM_FAULT_FALLBACK);
 559 }
 560 
 561 
 562 
 563 
 564 
 565 
 566 
 567 
 568 
 569 
 570 
 571 
 572 
 573 
 574 
 575 
 576 struct page *dax_layout_busy_page(struct address_space *mapping)
 577 {
 578         XA_STATE(xas, &mapping->i_pages, 0);
 579         void *entry;
 580         unsigned int scanned = 0;
 581         struct page *page = NULL;
 582 
 583         
 584 
 585 
 586         if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
 587                 return NULL;
 588 
 589         if (!dax_mapping(mapping) || !mapping_mapped(mapping))
 590                 return NULL;
 591 
 592         
 593 
 594 
 595 
 596 
 597 
 598 
 599 
 600 
 601 
 602 
 603 
 604         unmap_mapping_range(mapping, 0, 0, 0);
 605 
 606         xas_lock_irq(&xas);
 607         xas_for_each(&xas, entry, ULONG_MAX) {
 608                 if (WARN_ON_ONCE(!xa_is_value(entry)))
 609                         continue;
 610                 if (unlikely(dax_is_locked(entry)))
 611                         entry = get_unlocked_entry(&xas, 0);
 612                 if (entry)
 613                         page = dax_busy_page(entry);
 614                 put_unlocked_entry(&xas, entry);
 615                 if (page)
 616                         break;
 617                 if (++scanned % XA_CHECK_SCHED)
 618                         continue;
 619 
 620                 xas_pause(&xas);
 621                 xas_unlock_irq(&xas);
 622                 cond_resched();
 623                 xas_lock_irq(&xas);
 624         }
 625         xas_unlock_irq(&xas);
 626         return page;
 627 }
 628 EXPORT_SYMBOL_GPL(dax_layout_busy_page);
 629 
 630 static int __dax_invalidate_entry(struct address_space *mapping,
 631                                           pgoff_t index, bool trunc)
 632 {
 633         XA_STATE(xas, &mapping->i_pages, index);
 634         int ret = 0;
 635         void *entry;
 636 
 637         xas_lock_irq(&xas);
 638         entry = get_unlocked_entry(&xas, 0);
 639         if (!entry || WARN_ON_ONCE(!xa_is_value(entry)))
 640                 goto out;
 641         if (!trunc &&
 642             (xas_get_mark(&xas, PAGECACHE_TAG_DIRTY) ||
 643              xas_get_mark(&xas, PAGECACHE_TAG_TOWRITE)))
 644                 goto out;
 645         dax_disassociate_entry(entry, mapping, trunc);
 646         xas_store(&xas, NULL);
 647         mapping->nrexceptional--;
 648         ret = 1;
 649 out:
 650         put_unlocked_entry(&xas, entry);
 651         xas_unlock_irq(&xas);
 652         return ret;
 653 }
 654 
 655 
 656 
 657 
 658 
 659 int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index)
 660 {
 661         int ret = __dax_invalidate_entry(mapping, index, true);
 662 
 663         
 664 
 665 
 666 
 667 
 668 
 669 
 670         WARN_ON_ONCE(!ret);
 671         return ret;
 672 }
 673 
 674 
 675 
 676 
 677 int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
 678                                       pgoff_t index)
 679 {
 680         return __dax_invalidate_entry(mapping, index, false);
 681 }
 682 
 683 static int copy_user_dax(struct block_device *bdev, struct dax_device *dax_dev,
 684                 sector_t sector, size_t size, struct page *to,
 685                 unsigned long vaddr)
 686 {
 687         void *vto, *kaddr;
 688         pgoff_t pgoff;
 689         long rc;
 690         int id;
 691 
 692         rc = bdev_dax_pgoff(bdev, sector, size, &pgoff);
 693         if (rc)
 694                 return rc;
 695 
 696         id = dax_read_lock();
 697         rc = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr, NULL);
 698         if (rc < 0) {
 699                 dax_read_unlock(id);
 700                 return rc;
 701         }
 702         vto = kmap_atomic(to);
 703         copy_user_page(vto, (void __force *)kaddr, vaddr, to);
 704         kunmap_atomic(vto);
 705         dax_read_unlock(id);
 706         return 0;
 707 }
 708 
 709 
 710 
 711 
 712 
 713 
 714 
 715 
 716 static void *dax_insert_entry(struct xa_state *xas,
 717                 struct address_space *mapping, struct vm_fault *vmf,
 718                 void *entry, pfn_t pfn, unsigned long flags, bool dirty)
 719 {
 720         void *new_entry = dax_make_entry(pfn, flags);
 721 
 722         if (dirty)
 723                 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
 724 
 725         if (dax_is_zero_entry(entry) && !(flags & DAX_ZERO_PAGE)) {
 726                 unsigned long index = xas->xa_index;
 727                 
 728                 if (dax_is_pmd_entry(entry))
 729                         unmap_mapping_pages(mapping, index & ~PG_PMD_COLOUR,
 730                                         PG_PMD_NR, false);
 731                 else 
 732                         unmap_mapping_pages(mapping, index, 1, false);
 733         }
 734 
 735         xas_reset(xas);
 736         xas_lock_irq(xas);
 737         if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
 738                 void *old;
 739 
 740                 dax_disassociate_entry(entry, mapping, false);
 741                 dax_associate_entry(new_entry, mapping, vmf->vma, vmf->address);
 742                 
 743 
 744 
 745 
 746 
 747 
 748 
 749 
 750                 old = dax_lock_entry(xas, new_entry);
 751                 WARN_ON_ONCE(old != xa_mk_value(xa_to_value(entry) |
 752                                         DAX_LOCKED));
 753                 entry = new_entry;
 754         } else {
 755                 xas_load(xas);  
 756         }
 757 
 758         if (dirty)
 759                 xas_set_mark(xas, PAGECACHE_TAG_DIRTY);
 760 
 761         xas_unlock_irq(xas);
 762         return entry;
 763 }
 764 
 765 static inline
 766 unsigned long pgoff_address(pgoff_t pgoff, struct vm_area_struct *vma)
 767 {
 768         unsigned long address;
 769 
 770         address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
 771         VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
 772         return address;
 773 }
 774 
 775 
 776 static void dax_entry_mkclean(struct address_space *mapping, pgoff_t index,
 777                 unsigned long pfn)
 778 {
 779         struct vm_area_struct *vma;
 780         pte_t pte, *ptep = NULL;
 781         pmd_t *pmdp = NULL;
 782         spinlock_t *ptl;
 783 
 784         i_mmap_lock_read(mapping);
 785         vma_interval_tree_foreach(vma, &mapping->i_mmap, index, index) {
 786                 struct mmu_notifier_range range;
 787                 unsigned long address;
 788 
 789                 cond_resched();
 790 
 791                 if (!(vma->vm_flags & VM_SHARED))
 792                         continue;
 793 
 794                 address = pgoff_address(index, vma);
 795 
 796                 
 797 
 798 
 799 
 800 
 801                 if (follow_pte_pmd(vma->vm_mm, address, &range,
 802                                    &ptep, &pmdp, &ptl))
 803                         continue;
 804 
 805                 
 806 
 807 
 808 
 809 
 810 
 811 
 812                 if (pmdp) {
 813 #ifdef CONFIG_FS_DAX_PMD
 814                         pmd_t pmd;
 815 
 816                         if (pfn != pmd_pfn(*pmdp))
 817                                 goto unlock_pmd;
 818                         if (!pmd_dirty(*pmdp) && !pmd_write(*pmdp))
 819                                 goto unlock_pmd;
 820 
 821                         flush_cache_page(vma, address, pfn);
 822                         pmd = pmdp_invalidate(vma, address, pmdp);
 823                         pmd = pmd_wrprotect(pmd);
 824                         pmd = pmd_mkclean(pmd);
 825                         set_pmd_at(vma->vm_mm, address, pmdp, pmd);
 826 unlock_pmd:
 827 #endif
 828                         spin_unlock(ptl);
 829                 } else {
 830                         if (pfn != pte_pfn(*ptep))
 831                                 goto unlock_pte;
 832                         if (!pte_dirty(*ptep) && !pte_write(*ptep))
 833                                 goto unlock_pte;
 834 
 835                         flush_cache_page(vma, address, pfn);
 836                         pte = ptep_clear_flush(vma, address, ptep);
 837                         pte = pte_wrprotect(pte);
 838                         pte = pte_mkclean(pte);
 839                         set_pte_at(vma->vm_mm, address, ptep, pte);
 840 unlock_pte:
 841                         pte_unmap_unlock(ptep, ptl);
 842                 }
 843 
 844                 mmu_notifier_invalidate_range_end(&range);
 845         }
 846         i_mmap_unlock_read(mapping);
 847 }
 848 
 849 static int dax_writeback_one(struct xa_state *xas, struct dax_device *dax_dev,
 850                 struct address_space *mapping, void *entry)
 851 {
 852         unsigned long pfn, index, count;
 853         long ret = 0;
 854 
 855         
 856 
 857 
 858 
 859         if (WARN_ON(!xa_is_value(entry)))
 860                 return -EIO;
 861 
 862         if (unlikely(dax_is_locked(entry))) {
 863                 void *old_entry = entry;
 864 
 865                 entry = get_unlocked_entry(xas, 0);
 866 
 867                 
 868                 if (!entry || WARN_ON_ONCE(!xa_is_value(entry)))
 869                         goto put_unlocked;
 870                 
 871 
 872 
 873 
 874 
 875                 if (dax_to_pfn(old_entry) != dax_to_pfn(entry))
 876                         goto put_unlocked;
 877                 if (WARN_ON_ONCE(dax_is_empty_entry(entry) ||
 878                                         dax_is_zero_entry(entry))) {
 879                         ret = -EIO;
 880                         goto put_unlocked;
 881                 }
 882 
 883                 
 884                 if (!xas_get_mark(xas, PAGECACHE_TAG_TOWRITE))
 885                         goto put_unlocked;
 886         }
 887 
 888         
 889         dax_lock_entry(xas, entry);
 890 
 891         
 892 
 893 
 894 
 895 
 896 
 897 
 898         xas_clear_mark(xas, PAGECACHE_TAG_TOWRITE);
 899         xas_unlock_irq(xas);
 900 
 901         
 902 
 903 
 904 
 905 
 906 
 907 
 908         pfn = dax_to_pfn(entry);
 909         count = 1UL << dax_entry_order(entry);
 910         index = xas->xa_index & ~(count - 1);
 911 
 912         dax_entry_mkclean(mapping, index, pfn);
 913         dax_flush(dax_dev, page_address(pfn_to_page(pfn)), count * PAGE_SIZE);
 914         
 915 
 916 
 917 
 918 
 919 
 920         xas_reset(xas);
 921         xas_lock_irq(xas);
 922         xas_store(xas, entry);
 923         xas_clear_mark(xas, PAGECACHE_TAG_DIRTY);
 924         dax_wake_entry(xas, entry, false);
 925 
 926         trace_dax_writeback_one(mapping->host, index, count);
 927         return ret;
 928 
 929  put_unlocked:
 930         put_unlocked_entry(xas, entry);
 931         return ret;
 932 }
 933 
 934 
 935 
 936 
 937 
 938 
 939 int dax_writeback_mapping_range(struct address_space *mapping,
 940                 struct block_device *bdev, struct writeback_control *wbc)
 941 {
 942         XA_STATE(xas, &mapping->i_pages, wbc->range_start >> PAGE_SHIFT);
 943         struct inode *inode = mapping->host;
 944         pgoff_t end_index = wbc->range_end >> PAGE_SHIFT;
 945         struct dax_device *dax_dev;
 946         void *entry;
 947         int ret = 0;
 948         unsigned int scanned = 0;
 949 
 950         if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT))
 951                 return -EIO;
 952 
 953         if (!mapping->nrexceptional || wbc->sync_mode != WB_SYNC_ALL)
 954                 return 0;
 955 
 956         dax_dev = dax_get_by_host(bdev->bd_disk->disk_name);
 957         if (!dax_dev)
 958                 return -EIO;
 959 
 960         trace_dax_writeback_range(inode, xas.xa_index, end_index);
 961 
 962         tag_pages_for_writeback(mapping, xas.xa_index, end_index);
 963 
 964         xas_lock_irq(&xas);
 965         xas_for_each_marked(&xas, entry, end_index, PAGECACHE_TAG_TOWRITE) {
 966                 ret = dax_writeback_one(&xas, dax_dev, mapping, entry);
 967                 if (ret < 0) {
 968                         mapping_set_error(mapping, ret);
 969                         break;
 970                 }
 971                 if (++scanned % XA_CHECK_SCHED)
 972                         continue;
 973 
 974                 xas_pause(&xas);
 975                 xas_unlock_irq(&xas);
 976                 cond_resched();
 977                 xas_lock_irq(&xas);
 978         }
 979         xas_unlock_irq(&xas);
 980         put_dax(dax_dev);
 981         trace_dax_writeback_range_done(inode, xas.xa_index, end_index);
 982         return ret;
 983 }
 984 EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
 985 
 986 static sector_t dax_iomap_sector(struct iomap *iomap, loff_t pos)
 987 {
 988         return (iomap->addr + (pos & PAGE_MASK) - iomap->offset) >> 9;
 989 }
 990 
 991 static int dax_iomap_pfn(struct iomap *iomap, loff_t pos, size_t size,
 992                          pfn_t *pfnp)
 993 {
 994         const sector_t sector = dax_iomap_sector(iomap, pos);
 995         pgoff_t pgoff;
 996         int id, rc;
 997         long length;
 998 
 999         rc = bdev_dax_pgoff(iomap->bdev, sector, size, &pgoff);
1000         if (rc)
1001                 return rc;
1002         id = dax_read_lock();
1003         length = dax_direct_access(iomap->dax_dev, pgoff, PHYS_PFN(size),
1004                                    NULL, pfnp);
1005         if (length < 0) {
1006                 rc = length;
1007                 goto out;
1008         }
1009         rc = -EINVAL;
1010         if (PFN_PHYS(length) < size)
1011                 goto out;
1012         if (pfn_t_to_pfn(*pfnp) & (PHYS_PFN(size)-1))
1013                 goto out;
1014         
1015         if (length > 1 && !pfn_t_devmap(*pfnp))
1016                 goto out;
1017         rc = 0;
1018 out:
1019         dax_read_unlock(id);
1020         return rc;
1021 }
1022 
1023 
1024 
1025 
1026 
1027 
1028 
1029 
1030 static vm_fault_t dax_load_hole(struct xa_state *xas,
1031                 struct address_space *mapping, void **entry,
1032                 struct vm_fault *vmf)
1033 {
1034         struct inode *inode = mapping->host;
1035         unsigned long vaddr = vmf->address;
1036         pfn_t pfn = pfn_to_pfn_t(my_zero_pfn(vaddr));
1037         vm_fault_t ret;
1038 
1039         *entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn,
1040                         DAX_ZERO_PAGE, false);
1041 
1042         ret = vmf_insert_mixed(vmf->vma, vaddr, pfn);
1043         trace_dax_load_hole(inode, vmf, ret);
1044         return ret;
1045 }
1046 
1047 static bool dax_range_is_aligned(struct block_device *bdev,
1048                                  unsigned int offset, unsigned int length)
1049 {
1050         unsigned short sector_size = bdev_logical_block_size(bdev);
1051 
1052         if (!IS_ALIGNED(offset, sector_size))
1053                 return false;
1054         if (!IS_ALIGNED(length, sector_size))
1055                 return false;
1056 
1057         return true;
1058 }
1059 
1060 int __dax_zero_page_range(struct block_device *bdev,
1061                 struct dax_device *dax_dev, sector_t sector,
1062                 unsigned int offset, unsigned int size)
1063 {
1064         if (dax_range_is_aligned(bdev, offset, size)) {
1065                 sector_t start_sector = sector + (offset >> 9);
1066 
1067                 return blkdev_issue_zeroout(bdev, start_sector,
1068                                 size >> 9, GFP_NOFS, 0);
1069         } else {
1070                 pgoff_t pgoff;
1071                 long rc, id;
1072                 void *kaddr;
1073 
1074                 rc = bdev_dax_pgoff(bdev, sector, PAGE_SIZE, &pgoff);
1075                 if (rc)
1076                         return rc;
1077 
1078                 id = dax_read_lock();
1079                 rc = dax_direct_access(dax_dev, pgoff, 1, &kaddr, NULL);
1080                 if (rc < 0) {
1081                         dax_read_unlock(id);
1082                         return rc;
1083                 }
1084                 memset(kaddr + offset, 0, size);
1085                 dax_flush(dax_dev, kaddr + offset, size);
1086                 dax_read_unlock(id);
1087         }
1088         return 0;
1089 }
1090 EXPORT_SYMBOL_GPL(__dax_zero_page_range);
1091 
1092 static loff_t
1093 dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
1094                 struct iomap *iomap)
1095 {
1096         struct block_device *bdev = iomap->bdev;
1097         struct dax_device *dax_dev = iomap->dax_dev;
1098         struct iov_iter *iter = data;
1099         loff_t end = pos + length, done = 0;
1100         ssize_t ret = 0;
1101         size_t xfer;
1102         int id;
1103 
1104         if (iov_iter_rw(iter) == READ) {
1105                 end = min(end, i_size_read(inode));
1106                 if (pos >= end)
1107                         return 0;
1108 
1109                 if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN)
1110                         return iov_iter_zero(min(length, end - pos), iter);
1111         }
1112 
1113         if (WARN_ON_ONCE(iomap->type != IOMAP_MAPPED))
1114                 return -EIO;
1115 
1116         
1117 
1118 
1119 
1120 
1121         if (iomap->flags & IOMAP_F_NEW) {
1122                 invalidate_inode_pages2_range(inode->i_mapping,
1123                                               pos >> PAGE_SHIFT,
1124                                               (end - 1) >> PAGE_SHIFT);
1125         }
1126 
1127         id = dax_read_lock();
1128         while (pos < end) {
1129                 unsigned offset = pos & (PAGE_SIZE - 1);
1130                 const size_t size = ALIGN(length + offset, PAGE_SIZE);
1131                 const sector_t sector = dax_iomap_sector(iomap, pos);
1132                 ssize_t map_len;
1133                 pgoff_t pgoff;
1134                 void *kaddr;
1135 
1136                 if (fatal_signal_pending(current)) {
1137                         ret = -EINTR;
1138                         break;
1139                 }
1140 
1141                 ret = bdev_dax_pgoff(bdev, sector, size, &pgoff);
1142                 if (ret)
1143                         break;
1144 
1145                 map_len = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size),
1146                                 &kaddr, NULL);
1147                 if (map_len < 0) {
1148                         ret = map_len;
1149                         break;
1150                 }
1151 
1152                 map_len = PFN_PHYS(map_len);
1153                 kaddr += offset;
1154                 map_len -= offset;
1155                 if (map_len > end - pos)
1156                         map_len = end - pos;
1157 
1158                 
1159 
1160 
1161 
1162 
1163                 if (iov_iter_rw(iter) == WRITE)
1164                         xfer = dax_copy_from_iter(dax_dev, pgoff, kaddr,
1165                                         map_len, iter);
1166                 else
1167                         xfer = dax_copy_to_iter(dax_dev, pgoff, kaddr,
1168                                         map_len, iter);
1169 
1170                 pos += xfer;
1171                 length -= xfer;
1172                 done += xfer;
1173 
1174                 if (xfer == 0)
1175                         ret = -EFAULT;
1176                 if (xfer < map_len)
1177                         break;
1178         }
1179         dax_read_unlock(id);
1180 
1181         return done ? done : ret;
1182 }
1183 
1184 
1185 
1186 
1187 
1188 
1189 
1190 
1191 
1192 
1193 
1194 ssize_t
1195 dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
1196                 const struct iomap_ops *ops)
1197 {
1198         struct address_space *mapping = iocb->ki_filp->f_mapping;
1199         struct inode *inode = mapping->host;
1200         loff_t pos = iocb->ki_pos, ret = 0, done = 0;
1201         unsigned flags = 0;
1202 
1203         if (iov_iter_rw(iter) == WRITE) {
1204                 lockdep_assert_held_write(&inode->i_rwsem);
1205                 flags |= IOMAP_WRITE;
1206         } else {
1207                 lockdep_assert_held(&inode->i_rwsem);
1208         }
1209 
1210         if (iocb->ki_flags & IOCB_NOWAIT)
1211                 flags |= IOMAP_NOWAIT;
1212 
1213         while (iov_iter_count(iter)) {
1214                 ret = iomap_apply(inode, pos, iov_iter_count(iter), flags, ops,
1215                                 iter, dax_iomap_actor);
1216                 if (ret <= 0)
1217                         break;
1218                 pos += ret;
1219                 done += ret;
1220         }
1221 
1222         iocb->ki_pos += done;
1223         return done ? done : ret;
1224 }
1225 EXPORT_SYMBOL_GPL(dax_iomap_rw);
1226 
1227 static vm_fault_t dax_fault_return(int error)
1228 {
1229         if (error == 0)
1230                 return VM_FAULT_NOPAGE;
1231         return vmf_error(error);
1232 }
1233 
1234 
1235 
1236 
1237 
1238 static bool dax_fault_is_synchronous(unsigned long flags,
1239                 struct vm_area_struct *vma, struct iomap *iomap)
1240 {
1241         return (flags & IOMAP_WRITE) && (vma->vm_flags & VM_SYNC)
1242                 && (iomap->flags & IOMAP_F_DIRTY);
1243 }
1244 
1245 static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
1246                                int *iomap_errp, const struct iomap_ops *ops)
1247 {
1248         struct vm_area_struct *vma = vmf->vma;
1249         struct address_space *mapping = vma->vm_file->f_mapping;
1250         XA_STATE(xas, &mapping->i_pages, vmf->pgoff);
1251         struct inode *inode = mapping->host;
1252         unsigned long vaddr = vmf->address;
1253         loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT;
1254         struct iomap iomap = { 0 };
1255         unsigned flags = IOMAP_FAULT;
1256         int error, major = 0;
1257         bool write = vmf->flags & FAULT_FLAG_WRITE;
1258         bool sync;
1259         vm_fault_t ret = 0;
1260         void *entry;
1261         pfn_t pfn;
1262 
1263         trace_dax_pte_fault(inode, vmf, ret);
1264         
1265 
1266 
1267 
1268 
1269         if (pos >= i_size_read(inode)) {
1270                 ret = VM_FAULT_SIGBUS;
1271                 goto out;
1272         }
1273 
1274         if (write && !vmf->cow_page)
1275                 flags |= IOMAP_WRITE;
1276 
1277         entry = grab_mapping_entry(&xas, mapping, 0);
1278         if (xa_is_internal(entry)) {
1279                 ret = xa_to_internal(entry);
1280                 goto out;
1281         }
1282 
1283         
1284 
1285 
1286 
1287 
1288 
1289         if (pmd_trans_huge(*vmf->pmd) || pmd_devmap(*vmf->pmd)) {
1290                 ret = VM_FAULT_NOPAGE;
1291                 goto unlock_entry;
1292         }
1293 
1294         
1295 
1296 
1297 
1298 
1299         error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap);
1300         if (iomap_errp)
1301                 *iomap_errp = error;
1302         if (error) {
1303                 ret = dax_fault_return(error);
1304                 goto unlock_entry;
1305         }
1306         if (WARN_ON_ONCE(iomap.offset + iomap.length < pos + PAGE_SIZE)) {
1307                 error = -EIO;   
1308                 goto error_finish_iomap;
1309         }
1310 
1311         if (vmf->cow_page) {
1312                 sector_t sector = dax_iomap_sector(&iomap, pos);
1313 
1314                 switch (iomap.type) {
1315                 case IOMAP_HOLE:
1316                 case IOMAP_UNWRITTEN:
1317                         clear_user_highpage(vmf->cow_page, vaddr);
1318                         break;
1319                 case IOMAP_MAPPED:
1320                         error = copy_user_dax(iomap.bdev, iomap.dax_dev,
1321                                         sector, PAGE_SIZE, vmf->cow_page, vaddr);
1322                         break;
1323                 default:
1324                         WARN_ON_ONCE(1);
1325                         error = -EIO;
1326                         break;
1327                 }
1328 
1329                 if (error)
1330                         goto error_finish_iomap;
1331 
1332                 __SetPageUptodate(vmf->cow_page);
1333                 ret = finish_fault(vmf);
1334                 if (!ret)
1335                         ret = VM_FAULT_DONE_COW;
1336                 goto finish_iomap;
1337         }
1338 
1339         sync = dax_fault_is_synchronous(flags, vma, &iomap);
1340 
1341         switch (iomap.type) {
1342         case IOMAP_MAPPED:
1343                 if (iomap.flags & IOMAP_F_NEW) {
1344                         count_vm_event(PGMAJFAULT);
1345                         count_memcg_event_mm(vma->vm_mm, PGMAJFAULT);
1346                         major = VM_FAULT_MAJOR;
1347                 }
1348                 error = dax_iomap_pfn(&iomap, pos, PAGE_SIZE, &pfn);
1349                 if (error < 0)
1350                         goto error_finish_iomap;
1351 
1352                 entry = dax_insert_entry(&xas, mapping, vmf, entry, pfn,
1353                                                  0, write && !sync);
1354 
1355                 
1356 
1357 
1358 
1359 
1360 
1361                 if (sync) {
1362                         if (WARN_ON_ONCE(!pfnp)) {
1363                                 error = -EIO;
1364                                 goto error_finish_iomap;
1365                         }
1366                         *pfnp = pfn;
1367                         ret = VM_FAULT_NEEDDSYNC | major;
1368                         goto finish_iomap;
1369                 }
1370                 trace_dax_insert_mapping(inode, vmf, entry);
1371                 if (write)
1372                         ret = vmf_insert_mixed_mkwrite(vma, vaddr, pfn);
1373                 else
1374                         ret = vmf_insert_mixed(vma, vaddr, pfn);
1375 
1376                 goto finish_iomap;
1377         case IOMAP_UNWRITTEN:
1378         case IOMAP_HOLE:
1379                 if (!write) {
1380                         ret = dax_load_hole(&xas, mapping, &entry, vmf);
1381                         goto finish_iomap;
1382                 }
1383                 
1384         default:
1385                 WARN_ON_ONCE(1);
1386                 error = -EIO;
1387                 break;
1388         }
1389 
1390  error_finish_iomap:
1391         ret = dax_fault_return(error);
1392  finish_iomap:
1393         if (ops->iomap_end) {
1394                 int copied = PAGE_SIZE;
1395 
1396                 if (ret & VM_FAULT_ERROR)
1397                         copied = 0;
1398                 
1399 
1400 
1401 
1402 
1403 
1404                 ops->iomap_end(inode, pos, PAGE_SIZE, copied, flags, &iomap);
1405         }
1406  unlock_entry:
1407         dax_unlock_entry(&xas, entry);
1408  out:
1409         trace_dax_pte_fault_done(inode, vmf, ret);
1410         return ret | major;
1411 }
1412 
1413 #ifdef CONFIG_FS_DAX_PMD
1414 static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
1415                 struct iomap *iomap, void **entry)
1416 {
1417         struct address_space *mapping = vmf->vma->vm_file->f_mapping;
1418         unsigned long pmd_addr = vmf->address & PMD_MASK;
1419         struct vm_area_struct *vma = vmf->vma;
1420         struct inode *inode = mapping->host;
1421         pgtable_t pgtable = NULL;
1422         struct page *zero_page;
1423         spinlock_t *ptl;
1424         pmd_t pmd_entry;
1425         pfn_t pfn;
1426 
1427         zero_page = mm_get_huge_zero_page(vmf->vma->vm_mm);
1428 
1429         if (unlikely(!zero_page))
1430                 goto fallback;
1431 
1432         pfn = page_to_pfn_t(zero_page);
1433         *entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn,
1434                         DAX_PMD | DAX_ZERO_PAGE, false);
1435 
1436         if (arch_needs_pgtable_deposit()) {
1437                 pgtable = pte_alloc_one(vma->vm_mm);
1438                 if (!pgtable)
1439                         return VM_FAULT_OOM;
1440         }
1441 
1442         ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
1443         if (!pmd_none(*(vmf->pmd))) {
1444                 spin_unlock(ptl);
1445                 goto fallback;
1446         }
1447 
1448         if (pgtable) {
1449                 pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
1450                 mm_inc_nr_ptes(vma->vm_mm);
1451         }
1452         pmd_entry = mk_pmd(zero_page, vmf->vma->vm_page_prot);
1453         pmd_entry = pmd_mkhuge(pmd_entry);
1454         set_pmd_at(vmf->vma->vm_mm, pmd_addr, vmf->pmd, pmd_entry);
1455         spin_unlock(ptl);
1456         trace_dax_pmd_load_hole(inode, vmf, zero_page, *entry);
1457         return VM_FAULT_NOPAGE;
1458 
1459 fallback:
1460         if (pgtable)
1461                 pte_free(vma->vm_mm, pgtable);
1462         trace_dax_pmd_load_hole_fallback(inode, vmf, zero_page, *entry);
1463         return VM_FAULT_FALLBACK;
1464 }
1465 
1466 static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
1467                                const struct iomap_ops *ops)
1468 {
1469         struct vm_area_struct *vma = vmf->vma;
1470         struct address_space *mapping = vma->vm_file->f_mapping;
1471         XA_STATE_ORDER(xas, &mapping->i_pages, vmf->pgoff, PMD_ORDER);
1472         unsigned long pmd_addr = vmf->address & PMD_MASK;
1473         bool write = vmf->flags & FAULT_FLAG_WRITE;
1474         bool sync;
1475         unsigned int iomap_flags = (write ? IOMAP_WRITE : 0) | IOMAP_FAULT;
1476         struct inode *inode = mapping->host;
1477         vm_fault_t result = VM_FAULT_FALLBACK;
1478         struct iomap iomap = { 0 };
1479         pgoff_t max_pgoff;
1480         void *entry;
1481         loff_t pos;
1482         int error;
1483         pfn_t pfn;
1484 
1485         
1486 
1487 
1488 
1489 
1490         max_pgoff = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
1491 
1492         trace_dax_pmd_fault(inode, vmf, max_pgoff, 0);
1493 
1494         
1495 
1496 
1497 
1498 
1499 
1500         if ((vmf->pgoff & PG_PMD_COLOUR) !=
1501             ((vmf->address >> PAGE_SHIFT) & PG_PMD_COLOUR))
1502                 goto fallback;
1503 
1504         
1505         if (write && !(vma->vm_flags & VM_SHARED))
1506                 goto fallback;
1507 
1508         
1509         if (pmd_addr < vma->vm_start)
1510                 goto fallback;
1511         if ((pmd_addr + PMD_SIZE) > vma->vm_end)
1512                 goto fallback;
1513 
1514         if (xas.xa_index >= max_pgoff) {
1515                 result = VM_FAULT_SIGBUS;
1516                 goto out;
1517         }
1518 
1519         
1520         if ((xas.xa_index | PG_PMD_COLOUR) >= max_pgoff)
1521                 goto fallback;
1522 
1523         
1524 
1525 
1526 
1527 
1528 
1529         entry = grab_mapping_entry(&xas, mapping, PMD_ORDER);
1530         if (xa_is_internal(entry)) {
1531                 result = xa_to_internal(entry);
1532                 goto fallback;
1533         }
1534 
1535         
1536 
1537 
1538 
1539 
1540 
1541         if (!pmd_none(*vmf->pmd) && !pmd_trans_huge(*vmf->pmd) &&
1542                         !pmd_devmap(*vmf->pmd)) {
1543                 result = 0;
1544                 goto unlock_entry;
1545         }
1546 
1547         
1548 
1549 
1550 
1551 
1552         pos = (loff_t)xas.xa_index << PAGE_SHIFT;
1553         error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap);
1554         if (error)
1555                 goto unlock_entry;
1556 
1557         if (iomap.offset + iomap.length < pos + PMD_SIZE)
1558                 goto finish_iomap;
1559 
1560         sync = dax_fault_is_synchronous(iomap_flags, vma, &iomap);
1561 
1562         switch (iomap.type) {
1563         case IOMAP_MAPPED:
1564                 error = dax_iomap_pfn(&iomap, pos, PMD_SIZE, &pfn);
1565                 if (error < 0)
1566                         goto finish_iomap;
1567 
1568                 entry = dax_insert_entry(&xas, mapping, vmf, entry, pfn,
1569                                                 DAX_PMD, write && !sync);
1570 
1571                 
1572 
1573 
1574 
1575 
1576 
1577                 if (sync) {
1578                         if (WARN_ON_ONCE(!pfnp))
1579                                 goto finish_iomap;
1580                         *pfnp = pfn;
1581                         result = VM_FAULT_NEEDDSYNC;
1582                         goto finish_iomap;
1583                 }
1584 
1585                 trace_dax_pmd_insert_mapping(inode, vmf, PMD_SIZE, pfn, entry);
1586                 result = vmf_insert_pfn_pmd(vmf, pfn, write);
1587                 break;
1588         case IOMAP_UNWRITTEN:
1589         case IOMAP_HOLE:
1590                 if (WARN_ON_ONCE(write))
1591                         break;
1592                 result = dax_pmd_load_hole(&xas, vmf, &iomap, &entry);
1593                 break;
1594         default:
1595                 WARN_ON_ONCE(1);
1596                 break;
1597         }
1598 
1599  finish_iomap:
1600         if (ops->iomap_end) {
1601                 int copied = PMD_SIZE;
1602 
1603                 if (result == VM_FAULT_FALLBACK)
1604                         copied = 0;
1605                 
1606 
1607 
1608 
1609 
1610 
1611                 ops->iomap_end(inode, pos, PMD_SIZE, copied, iomap_flags,
1612                                 &iomap);
1613         }
1614  unlock_entry:
1615         dax_unlock_entry(&xas, entry);
1616  fallback:
1617         if (result == VM_FAULT_FALLBACK) {
1618                 split_huge_pmd(vma, vmf->pmd, vmf->address);
1619                 count_vm_event(THP_FAULT_FALLBACK);
1620         }
1621 out:
1622         trace_dax_pmd_fault_done(inode, vmf, max_pgoff, result);
1623         return result;
1624 }
1625 #else
1626 static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
1627                                const struct iomap_ops *ops)
1628 {
1629         return VM_FAULT_FALLBACK;
1630 }
1631 #endif 
1632 
1633 
1634 
1635 
1636 
1637 
1638 
1639 
1640 
1641 
1642 
1643 
1644 
1645 
1646 vm_fault_t dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
1647                     pfn_t *pfnp, int *iomap_errp, const struct iomap_ops *ops)
1648 {
1649         switch (pe_size) {
1650         case PE_SIZE_PTE:
1651                 return dax_iomap_pte_fault(vmf, pfnp, iomap_errp, ops);
1652         case PE_SIZE_PMD:
1653                 return dax_iomap_pmd_fault(vmf, pfnp, ops);
1654         default:
1655                 return VM_FAULT_FALLBACK;
1656         }
1657 }
1658 EXPORT_SYMBOL_GPL(dax_iomap_fault);
1659 
1660 
1661 
1662 
1663 
1664 
1665 
1666 
1667 
1668 
1669 static vm_fault_t
1670 dax_insert_pfn_mkwrite(struct vm_fault *vmf, pfn_t pfn, unsigned int order)
1671 {
1672         struct address_space *mapping = vmf->vma->vm_file->f_mapping;
1673         XA_STATE_ORDER(xas, &mapping->i_pages, vmf->pgoff, order);
1674         void *entry;
1675         vm_fault_t ret;
1676 
1677         xas_lock_irq(&xas);
1678         entry = get_unlocked_entry(&xas, order);
1679         
1680         if (!entry || dax_is_conflict(entry) ||
1681             (order == 0 && !dax_is_pte_entry(entry))) {
1682                 put_unlocked_entry(&xas, entry);
1683                 xas_unlock_irq(&xas);
1684                 trace_dax_insert_pfn_mkwrite_no_entry(mapping->host, vmf,
1685                                                       VM_FAULT_NOPAGE);
1686                 return VM_FAULT_NOPAGE;
1687         }
1688         xas_set_mark(&xas, PAGECACHE_TAG_DIRTY);
1689         dax_lock_entry(&xas, entry);
1690         xas_unlock_irq(&xas);
1691         if (order == 0)
1692                 ret = vmf_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn);
1693 #ifdef CONFIG_FS_DAX_PMD
1694         else if (order == PMD_ORDER)
1695                 ret = vmf_insert_pfn_pmd(vmf, pfn, FAULT_FLAG_WRITE);
1696 #endif
1697         else
1698                 ret = VM_FAULT_FALLBACK;
1699         dax_unlock_entry(&xas, entry);
1700         trace_dax_insert_pfn_mkwrite(mapping->host, vmf, ret);
1701         return ret;
1702 }
1703 
1704 
1705 
1706 
1707 
1708 
1709 
1710 
1711 
1712 
1713 
1714 vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf,
1715                 enum page_entry_size pe_size, pfn_t pfn)
1716 {
1717         int err;
1718         loff_t start = ((loff_t)vmf->pgoff) << PAGE_SHIFT;
1719         unsigned int order = pe_order(pe_size);
1720         size_t len = PAGE_SIZE << order;
1721 
1722         err = vfs_fsync_range(vmf->vma->vm_file, start, start + len - 1, 1);
1723         if (err)
1724                 return VM_FAULT_SIGBUS;
1725         return dax_insert_pfn_mkwrite(vmf, pfn, order);
1726 }
1727 EXPORT_SYMBOL_GPL(dax_finish_sync_fault);