1/* 2 * linux/mm/compaction.c 3 * 4 * Memory compaction for the reduction of external fragmentation. Note that 5 * this heavily depends upon page migration to do all the real heavy 6 * lifting 7 * 8 * Copyright IBM Corp. 2007-2010 Mel Gorman <mel@csn.ul.ie> 9 */ 10#include <linux/swap.h> 11#include <linux/migrate.h> 12#include <linux/compaction.h> 13#include <linux/mm_inline.h> 14#include <linux/backing-dev.h> 15#include <linux/sysctl.h> 16#include <linux/sysfs.h> 17#include <linux/balloon_compaction.h> 18#include <linux/page-isolation.h> 19#include <linux/kasan.h> 20#include "internal.h" 21 22#ifdef CONFIG_COMPACTION 23static inline void count_compact_event(enum vm_event_item item) 24{ 25 count_vm_event(item); 26} 27 28static inline void count_compact_events(enum vm_event_item item, long delta) 29{ 30 count_vm_events(item, delta); 31} 32#else 33#define count_compact_event(item) do { } while (0) 34#define count_compact_events(item, delta) do { } while (0) 35#endif 36 37#if defined CONFIG_COMPACTION || defined CONFIG_CMA 38#ifdef CONFIG_TRACEPOINTS 39static const char *const compaction_status_string[] = { 40 "deferred", 41 "skipped", 42 "continue", 43 "partial", 44 "complete", 45 "no_suitable_page", 46 "not_suitable_zone", 47}; 48#endif 49 50#define CREATE_TRACE_POINTS 51#include <trace/events/compaction.h> 52 53static unsigned long release_freepages(struct list_head *freelist) 54{ 55 struct page *page, *next; 56 unsigned long high_pfn = 0; 57 58 list_for_each_entry_safe(page, next, freelist, lru) { 59 unsigned long pfn = page_to_pfn(page); 60 list_del(&page->lru); 61 __free_page(page); 62 if (pfn > high_pfn) 63 high_pfn = pfn; 64 } 65 66 return high_pfn; 67} 68 69static void map_pages(struct list_head *list) 70{ 71 struct page *page; 72 73 list_for_each_entry(page, list, lru) { 74 arch_alloc_page(page, 0); 75 kernel_map_pages(page, 1, 1); 76 kasan_alloc_pages(page, 0); 77 } 78} 79 80static inline bool migrate_async_suitable(int migratetype) 81{ 82 return is_migrate_cma(migratetype) || migratetype == MIGRATE_MOVABLE; 83} 84 85/* 86 * Check that the whole (or subset of) a pageblock given by the interval of 87 * [start_pfn, end_pfn) is valid and within the same zone, before scanning it 88 * with the migration of free compaction scanner. The scanners then need to 89 * use only pfn_valid_within() check for arches that allow holes within 90 * pageblocks. 91 * 92 * Return struct page pointer of start_pfn, or NULL if checks were not passed. 93 * 94 * It's possible on some configurations to have a setup like node0 node1 node0 95 * i.e. it's possible that all pages within a zones range of pages do not 96 * belong to a single zone. We assume that a border between node0 and node1 97 * can occur within a single pageblock, but not a node0 node1 node0 98 * interleaving within a single pageblock. It is therefore sufficient to check 99 * the first and last page of a pageblock and avoid checking each individual 100 * page in a pageblock. 101 */ 102static struct page *pageblock_pfn_to_page(unsigned long start_pfn, 103 unsigned long end_pfn, struct zone *zone) 104{ 105 struct page *start_page; 106 struct page *end_page; 107 108 /* end_pfn is one past the range we are checking */ 109 end_pfn--; 110 111 if (!pfn_valid(start_pfn) || !pfn_valid(end_pfn)) 112 return NULL; 113 114 start_page = pfn_to_page(start_pfn); 115 116 if (page_zone(start_page) != zone) 117 return NULL; 118 119 end_page = pfn_to_page(end_pfn); 120 121 /* This gives a shorter code than deriving page_zone(end_page) */ 122 if (page_zone_id(start_page) != page_zone_id(end_page)) 123 return NULL; 124 125 return start_page; 126} 127 128#ifdef CONFIG_COMPACTION 129 130/* Do not skip compaction more than 64 times */ 131#define COMPACT_MAX_DEFER_SHIFT 6 132 133/* 134 * Compaction is deferred when compaction fails to result in a page 135 * allocation success. 1 << compact_defer_limit compactions are skipped up 136 * to a limit of 1 << COMPACT_MAX_DEFER_SHIFT 137 */ 138void defer_compaction(struct zone *zone, int order) 139{ 140 zone->compact_considered = 0; 141 zone->compact_defer_shift++; 142 143 if (order < zone->compact_order_failed) 144 zone->compact_order_failed = order; 145 146 if (zone->compact_defer_shift > COMPACT_MAX_DEFER_SHIFT) 147 zone->compact_defer_shift = COMPACT_MAX_DEFER_SHIFT; 148 149 trace_mm_compaction_defer_compaction(zone, order); 150} 151 152/* Returns true if compaction should be skipped this time */ 153bool compaction_deferred(struct zone *zone, int order) 154{ 155 unsigned long defer_limit = 1UL << zone->compact_defer_shift; 156 157 if (order < zone->compact_order_failed) 158 return false; 159 160 /* Avoid possible overflow */ 161 if (++zone->compact_considered > defer_limit) 162 zone->compact_considered = defer_limit; 163 164 if (zone->compact_considered >= defer_limit) 165 return false; 166 167 trace_mm_compaction_deferred(zone, order); 168 169 return true; 170} 171 172/* 173 * Update defer tracking counters after successful compaction of given order, 174 * which means an allocation either succeeded (alloc_success == true) or is 175 * expected to succeed. 176 */ 177void compaction_defer_reset(struct zone *zone, int order, 178 bool alloc_success) 179{ 180 if (alloc_success) { 181 zone->compact_considered = 0; 182 zone->compact_defer_shift = 0; 183 } 184 if (order >= zone->compact_order_failed) 185 zone->compact_order_failed = order + 1; 186 187 trace_mm_compaction_defer_reset(zone, order); 188} 189 190/* Returns true if restarting compaction after many failures */ 191bool compaction_restarting(struct zone *zone, int order) 192{ 193 if (order < zone->compact_order_failed) 194 return false; 195 196 return zone->compact_defer_shift == COMPACT_MAX_DEFER_SHIFT && 197 zone->compact_considered >= 1UL << zone->compact_defer_shift; 198} 199 200/* Returns true if the pageblock should be scanned for pages to isolate. */ 201static inline bool isolation_suitable(struct compact_control *cc, 202 struct page *page) 203{ 204 if (cc->ignore_skip_hint) 205 return true; 206 207 return !get_pageblock_skip(page); 208} 209 210/* 211 * This function is called to clear all cached information on pageblocks that 212 * should be skipped for page isolation when the migrate and free page scanner 213 * meet. 214 */ 215static void __reset_isolation_suitable(struct zone *zone) 216{ 217 unsigned long start_pfn = zone->zone_start_pfn; 218 unsigned long end_pfn = zone_end_pfn(zone); 219 unsigned long pfn; 220 221 zone->compact_cached_migrate_pfn[0] = start_pfn; 222 zone->compact_cached_migrate_pfn[1] = start_pfn; 223 zone->compact_cached_free_pfn = end_pfn; 224 zone->compact_blockskip_flush = false; 225 226 /* Walk the zone and mark every pageblock as suitable for isolation */ 227 for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { 228 struct page *page; 229 230 cond_resched(); 231 232 if (!pfn_valid(pfn)) 233 continue; 234 235 page = pfn_to_page(pfn); 236 if (zone != page_zone(page)) 237 continue; 238 239 clear_pageblock_skip(page); 240 } 241} 242 243void reset_isolation_suitable(pg_data_t *pgdat) 244{ 245 int zoneid; 246 247 for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) { 248 struct zone *zone = &pgdat->node_zones[zoneid]; 249 if (!populated_zone(zone)) 250 continue; 251 252 /* Only flush if a full compaction finished recently */ 253 if (zone->compact_blockskip_flush) 254 __reset_isolation_suitable(zone); 255 } 256} 257 258/* 259 * If no pages were isolated then mark this pageblock to be skipped in the 260 * future. The information is later cleared by __reset_isolation_suitable(). 261 */ 262static void update_pageblock_skip(struct compact_control *cc, 263 struct page *page, unsigned long nr_isolated, 264 bool migrate_scanner) 265{ 266 struct zone *zone = cc->zone; 267 unsigned long pfn; 268 269 if (cc->ignore_skip_hint) 270 return; 271 272 if (!page) 273 return; 274 275 if (nr_isolated) 276 return; 277 278 set_pageblock_skip(page); 279 280 pfn = page_to_pfn(page); 281 282 /* Update where async and sync compaction should restart */ 283 if (migrate_scanner) { 284 if (pfn > zone->compact_cached_migrate_pfn[0]) 285 zone->compact_cached_migrate_pfn[0] = pfn; 286 if (cc->mode != MIGRATE_ASYNC && 287 pfn > zone->compact_cached_migrate_pfn[1]) 288 zone->compact_cached_migrate_pfn[1] = pfn; 289 } else { 290 if (pfn < zone->compact_cached_free_pfn) 291 zone->compact_cached_free_pfn = pfn; 292 } 293} 294#else 295static inline bool isolation_suitable(struct compact_control *cc, 296 struct page *page) 297{ 298 return true; 299} 300 301static void update_pageblock_skip(struct compact_control *cc, 302 struct page *page, unsigned long nr_isolated, 303 bool migrate_scanner) 304{ 305} 306#endif /* CONFIG_COMPACTION */ 307 308/* 309 * Compaction requires the taking of some coarse locks that are potentially 310 * very heavily contended. For async compaction, back out if the lock cannot 311 * be taken immediately. For sync compaction, spin on the lock if needed. 312 * 313 * Returns true if the lock is held 314 * Returns false if the lock is not held and compaction should abort 315 */ 316static bool compact_trylock_irqsave(spinlock_t *lock, unsigned long *flags, 317 struct compact_control *cc) 318{ 319 if (cc->mode == MIGRATE_ASYNC) { 320 if (!spin_trylock_irqsave(lock, *flags)) { 321 cc->contended = COMPACT_CONTENDED_LOCK; 322 return false; 323 } 324 } else { 325 spin_lock_irqsave(lock, *flags); 326 } 327 328 return true; 329} 330 331/* 332 * Compaction requires the taking of some coarse locks that are potentially 333 * very heavily contended. The lock should be periodically unlocked to avoid 334 * having disabled IRQs for a long time, even when there is nobody waiting on 335 * the lock. It might also be that allowing the IRQs will result in 336 * need_resched() becoming true. If scheduling is needed, async compaction 337 * aborts. Sync compaction schedules. 338 * Either compaction type will also abort if a fatal signal is pending. 339 * In either case if the lock was locked, it is dropped and not regained. 340 * 341 * Returns true if compaction should abort due to fatal signal pending, or 342 * async compaction due to need_resched() 343 * Returns false when compaction can continue (sync compaction might have 344 * scheduled) 345 */ 346static bool compact_unlock_should_abort(spinlock_t *lock, 347 unsigned long flags, bool *locked, struct compact_control *cc) 348{ 349 if (*locked) { 350 spin_unlock_irqrestore(lock, flags); 351 *locked = false; 352 } 353 354 if (fatal_signal_pending(current)) { 355 cc->contended = COMPACT_CONTENDED_SCHED; 356 return true; 357 } 358 359 if (need_resched()) { 360 if (cc->mode == MIGRATE_ASYNC) { 361 cc->contended = COMPACT_CONTENDED_SCHED; 362 return true; 363 } 364 cond_resched(); 365 } 366 367 return false; 368} 369 370/* 371 * Aside from avoiding lock contention, compaction also periodically checks 372 * need_resched() and either schedules in sync compaction or aborts async 373 * compaction. This is similar to what compact_unlock_should_abort() does, but 374 * is used where no lock is concerned. 375 * 376 * Returns false when no scheduling was needed, or sync compaction scheduled. 377 * Returns true when async compaction should abort. 378 */ 379static inline bool compact_should_abort(struct compact_control *cc) 380{ 381 /* async compaction aborts if contended */ 382 if (need_resched()) { 383 if (cc->mode == MIGRATE_ASYNC) { 384 cc->contended = COMPACT_CONTENDED_SCHED; 385 return true; 386 } 387 388 cond_resched(); 389 } 390 391 return false; 392} 393 394/* 395 * Isolate free pages onto a private freelist. If @strict is true, will abort 396 * returning 0 on any invalid PFNs or non-free pages inside of the pageblock 397 * (even though it may still end up isolating some pages). 398 */ 399static unsigned long isolate_freepages_block(struct compact_control *cc, 400 unsigned long *start_pfn, 401 unsigned long end_pfn, 402 struct list_head *freelist, 403 bool strict) 404{ 405 int nr_scanned = 0, total_isolated = 0; 406 struct page *cursor, *valid_page = NULL; 407 unsigned long flags = 0; 408 bool locked = false; 409 unsigned long blockpfn = *start_pfn; 410 411 cursor = pfn_to_page(blockpfn); 412 413 /* Isolate free pages. */ 414 for (; blockpfn < end_pfn; blockpfn++, cursor++) { 415 int isolated, i; 416 struct page *page = cursor; 417 418 /* 419 * Periodically drop the lock (if held) regardless of its 420 * contention, to give chance to IRQs. Abort if fatal signal 421 * pending or async compaction detects need_resched() 422 */ 423 if (!(blockpfn % SWAP_CLUSTER_MAX) 424 && compact_unlock_should_abort(&cc->zone->lock, flags, 425 &locked, cc)) 426 break; 427 428 nr_scanned++; 429 if (!pfn_valid_within(blockpfn)) 430 goto isolate_fail; 431 432 if (!valid_page) 433 valid_page = page; 434 if (!PageBuddy(page)) 435 goto isolate_fail; 436 437 /* 438 * If we already hold the lock, we can skip some rechecking. 439 * Note that if we hold the lock now, checked_pageblock was 440 * already set in some previous iteration (or strict is true), 441 * so it is correct to skip the suitable migration target 442 * recheck as well. 443 */ 444 if (!locked) { 445 /* 446 * The zone lock must be held to isolate freepages. 447 * Unfortunately this is a very coarse lock and can be 448 * heavily contended if there are parallel allocations 449 * or parallel compactions. For async compaction do not 450 * spin on the lock and we acquire the lock as late as 451 * possible. 452 */ 453 locked = compact_trylock_irqsave(&cc->zone->lock, 454 &flags, cc); 455 if (!locked) 456 break; 457 458 /* Recheck this is a buddy page under lock */ 459 if (!PageBuddy(page)) 460 goto isolate_fail; 461 } 462 463 /* Found a free page, break it into order-0 pages */ 464 isolated = split_free_page(page); 465 total_isolated += isolated; 466 for (i = 0; i < isolated; i++) { 467 list_add(&page->lru, freelist); 468 page++; 469 } 470 471 /* If a page was split, advance to the end of it */ 472 if (isolated) { 473 cc->nr_freepages += isolated; 474 if (!strict && 475 cc->nr_migratepages <= cc->nr_freepages) { 476 blockpfn += isolated; 477 break; 478 } 479 480 blockpfn += isolated - 1; 481 cursor += isolated - 1; 482 continue; 483 } 484 485isolate_fail: 486 if (strict) 487 break; 488 else 489 continue; 490 491 } 492 493 trace_mm_compaction_isolate_freepages(*start_pfn, blockpfn, 494 nr_scanned, total_isolated); 495 496 /* Record how far we have got within the block */ 497 *start_pfn = blockpfn; 498 499 /* 500 * If strict isolation is requested by CMA then check that all the 501 * pages requested were isolated. If there were any failures, 0 is 502 * returned and CMA will fail. 503 */ 504 if (strict && blockpfn < end_pfn) 505 total_isolated = 0; 506 507 if (locked) 508 spin_unlock_irqrestore(&cc->zone->lock, flags); 509 510 /* Update the pageblock-skip if the whole pageblock was scanned */ 511 if (blockpfn == end_pfn) 512 update_pageblock_skip(cc, valid_page, total_isolated, false); 513 514 count_compact_events(COMPACTFREE_SCANNED, nr_scanned); 515 if (total_isolated) 516 count_compact_events(COMPACTISOLATED, total_isolated); 517 return total_isolated; 518} 519 520/** 521 * isolate_freepages_range() - isolate free pages. 522 * @start_pfn: The first PFN to start isolating. 523 * @end_pfn: The one-past-last PFN. 524 * 525 * Non-free pages, invalid PFNs, or zone boundaries within the 526 * [start_pfn, end_pfn) range are considered errors, cause function to 527 * undo its actions and return zero. 528 * 529 * Otherwise, function returns one-past-the-last PFN of isolated page 530 * (which may be greater then end_pfn if end fell in a middle of 531 * a free page). 532 */ 533unsigned long 534isolate_freepages_range(struct compact_control *cc, 535 unsigned long start_pfn, unsigned long end_pfn) 536{ 537 unsigned long isolated, pfn, block_end_pfn; 538 LIST_HEAD(freelist); 539 540 pfn = start_pfn; 541 block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); 542 543 for (; pfn < end_pfn; pfn += isolated, 544 block_end_pfn += pageblock_nr_pages) { 545 /* Protect pfn from changing by isolate_freepages_block */ 546 unsigned long isolate_start_pfn = pfn; 547 548 block_end_pfn = min(block_end_pfn, end_pfn); 549 550 /* 551 * pfn could pass the block_end_pfn if isolated freepage 552 * is more than pageblock order. In this case, we adjust 553 * scanning range to right one. 554 */ 555 if (pfn >= block_end_pfn) { 556 block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); 557 block_end_pfn = min(block_end_pfn, end_pfn); 558 } 559 560 if (!pageblock_pfn_to_page(pfn, block_end_pfn, cc->zone)) 561 break; 562 563 isolated = isolate_freepages_block(cc, &isolate_start_pfn, 564 block_end_pfn, &freelist, true); 565 566 /* 567 * In strict mode, isolate_freepages_block() returns 0 if 568 * there are any holes in the block (ie. invalid PFNs or 569 * non-free pages). 570 */ 571 if (!isolated) 572 break; 573 574 /* 575 * If we managed to isolate pages, it is always (1 << n) * 576 * pageblock_nr_pages for some non-negative n. (Max order 577 * page may span two pageblocks). 578 */ 579 } 580 581 /* split_free_page does not map the pages */ 582 map_pages(&freelist); 583 584 if (pfn < end_pfn) { 585 /* Loop terminated early, cleanup. */ 586 release_freepages(&freelist); 587 return 0; 588 } 589 590 /* We don't use freelists for anything. */ 591 return pfn; 592} 593 594/* Update the number of anon and file isolated pages in the zone */ 595static void acct_isolated(struct zone *zone, struct compact_control *cc) 596{ 597 struct page *page; 598 unsigned int count[2] = { 0, }; 599 600 if (list_empty(&cc->migratepages)) 601 return; 602 603 list_for_each_entry(page, &cc->migratepages, lru) 604 count[!!page_is_file_cache(page)]++; 605 606 mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]); 607 mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]); 608} 609 610/* Similar to reclaim, but different enough that they don't share logic */ 611static bool too_many_isolated(struct zone *zone) 612{ 613 unsigned long active, inactive, isolated; 614 615 inactive = zone_page_state(zone, NR_INACTIVE_FILE) + 616 zone_page_state(zone, NR_INACTIVE_ANON); 617 active = zone_page_state(zone, NR_ACTIVE_FILE) + 618 zone_page_state(zone, NR_ACTIVE_ANON); 619 isolated = zone_page_state(zone, NR_ISOLATED_FILE) + 620 zone_page_state(zone, NR_ISOLATED_ANON); 621 622 return isolated > (inactive + active) / 2; 623} 624 625/** 626 * isolate_migratepages_block() - isolate all migrate-able pages within 627 * a single pageblock 628 * @cc: Compaction control structure. 629 * @low_pfn: The first PFN to isolate 630 * @end_pfn: The one-past-the-last PFN to isolate, within same pageblock 631 * @isolate_mode: Isolation mode to be used. 632 * 633 * Isolate all pages that can be migrated from the range specified by 634 * [low_pfn, end_pfn). The range is expected to be within same pageblock. 635 * Returns zero if there is a fatal signal pending, otherwise PFN of the 636 * first page that was not scanned (which may be both less, equal to or more 637 * than end_pfn). 638 * 639 * The pages are isolated on cc->migratepages list (not required to be empty), 640 * and cc->nr_migratepages is updated accordingly. The cc->migrate_pfn field 641 * is neither read nor updated. 642 */ 643static unsigned long 644isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, 645 unsigned long end_pfn, isolate_mode_t isolate_mode) 646{ 647 struct zone *zone = cc->zone; 648 unsigned long nr_scanned = 0, nr_isolated = 0; 649 struct list_head *migratelist = &cc->migratepages; 650 struct lruvec *lruvec; 651 unsigned long flags = 0; 652 bool locked = false; 653 struct page *page = NULL, *valid_page = NULL; 654 unsigned long start_pfn = low_pfn; 655 656 /* 657 * Ensure that there are not too many pages isolated from the LRU 658 * list by either parallel reclaimers or compaction. If there are, 659 * delay for some time until fewer pages are isolated 660 */ 661 while (unlikely(too_many_isolated(zone))) { 662 /* async migration should just abort */ 663 if (cc->mode == MIGRATE_ASYNC) 664 return 0; 665 666 congestion_wait(BLK_RW_ASYNC, HZ/10); 667 668 if (fatal_signal_pending(current)) 669 return 0; 670 } 671 672 if (compact_should_abort(cc)) 673 return 0; 674 675 /* Time to isolate some pages for migration */ 676 for (; low_pfn < end_pfn; low_pfn++) { 677 /* 678 * Periodically drop the lock (if held) regardless of its 679 * contention, to give chance to IRQs. Abort async compaction 680 * if contended. 681 */ 682 if (!(low_pfn % SWAP_CLUSTER_MAX) 683 && compact_unlock_should_abort(&zone->lru_lock, flags, 684 &locked, cc)) 685 break; 686 687 if (!pfn_valid_within(low_pfn)) 688 continue; 689 nr_scanned++; 690 691 page = pfn_to_page(low_pfn); 692 693 if (!valid_page) 694 valid_page = page; 695 696 /* 697 * Skip if free. We read page order here without zone lock 698 * which is generally unsafe, but the race window is small and 699 * the worst thing that can happen is that we skip some 700 * potential isolation targets. 701 */ 702 if (PageBuddy(page)) { 703 unsigned long freepage_order = page_order_unsafe(page); 704 705 /* 706 * Without lock, we cannot be sure that what we got is 707 * a valid page order. Consider only values in the 708 * valid order range to prevent low_pfn overflow. 709 */ 710 if (freepage_order > 0 && freepage_order < MAX_ORDER) 711 low_pfn += (1UL << freepage_order) - 1; 712 continue; 713 } 714 715 /* 716 * Check may be lockless but that's ok as we recheck later. 717 * It's possible to migrate LRU pages and balloon pages 718 * Skip any other type of page 719 */ 720 if (!PageLRU(page)) { 721 if (unlikely(balloon_page_movable(page))) { 722 if (balloon_page_isolate(page)) { 723 /* Successfully isolated */ 724 goto isolate_success; 725 } 726 } 727 continue; 728 } 729 730 /* 731 * PageLRU is set. lru_lock normally excludes isolation 732 * splitting and collapsing (collapsing has already happened 733 * if PageLRU is set) but the lock is not necessarily taken 734 * here and it is wasteful to take it just to check transhuge. 735 * Check TransHuge without lock and skip the whole pageblock if 736 * it's either a transhuge or hugetlbfs page, as calling 737 * compound_order() without preventing THP from splitting the 738 * page underneath us may return surprising results. 739 */ 740 if (PageTransHuge(page)) { 741 if (!locked) 742 low_pfn = ALIGN(low_pfn + 1, 743 pageblock_nr_pages) - 1; 744 else 745 low_pfn += (1 << compound_order(page)) - 1; 746 747 continue; 748 } 749 750 /* 751 * Migration will fail if an anonymous page is pinned in memory, 752 * so avoid taking lru_lock and isolating it unnecessarily in an 753 * admittedly racy check. 754 */ 755 if (!page_mapping(page) && 756 page_count(page) > page_mapcount(page)) 757 continue; 758 759 /* If we already hold the lock, we can skip some rechecking */ 760 if (!locked) { 761 locked = compact_trylock_irqsave(&zone->lru_lock, 762 &flags, cc); 763 if (!locked) 764 break; 765 766 /* Recheck PageLRU and PageTransHuge under lock */ 767 if (!PageLRU(page)) 768 continue; 769 if (PageTransHuge(page)) { 770 low_pfn += (1 << compound_order(page)) - 1; 771 continue; 772 } 773 } 774 775 lruvec = mem_cgroup_page_lruvec(page, zone); 776 777 /* Try isolate the page */ 778 if (__isolate_lru_page(page, isolate_mode) != 0) 779 continue; 780 781 VM_BUG_ON_PAGE(PageTransCompound(page), page); 782 783 /* Successfully isolated */ 784 del_page_from_lru_list(page, lruvec, page_lru(page)); 785 786isolate_success: 787 list_add(&page->lru, migratelist); 788 cc->nr_migratepages++; 789 nr_isolated++; 790 791 /* Avoid isolating too much */ 792 if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) { 793 ++low_pfn; 794 break; 795 } 796 } 797 798 /* 799 * The PageBuddy() check could have potentially brought us outside 800 * the range to be scanned. 801 */ 802 if (unlikely(low_pfn > end_pfn)) 803 low_pfn = end_pfn; 804 805 if (locked) 806 spin_unlock_irqrestore(&zone->lru_lock, flags); 807 808 /* 809 * Update the pageblock-skip information and cached scanner pfn, 810 * if the whole pageblock was scanned without isolating any page. 811 */ 812 if (low_pfn == end_pfn) 813 update_pageblock_skip(cc, valid_page, nr_isolated, true); 814 815 trace_mm_compaction_isolate_migratepages(start_pfn, low_pfn, 816 nr_scanned, nr_isolated); 817 818 count_compact_events(COMPACTMIGRATE_SCANNED, nr_scanned); 819 if (nr_isolated) 820 count_compact_events(COMPACTISOLATED, nr_isolated); 821 822 return low_pfn; 823} 824 825/** 826 * isolate_migratepages_range() - isolate migrate-able pages in a PFN range 827 * @cc: Compaction control structure. 828 * @start_pfn: The first PFN to start isolating. 829 * @end_pfn: The one-past-last PFN. 830 * 831 * Returns zero if isolation fails fatally due to e.g. pending signal. 832 * Otherwise, function returns one-past-the-last PFN of isolated page 833 * (which may be greater than end_pfn if end fell in a middle of a THP page). 834 */ 835unsigned long 836isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn, 837 unsigned long end_pfn) 838{ 839 unsigned long pfn, block_end_pfn; 840 841 /* Scan block by block. First and last block may be incomplete */ 842 pfn = start_pfn; 843 block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); 844 845 for (; pfn < end_pfn; pfn = block_end_pfn, 846 block_end_pfn += pageblock_nr_pages) { 847 848 block_end_pfn = min(block_end_pfn, end_pfn); 849 850 if (!pageblock_pfn_to_page(pfn, block_end_pfn, cc->zone)) 851 continue; 852 853 pfn = isolate_migratepages_block(cc, pfn, block_end_pfn, 854 ISOLATE_UNEVICTABLE); 855 856 if (!pfn) 857 break; 858 859 if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) 860 break; 861 } 862 acct_isolated(cc->zone, cc); 863 864 return pfn; 865} 866 867#endif /* CONFIG_COMPACTION || CONFIG_CMA */ 868#ifdef CONFIG_COMPACTION 869 870/* Returns true if the page is within a block suitable for migration to */ 871static bool suitable_migration_target(struct page *page) 872{ 873 /* If the page is a large free page, then disallow migration */ 874 if (PageBuddy(page)) { 875 /* 876 * We are checking page_order without zone->lock taken. But 877 * the only small danger is that we skip a potentially suitable 878 * pageblock, so it's not worth to check order for valid range. 879 */ 880 if (page_order_unsafe(page) >= pageblock_order) 881 return false; 882 } 883 884 /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */ 885 if (migrate_async_suitable(get_pageblock_migratetype(page))) 886 return true; 887 888 /* Otherwise skip the block */ 889 return false; 890} 891 892/* 893 * Based on information in the current compact_control, find blocks 894 * suitable for isolating free pages from and then isolate them. 895 */ 896static void isolate_freepages(struct compact_control *cc) 897{ 898 struct zone *zone = cc->zone; 899 struct page *page; 900 unsigned long block_start_pfn; /* start of current pageblock */ 901 unsigned long isolate_start_pfn; /* exact pfn we start at */ 902 unsigned long block_end_pfn; /* end of current pageblock */ 903 unsigned long low_pfn; /* lowest pfn scanner is able to scan */ 904 struct list_head *freelist = &cc->freepages; 905 906 /* 907 * Initialise the free scanner. The starting point is where we last 908 * successfully isolated from, zone-cached value, or the end of the 909 * zone when isolating for the first time. For looping we also need 910 * this pfn aligned down to the pageblock boundary, because we do 911 * block_start_pfn -= pageblock_nr_pages in the for loop. 912 * For ending point, take care when isolating in last pageblock of a 913 * a zone which ends in the middle of a pageblock. 914 * The low boundary is the end of the pageblock the migration scanner 915 * is using. 916 */ 917 isolate_start_pfn = cc->free_pfn; 918 block_start_pfn = cc->free_pfn & ~(pageblock_nr_pages-1); 919 block_end_pfn = min(block_start_pfn + pageblock_nr_pages, 920 zone_end_pfn(zone)); 921 low_pfn = ALIGN(cc->migrate_pfn + 1, pageblock_nr_pages); 922 923 /* 924 * Isolate free pages until enough are available to migrate the 925 * pages on cc->migratepages. We stop searching if the migrate 926 * and free page scanners meet or enough free pages are isolated. 927 */ 928 for (; block_start_pfn >= low_pfn && 929 cc->nr_migratepages > cc->nr_freepages; 930 block_end_pfn = block_start_pfn, 931 block_start_pfn -= pageblock_nr_pages, 932 isolate_start_pfn = block_start_pfn) { 933 934 /* 935 * This can iterate a massively long zone without finding any 936 * suitable migration targets, so periodically check if we need 937 * to schedule, or even abort async compaction. 938 */ 939 if (!(block_start_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages)) 940 && compact_should_abort(cc)) 941 break; 942 943 page = pageblock_pfn_to_page(block_start_pfn, block_end_pfn, 944 zone); 945 if (!page) 946 continue; 947 948 /* Check the block is suitable for migration */ 949 if (!suitable_migration_target(page)) 950 continue; 951 952 /* If isolation recently failed, do not retry */ 953 if (!isolation_suitable(cc, page)) 954 continue; 955 956 /* Found a block suitable for isolating free pages from. */ 957 isolate_freepages_block(cc, &isolate_start_pfn, 958 block_end_pfn, freelist, false); 959 960 /* 961 * Remember where the free scanner should restart next time, 962 * which is where isolate_freepages_block() left off. 963 * But if it scanned the whole pageblock, isolate_start_pfn 964 * now points at block_end_pfn, which is the start of the next 965 * pageblock. 966 * In that case we will however want to restart at the start 967 * of the previous pageblock. 968 */ 969 cc->free_pfn = (isolate_start_pfn < block_end_pfn) ? 970 isolate_start_pfn : 971 block_start_pfn - pageblock_nr_pages; 972 973 /* 974 * isolate_freepages_block() might have aborted due to async 975 * compaction being contended 976 */ 977 if (cc->contended) 978 break; 979 } 980 981 /* split_free_page does not map the pages */ 982 map_pages(freelist); 983 984 /* 985 * If we crossed the migrate scanner, we want to keep it that way 986 * so that compact_finished() may detect this 987 */ 988 if (block_start_pfn < low_pfn) 989 cc->free_pfn = cc->migrate_pfn; 990} 991 992/* 993 * This is a migrate-callback that "allocates" freepages by taking pages 994 * from the isolated freelists in the block we are migrating to. 995 */ 996static struct page *compaction_alloc(struct page *migratepage, 997 unsigned long data, 998 int **result) 999{ 1000 struct compact_control *cc = (struct compact_control *)data; 1001 struct page *freepage; 1002 1003 /* 1004 * Isolate free pages if necessary, and if we are not aborting due to 1005 * contention. 1006 */ 1007 if (list_empty(&cc->freepages)) { 1008 if (!cc->contended) 1009 isolate_freepages(cc); 1010 1011 if (list_empty(&cc->freepages)) 1012 return NULL; 1013 } 1014 1015 freepage = list_entry(cc->freepages.next, struct page, lru); 1016 list_del(&freepage->lru); 1017 cc->nr_freepages--; 1018 1019 return freepage; 1020} 1021 1022/* 1023 * This is a migrate-callback that "frees" freepages back to the isolated 1024 * freelist. All pages on the freelist are from the same zone, so there is no 1025 * special handling needed for NUMA. 1026 */ 1027static void compaction_free(struct page *page, unsigned long data) 1028{ 1029 struct compact_control *cc = (struct compact_control *)data; 1030 1031 list_add(&page->lru, &cc->freepages); 1032 cc->nr_freepages++; 1033} 1034 1035/* possible outcome of isolate_migratepages */ 1036typedef enum { 1037 ISOLATE_ABORT, /* Abort compaction now */ 1038 ISOLATE_NONE, /* No pages isolated, continue scanning */ 1039 ISOLATE_SUCCESS, /* Pages isolated, migrate */ 1040} isolate_migrate_t; 1041 1042/* 1043 * Allow userspace to control policy on scanning the unevictable LRU for 1044 * compactable pages. 1045 */ 1046int sysctl_compact_unevictable_allowed __read_mostly = 1; 1047 1048/* 1049 * Isolate all pages that can be migrated from the first suitable block, 1050 * starting at the block pointed to by the migrate scanner pfn within 1051 * compact_control. 1052 */ 1053static isolate_migrate_t isolate_migratepages(struct zone *zone, 1054 struct compact_control *cc) 1055{ 1056 unsigned long low_pfn, end_pfn; 1057 struct page *page; 1058 const isolate_mode_t isolate_mode = 1059 (sysctl_compact_unevictable_allowed ? ISOLATE_UNEVICTABLE : 0) | 1060 (cc->mode == MIGRATE_ASYNC ? ISOLATE_ASYNC_MIGRATE : 0); 1061 1062 /* 1063 * Start at where we last stopped, or beginning of the zone as 1064 * initialized by compact_zone() 1065 */ 1066 low_pfn = cc->migrate_pfn; 1067 1068 /* Only scan within a pageblock boundary */ 1069 end_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages); 1070 1071 /* 1072 * Iterate over whole pageblocks until we find the first suitable. 1073 * Do not cross the free scanner. 1074 */ 1075 for (; end_pfn <= cc->free_pfn; 1076 low_pfn = end_pfn, end_pfn += pageblock_nr_pages) { 1077 1078 /* 1079 * This can potentially iterate a massively long zone with 1080 * many pageblocks unsuitable, so periodically check if we 1081 * need to schedule, or even abort async compaction. 1082 */ 1083 if (!(low_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages)) 1084 && compact_should_abort(cc)) 1085 break; 1086 1087 page = pageblock_pfn_to_page(low_pfn, end_pfn, zone); 1088 if (!page) 1089 continue; 1090 1091 /* If isolation recently failed, do not retry */ 1092 if (!isolation_suitable(cc, page)) 1093 continue; 1094 1095 /* 1096 * For async compaction, also only scan in MOVABLE blocks. 1097 * Async compaction is optimistic to see if the minimum amount 1098 * of work satisfies the allocation. 1099 */ 1100 if (cc->mode == MIGRATE_ASYNC && 1101 !migrate_async_suitable(get_pageblock_migratetype(page))) 1102 continue; 1103 1104 /* Perform the isolation */ 1105 low_pfn = isolate_migratepages_block(cc, low_pfn, end_pfn, 1106 isolate_mode); 1107 1108 if (!low_pfn || cc->contended) { 1109 acct_isolated(zone, cc); 1110 return ISOLATE_ABORT; 1111 } 1112 1113 /* 1114 * Either we isolated something and proceed with migration. Or 1115 * we failed and compact_zone should decide if we should 1116 * continue or not. 1117 */ 1118 break; 1119 } 1120 1121 acct_isolated(zone, cc); 1122 /* 1123 * Record where migration scanner will be restarted. If we end up in 1124 * the same pageblock as the free scanner, make the scanners fully 1125 * meet so that compact_finished() terminates compaction. 1126 */ 1127 cc->migrate_pfn = (end_pfn <= cc->free_pfn) ? low_pfn : cc->free_pfn; 1128 1129 return cc->nr_migratepages ? ISOLATE_SUCCESS : ISOLATE_NONE; 1130} 1131 1132static int __compact_finished(struct zone *zone, struct compact_control *cc, 1133 const int migratetype) 1134{ 1135 unsigned int order; 1136 unsigned long watermark; 1137 1138 if (cc->contended || fatal_signal_pending(current)) 1139 return COMPACT_PARTIAL; 1140 1141 /* Compaction run completes if the migrate and free scanner meet */ 1142 if (cc->free_pfn <= cc->migrate_pfn) { 1143 /* Let the next compaction start anew. */ 1144 zone->compact_cached_migrate_pfn[0] = zone->zone_start_pfn; 1145 zone->compact_cached_migrate_pfn[1] = zone->zone_start_pfn; 1146 zone->compact_cached_free_pfn = zone_end_pfn(zone); 1147 1148 /* 1149 * Mark that the PG_migrate_skip information should be cleared 1150 * by kswapd when it goes to sleep. kswapd does not set the 1151 * flag itself as the decision to be clear should be directly 1152 * based on an allocation request. 1153 */ 1154 if (!current_is_kswapd()) 1155 zone->compact_blockskip_flush = true; 1156 1157 return COMPACT_COMPLETE; 1158 } 1159 1160 /* 1161 * order == -1 is expected when compacting via 1162 * /proc/sys/vm/compact_memory 1163 */ 1164 if (cc->order == -1) 1165 return COMPACT_CONTINUE; 1166 1167 /* Compaction run is not finished if the watermark is not met */ 1168 watermark = low_wmark_pages(zone); 1169 1170 if (!zone_watermark_ok(zone, cc->order, watermark, cc->classzone_idx, 1171 cc->alloc_flags)) 1172 return COMPACT_CONTINUE; 1173 1174 /* Direct compactor: Is a suitable page free? */ 1175 for (order = cc->order; order < MAX_ORDER; order++) { 1176 struct free_area *area = &zone->free_area[order]; 1177 bool can_steal; 1178 1179 /* Job done if page is free of the right migratetype */ 1180 if (!list_empty(&area->free_list[migratetype])) 1181 return COMPACT_PARTIAL; 1182 1183#ifdef CONFIG_CMA 1184 /* MIGRATE_MOVABLE can fallback on MIGRATE_CMA */ 1185 if (migratetype == MIGRATE_MOVABLE && 1186 !list_empty(&area->free_list[MIGRATE_CMA])) 1187 return COMPACT_PARTIAL; 1188#endif 1189 /* 1190 * Job done if allocation would steal freepages from 1191 * other migratetype buddy lists. 1192 */ 1193 if (find_suitable_fallback(area, order, migratetype, 1194 true, &can_steal) != -1) 1195 return COMPACT_PARTIAL; 1196 } 1197 1198 return COMPACT_NO_SUITABLE_PAGE; 1199} 1200 1201static int compact_finished(struct zone *zone, struct compact_control *cc, 1202 const int migratetype) 1203{ 1204 int ret; 1205 1206 ret = __compact_finished(zone, cc, migratetype); 1207 trace_mm_compaction_finished(zone, cc->order, ret); 1208 if (ret == COMPACT_NO_SUITABLE_PAGE) 1209 ret = COMPACT_CONTINUE; 1210 1211 return ret; 1212} 1213 1214/* 1215 * compaction_suitable: Is this suitable to run compaction on this zone now? 1216 * Returns 1217 * COMPACT_SKIPPED - If there are too few free pages for compaction 1218 * COMPACT_PARTIAL - If the allocation would succeed without compaction 1219 * COMPACT_CONTINUE - If compaction should run now 1220 */ 1221static unsigned long __compaction_suitable(struct zone *zone, int order, 1222 int alloc_flags, int classzone_idx) 1223{ 1224 int fragindex; 1225 unsigned long watermark; 1226 1227 /* 1228 * order == -1 is expected when compacting via 1229 * /proc/sys/vm/compact_memory 1230 */ 1231 if (order == -1) 1232 return COMPACT_CONTINUE; 1233 1234 watermark = low_wmark_pages(zone); 1235 /* 1236 * If watermarks for high-order allocation are already met, there 1237 * should be no need for compaction at all. 1238 */ 1239 if (zone_watermark_ok(zone, order, watermark, classzone_idx, 1240 alloc_flags)) 1241 return COMPACT_PARTIAL; 1242 1243 /* 1244 * Watermarks for order-0 must be met for compaction. Note the 2UL. 1245 * This is because during migration, copies of pages need to be 1246 * allocated and for a short time, the footprint is higher 1247 */ 1248 watermark += (2UL << order); 1249 if (!zone_watermark_ok(zone, 0, watermark, classzone_idx, alloc_flags)) 1250 return COMPACT_SKIPPED; 1251 1252 /* 1253 * fragmentation index determines if allocation failures are due to 1254 * low memory or external fragmentation 1255 * 1256 * index of -1000 would imply allocations might succeed depending on 1257 * watermarks, but we already failed the high-order watermark check 1258 * index towards 0 implies failure is due to lack of memory 1259 * index towards 1000 implies failure is due to fragmentation 1260 * 1261 * Only compact if a failure would be due to fragmentation. 1262 */ 1263 fragindex = fragmentation_index(zone, order); 1264 if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold) 1265 return COMPACT_NOT_SUITABLE_ZONE; 1266 1267 return COMPACT_CONTINUE; 1268} 1269 1270unsigned long compaction_suitable(struct zone *zone, int order, 1271 int alloc_flags, int classzone_idx) 1272{ 1273 unsigned long ret; 1274 1275 ret = __compaction_suitable(zone, order, alloc_flags, classzone_idx); 1276 trace_mm_compaction_suitable(zone, order, ret); 1277 if (ret == COMPACT_NOT_SUITABLE_ZONE) 1278 ret = COMPACT_SKIPPED; 1279 1280 return ret; 1281} 1282 1283static int compact_zone(struct zone *zone, struct compact_control *cc) 1284{ 1285 int ret; 1286 unsigned long start_pfn = zone->zone_start_pfn; 1287 unsigned long end_pfn = zone_end_pfn(zone); 1288 const int migratetype = gfpflags_to_migratetype(cc->gfp_mask); 1289 const bool sync = cc->mode != MIGRATE_ASYNC; 1290 unsigned long last_migrated_pfn = 0; 1291 1292 ret = compaction_suitable(zone, cc->order, cc->alloc_flags, 1293 cc->classzone_idx); 1294 switch (ret) { 1295 case COMPACT_PARTIAL: 1296 case COMPACT_SKIPPED: 1297 /* Compaction is likely to fail */ 1298 return ret; 1299 case COMPACT_CONTINUE: 1300 /* Fall through to compaction */ 1301 ; 1302 } 1303 1304 /* 1305 * Clear pageblock skip if there were failures recently and compaction 1306 * is about to be retried after being deferred. kswapd does not do 1307 * this reset as it'll reset the cached information when going to sleep. 1308 */ 1309 if (compaction_restarting(zone, cc->order) && !current_is_kswapd()) 1310 __reset_isolation_suitable(zone); 1311 1312 /* 1313 * Setup to move all movable pages to the end of the zone. Used cached 1314 * information on where the scanners should start but check that it 1315 * is initialised by ensuring the values are within zone boundaries. 1316 */ 1317 cc->migrate_pfn = zone->compact_cached_migrate_pfn[sync]; 1318 cc->free_pfn = zone->compact_cached_free_pfn; 1319 if (cc->free_pfn < start_pfn || cc->free_pfn > end_pfn) { 1320 cc->free_pfn = end_pfn & ~(pageblock_nr_pages-1); 1321 zone->compact_cached_free_pfn = cc->free_pfn; 1322 } 1323 if (cc->migrate_pfn < start_pfn || cc->migrate_pfn > end_pfn) { 1324 cc->migrate_pfn = start_pfn; 1325 zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn; 1326 zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn; 1327 } 1328 1329 trace_mm_compaction_begin(start_pfn, cc->migrate_pfn, 1330 cc->free_pfn, end_pfn, sync); 1331 1332 migrate_prep_local(); 1333 1334 while ((ret = compact_finished(zone, cc, migratetype)) == 1335 COMPACT_CONTINUE) { 1336 int err; 1337 unsigned long isolate_start_pfn = cc->migrate_pfn; 1338 1339 switch (isolate_migratepages(zone, cc)) { 1340 case ISOLATE_ABORT: 1341 ret = COMPACT_PARTIAL; 1342 putback_movable_pages(&cc->migratepages); 1343 cc->nr_migratepages = 0; 1344 goto out; 1345 case ISOLATE_NONE: 1346 /* 1347 * We haven't isolated and migrated anything, but 1348 * there might still be unflushed migrations from 1349 * previous cc->order aligned block. 1350 */ 1351 goto check_drain; 1352 case ISOLATE_SUCCESS: 1353 ; 1354 } 1355 1356 err = migrate_pages(&cc->migratepages, compaction_alloc, 1357 compaction_free, (unsigned long)cc, cc->mode, 1358 MR_COMPACTION); 1359 1360 trace_mm_compaction_migratepages(cc->nr_migratepages, err, 1361 &cc->migratepages); 1362 1363 /* All pages were either migrated or will be released */ 1364 cc->nr_migratepages = 0; 1365 if (err) { 1366 putback_movable_pages(&cc->migratepages); 1367 /* 1368 * migrate_pages() may return -ENOMEM when scanners meet 1369 * and we want compact_finished() to detect it 1370 */ 1371 if (err == -ENOMEM && cc->free_pfn > cc->migrate_pfn) { 1372 ret = COMPACT_PARTIAL; 1373 goto out; 1374 } 1375 } 1376 1377 /* 1378 * Record where we could have freed pages by migration and not 1379 * yet flushed them to buddy allocator. We use the pfn that 1380 * isolate_migratepages() started from in this loop iteration 1381 * - this is the lowest page that could have been isolated and 1382 * then freed by migration. 1383 */ 1384 if (!last_migrated_pfn) 1385 last_migrated_pfn = isolate_start_pfn; 1386 1387check_drain: 1388 /* 1389 * Has the migration scanner moved away from the previous 1390 * cc->order aligned block where we migrated from? If yes, 1391 * flush the pages that were freed, so that they can merge and 1392 * compact_finished() can detect immediately if allocation 1393 * would succeed. 1394 */ 1395 if (cc->order > 0 && last_migrated_pfn) { 1396 int cpu; 1397 unsigned long current_block_start = 1398 cc->migrate_pfn & ~((1UL << cc->order) - 1); 1399 1400 if (last_migrated_pfn < current_block_start) { 1401 cpu = get_cpu(); 1402 lru_add_drain_cpu(cpu); 1403 drain_local_pages(zone); 1404 put_cpu(); 1405 /* No more flushing until we migrate again */ 1406 last_migrated_pfn = 0; 1407 } 1408 } 1409 1410 } 1411 1412out: 1413 /* 1414 * Release free pages and update where the free scanner should restart, 1415 * so we don't leave any returned pages behind in the next attempt. 1416 */ 1417 if (cc->nr_freepages > 0) { 1418 unsigned long free_pfn = release_freepages(&cc->freepages); 1419 1420 cc->nr_freepages = 0; 1421 VM_BUG_ON(free_pfn == 0); 1422 /* The cached pfn is always the first in a pageblock */ 1423 free_pfn &= ~(pageblock_nr_pages-1); 1424 /* 1425 * Only go back, not forward. The cached pfn might have been 1426 * already reset to zone end in compact_finished() 1427 */ 1428 if (free_pfn > zone->compact_cached_free_pfn) 1429 zone->compact_cached_free_pfn = free_pfn; 1430 } 1431 1432 trace_mm_compaction_end(start_pfn, cc->migrate_pfn, 1433 cc->free_pfn, end_pfn, sync, ret); 1434 1435 return ret; 1436} 1437 1438static unsigned long compact_zone_order(struct zone *zone, int order, 1439 gfp_t gfp_mask, enum migrate_mode mode, int *contended, 1440 int alloc_flags, int classzone_idx) 1441{ 1442 unsigned long ret; 1443 struct compact_control cc = { 1444 .nr_freepages = 0, 1445 .nr_migratepages = 0, 1446 .order = order, 1447 .gfp_mask = gfp_mask, 1448 .zone = zone, 1449 .mode = mode, 1450 .alloc_flags = alloc_flags, 1451 .classzone_idx = classzone_idx, 1452 }; 1453 INIT_LIST_HEAD(&cc.freepages); 1454 INIT_LIST_HEAD(&cc.migratepages); 1455 1456 ret = compact_zone(zone, &cc); 1457 1458 VM_BUG_ON(!list_empty(&cc.freepages)); 1459 VM_BUG_ON(!list_empty(&cc.migratepages)); 1460 1461 *contended = cc.contended; 1462 return ret; 1463} 1464 1465int sysctl_extfrag_threshold = 500; 1466 1467/** 1468 * try_to_compact_pages - Direct compact to satisfy a high-order allocation 1469 * @gfp_mask: The GFP mask of the current allocation 1470 * @order: The order of the current allocation 1471 * @alloc_flags: The allocation flags of the current allocation 1472 * @ac: The context of current allocation 1473 * @mode: The migration mode for async, sync light, or sync migration 1474 * @contended: Return value that determines if compaction was aborted due to 1475 * need_resched() or lock contention 1476 * 1477 * This is the main entry point for direct page compaction. 1478 */ 1479unsigned long try_to_compact_pages(gfp_t gfp_mask, unsigned int order, 1480 int alloc_flags, const struct alloc_context *ac, 1481 enum migrate_mode mode, int *contended) 1482{ 1483 int may_enter_fs = gfp_mask & __GFP_FS; 1484 int may_perform_io = gfp_mask & __GFP_IO; 1485 struct zoneref *z; 1486 struct zone *zone; 1487 int rc = COMPACT_DEFERRED; 1488 int all_zones_contended = COMPACT_CONTENDED_LOCK; /* init for &= op */ 1489 1490 *contended = COMPACT_CONTENDED_NONE; 1491 1492 /* Check if the GFP flags allow compaction */ 1493 if (!order || !may_enter_fs || !may_perform_io) 1494 return COMPACT_SKIPPED; 1495 1496 trace_mm_compaction_try_to_compact_pages(order, gfp_mask, mode); 1497 1498 /* Compact each zone in the list */ 1499 for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx, 1500 ac->nodemask) { 1501 int status; 1502 int zone_contended; 1503 1504 if (compaction_deferred(zone, order)) 1505 continue; 1506 1507 status = compact_zone_order(zone, order, gfp_mask, mode, 1508 &zone_contended, alloc_flags, 1509 ac->classzone_idx); 1510 rc = max(status, rc); 1511 /* 1512 * It takes at least one zone that wasn't lock contended 1513 * to clear all_zones_contended. 1514 */ 1515 all_zones_contended &= zone_contended; 1516 1517 /* If a normal allocation would succeed, stop compacting */ 1518 if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 1519 ac->classzone_idx, alloc_flags)) { 1520 /* 1521 * We think the allocation will succeed in this zone, 1522 * but it is not certain, hence the false. The caller 1523 * will repeat this with true if allocation indeed 1524 * succeeds in this zone. 1525 */ 1526 compaction_defer_reset(zone, order, false); 1527 /* 1528 * It is possible that async compaction aborted due to 1529 * need_resched() and the watermarks were ok thanks to 1530 * somebody else freeing memory. The allocation can 1531 * however still fail so we better signal the 1532 * need_resched() contention anyway (this will not 1533 * prevent the allocation attempt). 1534 */ 1535 if (zone_contended == COMPACT_CONTENDED_SCHED) 1536 *contended = COMPACT_CONTENDED_SCHED; 1537 1538 goto break_loop; 1539 } 1540 1541 if (mode != MIGRATE_ASYNC && status == COMPACT_COMPLETE) { 1542 /* 1543 * We think that allocation won't succeed in this zone 1544 * so we defer compaction there. If it ends up 1545 * succeeding after all, it will be reset. 1546 */ 1547 defer_compaction(zone, order); 1548 } 1549 1550 /* 1551 * We might have stopped compacting due to need_resched() in 1552 * async compaction, or due to a fatal signal detected. In that 1553 * case do not try further zones and signal need_resched() 1554 * contention. 1555 */ 1556 if ((zone_contended == COMPACT_CONTENDED_SCHED) 1557 || fatal_signal_pending(current)) { 1558 *contended = COMPACT_CONTENDED_SCHED; 1559 goto break_loop; 1560 } 1561 1562 continue; 1563break_loop: 1564 /* 1565 * We might not have tried all the zones, so be conservative 1566 * and assume they are not all lock contended. 1567 */ 1568 all_zones_contended = 0; 1569 break; 1570 } 1571 1572 /* 1573 * If at least one zone wasn't deferred or skipped, we report if all 1574 * zones that were tried were lock contended. 1575 */ 1576 if (rc > COMPACT_SKIPPED && all_zones_contended) 1577 *contended = COMPACT_CONTENDED_LOCK; 1578 1579 return rc; 1580} 1581 1582 1583/* Compact all zones within a node */ 1584static void __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc) 1585{ 1586 int zoneid; 1587 struct zone *zone; 1588 1589 for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) { 1590 1591 zone = &pgdat->node_zones[zoneid]; 1592 if (!populated_zone(zone)) 1593 continue; 1594 1595 cc->nr_freepages = 0; 1596 cc->nr_migratepages = 0; 1597 cc->zone = zone; 1598 INIT_LIST_HEAD(&cc->freepages); 1599 INIT_LIST_HEAD(&cc->migratepages); 1600 1601 /* 1602 * When called via /proc/sys/vm/compact_memory 1603 * this makes sure we compact the whole zone regardless of 1604 * cached scanner positions. 1605 */ 1606 if (cc->order == -1) 1607 __reset_isolation_suitable(zone); 1608 1609 if (cc->order == -1 || !compaction_deferred(zone, cc->order)) 1610 compact_zone(zone, cc); 1611 1612 if (cc->order > 0) { 1613 if (zone_watermark_ok(zone, cc->order, 1614 low_wmark_pages(zone), 0, 0)) 1615 compaction_defer_reset(zone, cc->order, false); 1616 } 1617 1618 VM_BUG_ON(!list_empty(&cc->freepages)); 1619 VM_BUG_ON(!list_empty(&cc->migratepages)); 1620 } 1621} 1622 1623void compact_pgdat(pg_data_t *pgdat, int order) 1624{ 1625 struct compact_control cc = { 1626 .order = order, 1627 .mode = MIGRATE_ASYNC, 1628 }; 1629 1630 if (!order) 1631 return; 1632 1633 __compact_pgdat(pgdat, &cc); 1634} 1635 1636static void compact_node(int nid) 1637{ 1638 struct compact_control cc = { 1639 .order = -1, 1640 .mode = MIGRATE_SYNC, 1641 .ignore_skip_hint = true, 1642 }; 1643 1644 __compact_pgdat(NODE_DATA(nid), &cc); 1645} 1646 1647/* Compact all nodes in the system */ 1648static void compact_nodes(void) 1649{ 1650 int nid; 1651 1652 /* Flush pending updates to the LRU lists */ 1653 lru_add_drain_all(); 1654 1655 for_each_online_node(nid) 1656 compact_node(nid); 1657} 1658 1659/* The written value is actually unused, all memory is compacted */ 1660int sysctl_compact_memory; 1661 1662/* This is the entry point for compacting all nodes via /proc/sys/vm */ 1663int sysctl_compaction_handler(struct ctl_table *table, int write, 1664 void __user *buffer, size_t *length, loff_t *ppos) 1665{ 1666 if (write) 1667 compact_nodes(); 1668 1669 return 0; 1670} 1671 1672int sysctl_extfrag_handler(struct ctl_table *table, int write, 1673 void __user *buffer, size_t *length, loff_t *ppos) 1674{ 1675 proc_dointvec_minmax(table, write, buffer, length, ppos); 1676 1677 return 0; 1678} 1679 1680#if defined(CONFIG_SYSFS) && defined(CONFIG_NUMA) 1681static ssize_t sysfs_compact_node(struct device *dev, 1682 struct device_attribute *attr, 1683 const char *buf, size_t count) 1684{ 1685 int nid = dev->id; 1686 1687 if (nid >= 0 && nid < nr_node_ids && node_online(nid)) { 1688 /* Flush pending updates to the LRU lists */ 1689 lru_add_drain_all(); 1690 1691 compact_node(nid); 1692 } 1693 1694 return count; 1695} 1696static DEVICE_ATTR(compact, S_IWUSR, NULL, sysfs_compact_node); 1697 1698int compaction_register_node(struct node *node) 1699{ 1700 return device_create_file(&node->dev, &dev_attr_compact); 1701} 1702 1703void compaction_unregister_node(struct node *node) 1704{ 1705 return device_remove_file(&node->dev, &dev_attr_compact); 1706} 1707#endif /* CONFIG_SYSFS && CONFIG_NUMA */ 1708 1709#endif /* CONFIG_COMPACTION */ 1710