This source file includes following definitions.
- kvm_available_flush_tlb_with_range
- kvm_flush_remote_tlbs_with_range
- kvm_flush_remote_tlbs_with_address
- kvm_mmu_set_mmio_spte_mask
- is_mmio_spte
- sp_ad_disabled
- kvm_vcpu_ad_need_write_protect
- spte_ad_enabled
- spte_ad_need_write_protect
- is_nx_huge_page_enabled
- spte_shadow_accessed_mask
- spte_shadow_dirty_mask
- is_access_track_spte
- generation_mmio_spte_mask
- get_mmio_spte_generation
- mark_mmio_spte
- get_mmio_spte_gfn
- get_mmio_spte_access
- set_mmio_spte
- check_mmio_spte
- kvm_mmu_set_mask_ptes
- kvm_get_shadow_phys_bits
- kvm_mmu_reset_all_pte_masks
- is_cpuid_PSE36
- is_nx
- is_shadow_present_pte
- is_large_pte
- is_last_spte
- is_executable_pte
- spte_to_pfn
- pse36_gfn_delta
- __set_spte
- __update_clear_spte_fast
- __update_clear_spte_slow
- __get_spte_lockless
- count_spte_clear
- __set_spte
- __update_clear_spte_fast
- __update_clear_spte_slow
- __get_spte_lockless
- spte_can_locklessly_be_made_writable
- spte_has_volatile_bits
- is_accessed_spte
- is_dirty_spte
- mmu_spte_set
- mmu_spte_update_no_track
- mmu_spte_update
- mmu_spte_clear_track_bits
- mmu_spte_clear_no_track
- mmu_spte_get_lockless
- mark_spte_for_access_track
- restore_acc_track_spte
- mmu_spte_age
- walk_shadow_page_lockless_begin
- walk_shadow_page_lockless_end
- mmu_topup_memory_cache
- mmu_memory_cache_free_objects
- mmu_free_memory_cache
- mmu_topup_memory_cache_page
- mmu_free_memory_cache_page
- mmu_topup_memory_caches
- mmu_free_memory_caches
- mmu_memory_cache_alloc
- mmu_alloc_pte_list_desc
- mmu_free_pte_list_desc
- kvm_mmu_page_get_gfn
- kvm_mmu_page_set_gfn
- lpage_info_slot
- update_gfn_disallow_lpage_count
- kvm_mmu_gfn_disallow_lpage
- kvm_mmu_gfn_allow_lpage
- account_shadowed
- account_huge_nx_page
- unaccount_shadowed
- unaccount_huge_nx_page
- __mmu_gfn_lpage_is_disallowed
- mmu_gfn_lpage_is_disallowed
- host_mapping_level
- memslot_valid_for_gpte
- gfn_to_memslot_dirty_bitmap
- mapping_level
- pte_list_add
- pte_list_desc_remove_entry
- __pte_list_remove
- pte_list_remove
- __gfn_to_rmap
- gfn_to_rmap
- rmap_can_add
- rmap_add
- rmap_remove
- rmap_get_first
- rmap_get_next
- drop_spte
- __drop_large_spte
- drop_large_spte
- spte_write_protect
- __rmap_write_protect
- spte_clear_dirty
- spte_wrprot_for_clear_dirty
- __rmap_clear_dirty
- spte_set_dirty
- __rmap_set_dirty
- kvm_mmu_write_protect_pt_masked
- kvm_mmu_clear_dirty_pt_masked
- kvm_arch_mmu_enable_log_dirty_pt_masked
- kvm_arch_write_log_dirty
- kvm_mmu_slot_gfn_write_protect
- rmap_write_protect
- kvm_zap_rmapp
- kvm_unmap_rmapp
- kvm_set_pte_rmapp
- rmap_walk_init_level
- slot_rmap_walk_init
- slot_rmap_walk_okay
- slot_rmap_walk_next
- kvm_handle_hva_range
- kvm_handle_hva
- kvm_unmap_hva_range
- kvm_set_spte_hva
- kvm_age_rmapp
- kvm_test_age_rmapp
- rmap_recycle
- kvm_age_hva
- kvm_test_age_hva
- is_empty_shadow_page
- kvm_mod_used_mmu_pages
- kvm_mmu_free_page
- kvm_page_table_hashfn
- mmu_page_add_parent_pte
- mmu_page_remove_parent_pte
- drop_parent_pte
- kvm_mmu_alloc_page
- kvm_mmu_mark_parents_unsync
- mark_unsync
- nonpaging_sync_page
- nonpaging_invlpg
- nonpaging_update_pte
- mmu_pages_add
- clear_unsync_child_bit
- __mmu_unsync_walk
- mmu_unsync_walk
- kvm_unlink_unsync_page
- is_ept_sp
- __kvm_sync_page
- kvm_mmu_remote_flush_or_zap
- kvm_mmu_flush_or_zap
- kvm_mmu_audit
- mmu_audit_disable
- is_obsolete_sp
- kvm_sync_page
- kvm_sync_pages
- mmu_pages_next
- mmu_pages_first
- mmu_pages_clear_parents
- mmu_sync_children
- __clear_sp_write_flooding_count
- clear_sp_write_flooding_count
- kvm_mmu_get_page
- shadow_walk_init_using_root
- shadow_walk_init
- shadow_walk_okay
- __shadow_walk_next
- shadow_walk_next
- link_shadow_page
- validate_direct_spte
- mmu_page_zap_pte
- kvm_mmu_page_unlink_children
- kvm_mmu_unlink_parents
- mmu_zap_unsync_children
- __kvm_mmu_prepare_zap_page
- kvm_mmu_prepare_zap_page
- kvm_mmu_commit_zap_page
- prepare_zap_oldest_mmu_page
- kvm_mmu_change_mmu_pages
- kvm_mmu_unprotect_page
- kvm_unsync_page
- mmu_need_write_protect
- kvm_is_mmio_pfn
- set_spte
- mmu_set_spte
- pte_prefetch_gfn_to_pfn
- direct_pte_prefetch_many
- __direct_pte_prefetch
- direct_pte_prefetch
- disallowed_hugepage_adjust
- __direct_map
- kvm_send_hwpoison_signal
- kvm_handle_bad_page
- transparent_hugepage_adjust
- handle_abnormal_pfn
- page_fault_can_be_fast
- fast_pf_fix_direct_spte
- is_access_allowed
- fast_page_fault
- nonpaging_map
- mmu_free_root_page
- kvm_mmu_free_roots
- mmu_check_root
- mmu_alloc_direct_roots
- mmu_alloc_shadow_roots
- mmu_alloc_roots
- kvm_mmu_sync_roots
- nonpaging_gva_to_gpa
- nonpaging_gva_to_gpa_nested
- __is_rsvd_bits_set
- is_rsvd_bits_set
- is_shadow_zero_bits_set
- mmio_info_in_cache
- walk_shadow_page_get_mmio_spte
- handle_mmio_page_fault
- page_fault_handle_page_track
- shadow_page_table_clear_flood
- nonpaging_page_fault
- kvm_arch_setup_async_pf
- try_async_pf
- kvm_handle_page_fault
- check_hugepage_cache_consistency
- tdp_page_fault
- nonpaging_init_context
- cached_root_available
- fast_cr3_switch
- __kvm_mmu_new_cr3
- kvm_mmu_new_cr3
- get_cr3
- inject_page_fault
- sync_mmio_spte
- is_last_gpte
- __reset_rsvds_bits_mask
- reset_rsvds_bits_mask
- __reset_rsvds_bits_mask_ept
- reset_rsvds_bits_mask_ept
- reset_shadow_zero_bits_mask
- boot_cpu_is_amd
- reset_tdp_shadow_zero_bits_mask
- reset_ept_shadow_zero_bits_mask
- update_permission_bitmask
- update_pkru_bitmask
- update_last_nonleaf_level
- paging64_init_context_common
- paging64_init_context
- paging32_init_context
- paging32E_init_context
- kvm_calc_mmu_role_ext
- kvm_calc_mmu_role_common
- kvm_calc_tdp_mmu_root_page_role
- init_kvm_tdp_mmu
- kvm_calc_shadow_mmu_root_page_role
- kvm_init_shadow_mmu
- kvm_calc_shadow_ept_root_page_role
- kvm_init_shadow_ept_mmu
- init_kvm_softmmu
- init_kvm_nested_mmu
- kvm_init_mmu
- kvm_mmu_calc_root_page_role
- kvm_mmu_reset_context
- kvm_mmu_load
- kvm_mmu_unload
- mmu_pte_write_new_pte
- need_remote_flush
- mmu_pte_write_fetch_gpte
- detect_write_flooding
- detect_write_misaligned
- get_written_sptes
- kvm_mmu_pte_write
- kvm_mmu_unprotect_page_virt
- make_mmu_pages_available
- kvm_mmu_page_fault
- kvm_mmu_invlpg
- kvm_mmu_invpcid_gva
- kvm_enable_tdp
- kvm_disable_tdp
- slot_handle_level_range
- slot_handle_level
- slot_handle_all_level
- slot_handle_large_level
- slot_handle_leaf
- free_mmu_pages
- alloc_mmu_pages
- kvm_mmu_create
- kvm_zap_obsolete_pages
- kvm_mmu_zap_all_fast
- kvm_has_zapped_obsolete_pages
- kvm_mmu_invalidate_zap_pages_in_memslot
- kvm_mmu_init_vm
- kvm_mmu_uninit_vm
- kvm_zap_gfn_range
- slot_rmap_write_protect
- kvm_mmu_slot_remove_write_access
- kvm_mmu_zap_collapsible_spte
- kvm_mmu_zap_collapsible_sptes
- kvm_mmu_slot_leaf_clear_dirty
- kvm_mmu_slot_largepage_remove_write_access
- kvm_mmu_slot_set_dirty
- kvm_mmu_zap_all
- kvm_mmu_invalidate_mmio_sptes
- mmu_shrink_scan
- mmu_shrink_count
- mmu_destroy_caches
- kvm_set_mmio_spte_mask
- get_nx_auto_mode
- __set_nx_huge_pages
- set_nx_huge_pages
- kvm_mmu_module_init
- kvm_mmu_calculate_default_mmu_pages
- kvm_mmu_destroy
- kvm_mmu_module_exit
- set_nx_huge_pages_recovery_ratio
- kvm_recover_nx_lpages
- get_nx_lpage_recovery_timeout
- kvm_nx_lpage_recovery_worker
- kvm_mmu_post_init_vm
- kvm_mmu_pre_destroy_vm
   1 
   2 
   3 
   4 
   5 
   6 
   7 
   8 
   9 
  10 
  11 
  12 
  13 
  14 
  15 
  16 
  17 
  18 #include "irq.h"
  19 #include "mmu.h"
  20 #include "x86.h"
  21 #include "kvm_cache_regs.h"
  22 #include "cpuid.h"
  23 
  24 #include <linux/kvm_host.h>
  25 #include <linux/types.h>
  26 #include <linux/string.h>
  27 #include <linux/mm.h>
  28 #include <linux/highmem.h>
  29 #include <linux/moduleparam.h>
  30 #include <linux/export.h>
  31 #include <linux/swap.h>
  32 #include <linux/hugetlb.h>
  33 #include <linux/compiler.h>
  34 #include <linux/srcu.h>
  35 #include <linux/slab.h>
  36 #include <linux/sched/signal.h>
  37 #include <linux/uaccess.h>
  38 #include <linux/hash.h>
  39 #include <linux/kern_levels.h>
  40 #include <linux/kthread.h>
  41 
  42 #include <asm/page.h>
  43 #include <asm/pat.h>
  44 #include <asm/cmpxchg.h>
  45 #include <asm/e820/api.h>
  46 #include <asm/io.h>
  47 #include <asm/vmx.h>
  48 #include <asm/kvm_page_track.h>
  49 #include "trace.h"
  50 
  51 extern bool itlb_multihit_kvm_mitigation;
  52 
  53 static int __read_mostly nx_huge_pages = -1;
  54 #ifdef CONFIG_PREEMPT_RT
  55 
  56 static uint __read_mostly nx_huge_pages_recovery_ratio = 0;
  57 #else
  58 static uint __read_mostly nx_huge_pages_recovery_ratio = 60;
  59 #endif
  60 
  61 static int set_nx_huge_pages(const char *val, const struct kernel_param *kp);
  62 static int set_nx_huge_pages_recovery_ratio(const char *val, const struct kernel_param *kp);
  63 
  64 static struct kernel_param_ops nx_huge_pages_ops = {
  65         .set = set_nx_huge_pages,
  66         .get = param_get_bool,
  67 };
  68 
  69 static struct kernel_param_ops nx_huge_pages_recovery_ratio_ops = {
  70         .set = set_nx_huge_pages_recovery_ratio,
  71         .get = param_get_uint,
  72 };
  73 
  74 module_param_cb(nx_huge_pages, &nx_huge_pages_ops, &nx_huge_pages, 0644);
  75 __MODULE_PARM_TYPE(nx_huge_pages, "bool");
  76 module_param_cb(nx_huge_pages_recovery_ratio, &nx_huge_pages_recovery_ratio_ops,
  77                 &nx_huge_pages_recovery_ratio, 0644);
  78 __MODULE_PARM_TYPE(nx_huge_pages_recovery_ratio, "uint");
  79 
  80 
  81 
  82 
  83 
  84 
  85 
  86 
  87 bool tdp_enabled = false;
  88 
  89 enum {
  90         AUDIT_PRE_PAGE_FAULT,
  91         AUDIT_POST_PAGE_FAULT,
  92         AUDIT_PRE_PTE_WRITE,
  93         AUDIT_POST_PTE_WRITE,
  94         AUDIT_PRE_SYNC,
  95         AUDIT_POST_SYNC
  96 };
  97 
  98 #undef MMU_DEBUG
  99 
 100 #ifdef MMU_DEBUG
 101 static bool dbg = 0;
 102 module_param(dbg, bool, 0644);
 103 
 104 #define pgprintk(x...) do { if (dbg) printk(x); } while (0)
 105 #define rmap_printk(x...) do { if (dbg) printk(x); } while (0)
 106 #define MMU_WARN_ON(x) WARN_ON(x)
 107 #else
 108 #define pgprintk(x...) do { } while (0)
 109 #define rmap_printk(x...) do { } while (0)
 110 #define MMU_WARN_ON(x) do { } while (0)
 111 #endif
 112 
 113 #define PTE_PREFETCH_NUM                8
 114 
 115 #define PT_FIRST_AVAIL_BITS_SHIFT 10
 116 #define PT64_SECOND_AVAIL_BITS_SHIFT 54
 117 
 118 
 119 
 120 
 121 
 122 #define SPTE_SPECIAL_MASK (3ULL << 52)
 123 #define SPTE_AD_ENABLED_MASK (0ULL << 52)
 124 #define SPTE_AD_DISABLED_MASK (1ULL << 52)
 125 #define SPTE_AD_WRPROT_ONLY_MASK (2ULL << 52)
 126 #define SPTE_MMIO_MASK (3ULL << 52)
 127 
 128 #define PT64_LEVEL_BITS 9
 129 
 130 #define PT64_LEVEL_SHIFT(level) \
 131                 (PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS)
 132 
 133 #define PT64_INDEX(address, level)\
 134         (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))
 135 
 136 
 137 #define PT32_LEVEL_BITS 10
 138 
 139 #define PT32_LEVEL_SHIFT(level) \
 140                 (PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS)
 141 
 142 #define PT32_LVL_OFFSET_MASK(level) \
 143         (PT32_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
 144                                                 * PT32_LEVEL_BITS))) - 1))
 145 
 146 #define PT32_INDEX(address, level)\
 147         (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1))
 148 
 149 
 150 #ifdef CONFIG_DYNAMIC_PHYSICAL_MASK
 151 #define PT64_BASE_ADDR_MASK (physical_mask & ~(u64)(PAGE_SIZE-1))
 152 #else
 153 #define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))
 154 #endif
 155 #define PT64_LVL_ADDR_MASK(level) \
 156         (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
 157                                                 * PT64_LEVEL_BITS))) - 1))
 158 #define PT64_LVL_OFFSET_MASK(level) \
 159         (PT64_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
 160                                                 * PT64_LEVEL_BITS))) - 1))
 161 
 162 #define PT32_BASE_ADDR_MASK PAGE_MASK
 163 #define PT32_DIR_BASE_ADDR_MASK \
 164         (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1))
 165 #define PT32_LVL_ADDR_MASK(level) \
 166         (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
 167                                             * PT32_LEVEL_BITS))) - 1))
 168 
 169 #define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | shadow_user_mask \
 170                         | shadow_x_mask | shadow_nx_mask | shadow_me_mask)
 171 
 172 #define ACC_EXEC_MASK    1
 173 #define ACC_WRITE_MASK   PT_WRITABLE_MASK
 174 #define ACC_USER_MASK    PT_USER_MASK
 175 #define ACC_ALL          (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)
 176 
 177 
 178 #define PT64_EPT_READABLE_MASK                  0x1ull
 179 #define PT64_EPT_EXECUTABLE_MASK                0x4ull
 180 
 181 #include <trace/events/kvm.h>
 182 
 183 #define SPTE_HOST_WRITEABLE     (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
 184 #define SPTE_MMU_WRITEABLE      (1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1))
 185 
 186 #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
 187 
 188 
 189 #define PTE_LIST_EXT 3
 190 
 191 
 192 
 193 
 194 
 195 
 196 
 197 
 198 
 199 enum {
 200         RET_PF_RETRY = 0,
 201         RET_PF_EMULATE = 1,
 202         RET_PF_INVALID = 2,
 203 };
 204 
 205 struct pte_list_desc {
 206         u64 *sptes[PTE_LIST_EXT];
 207         struct pte_list_desc *more;
 208 };
 209 
 210 struct kvm_shadow_walk_iterator {
 211         u64 addr;
 212         hpa_t shadow_addr;
 213         u64 *sptep;
 214         int level;
 215         unsigned index;
 216 };
 217 
 218 static const union kvm_mmu_page_role mmu_base_role_mask = {
 219         .cr0_wp = 1,
 220         .gpte_is_8_bytes = 1,
 221         .nxe = 1,
 222         .smep_andnot_wp = 1,
 223         .smap_andnot_wp = 1,
 224         .smm = 1,
 225         .guest_mode = 1,
 226         .ad_disabled = 1,
 227 };
 228 
 229 #define for_each_shadow_entry_using_root(_vcpu, _root, _addr, _walker)     \
 230         for (shadow_walk_init_using_root(&(_walker), (_vcpu),              \
 231                                          (_root), (_addr));                \
 232              shadow_walk_okay(&(_walker));                                 \
 233              shadow_walk_next(&(_walker)))
 234 
 235 #define for_each_shadow_entry(_vcpu, _addr, _walker)            \
 236         for (shadow_walk_init(&(_walker), _vcpu, _addr);        \
 237              shadow_walk_okay(&(_walker));                      \
 238              shadow_walk_next(&(_walker)))
 239 
 240 #define for_each_shadow_entry_lockless(_vcpu, _addr, _walker, spte)     \
 241         for (shadow_walk_init(&(_walker), _vcpu, _addr);                \
 242              shadow_walk_okay(&(_walker)) &&                            \
 243                 ({ spte = mmu_spte_get_lockless(_walker.sptep); 1; });  \
 244              __shadow_walk_next(&(_walker), spte))
 245 
 246 static struct kmem_cache *pte_list_desc_cache;
 247 static struct kmem_cache *mmu_page_header_cache;
 248 static struct percpu_counter kvm_total_used_mmu_pages;
 249 
 250 static u64 __read_mostly shadow_nx_mask;
 251 static u64 __read_mostly shadow_x_mask; 
 252 static u64 __read_mostly shadow_user_mask;
 253 static u64 __read_mostly shadow_accessed_mask;
 254 static u64 __read_mostly shadow_dirty_mask;
 255 static u64 __read_mostly shadow_mmio_mask;
 256 static u64 __read_mostly shadow_mmio_value;
 257 static u64 __read_mostly shadow_mmio_access_mask;
 258 static u64 __read_mostly shadow_present_mask;
 259 static u64 __read_mostly shadow_me_mask;
 260 
 261 
 262 
 263 
 264 
 265 
 266 static u64 __read_mostly shadow_acc_track_mask;
 267 
 268 
 269 
 270 
 271 
 272 
 273 
 274 static const u64 shadow_acc_track_saved_bits_mask = PT64_EPT_READABLE_MASK |
 275                                                     PT64_EPT_EXECUTABLE_MASK;
 276 static const u64 shadow_acc_track_saved_bits_shift = PT64_SECOND_AVAIL_BITS_SHIFT;
 277 
 278 
 279 
 280 
 281 
 282 static u64 __read_mostly shadow_nonpresent_or_rsvd_mask;
 283 
 284 
 285 
 286 
 287 static const u64 shadow_nonpresent_or_rsvd_mask_len = 5;
 288 
 289 
 290 
 291 
 292 
 293 
 294 
 295 
 296 
 297 static u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask;
 298 
 299 
 300 
 301 
 302 
 303 static u8 __read_mostly shadow_phys_bits;
 304 
 305 static void mmu_spte_set(u64 *sptep, u64 spte);
 306 static bool is_executable_pte(u64 spte);
 307 static union kvm_mmu_page_role
 308 kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu);
 309 
 310 #define CREATE_TRACE_POINTS
 311 #include "mmutrace.h"
 312 
 313 
 314 static inline bool kvm_available_flush_tlb_with_range(void)
 315 {
 316         return kvm_x86_ops->tlb_remote_flush_with_range;
 317 }
 318 
 319 static void kvm_flush_remote_tlbs_with_range(struct kvm *kvm,
 320                 struct kvm_tlb_range *range)
 321 {
 322         int ret = -ENOTSUPP;
 323 
 324         if (range && kvm_x86_ops->tlb_remote_flush_with_range)
 325                 ret = kvm_x86_ops->tlb_remote_flush_with_range(kvm, range);
 326 
 327         if (ret)
 328                 kvm_flush_remote_tlbs(kvm);
 329 }
 330 
 331 static void kvm_flush_remote_tlbs_with_address(struct kvm *kvm,
 332                 u64 start_gfn, u64 pages)
 333 {
 334         struct kvm_tlb_range range;
 335 
 336         range.start_gfn = start_gfn;
 337         range.pages = pages;
 338 
 339         kvm_flush_remote_tlbs_with_range(kvm, &range);
 340 }
 341 
 342 void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask, u64 mmio_value, u64 access_mask)
 343 {
 344         BUG_ON((u64)(unsigned)access_mask != access_mask);
 345         BUG_ON((mmio_mask & mmio_value) != mmio_value);
 346         WARN_ON(mmio_value & (shadow_nonpresent_or_rsvd_mask << shadow_nonpresent_or_rsvd_mask_len));
 347         WARN_ON(mmio_value & shadow_nonpresent_or_rsvd_lower_gfn_mask);
 348         shadow_mmio_value = mmio_value | SPTE_MMIO_MASK;
 349         shadow_mmio_mask = mmio_mask | SPTE_SPECIAL_MASK;
 350         shadow_mmio_access_mask = access_mask;
 351 }
 352 EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask);
 353 
 354 static bool is_mmio_spte(u64 spte)
 355 {
 356         return (spte & shadow_mmio_mask) == shadow_mmio_value;
 357 }
 358 
 359 static inline bool sp_ad_disabled(struct kvm_mmu_page *sp)
 360 {
 361         return sp->role.ad_disabled;
 362 }
 363 
 364 static inline bool kvm_vcpu_ad_need_write_protect(struct kvm_vcpu *vcpu)
 365 {
 366         
 367 
 368 
 369 
 370 
 371 
 372         return vcpu->arch.mmu == &vcpu->arch.guest_mmu;
 373 }
 374 
 375 static inline bool spte_ad_enabled(u64 spte)
 376 {
 377         MMU_WARN_ON(is_mmio_spte(spte));
 378         return (spte & SPTE_SPECIAL_MASK) != SPTE_AD_DISABLED_MASK;
 379 }
 380 
 381 static inline bool spte_ad_need_write_protect(u64 spte)
 382 {
 383         MMU_WARN_ON(is_mmio_spte(spte));
 384         return (spte & SPTE_SPECIAL_MASK) != SPTE_AD_ENABLED_MASK;
 385 }
 386 
 387 static bool is_nx_huge_page_enabled(void)
 388 {
 389         return READ_ONCE(nx_huge_pages);
 390 }
 391 
 392 static inline u64 spte_shadow_accessed_mask(u64 spte)
 393 {
 394         MMU_WARN_ON(is_mmio_spte(spte));
 395         return spte_ad_enabled(spte) ? shadow_accessed_mask : 0;
 396 }
 397 
 398 static inline u64 spte_shadow_dirty_mask(u64 spte)
 399 {
 400         MMU_WARN_ON(is_mmio_spte(spte));
 401         return spte_ad_enabled(spte) ? shadow_dirty_mask : 0;
 402 }
 403 
 404 static inline bool is_access_track_spte(u64 spte)
 405 {
 406         return !spte_ad_enabled(spte) && (spte & shadow_acc_track_mask) == 0;
 407 }
 408 
 409 
 410 
 411 
 412 
 413 
 414 
 415 
 416 
 417 
 418 
 419 
 420 
 421 
 422 
 423 #define MMIO_SPTE_GEN_MASK              GENMASK_ULL(17, 0)
 424 
 425 #define MMIO_SPTE_GEN_LOW_START         3
 426 #define MMIO_SPTE_GEN_LOW_END           11
 427 #define MMIO_SPTE_GEN_LOW_MASK          GENMASK_ULL(MMIO_SPTE_GEN_LOW_END, \
 428                                                     MMIO_SPTE_GEN_LOW_START)
 429 
 430 #define MMIO_SPTE_GEN_HIGH_START        PT64_SECOND_AVAIL_BITS_SHIFT
 431 #define MMIO_SPTE_GEN_HIGH_END          62
 432 #define MMIO_SPTE_GEN_HIGH_MASK         GENMASK_ULL(MMIO_SPTE_GEN_HIGH_END, \
 433                                                     MMIO_SPTE_GEN_HIGH_START)
 434 
 435 static u64 generation_mmio_spte_mask(u64 gen)
 436 {
 437         u64 mask;
 438 
 439         WARN_ON(gen & ~MMIO_SPTE_GEN_MASK);
 440         BUILD_BUG_ON((MMIO_SPTE_GEN_HIGH_MASK | MMIO_SPTE_GEN_LOW_MASK) & SPTE_SPECIAL_MASK);
 441 
 442         mask = (gen << MMIO_SPTE_GEN_LOW_START) & MMIO_SPTE_GEN_LOW_MASK;
 443         mask |= (gen << MMIO_SPTE_GEN_HIGH_START) & MMIO_SPTE_GEN_HIGH_MASK;
 444         return mask;
 445 }
 446 
 447 static u64 get_mmio_spte_generation(u64 spte)
 448 {
 449         u64 gen;
 450 
 451         gen = (spte & MMIO_SPTE_GEN_LOW_MASK) >> MMIO_SPTE_GEN_LOW_START;
 452         gen |= (spte & MMIO_SPTE_GEN_HIGH_MASK) >> MMIO_SPTE_GEN_HIGH_START;
 453         return gen;
 454 }
 455 
 456 static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn,
 457                            unsigned access)
 458 {
 459         u64 gen = kvm_vcpu_memslots(vcpu)->generation & MMIO_SPTE_GEN_MASK;
 460         u64 mask = generation_mmio_spte_mask(gen);
 461         u64 gpa = gfn << PAGE_SHIFT;
 462 
 463         access &= shadow_mmio_access_mask;
 464         mask |= shadow_mmio_value | access;
 465         mask |= gpa | shadow_nonpresent_or_rsvd_mask;
 466         mask |= (gpa & shadow_nonpresent_or_rsvd_mask)
 467                 << shadow_nonpresent_or_rsvd_mask_len;
 468 
 469         trace_mark_mmio_spte(sptep, gfn, access, gen);
 470         mmu_spte_set(sptep, mask);
 471 }
 472 
 473 static gfn_t get_mmio_spte_gfn(u64 spte)
 474 {
 475         u64 gpa = spte & shadow_nonpresent_or_rsvd_lower_gfn_mask;
 476 
 477         gpa |= (spte >> shadow_nonpresent_or_rsvd_mask_len)
 478                & shadow_nonpresent_or_rsvd_mask;
 479 
 480         return gpa >> PAGE_SHIFT;
 481 }
 482 
 483 static unsigned get_mmio_spte_access(u64 spte)
 484 {
 485         return spte & shadow_mmio_access_mask;
 486 }
 487 
 488 static bool set_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
 489                           kvm_pfn_t pfn, unsigned access)
 490 {
 491         if (unlikely(is_noslot_pfn(pfn))) {
 492                 mark_mmio_spte(vcpu, sptep, gfn, access);
 493                 return true;
 494         }
 495 
 496         return false;
 497 }
 498 
 499 static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte)
 500 {
 501         u64 kvm_gen, spte_gen, gen;
 502 
 503         gen = kvm_vcpu_memslots(vcpu)->generation;
 504         if (unlikely(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS))
 505                 return false;
 506 
 507         kvm_gen = gen & MMIO_SPTE_GEN_MASK;
 508         spte_gen = get_mmio_spte_generation(spte);
 509 
 510         trace_check_mmio_spte(spte, kvm_gen, spte_gen);
 511         return likely(kvm_gen == spte_gen);
 512 }
 513 
 514 
 515 
 516 
 517 
 518 
 519 
 520 
 521 void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
 522                 u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask,
 523                 u64 acc_track_mask, u64 me_mask)
 524 {
 525         BUG_ON(!dirty_mask != !accessed_mask);
 526         BUG_ON(!accessed_mask && !acc_track_mask);
 527         BUG_ON(acc_track_mask & SPTE_SPECIAL_MASK);
 528 
 529         shadow_user_mask = user_mask;
 530         shadow_accessed_mask = accessed_mask;
 531         shadow_dirty_mask = dirty_mask;
 532         shadow_nx_mask = nx_mask;
 533         shadow_x_mask = x_mask;
 534         shadow_present_mask = p_mask;
 535         shadow_acc_track_mask = acc_track_mask;
 536         shadow_me_mask = me_mask;
 537 }
 538 EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
 539 
 540 static u8 kvm_get_shadow_phys_bits(void)
 541 {
 542         
 543 
 544 
 545 
 546 
 547 
 548         if (likely(boot_cpu_data.extended_cpuid_level >= 0x80000008))
 549                 return cpuid_eax(0x80000008) & 0xff;
 550 
 551         
 552 
 553 
 554 
 555 
 556         return boot_cpu_data.x86_phys_bits;
 557 }
 558 
 559 static void kvm_mmu_reset_all_pte_masks(void)
 560 {
 561         u8 low_phys_bits;
 562 
 563         shadow_user_mask = 0;
 564         shadow_accessed_mask = 0;
 565         shadow_dirty_mask = 0;
 566         shadow_nx_mask = 0;
 567         shadow_x_mask = 0;
 568         shadow_mmio_mask = 0;
 569         shadow_present_mask = 0;
 570         shadow_acc_track_mask = 0;
 571 
 572         shadow_phys_bits = kvm_get_shadow_phys_bits();
 573 
 574         
 575 
 576 
 577 
 578 
 579 
 580 
 581 
 582 
 583 
 584         shadow_nonpresent_or_rsvd_mask = 0;
 585         low_phys_bits = boot_cpu_data.x86_phys_bits;
 586         if (boot_cpu_has_bug(X86_BUG_L1TF) &&
 587             !WARN_ON_ONCE(boot_cpu_data.x86_cache_bits >=
 588                           52 - shadow_nonpresent_or_rsvd_mask_len)) {
 589                 low_phys_bits = boot_cpu_data.x86_cache_bits
 590                         - shadow_nonpresent_or_rsvd_mask_len;
 591                 shadow_nonpresent_or_rsvd_mask =
 592                         rsvd_bits(low_phys_bits, boot_cpu_data.x86_cache_bits - 1);
 593         }
 594 
 595         shadow_nonpresent_or_rsvd_lower_gfn_mask =
 596                 GENMASK_ULL(low_phys_bits - 1, PAGE_SHIFT);
 597 }
 598 
 599 static int is_cpuid_PSE36(void)
 600 {
 601         return 1;
 602 }
 603 
 604 static int is_nx(struct kvm_vcpu *vcpu)
 605 {
 606         return vcpu->arch.efer & EFER_NX;
 607 }
 608 
 609 static int is_shadow_present_pte(u64 pte)
 610 {
 611         return (pte != 0) && !is_mmio_spte(pte);
 612 }
 613 
 614 static int is_large_pte(u64 pte)
 615 {
 616         return pte & PT_PAGE_SIZE_MASK;
 617 }
 618 
 619 static int is_last_spte(u64 pte, int level)
 620 {
 621         if (level == PT_PAGE_TABLE_LEVEL)
 622                 return 1;
 623         if (is_large_pte(pte))
 624                 return 1;
 625         return 0;
 626 }
 627 
 628 static bool is_executable_pte(u64 spte)
 629 {
 630         return (spte & (shadow_x_mask | shadow_nx_mask)) == shadow_x_mask;
 631 }
 632 
 633 static kvm_pfn_t spte_to_pfn(u64 pte)
 634 {
 635         return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
 636 }
 637 
 638 static gfn_t pse36_gfn_delta(u32 gpte)
 639 {
 640         int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT;
 641 
 642         return (gpte & PT32_DIR_PSE36_MASK) << shift;
 643 }
 644 
 645 #ifdef CONFIG_X86_64
 646 static void __set_spte(u64 *sptep, u64 spte)
 647 {
 648         WRITE_ONCE(*sptep, spte);
 649 }
 650 
 651 static void __update_clear_spte_fast(u64 *sptep, u64 spte)
 652 {
 653         WRITE_ONCE(*sptep, spte);
 654 }
 655 
 656 static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
 657 {
 658         return xchg(sptep, spte);
 659 }
 660 
 661 static u64 __get_spte_lockless(u64 *sptep)
 662 {
 663         return READ_ONCE(*sptep);
 664 }
 665 #else
 666 union split_spte {
 667         struct {
 668                 u32 spte_low;
 669                 u32 spte_high;
 670         };
 671         u64 spte;
 672 };
 673 
 674 static void count_spte_clear(u64 *sptep, u64 spte)
 675 {
 676         struct kvm_mmu_page *sp =  page_header(__pa(sptep));
 677 
 678         if (is_shadow_present_pte(spte))
 679                 return;
 680 
 681         
 682         smp_wmb();
 683         sp->clear_spte_count++;
 684 }
 685 
 686 static void __set_spte(u64 *sptep, u64 spte)
 687 {
 688         union split_spte *ssptep, sspte;
 689 
 690         ssptep = (union split_spte *)sptep;
 691         sspte = (union split_spte)spte;
 692 
 693         ssptep->spte_high = sspte.spte_high;
 694 
 695         
 696 
 697 
 698 
 699 
 700         smp_wmb();
 701 
 702         WRITE_ONCE(ssptep->spte_low, sspte.spte_low);
 703 }
 704 
 705 static void __update_clear_spte_fast(u64 *sptep, u64 spte)
 706 {
 707         union split_spte *ssptep, sspte;
 708 
 709         ssptep = (union split_spte *)sptep;
 710         sspte = (union split_spte)spte;
 711 
 712         WRITE_ONCE(ssptep->spte_low, sspte.spte_low);
 713 
 714         
 715 
 716 
 717 
 718         smp_wmb();
 719 
 720         ssptep->spte_high = sspte.spte_high;
 721         count_spte_clear(sptep, spte);
 722 }
 723 
 724 static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
 725 {
 726         union split_spte *ssptep, sspte, orig;
 727 
 728         ssptep = (union split_spte *)sptep;
 729         sspte = (union split_spte)spte;
 730 
 731         
 732         orig.spte_low = xchg(&ssptep->spte_low, sspte.spte_low);
 733         orig.spte_high = ssptep->spte_high;
 734         ssptep->spte_high = sspte.spte_high;
 735         count_spte_clear(sptep, spte);
 736 
 737         return orig.spte;
 738 }
 739 
 740 
 741 
 742 
 743 
 744 
 745 
 746 
 747 
 748 
 749 
 750 
 751 
 752 
 753 
 754 
 755 
 756 
 757 
 758 static u64 __get_spte_lockless(u64 *sptep)
 759 {
 760         struct kvm_mmu_page *sp =  page_header(__pa(sptep));
 761         union split_spte spte, *orig = (union split_spte *)sptep;
 762         int count;
 763 
 764 retry:
 765         count = sp->clear_spte_count;
 766         smp_rmb();
 767 
 768         spte.spte_low = orig->spte_low;
 769         smp_rmb();
 770 
 771         spte.spte_high = orig->spte_high;
 772         smp_rmb();
 773 
 774         if (unlikely(spte.spte_low != orig->spte_low ||
 775               count != sp->clear_spte_count))
 776                 goto retry;
 777 
 778         return spte.spte;
 779 }
 780 #endif
 781 
 782 static bool spte_can_locklessly_be_made_writable(u64 spte)
 783 {
 784         return (spte & (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE)) ==
 785                 (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE);
 786 }
 787 
 788 static bool spte_has_volatile_bits(u64 spte)
 789 {
 790         if (!is_shadow_present_pte(spte))
 791                 return false;
 792 
 793         
 794 
 795 
 796 
 797 
 798 
 799         if (spte_can_locklessly_be_made_writable(spte) ||
 800             is_access_track_spte(spte))
 801                 return true;
 802 
 803         if (spte_ad_enabled(spte)) {
 804                 if ((spte & shadow_accessed_mask) == 0 ||
 805                     (is_writable_pte(spte) && (spte & shadow_dirty_mask) == 0))
 806                         return true;
 807         }
 808 
 809         return false;
 810 }
 811 
 812 static bool is_accessed_spte(u64 spte)
 813 {
 814         u64 accessed_mask = spte_shadow_accessed_mask(spte);
 815 
 816         return accessed_mask ? spte & accessed_mask
 817                              : !is_access_track_spte(spte);
 818 }
 819 
 820 static bool is_dirty_spte(u64 spte)
 821 {
 822         u64 dirty_mask = spte_shadow_dirty_mask(spte);
 823 
 824         return dirty_mask ? spte & dirty_mask : spte & PT_WRITABLE_MASK;
 825 }
 826 
 827 
 828 
 829 
 830 
 831 
 832 
 833 static void mmu_spte_set(u64 *sptep, u64 new_spte)
 834 {
 835         WARN_ON(is_shadow_present_pte(*sptep));
 836         __set_spte(sptep, new_spte);
 837 }
 838 
 839 
 840 
 841 
 842 
 843 static u64 mmu_spte_update_no_track(u64 *sptep, u64 new_spte)
 844 {
 845         u64 old_spte = *sptep;
 846 
 847         WARN_ON(!is_shadow_present_pte(new_spte));
 848 
 849         if (!is_shadow_present_pte(old_spte)) {
 850                 mmu_spte_set(sptep, new_spte);
 851                 return old_spte;
 852         }
 853 
 854         if (!spte_has_volatile_bits(old_spte))
 855                 __update_clear_spte_fast(sptep, new_spte);
 856         else
 857                 old_spte = __update_clear_spte_slow(sptep, new_spte);
 858 
 859         WARN_ON(spte_to_pfn(old_spte) != spte_to_pfn(new_spte));
 860 
 861         return old_spte;
 862 }
 863 
 864 
 865 
 866 
 867 
 868 
 869 
 870 
 871 
 872 
 873 
 874 
 875 static bool mmu_spte_update(u64 *sptep, u64 new_spte)
 876 {
 877         bool flush = false;
 878         u64 old_spte = mmu_spte_update_no_track(sptep, new_spte);
 879 
 880         if (!is_shadow_present_pte(old_spte))
 881                 return false;
 882 
 883         
 884 
 885 
 886 
 887 
 888         if (spte_can_locklessly_be_made_writable(old_spte) &&
 889               !is_writable_pte(new_spte))
 890                 flush = true;
 891 
 892         
 893 
 894 
 895 
 896 
 897         if (is_accessed_spte(old_spte) && !is_accessed_spte(new_spte)) {
 898                 flush = true;
 899                 kvm_set_pfn_accessed(spte_to_pfn(old_spte));
 900         }
 901 
 902         if (is_dirty_spte(old_spte) && !is_dirty_spte(new_spte)) {
 903                 flush = true;
 904                 kvm_set_pfn_dirty(spte_to_pfn(old_spte));
 905         }
 906 
 907         return flush;
 908 }
 909 
 910 
 911 
 912 
 913 
 914 
 915 
 916 static int mmu_spte_clear_track_bits(u64 *sptep)
 917 {
 918         kvm_pfn_t pfn;
 919         u64 old_spte = *sptep;
 920 
 921         if (!spte_has_volatile_bits(old_spte))
 922                 __update_clear_spte_fast(sptep, 0ull);
 923         else
 924                 old_spte = __update_clear_spte_slow(sptep, 0ull);
 925 
 926         if (!is_shadow_present_pte(old_spte))
 927                 return 0;
 928 
 929         pfn = spte_to_pfn(old_spte);
 930 
 931         
 932 
 933 
 934 
 935 
 936         WARN_ON(!kvm_is_reserved_pfn(pfn) && !page_count(pfn_to_page(pfn)));
 937 
 938         if (is_accessed_spte(old_spte))
 939                 kvm_set_pfn_accessed(pfn);
 940 
 941         if (is_dirty_spte(old_spte))
 942                 kvm_set_pfn_dirty(pfn);
 943 
 944         return 1;
 945 }
 946 
 947 
 948 
 949 
 950 
 951 
 952 static void mmu_spte_clear_no_track(u64 *sptep)
 953 {
 954         __update_clear_spte_fast(sptep, 0ull);
 955 }
 956 
 957 static u64 mmu_spte_get_lockless(u64 *sptep)
 958 {
 959         return __get_spte_lockless(sptep);
 960 }
 961 
 962 static u64 mark_spte_for_access_track(u64 spte)
 963 {
 964         if (spte_ad_enabled(spte))
 965                 return spte & ~shadow_accessed_mask;
 966 
 967         if (is_access_track_spte(spte))
 968                 return spte;
 969 
 970         
 971 
 972 
 973 
 974 
 975         WARN_ONCE((spte & PT_WRITABLE_MASK) &&
 976                   !spte_can_locklessly_be_made_writable(spte),
 977                   "kvm: Writable SPTE is not locklessly dirty-trackable\n");
 978 
 979         WARN_ONCE(spte & (shadow_acc_track_saved_bits_mask <<
 980                           shadow_acc_track_saved_bits_shift),
 981                   "kvm: Access Tracking saved bit locations are not zero\n");
 982 
 983         spte |= (spte & shadow_acc_track_saved_bits_mask) <<
 984                 shadow_acc_track_saved_bits_shift;
 985         spte &= ~shadow_acc_track_mask;
 986 
 987         return spte;
 988 }
 989 
 990 
 991 static u64 restore_acc_track_spte(u64 spte)
 992 {
 993         u64 new_spte = spte;
 994         u64 saved_bits = (spte >> shadow_acc_track_saved_bits_shift)
 995                          & shadow_acc_track_saved_bits_mask;
 996 
 997         WARN_ON_ONCE(spte_ad_enabled(spte));
 998         WARN_ON_ONCE(!is_access_track_spte(spte));
 999 
1000         new_spte &= ~shadow_acc_track_mask;
1001         new_spte &= ~(shadow_acc_track_saved_bits_mask <<
1002                       shadow_acc_track_saved_bits_shift);
1003         new_spte |= saved_bits;
1004 
1005         return new_spte;
1006 }
1007 
1008 
1009 static bool mmu_spte_age(u64 *sptep)
1010 {
1011         u64 spte = mmu_spte_get_lockless(sptep);
1012 
1013         if (!is_accessed_spte(spte))
1014                 return false;
1015 
1016         if (spte_ad_enabled(spte)) {
1017                 clear_bit((ffs(shadow_accessed_mask) - 1),
1018                           (unsigned long *)sptep);
1019         } else {
1020                 
1021 
1022 
1023 
1024                 if (is_writable_pte(spte))
1025                         kvm_set_pfn_dirty(spte_to_pfn(spte));
1026 
1027                 spte = mark_spte_for_access_track(spte);
1028                 mmu_spte_update_no_track(sptep, spte);
1029         }
1030 
1031         return true;
1032 }
1033 
1034 static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu)
1035 {
1036         
1037 
1038 
1039 
1040         local_irq_disable();
1041 
1042         
1043 
1044 
1045 
1046         smp_store_mb(vcpu->mode, READING_SHADOW_PAGE_TABLES);
1047 }
1048 
1049 static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu)
1050 {
1051         
1052 
1053 
1054 
1055 
1056         smp_store_release(&vcpu->mode, OUTSIDE_GUEST_MODE);
1057         local_irq_enable();
1058 }
1059 
1060 static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
1061                                   struct kmem_cache *base_cache, int min)
1062 {
1063         void *obj;
1064 
1065         if (cache->nobjs >= min)
1066                 return 0;
1067         while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
1068                 obj = kmem_cache_zalloc(base_cache, GFP_KERNEL_ACCOUNT);
1069                 if (!obj)
1070                         return cache->nobjs >= min ? 0 : -ENOMEM;
1071                 cache->objects[cache->nobjs++] = obj;
1072         }
1073         return 0;
1074 }
1075 
1076 static int mmu_memory_cache_free_objects(struct kvm_mmu_memory_cache *cache)
1077 {
1078         return cache->nobjs;
1079 }
1080 
1081 static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc,
1082                                   struct kmem_cache *cache)
1083 {
1084         while (mc->nobjs)
1085                 kmem_cache_free(cache, mc->objects[--mc->nobjs]);
1086 }
1087 
1088 static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache,
1089                                        int min)
1090 {
1091         void *page;
1092 
1093         if (cache->nobjs >= min)
1094                 return 0;
1095         while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
1096                 page = (void *)__get_free_page(GFP_KERNEL_ACCOUNT);
1097                 if (!page)
1098                         return cache->nobjs >= min ? 0 : -ENOMEM;
1099                 cache->objects[cache->nobjs++] = page;
1100         }
1101         return 0;
1102 }
1103 
1104 static void mmu_free_memory_cache_page(struct kvm_mmu_memory_cache *mc)
1105 {
1106         while (mc->nobjs)
1107                 free_page((unsigned long)mc->objects[--mc->nobjs]);
1108 }
1109 
1110 static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
1111 {
1112         int r;
1113 
1114         r = mmu_topup_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache,
1115                                    pte_list_desc_cache, 8 + PTE_PREFETCH_NUM);
1116         if (r)
1117                 goto out;
1118         r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8);
1119         if (r)
1120                 goto out;
1121         r = mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache,
1122                                    mmu_page_header_cache, 4);
1123 out:
1124         return r;
1125 }
1126 
1127 static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
1128 {
1129         mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache,
1130                                 pte_list_desc_cache);
1131         mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache);
1132         mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache,
1133                                 mmu_page_header_cache);
1134 }
1135 
1136 static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc)
1137 {
1138         void *p;
1139 
1140         BUG_ON(!mc->nobjs);
1141         p = mc->objects[--mc->nobjs];
1142         return p;
1143 }
1144 
1145 static struct pte_list_desc *mmu_alloc_pte_list_desc(struct kvm_vcpu *vcpu)
1146 {
1147         return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_list_desc_cache);
1148 }
1149 
1150 static void mmu_free_pte_list_desc(struct pte_list_desc *pte_list_desc)
1151 {
1152         kmem_cache_free(pte_list_desc_cache, pte_list_desc);
1153 }
1154 
1155 static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index)
1156 {
1157         if (!sp->role.direct)
1158                 return sp->gfns[index];
1159 
1160         return sp->gfn + (index << ((sp->role.level - 1) * PT64_LEVEL_BITS));
1161 }
1162 
1163 static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn)
1164 {
1165         if (!sp->role.direct) {
1166                 sp->gfns[index] = gfn;
1167                 return;
1168         }
1169 
1170         if (WARN_ON(gfn != kvm_mmu_page_get_gfn(sp, index)))
1171                 pr_err_ratelimited("gfn mismatch under direct page %llx "
1172                                    "(expected %llx, got %llx)\n",
1173                                    sp->gfn,
1174                                    kvm_mmu_page_get_gfn(sp, index), gfn);
1175 }
1176 
1177 
1178 
1179 
1180 
1181 static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn,
1182                                               struct kvm_memory_slot *slot,
1183                                               int level)
1184 {
1185         unsigned long idx;
1186 
1187         idx = gfn_to_index(gfn, slot->base_gfn, level);
1188         return &slot->arch.lpage_info[level - 2][idx];
1189 }
1190 
1191 static void update_gfn_disallow_lpage_count(struct kvm_memory_slot *slot,
1192                                             gfn_t gfn, int count)
1193 {
1194         struct kvm_lpage_info *linfo;
1195         int i;
1196 
1197         for (i = PT_DIRECTORY_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
1198                 linfo = lpage_info_slot(gfn, slot, i);
1199                 linfo->disallow_lpage += count;
1200                 WARN_ON(linfo->disallow_lpage < 0);
1201         }
1202 }
1203 
1204 void kvm_mmu_gfn_disallow_lpage(struct kvm_memory_slot *slot, gfn_t gfn)
1205 {
1206         update_gfn_disallow_lpage_count(slot, gfn, 1);
1207 }
1208 
1209 void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn)
1210 {
1211         update_gfn_disallow_lpage_count(slot, gfn, -1);
1212 }
1213 
1214 static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
1215 {
1216         struct kvm_memslots *slots;
1217         struct kvm_memory_slot *slot;
1218         gfn_t gfn;
1219 
1220         kvm->arch.indirect_shadow_pages++;
1221         gfn = sp->gfn;
1222         slots = kvm_memslots_for_spte_role(kvm, sp->role);
1223         slot = __gfn_to_memslot(slots, gfn);
1224 
1225         
1226         if (sp->role.level > PT_PAGE_TABLE_LEVEL)
1227                 return kvm_slot_page_track_add_page(kvm, slot, gfn,
1228                                                     KVM_PAGE_TRACK_WRITE);
1229 
1230         kvm_mmu_gfn_disallow_lpage(slot, gfn);
1231 }
1232 
1233 static void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp)
1234 {
1235         if (sp->lpage_disallowed)
1236                 return;
1237 
1238         ++kvm->stat.nx_lpage_splits;
1239         list_add_tail(&sp->lpage_disallowed_link,
1240                       &kvm->arch.lpage_disallowed_mmu_pages);
1241         sp->lpage_disallowed = true;
1242 }
1243 
1244 static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
1245 {
1246         struct kvm_memslots *slots;
1247         struct kvm_memory_slot *slot;
1248         gfn_t gfn;
1249 
1250         kvm->arch.indirect_shadow_pages--;
1251         gfn = sp->gfn;
1252         slots = kvm_memslots_for_spte_role(kvm, sp->role);
1253         slot = __gfn_to_memslot(slots, gfn);
1254         if (sp->role.level > PT_PAGE_TABLE_LEVEL)
1255                 return kvm_slot_page_track_remove_page(kvm, slot, gfn,
1256                                                        KVM_PAGE_TRACK_WRITE);
1257 
1258         kvm_mmu_gfn_allow_lpage(slot, gfn);
1259 }
1260 
1261 static void unaccount_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp)
1262 {
1263         --kvm->stat.nx_lpage_splits;
1264         sp->lpage_disallowed = false;
1265         list_del(&sp->lpage_disallowed_link);
1266 }
1267 
1268 static bool __mmu_gfn_lpage_is_disallowed(gfn_t gfn, int level,
1269                                           struct kvm_memory_slot *slot)
1270 {
1271         struct kvm_lpage_info *linfo;
1272 
1273         if (slot) {
1274                 linfo = lpage_info_slot(gfn, slot, level);
1275                 return !!linfo->disallow_lpage;
1276         }
1277 
1278         return true;
1279 }
1280 
1281 static bool mmu_gfn_lpage_is_disallowed(struct kvm_vcpu *vcpu, gfn_t gfn,
1282                                         int level)
1283 {
1284         struct kvm_memory_slot *slot;
1285 
1286         slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
1287         return __mmu_gfn_lpage_is_disallowed(gfn, level, slot);
1288 }
1289 
1290 static int host_mapping_level(struct kvm_vcpu *vcpu, gfn_t gfn)
1291 {
1292         unsigned long page_size;
1293         int i, ret = 0;
1294 
1295         page_size = kvm_host_page_size(vcpu, gfn);
1296 
1297         for (i = PT_PAGE_TABLE_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
1298                 if (page_size >= KVM_HPAGE_SIZE(i))
1299                         ret = i;
1300                 else
1301                         break;
1302         }
1303 
1304         return ret;
1305 }
1306 
1307 static inline bool memslot_valid_for_gpte(struct kvm_memory_slot *slot,
1308                                           bool no_dirty_log)
1309 {
1310         if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
1311                 return false;
1312         if (no_dirty_log && slot->dirty_bitmap)
1313                 return false;
1314 
1315         return true;
1316 }
1317 
1318 static struct kvm_memory_slot *
1319 gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn,
1320                             bool no_dirty_log)
1321 {
1322         struct kvm_memory_slot *slot;
1323 
1324         slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
1325         if (!memslot_valid_for_gpte(slot, no_dirty_log))
1326                 slot = NULL;
1327 
1328         return slot;
1329 }
1330 
1331 static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn,
1332                          bool *force_pt_level)
1333 {
1334         int host_level, level, max_level;
1335         struct kvm_memory_slot *slot;
1336 
1337         if (unlikely(*force_pt_level))
1338                 return PT_PAGE_TABLE_LEVEL;
1339 
1340         slot = kvm_vcpu_gfn_to_memslot(vcpu, large_gfn);
1341         *force_pt_level = !memslot_valid_for_gpte(slot, true);
1342         if (unlikely(*force_pt_level))
1343                 return PT_PAGE_TABLE_LEVEL;
1344 
1345         host_level = host_mapping_level(vcpu, large_gfn);
1346 
1347         if (host_level == PT_PAGE_TABLE_LEVEL)
1348                 return host_level;
1349 
1350         max_level = min(kvm_x86_ops->get_lpage_level(), host_level);
1351 
1352         for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level)
1353                 if (__mmu_gfn_lpage_is_disallowed(large_gfn, level, slot))
1354                         break;
1355 
1356         return level - 1;
1357 }
1358 
1359 
1360 
1361 
1362 
1363 
1364 
1365 
1366 
1367 
1368 
1369 
1370 static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte,
1371                         struct kvm_rmap_head *rmap_head)
1372 {
1373         struct pte_list_desc *desc;
1374         int i, count = 0;
1375 
1376         if (!rmap_head->val) {
1377                 rmap_printk("pte_list_add: %p %llx 0->1\n", spte, *spte);
1378                 rmap_head->val = (unsigned long)spte;
1379         } else if (!(rmap_head->val & 1)) {
1380                 rmap_printk("pte_list_add: %p %llx 1->many\n", spte, *spte);
1381                 desc = mmu_alloc_pte_list_desc(vcpu);
1382                 desc->sptes[0] = (u64 *)rmap_head->val;
1383                 desc->sptes[1] = spte;
1384                 rmap_head->val = (unsigned long)desc | 1;
1385                 ++count;
1386         } else {
1387                 rmap_printk("pte_list_add: %p %llx many->many\n", spte, *spte);
1388                 desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
1389                 while (desc->sptes[PTE_LIST_EXT-1] && desc->more) {
1390                         desc = desc->more;
1391                         count += PTE_LIST_EXT;
1392                 }
1393                 if (desc->sptes[PTE_LIST_EXT-1]) {
1394                         desc->more = mmu_alloc_pte_list_desc(vcpu);
1395                         desc = desc->more;
1396                 }
1397                 for (i = 0; desc->sptes[i]; ++i)
1398                         ++count;
1399                 desc->sptes[i] = spte;
1400         }
1401         return count;
1402 }
1403 
1404 static void
1405 pte_list_desc_remove_entry(struct kvm_rmap_head *rmap_head,
1406                            struct pte_list_desc *desc, int i,
1407                            struct pte_list_desc *prev_desc)
1408 {
1409         int j;
1410 
1411         for (j = PTE_LIST_EXT - 1; !desc->sptes[j] && j > i; --j)
1412                 ;
1413         desc->sptes[i] = desc->sptes[j];
1414         desc->sptes[j] = NULL;
1415         if (j != 0)
1416                 return;
1417         if (!prev_desc && !desc->more)
1418                 rmap_head->val = (unsigned long)desc->sptes[0];
1419         else
1420                 if (prev_desc)
1421                         prev_desc->more = desc->more;
1422                 else
1423                         rmap_head->val = (unsigned long)desc->more | 1;
1424         mmu_free_pte_list_desc(desc);
1425 }
1426 
1427 static void __pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head)
1428 {
1429         struct pte_list_desc *desc;
1430         struct pte_list_desc *prev_desc;
1431         int i;
1432 
1433         if (!rmap_head->val) {
1434                 pr_err("%s: %p 0->BUG\n", __func__, spte);
1435                 BUG();
1436         } else if (!(rmap_head->val & 1)) {
1437                 rmap_printk("%s:  %p 1->0\n", __func__, spte);
1438                 if ((u64 *)rmap_head->val != spte) {
1439                         pr_err("%s:  %p 1->BUG\n", __func__, spte);
1440                         BUG();
1441                 }
1442                 rmap_head->val = 0;
1443         } else {
1444                 rmap_printk("%s:  %p many->many\n", __func__, spte);
1445                 desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
1446                 prev_desc = NULL;
1447                 while (desc) {
1448                         for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i) {
1449                                 if (desc->sptes[i] == spte) {
1450                                         pte_list_desc_remove_entry(rmap_head,
1451                                                         desc, i, prev_desc);
1452                                         return;
1453                                 }
1454                         }
1455                         prev_desc = desc;
1456                         desc = desc->more;
1457                 }
1458                 pr_err("%s: %p many->many\n", __func__, spte);
1459                 BUG();
1460         }
1461 }
1462 
1463 static void pte_list_remove(struct kvm_rmap_head *rmap_head, u64 *sptep)
1464 {
1465         mmu_spte_clear_track_bits(sptep);
1466         __pte_list_remove(sptep, rmap_head);
1467 }
1468 
1469 static struct kvm_rmap_head *__gfn_to_rmap(gfn_t gfn, int level,
1470                                            struct kvm_memory_slot *slot)
1471 {
1472         unsigned long idx;
1473 
1474         idx = gfn_to_index(gfn, slot->base_gfn, level);
1475         return &slot->arch.rmap[level - PT_PAGE_TABLE_LEVEL][idx];
1476 }
1477 
1478 static struct kvm_rmap_head *gfn_to_rmap(struct kvm *kvm, gfn_t gfn,
1479                                          struct kvm_mmu_page *sp)
1480 {
1481         struct kvm_memslots *slots;
1482         struct kvm_memory_slot *slot;
1483 
1484         slots = kvm_memslots_for_spte_role(kvm, sp->role);
1485         slot = __gfn_to_memslot(slots, gfn);
1486         return __gfn_to_rmap(gfn, sp->role.level, slot);
1487 }
1488 
1489 static bool rmap_can_add(struct kvm_vcpu *vcpu)
1490 {
1491         struct kvm_mmu_memory_cache *cache;
1492 
1493         cache = &vcpu->arch.mmu_pte_list_desc_cache;
1494         return mmu_memory_cache_free_objects(cache);
1495 }
1496 
1497 static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
1498 {
1499         struct kvm_mmu_page *sp;
1500         struct kvm_rmap_head *rmap_head;
1501 
1502         sp = page_header(__pa(spte));
1503         kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn);
1504         rmap_head = gfn_to_rmap(vcpu->kvm, gfn, sp);
1505         return pte_list_add(vcpu, spte, rmap_head);
1506 }
1507 
1508 static void rmap_remove(struct kvm *kvm, u64 *spte)
1509 {
1510         struct kvm_mmu_page *sp;
1511         gfn_t gfn;
1512         struct kvm_rmap_head *rmap_head;
1513 
1514         sp = page_header(__pa(spte));
1515         gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt);
1516         rmap_head = gfn_to_rmap(kvm, gfn, sp);
1517         __pte_list_remove(spte, rmap_head);
1518 }
1519 
1520 
1521 
1522 
1523 
1524 struct rmap_iterator {
1525         
1526         struct pte_list_desc *desc;     
1527         int pos;                        
1528 };
1529 
1530 
1531 
1532 
1533 
1534 
1535 
1536 
1537 static u64 *rmap_get_first(struct kvm_rmap_head *rmap_head,
1538                            struct rmap_iterator *iter)
1539 {
1540         u64 *sptep;
1541 
1542         if (!rmap_head->val)
1543                 return NULL;
1544 
1545         if (!(rmap_head->val & 1)) {
1546                 iter->desc = NULL;
1547                 sptep = (u64 *)rmap_head->val;
1548                 goto out;
1549         }
1550 
1551         iter->desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
1552         iter->pos = 0;
1553         sptep = iter->desc->sptes[iter->pos];
1554 out:
1555         BUG_ON(!is_shadow_present_pte(*sptep));
1556         return sptep;
1557 }
1558 
1559 
1560 
1561 
1562 
1563 
1564 static u64 *rmap_get_next(struct rmap_iterator *iter)
1565 {
1566         u64 *sptep;
1567 
1568         if (iter->desc) {
1569                 if (iter->pos < PTE_LIST_EXT - 1) {
1570                         ++iter->pos;
1571                         sptep = iter->desc->sptes[iter->pos];
1572                         if (sptep)
1573                                 goto out;
1574                 }
1575 
1576                 iter->desc = iter->desc->more;
1577 
1578                 if (iter->desc) {
1579                         iter->pos = 0;
1580                         
1581                         sptep = iter->desc->sptes[iter->pos];
1582                         goto out;
1583                 }
1584         }
1585 
1586         return NULL;
1587 out:
1588         BUG_ON(!is_shadow_present_pte(*sptep));
1589         return sptep;
1590 }
1591 
1592 #define for_each_rmap_spte(_rmap_head_, _iter_, _spte_)                 \
1593         for (_spte_ = rmap_get_first(_rmap_head_, _iter_);              \
1594              _spte_; _spte_ = rmap_get_next(_iter_))
1595 
1596 static void drop_spte(struct kvm *kvm, u64 *sptep)
1597 {
1598         if (mmu_spte_clear_track_bits(sptep))
1599                 rmap_remove(kvm, sptep);
1600 }
1601 
1602 
1603 static bool __drop_large_spte(struct kvm *kvm, u64 *sptep)
1604 {
1605         if (is_large_pte(*sptep)) {
1606                 WARN_ON(page_header(__pa(sptep))->role.level ==
1607                         PT_PAGE_TABLE_LEVEL);
1608                 drop_spte(kvm, sptep);
1609                 --kvm->stat.lpages;
1610                 return true;
1611         }
1612 
1613         return false;
1614 }
1615 
1616 static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep)
1617 {
1618         if (__drop_large_spte(vcpu->kvm, sptep)) {
1619                 struct kvm_mmu_page *sp = page_header(__pa(sptep));
1620 
1621                 kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn,
1622                         KVM_PAGES_PER_HPAGE(sp->role.level));
1623         }
1624 }
1625 
1626 
1627 
1628 
1629 
1630 
1631 
1632 
1633 
1634 
1635 
1636 
1637 
1638 
1639 static bool spte_write_protect(u64 *sptep, bool pt_protect)
1640 {
1641         u64 spte = *sptep;
1642 
1643         if (!is_writable_pte(spte) &&
1644               !(pt_protect && spte_can_locklessly_be_made_writable(spte)))
1645                 return false;
1646 
1647         rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep);
1648 
1649         if (pt_protect)
1650                 spte &= ~SPTE_MMU_WRITEABLE;
1651         spte = spte & ~PT_WRITABLE_MASK;
1652 
1653         return mmu_spte_update(sptep, spte);
1654 }
1655 
1656 static bool __rmap_write_protect(struct kvm *kvm,
1657                                  struct kvm_rmap_head *rmap_head,
1658                                  bool pt_protect)
1659 {
1660         u64 *sptep;
1661         struct rmap_iterator iter;
1662         bool flush = false;
1663 
1664         for_each_rmap_spte(rmap_head, &iter, sptep)
1665                 flush |= spte_write_protect(sptep, pt_protect);
1666 
1667         return flush;
1668 }
1669 
1670 static bool spte_clear_dirty(u64 *sptep)
1671 {
1672         u64 spte = *sptep;
1673 
1674         rmap_printk("rmap_clear_dirty: spte %p %llx\n", sptep, *sptep);
1675 
1676         MMU_WARN_ON(!spte_ad_enabled(spte));
1677         spte &= ~shadow_dirty_mask;
1678         return mmu_spte_update(sptep, spte);
1679 }
1680 
1681 static bool spte_wrprot_for_clear_dirty(u64 *sptep)
1682 {
1683         bool was_writable = test_and_clear_bit(PT_WRITABLE_SHIFT,
1684                                                (unsigned long *)sptep);
1685         if (was_writable && !spte_ad_enabled(*sptep))
1686                 kvm_set_pfn_dirty(spte_to_pfn(*sptep));
1687 
1688         return was_writable;
1689 }
1690 
1691 
1692 
1693 
1694 
1695 
1696 
1697 static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
1698 {
1699         u64 *sptep;
1700         struct rmap_iterator iter;
1701         bool flush = false;
1702 
1703         for_each_rmap_spte(rmap_head, &iter, sptep)
1704                 if (spte_ad_need_write_protect(*sptep))
1705                         flush |= spte_wrprot_for_clear_dirty(sptep);
1706                 else
1707                         flush |= spte_clear_dirty(sptep);
1708 
1709         return flush;
1710 }
1711 
1712 static bool spte_set_dirty(u64 *sptep)
1713 {
1714         u64 spte = *sptep;
1715 
1716         rmap_printk("rmap_set_dirty: spte %p %llx\n", sptep, *sptep);
1717 
1718         
1719 
1720 
1721 
1722 
1723         spte |= shadow_dirty_mask;
1724 
1725         return mmu_spte_update(sptep, spte);
1726 }
1727 
1728 static bool __rmap_set_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
1729 {
1730         u64 *sptep;
1731         struct rmap_iterator iter;
1732         bool flush = false;
1733 
1734         for_each_rmap_spte(rmap_head, &iter, sptep)
1735                 if (spte_ad_enabled(*sptep))
1736                         flush |= spte_set_dirty(sptep);
1737 
1738         return flush;
1739 }
1740 
1741 
1742 
1743 
1744 
1745 
1746 
1747 
1748 
1749 
1750 
1751 static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
1752                                      struct kvm_memory_slot *slot,
1753                                      gfn_t gfn_offset, unsigned long mask)
1754 {
1755         struct kvm_rmap_head *rmap_head;
1756 
1757         while (mask) {
1758                 rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
1759                                           PT_PAGE_TABLE_LEVEL, slot);
1760                 __rmap_write_protect(kvm, rmap_head, false);
1761 
1762                 
1763                 mask &= mask - 1;
1764         }
1765 }
1766 
1767 
1768 
1769 
1770 
1771 
1772 
1773 
1774 
1775 
1776 
1777 void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm,
1778                                      struct kvm_memory_slot *slot,
1779                                      gfn_t gfn_offset, unsigned long mask)
1780 {
1781         struct kvm_rmap_head *rmap_head;
1782 
1783         while (mask) {
1784                 rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
1785                                           PT_PAGE_TABLE_LEVEL, slot);
1786                 __rmap_clear_dirty(kvm, rmap_head);
1787 
1788                 
1789                 mask &= mask - 1;
1790         }
1791 }
1792 EXPORT_SYMBOL_GPL(kvm_mmu_clear_dirty_pt_masked);
1793 
1794 
1795 
1796 
1797 
1798 
1799 
1800 
1801 
1802 
1803 
1804 void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
1805                                 struct kvm_memory_slot *slot,
1806                                 gfn_t gfn_offset, unsigned long mask)
1807 {
1808         if (kvm_x86_ops->enable_log_dirty_pt_masked)
1809                 kvm_x86_ops->enable_log_dirty_pt_masked(kvm, slot, gfn_offset,
1810                                 mask);
1811         else
1812                 kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
1813 }
1814 
1815 
1816 
1817 
1818 
1819 
1820 
1821 
1822 int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu)
1823 {
1824         if (kvm_x86_ops->write_log_dirty)
1825                 return kvm_x86_ops->write_log_dirty(vcpu);
1826 
1827         return 0;
1828 }
1829 
1830 bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
1831                                     struct kvm_memory_slot *slot, u64 gfn)
1832 {
1833         struct kvm_rmap_head *rmap_head;
1834         int i;
1835         bool write_protected = false;
1836 
1837         for (i = PT_PAGE_TABLE_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
1838                 rmap_head = __gfn_to_rmap(gfn, i, slot);
1839                 write_protected |= __rmap_write_protect(kvm, rmap_head, true);
1840         }
1841 
1842         return write_protected;
1843 }
1844 
1845 static bool rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn)
1846 {
1847         struct kvm_memory_slot *slot;
1848 
1849         slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
1850         return kvm_mmu_slot_gfn_write_protect(vcpu->kvm, slot, gfn);
1851 }
1852 
1853 static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
1854 {
1855         u64 *sptep;
1856         struct rmap_iterator iter;
1857         bool flush = false;
1858 
1859         while ((sptep = rmap_get_first(rmap_head, &iter))) {
1860                 rmap_printk("%s: spte %p %llx.\n", __func__, sptep, *sptep);
1861 
1862                 pte_list_remove(rmap_head, sptep);
1863                 flush = true;
1864         }
1865 
1866         return flush;
1867 }
1868 
1869 static int kvm_unmap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
1870                            struct kvm_memory_slot *slot, gfn_t gfn, int level,
1871                            unsigned long data)
1872 {
1873         return kvm_zap_rmapp(kvm, rmap_head);
1874 }
1875 
1876 static int kvm_set_pte_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
1877                              struct kvm_memory_slot *slot, gfn_t gfn, int level,
1878                              unsigned long data)
1879 {
1880         u64 *sptep;
1881         struct rmap_iterator iter;
1882         int need_flush = 0;
1883         u64 new_spte;
1884         pte_t *ptep = (pte_t *)data;
1885         kvm_pfn_t new_pfn;
1886 
1887         WARN_ON(pte_huge(*ptep));
1888         new_pfn = pte_pfn(*ptep);
1889 
1890 restart:
1891         for_each_rmap_spte(rmap_head, &iter, sptep) {
1892                 rmap_printk("kvm_set_pte_rmapp: spte %p %llx gfn %llx (%d)\n",
1893                             sptep, *sptep, gfn, level);
1894 
1895                 need_flush = 1;
1896 
1897                 if (pte_write(*ptep)) {
1898                         pte_list_remove(rmap_head, sptep);
1899                         goto restart;
1900                 } else {
1901                         new_spte = *sptep & ~PT64_BASE_ADDR_MASK;
1902                         new_spte |= (u64)new_pfn << PAGE_SHIFT;
1903 
1904                         new_spte &= ~PT_WRITABLE_MASK;
1905                         new_spte &= ~SPTE_HOST_WRITEABLE;
1906 
1907                         new_spte = mark_spte_for_access_track(new_spte);
1908 
1909                         mmu_spte_clear_track_bits(sptep);
1910                         mmu_spte_set(sptep, new_spte);
1911                 }
1912         }
1913 
1914         if (need_flush && kvm_available_flush_tlb_with_range()) {
1915                 kvm_flush_remote_tlbs_with_address(kvm, gfn, 1);
1916                 return 0;
1917         }
1918 
1919         return need_flush;
1920 }
1921 
1922 struct slot_rmap_walk_iterator {
1923         
1924         struct kvm_memory_slot *slot;
1925         gfn_t start_gfn;
1926         gfn_t end_gfn;
1927         int start_level;
1928         int end_level;
1929 
1930         
1931         gfn_t gfn;
1932         struct kvm_rmap_head *rmap;
1933         int level;
1934 
1935         
1936         struct kvm_rmap_head *end_rmap;
1937 };
1938 
1939 static void
1940 rmap_walk_init_level(struct slot_rmap_walk_iterator *iterator, int level)
1941 {
1942         iterator->level = level;
1943         iterator->gfn = iterator->start_gfn;
1944         iterator->rmap = __gfn_to_rmap(iterator->gfn, level, iterator->slot);
1945         iterator->end_rmap = __gfn_to_rmap(iterator->end_gfn, level,
1946                                            iterator->slot);
1947 }
1948 
1949 static void
1950 slot_rmap_walk_init(struct slot_rmap_walk_iterator *iterator,
1951                     struct kvm_memory_slot *slot, int start_level,
1952                     int end_level, gfn_t start_gfn, gfn_t end_gfn)
1953 {
1954         iterator->slot = slot;
1955         iterator->start_level = start_level;
1956         iterator->end_level = end_level;
1957         iterator->start_gfn = start_gfn;
1958         iterator->end_gfn = end_gfn;
1959 
1960         rmap_walk_init_level(iterator, iterator->start_level);
1961 }
1962 
1963 static bool slot_rmap_walk_okay(struct slot_rmap_walk_iterator *iterator)
1964 {
1965         return !!iterator->rmap;
1966 }
1967 
1968 static void slot_rmap_walk_next(struct slot_rmap_walk_iterator *iterator)
1969 {
1970         if (++iterator->rmap <= iterator->end_rmap) {
1971                 iterator->gfn += (1UL << KVM_HPAGE_GFN_SHIFT(iterator->level));
1972                 return;
1973         }
1974 
1975         if (++iterator->level > iterator->end_level) {
1976                 iterator->rmap = NULL;
1977                 return;
1978         }
1979 
1980         rmap_walk_init_level(iterator, iterator->level);
1981 }
1982 
1983 #define for_each_slot_rmap_range(_slot_, _start_level_, _end_level_,    \
1984            _start_gfn, _end_gfn, _iter_)                                \
1985         for (slot_rmap_walk_init(_iter_, _slot_, _start_level_,         \
1986                                  _end_level_, _start_gfn, _end_gfn);    \
1987              slot_rmap_walk_okay(_iter_);                               \
1988              slot_rmap_walk_next(_iter_))
1989 
1990 static int kvm_handle_hva_range(struct kvm *kvm,
1991                                 unsigned long start,
1992                                 unsigned long end,
1993                                 unsigned long data,
1994                                 int (*handler)(struct kvm *kvm,
1995                                                struct kvm_rmap_head *rmap_head,
1996                                                struct kvm_memory_slot *slot,
1997                                                gfn_t gfn,
1998                                                int level,
1999                                                unsigned long data))
2000 {
2001         struct kvm_memslots *slots;
2002         struct kvm_memory_slot *memslot;
2003         struct slot_rmap_walk_iterator iterator;
2004         int ret = 0;
2005         int i;
2006 
2007         for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
2008                 slots = __kvm_memslots(kvm, i);
2009                 kvm_for_each_memslot(memslot, slots) {
2010                         unsigned long hva_start, hva_end;
2011                         gfn_t gfn_start, gfn_end;
2012 
2013                         hva_start = max(start, memslot->userspace_addr);
2014                         hva_end = min(end, memslot->userspace_addr +
2015                                       (memslot->npages << PAGE_SHIFT));
2016                         if (hva_start >= hva_end)
2017                                 continue;
2018                         
2019 
2020 
2021 
2022                         gfn_start = hva_to_gfn_memslot(hva_start, memslot);
2023                         gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
2024 
2025                         for_each_slot_rmap_range(memslot, PT_PAGE_TABLE_LEVEL,
2026                                                  PT_MAX_HUGEPAGE_LEVEL,
2027                                                  gfn_start, gfn_end - 1,
2028                                                  &iterator)
2029                                 ret |= handler(kvm, iterator.rmap, memslot,
2030                                                iterator.gfn, iterator.level, data);
2031                 }
2032         }
2033 
2034         return ret;
2035 }
2036 
2037 static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
2038                           unsigned long data,
2039                           int (*handler)(struct kvm *kvm,
2040                                          struct kvm_rmap_head *rmap_head,
2041                                          struct kvm_memory_slot *slot,
2042                                          gfn_t gfn, int level,
2043                                          unsigned long data))
2044 {
2045         return kvm_handle_hva_range(kvm, hva, hva + 1, data, handler);
2046 }
2047 
2048 int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end)
2049 {
2050         return kvm_handle_hva_range(kvm, start, end, 0, kvm_unmap_rmapp);
2051 }
2052 
2053 int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
2054 {
2055         return kvm_handle_hva(kvm, hva, (unsigned long)&pte, kvm_set_pte_rmapp);
2056 }
2057 
2058 static int kvm_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
2059                          struct kvm_memory_slot *slot, gfn_t gfn, int level,
2060                          unsigned long data)
2061 {
2062         u64 *sptep;
2063         struct rmap_iterator uninitialized_var(iter);
2064         int young = 0;
2065 
2066         for_each_rmap_spte(rmap_head, &iter, sptep)
2067                 young |= mmu_spte_age(sptep);
2068 
2069         trace_kvm_age_page(gfn, level, slot, young);
2070         return young;
2071 }
2072 
2073 static int kvm_test_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
2074                               struct kvm_memory_slot *slot, gfn_t gfn,
2075                               int level, unsigned long data)
2076 {
2077         u64 *sptep;
2078         struct rmap_iterator iter;
2079 
2080         for_each_rmap_spte(rmap_head, &iter, sptep)
2081                 if (is_accessed_spte(*sptep))
2082                         return 1;
2083         return 0;
2084 }
2085 
2086 #define RMAP_RECYCLE_THRESHOLD 1000
2087 
2088 static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
2089 {
2090         struct kvm_rmap_head *rmap_head;
2091         struct kvm_mmu_page *sp;
2092 
2093         sp = page_header(__pa(spte));
2094 
2095         rmap_head = gfn_to_rmap(vcpu->kvm, gfn, sp);
2096 
2097         kvm_unmap_rmapp(vcpu->kvm, rmap_head, NULL, gfn, sp->role.level, 0);
2098         kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn,
2099                         KVM_PAGES_PER_HPAGE(sp->role.level));
2100 }
2101 
2102 int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
2103 {
2104         return kvm_handle_hva_range(kvm, start, end, 0, kvm_age_rmapp);
2105 }
2106 
2107 int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
2108 {
2109         return kvm_handle_hva(kvm, hva, 0, kvm_test_age_rmapp);
2110 }
2111 
2112 #ifdef MMU_DEBUG
2113 static int is_empty_shadow_page(u64 *spt)
2114 {
2115         u64 *pos;
2116         u64 *end;
2117 
2118         for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++)
2119                 if (is_shadow_present_pte(*pos)) {
2120                         printk(KERN_ERR "%s: %p %llx\n", __func__,
2121                                pos, *pos);
2122                         return 0;
2123                 }
2124         return 1;
2125 }
2126 #endif
2127 
2128 
2129 
2130 
2131 
2132 
2133 
2134 static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, unsigned long nr)
2135 {
2136         kvm->arch.n_used_mmu_pages += nr;
2137         percpu_counter_add(&kvm_total_used_mmu_pages, nr);
2138 }
2139 
2140 static void kvm_mmu_free_page(struct kvm_mmu_page *sp)
2141 {
2142         MMU_WARN_ON(!is_empty_shadow_page(sp->spt));
2143         hlist_del(&sp->hash_link);
2144         list_del(&sp->link);
2145         free_page((unsigned long)sp->spt);
2146         if (!sp->role.direct)
2147                 free_page((unsigned long)sp->gfns);
2148         kmem_cache_free(mmu_page_header_cache, sp);
2149 }
2150 
2151 static unsigned kvm_page_table_hashfn(gfn_t gfn)
2152 {
2153         return hash_64(gfn, KVM_MMU_HASH_SHIFT);
2154 }
2155 
2156 static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu,
2157                                     struct kvm_mmu_page *sp, u64 *parent_pte)
2158 {
2159         if (!parent_pte)
2160                 return;
2161 
2162         pte_list_add(vcpu, parent_pte, &sp->parent_ptes);
2163 }
2164 
2165 static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
2166                                        u64 *parent_pte)
2167 {
2168         __pte_list_remove(parent_pte, &sp->parent_ptes);
2169 }
2170 
2171 static void drop_parent_pte(struct kvm_mmu_page *sp,
2172                             u64 *parent_pte)
2173 {
2174         mmu_page_remove_parent_pte(sp, parent_pte);
2175         mmu_spte_clear_no_track(parent_pte);
2176 }
2177 
2178 static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, int direct)
2179 {
2180         struct kvm_mmu_page *sp;
2181 
2182         sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
2183         sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache);
2184         if (!direct)
2185                 sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache);
2186         set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
2187 
2188         
2189 
2190 
2191 
2192 
2193         sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen;
2194         list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
2195         kvm_mod_used_mmu_pages(vcpu->kvm, +1);
2196         return sp;
2197 }
2198 
2199 static void mark_unsync(u64 *spte);
2200 static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp)
2201 {
2202         u64 *sptep;
2203         struct rmap_iterator iter;
2204 
2205         for_each_rmap_spte(&sp->parent_ptes, &iter, sptep) {
2206                 mark_unsync(sptep);
2207         }
2208 }
2209 
2210 static void mark_unsync(u64 *spte)
2211 {
2212         struct kvm_mmu_page *sp;
2213         unsigned int index;
2214 
2215         sp = page_header(__pa(spte));
2216         index = spte - sp->spt;
2217         if (__test_and_set_bit(index, sp->unsync_child_bitmap))
2218                 return;
2219         if (sp->unsync_children++)
2220                 return;
2221         kvm_mmu_mark_parents_unsync(sp);
2222 }
2223 
2224 static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
2225                                struct kvm_mmu_page *sp)
2226 {
2227         return 0;
2228 }
2229 
2230 static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root)
2231 {
2232 }
2233 
2234 static void nonpaging_update_pte(struct kvm_vcpu *vcpu,
2235                                  struct kvm_mmu_page *sp, u64 *spte,
2236                                  const void *pte)
2237 {
2238         WARN_ON(1);
2239 }
2240 
2241 #define KVM_PAGE_ARRAY_NR 16
2242 
2243 struct kvm_mmu_pages {
2244         struct mmu_page_and_offset {
2245                 struct kvm_mmu_page *sp;
2246                 unsigned int idx;
2247         } page[KVM_PAGE_ARRAY_NR];
2248         unsigned int nr;
2249 };
2250 
2251 static int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp,
2252                          int idx)
2253 {
2254         int i;
2255 
2256         if (sp->unsync)
2257                 for (i=0; i < pvec->nr; i++)
2258                         if (pvec->page[i].sp == sp)
2259                                 return 0;
2260 
2261         pvec->page[pvec->nr].sp = sp;
2262         pvec->page[pvec->nr].idx = idx;
2263         pvec->nr++;
2264         return (pvec->nr == KVM_PAGE_ARRAY_NR);
2265 }
2266 
2267 static inline void clear_unsync_child_bit(struct kvm_mmu_page *sp, int idx)
2268 {
2269         --sp->unsync_children;
2270         WARN_ON((int)sp->unsync_children < 0);
2271         __clear_bit(idx, sp->unsync_child_bitmap);
2272 }
2273 
2274 static int __mmu_unsync_walk(struct kvm_mmu_page *sp,
2275                            struct kvm_mmu_pages *pvec)
2276 {
2277         int i, ret, nr_unsync_leaf = 0;
2278 
2279         for_each_set_bit(i, sp->unsync_child_bitmap, 512) {
2280                 struct kvm_mmu_page *child;
2281                 u64 ent = sp->spt[i];
2282 
2283                 if (!is_shadow_present_pte(ent) || is_large_pte(ent)) {
2284                         clear_unsync_child_bit(sp, i);
2285                         continue;
2286                 }
2287 
2288                 child = page_header(ent & PT64_BASE_ADDR_MASK);
2289 
2290                 if (child->unsync_children) {
2291                         if (mmu_pages_add(pvec, child, i))
2292                                 return -ENOSPC;
2293 
2294                         ret = __mmu_unsync_walk(child, pvec);
2295                         if (!ret) {
2296                                 clear_unsync_child_bit(sp, i);
2297                                 continue;
2298                         } else if (ret > 0) {
2299                                 nr_unsync_leaf += ret;
2300                         } else
2301                                 return ret;
2302                 } else if (child->unsync) {
2303                         nr_unsync_leaf++;
2304                         if (mmu_pages_add(pvec, child, i))
2305                                 return -ENOSPC;
2306                 } else
2307                         clear_unsync_child_bit(sp, i);
2308         }
2309 
2310         return nr_unsync_leaf;
2311 }
2312 
2313 #define INVALID_INDEX (-1)
2314 
2315 static int mmu_unsync_walk(struct kvm_mmu_page *sp,
2316                            struct kvm_mmu_pages *pvec)
2317 {
2318         pvec->nr = 0;
2319         if (!sp->unsync_children)
2320                 return 0;
2321 
2322         mmu_pages_add(pvec, sp, INVALID_INDEX);
2323         return __mmu_unsync_walk(sp, pvec);
2324 }
2325 
2326 static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
2327 {
2328         WARN_ON(!sp->unsync);
2329         trace_kvm_mmu_sync_page(sp);
2330         sp->unsync = 0;
2331         --kvm->stat.mmu_unsync;
2332 }
2333 
2334 static bool kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
2335                                      struct list_head *invalid_list);
2336 static void kvm_mmu_commit_zap_page(struct kvm *kvm,
2337                                     struct list_head *invalid_list);
2338 
2339 
2340 #define for_each_valid_sp(_kvm, _sp, _gfn)                              \
2341         hlist_for_each_entry(_sp,                                       \
2342           &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)], hash_link) \
2343                 if (is_obsolete_sp((_kvm), (_sp))) {                    \
2344                 } else
2345 
2346 #define for_each_gfn_indirect_valid_sp(_kvm, _sp, _gfn)                 \
2347         for_each_valid_sp(_kvm, _sp, _gfn)                              \
2348                 if ((_sp)->gfn != (_gfn) || (_sp)->role.direct) {} else
2349 
2350 static inline bool is_ept_sp(struct kvm_mmu_page *sp)
2351 {
2352         return sp->role.cr0_wp && sp->role.smap_andnot_wp;
2353 }
2354 
2355 
2356 static bool __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
2357                             struct list_head *invalid_list)
2358 {
2359         if ((!is_ept_sp(sp) && sp->role.gpte_is_8_bytes != !!is_pae(vcpu)) ||
2360             vcpu->arch.mmu->sync_page(vcpu, sp) == 0) {
2361                 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
2362                 return false;
2363         }
2364 
2365         return true;
2366 }
2367 
2368 static bool kvm_mmu_remote_flush_or_zap(struct kvm *kvm,
2369                                         struct list_head *invalid_list,
2370                                         bool remote_flush)
2371 {
2372         if (!remote_flush && list_empty(invalid_list))
2373                 return false;
2374 
2375         if (!list_empty(invalid_list))
2376                 kvm_mmu_commit_zap_page(kvm, invalid_list);
2377         else
2378                 kvm_flush_remote_tlbs(kvm);
2379         return true;
2380 }
2381 
2382 static void kvm_mmu_flush_or_zap(struct kvm_vcpu *vcpu,
2383                                  struct list_head *invalid_list,
2384                                  bool remote_flush, bool local_flush)
2385 {
2386         if (kvm_mmu_remote_flush_or_zap(vcpu->kvm, invalid_list, remote_flush))
2387                 return;
2388 
2389         if (local_flush)
2390                 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
2391 }
2392 
2393 #ifdef CONFIG_KVM_MMU_AUDIT
2394 #include "mmu_audit.c"
2395 #else
2396 static void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) { }
2397 static void mmu_audit_disable(void) { }
2398 #endif
2399 
2400 static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
2401 {
2402         return sp->role.invalid ||
2403                unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen);
2404 }
2405 
2406 static bool kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
2407                          struct list_head *invalid_list)
2408 {
2409         kvm_unlink_unsync_page(vcpu->kvm, sp);
2410         return __kvm_sync_page(vcpu, sp, invalid_list);
2411 }
2412 
2413 
2414 static bool kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn,
2415                            struct list_head *invalid_list)
2416 {
2417         struct kvm_mmu_page *s;
2418         bool ret = false;
2419 
2420         for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn) {
2421                 if (!s->unsync)
2422                         continue;
2423 
2424                 WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL);
2425                 ret |= kvm_sync_page(vcpu, s, invalid_list);
2426         }
2427 
2428         return ret;
2429 }
2430 
2431 struct mmu_page_path {
2432         struct kvm_mmu_page *parent[PT64_ROOT_MAX_LEVEL];
2433         unsigned int idx[PT64_ROOT_MAX_LEVEL];
2434 };
2435 
2436 #define for_each_sp(pvec, sp, parents, i)                       \
2437                 for (i = mmu_pages_first(&pvec, &parents);      \
2438                         i < pvec.nr && ({ sp = pvec.page[i].sp; 1;});   \
2439                         i = mmu_pages_next(&pvec, &parents, i))
2440 
2441 static int mmu_pages_next(struct kvm_mmu_pages *pvec,
2442                           struct mmu_page_path *parents,
2443                           int i)
2444 {
2445         int n;
2446 
2447         for (n = i+1; n < pvec->nr; n++) {
2448                 struct kvm_mmu_page *sp = pvec->page[n].sp;
2449                 unsigned idx = pvec->page[n].idx;
2450                 int level = sp->role.level;
2451 
2452                 parents->idx[level-1] = idx;
2453                 if (level == PT_PAGE_TABLE_LEVEL)
2454                         break;
2455 
2456                 parents->parent[level-2] = sp;
2457         }
2458 
2459         return n;
2460 }
2461 
2462 static int mmu_pages_first(struct kvm_mmu_pages *pvec,
2463                            struct mmu_page_path *parents)
2464 {
2465         struct kvm_mmu_page *sp;
2466         int level;
2467 
2468         if (pvec->nr == 0)
2469                 return 0;
2470 
2471         WARN_ON(pvec->page[0].idx != INVALID_INDEX);
2472 
2473         sp = pvec->page[0].sp;
2474         level = sp->role.level;
2475         WARN_ON(level == PT_PAGE_TABLE_LEVEL);
2476 
2477         parents->parent[level-2] = sp;
2478 
2479         
2480 
2481 
2482         parents->parent[level-1] = NULL;
2483         return mmu_pages_next(pvec, parents, 0);
2484 }
2485 
2486 static void mmu_pages_clear_parents(struct mmu_page_path *parents)
2487 {
2488         struct kvm_mmu_page *sp;
2489         unsigned int level = 0;
2490 
2491         do {
2492                 unsigned int idx = parents->idx[level];
2493                 sp = parents->parent[level];
2494                 if (!sp)
2495                         return;
2496 
2497                 WARN_ON(idx == INVALID_INDEX);
2498                 clear_unsync_child_bit(sp, idx);
2499                 level++;
2500         } while (!sp->unsync_children);
2501 }
2502 
2503 static void mmu_sync_children(struct kvm_vcpu *vcpu,
2504                               struct kvm_mmu_page *parent)
2505 {
2506         int i;
2507         struct kvm_mmu_page *sp;
2508         struct mmu_page_path parents;
2509         struct kvm_mmu_pages pages;
2510         LIST_HEAD(invalid_list);
2511         bool flush = false;
2512 
2513         while (mmu_unsync_walk(parent, &pages)) {
2514                 bool protected = false;
2515 
2516                 for_each_sp(pages, sp, parents, i)
2517                         protected |= rmap_write_protect(vcpu, sp->gfn);
2518 
2519                 if (protected) {
2520                         kvm_flush_remote_tlbs(vcpu->kvm);
2521                         flush = false;
2522                 }
2523 
2524                 for_each_sp(pages, sp, parents, i) {
2525                         flush |= kvm_sync_page(vcpu, sp, &invalid_list);
2526                         mmu_pages_clear_parents(&parents);
2527                 }
2528                 if (need_resched() || spin_needbreak(&vcpu->kvm->mmu_lock)) {
2529                         kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
2530                         cond_resched_lock(&vcpu->kvm->mmu_lock);
2531                         flush = false;
2532                 }
2533         }
2534 
2535         kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
2536 }
2537 
2538 static void __clear_sp_write_flooding_count(struct kvm_mmu_page *sp)
2539 {
2540         atomic_set(&sp->write_flooding_count,  0);
2541 }
2542 
2543 static void clear_sp_write_flooding_count(u64 *spte)
2544 {
2545         struct kvm_mmu_page *sp =  page_header(__pa(spte));
2546 
2547         __clear_sp_write_flooding_count(sp);
2548 }
2549 
2550 static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
2551                                              gfn_t gfn,
2552                                              gva_t gaddr,
2553                                              unsigned level,
2554                                              int direct,
2555                                              unsigned access)
2556 {
2557         union kvm_mmu_page_role role;
2558         unsigned quadrant;
2559         struct kvm_mmu_page *sp;
2560         bool need_sync = false;
2561         bool flush = false;
2562         int collisions = 0;
2563         LIST_HEAD(invalid_list);
2564 
2565         role = vcpu->arch.mmu->mmu_role.base;
2566         role.level = level;
2567         role.direct = direct;
2568         if (role.direct)
2569                 role.gpte_is_8_bytes = true;
2570         role.access = access;
2571         if (!vcpu->arch.mmu->direct_map
2572             && vcpu->arch.mmu->root_level <= PT32_ROOT_LEVEL) {
2573                 quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
2574                 quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
2575                 role.quadrant = quadrant;
2576         }
2577         for_each_valid_sp(vcpu->kvm, sp, gfn) {
2578                 if (sp->gfn != gfn) {
2579                         collisions++;
2580                         continue;
2581                 }
2582 
2583                 if (!need_sync && sp->unsync)
2584                         need_sync = true;
2585 
2586                 if (sp->role.word != role.word)
2587                         continue;
2588 
2589                 if (sp->unsync) {
2590                         
2591 
2592 
2593                         if (!__kvm_sync_page(vcpu, sp, &invalid_list))
2594                                 break;
2595 
2596                         WARN_ON(!list_empty(&invalid_list));
2597                         kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
2598                 }
2599 
2600                 if (sp->unsync_children)
2601                         kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
2602 
2603                 __clear_sp_write_flooding_count(sp);
2604                 trace_kvm_mmu_get_page(sp, false);
2605                 goto out;
2606         }
2607 
2608         ++vcpu->kvm->stat.mmu_cache_miss;
2609 
2610         sp = kvm_mmu_alloc_page(vcpu, direct);
2611 
2612         sp->gfn = gfn;
2613         sp->role = role;
2614         hlist_add_head(&sp->hash_link,
2615                 &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]);
2616         if (!direct) {
2617                 
2618 
2619 
2620 
2621 
2622                 account_shadowed(vcpu->kvm, sp);
2623                 if (level == PT_PAGE_TABLE_LEVEL &&
2624                       rmap_write_protect(vcpu, gfn))
2625                         kvm_flush_remote_tlbs_with_address(vcpu->kvm, gfn, 1);
2626 
2627                 if (level > PT_PAGE_TABLE_LEVEL && need_sync)
2628                         flush |= kvm_sync_pages(vcpu, gfn, &invalid_list);
2629         }
2630         clear_page(sp->spt);
2631         trace_kvm_mmu_get_page(sp, true);
2632 
2633         kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
2634 out:
2635         if (collisions > vcpu->kvm->stat.max_mmu_page_hash_collisions)
2636                 vcpu->kvm->stat.max_mmu_page_hash_collisions = collisions;
2637         return sp;
2638 }
2639 
2640 static void shadow_walk_init_using_root(struct kvm_shadow_walk_iterator *iterator,
2641                                         struct kvm_vcpu *vcpu, hpa_t root,
2642                                         u64 addr)
2643 {
2644         iterator->addr = addr;
2645         iterator->shadow_addr = root;
2646         iterator->level = vcpu->arch.mmu->shadow_root_level;
2647 
2648         if (iterator->level == PT64_ROOT_4LEVEL &&
2649             vcpu->arch.mmu->root_level < PT64_ROOT_4LEVEL &&
2650             !vcpu->arch.mmu->direct_map)
2651                 --iterator->level;
2652 
2653         if (iterator->level == PT32E_ROOT_LEVEL) {
2654                 
2655 
2656 
2657 
2658                 BUG_ON(root != vcpu->arch.mmu->root_hpa);
2659 
2660                 iterator->shadow_addr
2661                         = vcpu->arch.mmu->pae_root[(addr >> 30) & 3];
2662                 iterator->shadow_addr &= PT64_BASE_ADDR_MASK;
2663                 --iterator->level;
2664                 if (!iterator->shadow_addr)
2665                         iterator->level = 0;
2666         }
2667 }
2668 
2669 static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator,
2670                              struct kvm_vcpu *vcpu, u64 addr)
2671 {
2672         shadow_walk_init_using_root(iterator, vcpu, vcpu->arch.mmu->root_hpa,
2673                                     addr);
2674 }
2675 
2676 static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator)
2677 {
2678         if (iterator->level < PT_PAGE_TABLE_LEVEL)
2679                 return false;
2680 
2681         iterator->index = SHADOW_PT_INDEX(iterator->addr, iterator->level);
2682         iterator->sptep = ((u64 *)__va(iterator->shadow_addr)) + iterator->index;
2683         return true;
2684 }
2685 
2686 static void __shadow_walk_next(struct kvm_shadow_walk_iterator *iterator,
2687                                u64 spte)
2688 {
2689         if (is_last_spte(spte, iterator->level)) {
2690                 iterator->level = 0;
2691                 return;
2692         }
2693 
2694         iterator->shadow_addr = spte & PT64_BASE_ADDR_MASK;
2695         --iterator->level;
2696 }
2697 
2698 static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)
2699 {
2700         __shadow_walk_next(iterator, *iterator->sptep);
2701 }
2702 
2703 static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep,
2704                              struct kvm_mmu_page *sp)
2705 {
2706         u64 spte;
2707 
2708         BUILD_BUG_ON(VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK);
2709 
2710         spte = __pa(sp->spt) | shadow_present_mask | PT_WRITABLE_MASK |
2711                shadow_user_mask | shadow_x_mask | shadow_me_mask;
2712 
2713         if (sp_ad_disabled(sp))
2714                 spte |= SPTE_AD_DISABLED_MASK;
2715         else
2716                 spte |= shadow_accessed_mask;
2717 
2718         mmu_spte_set(sptep, spte);
2719 
2720         mmu_page_add_parent_pte(vcpu, sp, sptep);
2721 
2722         if (sp->unsync_children || sp->unsync)
2723                 mark_unsync(sptep);
2724 }
2725 
2726 static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2727                                    unsigned direct_access)
2728 {
2729         if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) {
2730                 struct kvm_mmu_page *child;
2731 
2732                 
2733 
2734 
2735 
2736 
2737 
2738 
2739                 child = page_header(*sptep & PT64_BASE_ADDR_MASK);
2740                 if (child->role.access == direct_access)
2741                         return;
2742 
2743                 drop_parent_pte(child, sptep);
2744                 kvm_flush_remote_tlbs_with_address(vcpu->kvm, child->gfn, 1);
2745         }
2746 }
2747 
2748 static bool mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
2749                              u64 *spte)
2750 {
2751         u64 pte;
2752         struct kvm_mmu_page *child;
2753 
2754         pte = *spte;
2755         if (is_shadow_present_pte(pte)) {
2756                 if (is_last_spte(pte, sp->role.level)) {
2757                         drop_spte(kvm, spte);
2758                         if (is_large_pte(pte))
2759                                 --kvm->stat.lpages;
2760                 } else {
2761                         child = page_header(pte & PT64_BASE_ADDR_MASK);
2762                         drop_parent_pte(child, spte);
2763                 }
2764                 return true;
2765         }
2766 
2767         if (is_mmio_spte(pte))
2768                 mmu_spte_clear_no_track(spte);
2769 
2770         return false;
2771 }
2772 
2773 static void kvm_mmu_page_unlink_children(struct kvm *kvm,
2774                                          struct kvm_mmu_page *sp)
2775 {
2776         unsigned i;
2777 
2778         for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
2779                 mmu_page_zap_pte(kvm, sp, sp->spt + i);
2780 }
2781 
2782 static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
2783 {
2784         u64 *sptep;
2785         struct rmap_iterator iter;
2786 
2787         while ((sptep = rmap_get_first(&sp->parent_ptes, &iter)))
2788                 drop_parent_pte(sp, sptep);
2789 }
2790 
2791 static int mmu_zap_unsync_children(struct kvm *kvm,
2792                                    struct kvm_mmu_page *parent,
2793                                    struct list_head *invalid_list)
2794 {
2795         int i, zapped = 0;
2796         struct mmu_page_path parents;
2797         struct kvm_mmu_pages pages;
2798 
2799         if (parent->role.level == PT_PAGE_TABLE_LEVEL)
2800                 return 0;
2801 
2802         while (mmu_unsync_walk(parent, &pages)) {
2803                 struct kvm_mmu_page *sp;
2804 
2805                 for_each_sp(pages, sp, parents, i) {
2806                         kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
2807                         mmu_pages_clear_parents(&parents);
2808                         zapped++;
2809                 }
2810         }
2811 
2812         return zapped;
2813 }
2814 
2815 static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm,
2816                                        struct kvm_mmu_page *sp,
2817                                        struct list_head *invalid_list,
2818                                        int *nr_zapped)
2819 {
2820         bool list_unstable;
2821 
2822         trace_kvm_mmu_prepare_zap_page(sp);
2823         ++kvm->stat.mmu_shadow_zapped;
2824         *nr_zapped = mmu_zap_unsync_children(kvm, sp, invalid_list);
2825         kvm_mmu_page_unlink_children(kvm, sp);
2826         kvm_mmu_unlink_parents(kvm, sp);
2827 
2828         
2829         list_unstable = *nr_zapped;
2830 
2831         if (!sp->role.invalid && !sp->role.direct)
2832                 unaccount_shadowed(kvm, sp);
2833 
2834         if (sp->unsync)
2835                 kvm_unlink_unsync_page(kvm, sp);
2836         if (!sp->root_count) {
2837                 
2838                 (*nr_zapped)++;
2839                 list_move(&sp->link, invalid_list);
2840                 kvm_mod_used_mmu_pages(kvm, -1);
2841         } else {
2842                 list_move(&sp->link, &kvm->arch.active_mmu_pages);
2843 
2844                 
2845 
2846 
2847 
2848 
2849                 if (!is_obsolete_sp(kvm, sp))
2850                         kvm_reload_remote_mmus(kvm);
2851         }
2852 
2853         if (sp->lpage_disallowed)
2854                 unaccount_huge_nx_page(kvm, sp);
2855 
2856         sp->role.invalid = 1;
2857         return list_unstable;
2858 }
2859 
2860 static bool kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
2861                                      struct list_head *invalid_list)
2862 {
2863         int nr_zapped;
2864 
2865         __kvm_mmu_prepare_zap_page(kvm, sp, invalid_list, &nr_zapped);
2866         return nr_zapped;
2867 }
2868 
2869 static void kvm_mmu_commit_zap_page(struct kvm *kvm,
2870                                     struct list_head *invalid_list)
2871 {
2872         struct kvm_mmu_page *sp, *nsp;
2873 
2874         if (list_empty(invalid_list))
2875                 return;
2876 
2877         
2878 
2879 
2880 
2881 
2882 
2883 
2884 
2885 
2886         kvm_flush_remote_tlbs(kvm);
2887 
2888         list_for_each_entry_safe(sp, nsp, invalid_list, link) {
2889                 WARN_ON(!sp->role.invalid || sp->root_count);
2890                 kvm_mmu_free_page(sp);
2891         }
2892 }
2893 
2894 static bool prepare_zap_oldest_mmu_page(struct kvm *kvm,
2895                                         struct list_head *invalid_list)
2896 {
2897         struct kvm_mmu_page *sp;
2898 
2899         if (list_empty(&kvm->arch.active_mmu_pages))
2900                 return false;
2901 
2902         sp = list_last_entry(&kvm->arch.active_mmu_pages,
2903                              struct kvm_mmu_page, link);
2904         return kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
2905 }
2906 
2907 
2908 
2909 
2910 
2911 void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long goal_nr_mmu_pages)
2912 {
2913         LIST_HEAD(invalid_list);
2914 
2915         spin_lock(&kvm->mmu_lock);
2916 
2917         if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) {
2918                 
2919                 while (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages)
2920                         if (!prepare_zap_oldest_mmu_page(kvm, &invalid_list))
2921                                 break;
2922 
2923                 kvm_mmu_commit_zap_page(kvm, &invalid_list);
2924                 goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages;
2925         }
2926 
2927         kvm->arch.n_max_mmu_pages = goal_nr_mmu_pages;
2928 
2929         spin_unlock(&kvm->mmu_lock);
2930 }
2931 
2932 int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
2933 {
2934         struct kvm_mmu_page *sp;
2935         LIST_HEAD(invalid_list);
2936         int r;
2937 
2938         pgprintk("%s: looking for gfn %llx\n", __func__, gfn);
2939         r = 0;
2940         spin_lock(&kvm->mmu_lock);
2941         for_each_gfn_indirect_valid_sp(kvm, sp, gfn) {
2942                 pgprintk("%s: gfn %llx role %x\n", __func__, gfn,
2943                          sp->role.word);
2944                 r = 1;
2945                 kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
2946         }
2947         kvm_mmu_commit_zap_page(kvm, &invalid_list);
2948         spin_unlock(&kvm->mmu_lock);
2949 
2950         return r;
2951 }
2952 EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page);
2953 
2954 static void kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
2955 {
2956         trace_kvm_mmu_unsync_page(sp);
2957         ++vcpu->kvm->stat.mmu_unsync;
2958         sp->unsync = 1;
2959 
2960         kvm_mmu_mark_parents_unsync(sp);
2961 }
2962 
2963 static bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
2964                                    bool can_unsync)
2965 {
2966         struct kvm_mmu_page *sp;
2967 
2968         if (kvm_page_track_is_active(vcpu, gfn, KVM_PAGE_TRACK_WRITE))
2969                 return true;
2970 
2971         for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn) {
2972                 if (!can_unsync)
2973                         return true;
2974 
2975                 if (sp->unsync)
2976                         continue;
2977 
2978                 WARN_ON(sp->role.level != PT_PAGE_TABLE_LEVEL);
2979                 kvm_unsync_page(vcpu, sp);
2980         }
2981 
2982         
2983 
2984 
2985 
2986 
2987 
2988 
2989 
2990 
2991 
2992 
2993 
2994 
2995 
2996 
2997 
2998 
2999 
3000 
3001 
3002 
3003 
3004 
3005 
3006 
3007 
3008 
3009 
3010 
3011 
3012 
3013 
3014 
3015 
3016 
3017 
3018 
3019         smp_wmb();
3020 
3021         return false;
3022 }
3023 
3024 static bool kvm_is_mmio_pfn(kvm_pfn_t pfn)
3025 {
3026         if (pfn_valid(pfn))
3027                 return !is_zero_pfn(pfn) && PageReserved(pfn_to_page(pfn)) &&
3028                         
3029 
3030 
3031 
3032 
3033 
3034 
3035 
3036 
3037 
3038                         (!pat_enabled() || pat_pfn_immune_to_uc_mtrr(pfn));
3039 
3040         return !e820__mapped_raw_any(pfn_to_hpa(pfn),
3041                                      pfn_to_hpa(pfn + 1) - 1,
3042                                      E820_TYPE_RAM);
3043 }
3044 
3045 
3046 #define SET_SPTE_WRITE_PROTECTED_PT     BIT(0)
3047 #define SET_SPTE_NEED_REMOTE_TLB_FLUSH  BIT(1)
3048 
3049 static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
3050                     unsigned pte_access, int level,
3051                     gfn_t gfn, kvm_pfn_t pfn, bool speculative,
3052                     bool can_unsync, bool host_writable)
3053 {
3054         u64 spte = 0;
3055         int ret = 0;
3056         struct kvm_mmu_page *sp;
3057 
3058         if (set_mmio_spte(vcpu, sptep, gfn, pfn, pte_access))
3059                 return 0;
3060 
3061         sp = page_header(__pa(sptep));
3062         if (sp_ad_disabled(sp))
3063                 spte |= SPTE_AD_DISABLED_MASK;
3064         else if (kvm_vcpu_ad_need_write_protect(vcpu))
3065                 spte |= SPTE_AD_WRPROT_ONLY_MASK;
3066 
3067         
3068 
3069 
3070 
3071 
3072 
3073         spte |= shadow_present_mask;
3074         if (!speculative)
3075                 spte |= spte_shadow_accessed_mask(spte);
3076 
3077         if (level > PT_PAGE_TABLE_LEVEL && (pte_access & ACC_EXEC_MASK) &&
3078             is_nx_huge_page_enabled()) {
3079                 pte_access &= ~ACC_EXEC_MASK;
3080         }
3081 
3082         if (pte_access & ACC_EXEC_MASK)
3083                 spte |= shadow_x_mask;
3084         else
3085                 spte |= shadow_nx_mask;
3086 
3087         if (pte_access & ACC_USER_MASK)
3088                 spte |= shadow_user_mask;
3089 
3090         if (level > PT_PAGE_TABLE_LEVEL)
3091                 spte |= PT_PAGE_SIZE_MASK;
3092         if (tdp_enabled)
3093                 spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn,
3094                         kvm_is_mmio_pfn(pfn));
3095 
3096         if (host_writable)
3097                 spte |= SPTE_HOST_WRITEABLE;
3098         else
3099                 pte_access &= ~ACC_WRITE_MASK;
3100 
3101         if (!kvm_is_mmio_pfn(pfn))
3102                 spte |= shadow_me_mask;
3103 
3104         spte |= (u64)pfn << PAGE_SHIFT;
3105 
3106         if (pte_access & ACC_WRITE_MASK) {
3107 
3108                 
3109 
3110 
3111 
3112 
3113 
3114                 if (level > PT_PAGE_TABLE_LEVEL &&
3115                     mmu_gfn_lpage_is_disallowed(vcpu, gfn, level))
3116                         goto done;
3117 
3118                 spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE;
3119 
3120                 
3121 
3122 
3123 
3124 
3125 
3126                 if (!can_unsync && is_writable_pte(*sptep))
3127                         goto set_pte;
3128 
3129                 if (mmu_need_write_protect(vcpu, gfn, can_unsync)) {
3130                         pgprintk("%s: found shadow page for %llx, marking ro\n",
3131                                  __func__, gfn);
3132                         ret |= SET_SPTE_WRITE_PROTECTED_PT;
3133                         pte_access &= ~ACC_WRITE_MASK;
3134                         spte &= ~(PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE);
3135                 }
3136         }
3137 
3138         if (pte_access & ACC_WRITE_MASK) {
3139                 kvm_vcpu_mark_page_dirty(vcpu, gfn);
3140                 spte |= spte_shadow_dirty_mask(spte);
3141         }
3142 
3143         if (speculative)
3144                 spte = mark_spte_for_access_track(spte);
3145 
3146 set_pte:
3147         if (mmu_spte_update(sptep, spte))
3148                 ret |= SET_SPTE_NEED_REMOTE_TLB_FLUSH;
3149 done:
3150         return ret;
3151 }
3152 
3153 static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access,
3154                         int write_fault, int level, gfn_t gfn, kvm_pfn_t pfn,
3155                         bool speculative, bool host_writable)
3156 {
3157         int was_rmapped = 0;
3158         int rmap_count;
3159         int set_spte_ret;
3160         int ret = RET_PF_RETRY;
3161         bool flush = false;
3162 
3163         pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__,
3164                  *sptep, write_fault, gfn);
3165 
3166         if (is_shadow_present_pte(*sptep)) {
3167                 
3168 
3169 
3170 
3171                 if (level > PT_PAGE_TABLE_LEVEL &&
3172                     !is_large_pte(*sptep)) {
3173                         struct kvm_mmu_page *child;
3174                         u64 pte = *sptep;
3175 
3176                         child = page_header(pte & PT64_BASE_ADDR_MASK);
3177                         drop_parent_pte(child, sptep);
3178                         flush = true;
3179                 } else if (pfn != spte_to_pfn(*sptep)) {
3180                         pgprintk("hfn old %llx new %llx\n",
3181                                  spte_to_pfn(*sptep), pfn);
3182                         drop_spte(vcpu->kvm, sptep);
3183                         flush = true;
3184                 } else
3185                         was_rmapped = 1;
3186         }
3187 
3188         set_spte_ret = set_spte(vcpu, sptep, pte_access, level, gfn, pfn,
3189                                 speculative, true, host_writable);
3190         if (set_spte_ret & SET_SPTE_WRITE_PROTECTED_PT) {
3191                 if (write_fault)
3192                         ret = RET_PF_EMULATE;
3193                 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
3194         }
3195 
3196         if (set_spte_ret & SET_SPTE_NEED_REMOTE_TLB_FLUSH || flush)
3197                 kvm_flush_remote_tlbs_with_address(vcpu->kvm, gfn,
3198                                 KVM_PAGES_PER_HPAGE(level));
3199 
3200         if (unlikely(is_mmio_spte(*sptep)))
3201                 ret = RET_PF_EMULATE;
3202 
3203         pgprintk("%s: setting spte %llx\n", __func__, *sptep);
3204         trace_kvm_mmu_set_spte(level, gfn, sptep);
3205         if (!was_rmapped && is_large_pte(*sptep))
3206                 ++vcpu->kvm->stat.lpages;
3207 
3208         if (is_shadow_present_pte(*sptep)) {
3209                 if (!was_rmapped) {
3210                         rmap_count = rmap_add(vcpu, sptep, gfn);
3211                         if (rmap_count > RMAP_RECYCLE_THRESHOLD)
3212                                 rmap_recycle(vcpu, sptep, gfn);
3213                 }
3214         }
3215 
3216         return ret;
3217 }
3218 
3219 static kvm_pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
3220                                      bool no_dirty_log)
3221 {
3222         struct kvm_memory_slot *slot;
3223 
3224         slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, no_dirty_log);
3225         if (!slot)
3226                 return KVM_PFN_ERR_FAULT;
3227 
3228         return gfn_to_pfn_memslot_atomic(slot, gfn);
3229 }
3230 
3231 static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
3232                                     struct kvm_mmu_page *sp,
3233                                     u64 *start, u64 *end)
3234 {
3235         struct page *pages[PTE_PREFETCH_NUM];
3236         struct kvm_memory_slot *slot;
3237         unsigned access = sp->role.access;
3238         int i, ret;
3239         gfn_t gfn;
3240 
3241         gfn = kvm_mmu_page_get_gfn(sp, start - sp->spt);
3242         slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, access & ACC_WRITE_MASK);
3243         if (!slot)
3244                 return -1;
3245 
3246         ret = gfn_to_page_many_atomic(slot, gfn, pages, end - start);
3247         if (ret <= 0)
3248                 return -1;
3249 
3250         for (i = 0; i < ret; i++, gfn++, start++) {
3251                 mmu_set_spte(vcpu, start, access, 0, sp->role.level, gfn,
3252                              page_to_pfn(pages[i]), true, true);
3253                 put_page(pages[i]);
3254         }
3255 
3256         return 0;
3257 }
3258 
3259 static void __direct_pte_prefetch(struct kvm_vcpu *vcpu,
3260                                   struct kvm_mmu_page *sp, u64 *sptep)
3261 {
3262         u64 *spte, *start = NULL;
3263         int i;
3264 
3265         WARN_ON(!sp->role.direct);
3266 
3267         i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1);
3268         spte = sp->spt + i;
3269 
3270         for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) {
3271                 if (is_shadow_present_pte(*spte) || spte == sptep) {
3272                         if (!start)
3273                                 continue;
3274                         if (direct_pte_prefetch_many(vcpu, sp, start, spte) < 0)
3275                                 break;
3276                         start = NULL;
3277                 } else if (!start)
3278                         start = spte;
3279         }
3280 }
3281 
3282 static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
3283 {
3284         struct kvm_mmu_page *sp;
3285 
3286         sp = page_header(__pa(sptep));
3287 
3288         
3289 
3290 
3291 
3292 
3293         if (sp_ad_disabled(sp))
3294                 return;
3295 
3296         if (sp->role.level > PT_PAGE_TABLE_LEVEL)
3297                 return;
3298 
3299         __direct_pte_prefetch(vcpu, sp, sptep);
3300 }
3301 
3302 static void disallowed_hugepage_adjust(struct kvm_shadow_walk_iterator it,
3303                                        gfn_t gfn, kvm_pfn_t *pfnp, int *levelp)
3304 {
3305         int level = *levelp;
3306         u64 spte = *it.sptep;
3307 
3308         if (it.level == level && level > PT_PAGE_TABLE_LEVEL &&
3309             is_nx_huge_page_enabled() &&
3310             is_shadow_present_pte(spte) &&
3311             !is_large_pte(spte)) {
3312                 
3313 
3314 
3315 
3316 
3317 
3318 
3319                 u64 page_mask = KVM_PAGES_PER_HPAGE(level) - KVM_PAGES_PER_HPAGE(level - 1);
3320                 *pfnp |= gfn & page_mask;
3321                 (*levelp)--;
3322         }
3323 }
3324 
3325 static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write,
3326                         int map_writable, int level, kvm_pfn_t pfn,
3327                         bool prefault, bool lpage_disallowed)
3328 {
3329         struct kvm_shadow_walk_iterator it;
3330         struct kvm_mmu_page *sp;
3331         int ret;
3332         gfn_t gfn = gpa >> PAGE_SHIFT;
3333         gfn_t base_gfn = gfn;
3334 
3335         if (!VALID_PAGE(vcpu->arch.mmu->root_hpa))
3336                 return RET_PF_RETRY;
3337 
3338         trace_kvm_mmu_spte_requested(gpa, level, pfn);
3339         for_each_shadow_entry(vcpu, gpa, it) {
3340                 
3341 
3342 
3343 
3344                 disallowed_hugepage_adjust(it, gfn, &pfn, &level);
3345 
3346                 base_gfn = gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
3347                 if (it.level == level)
3348                         break;
3349 
3350                 drop_large_spte(vcpu, it.sptep);
3351                 if (!is_shadow_present_pte(*it.sptep)) {
3352                         sp = kvm_mmu_get_page(vcpu, base_gfn, it.addr,
3353                                               it.level - 1, true, ACC_ALL);
3354 
3355                         link_shadow_page(vcpu, it.sptep, sp);
3356                         if (lpage_disallowed)
3357                                 account_huge_nx_page(vcpu->kvm, sp);
3358                 }
3359         }
3360 
3361         ret = mmu_set_spte(vcpu, it.sptep, ACC_ALL,
3362                            write, level, base_gfn, pfn, prefault,
3363                            map_writable);
3364         direct_pte_prefetch(vcpu, it.sptep);
3365         ++vcpu->stat.pf_fixed;
3366         return ret;
3367 }
3368 
3369 static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk)
3370 {
3371         send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, PAGE_SHIFT, tsk);
3372 }
3373 
3374 static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn)
3375 {
3376         
3377 
3378 
3379 
3380 
3381         if (pfn == KVM_PFN_ERR_RO_FAULT)
3382                 return RET_PF_EMULATE;
3383 
3384         if (pfn == KVM_PFN_ERR_HWPOISON) {
3385                 kvm_send_hwpoison_signal(kvm_vcpu_gfn_to_hva(vcpu, gfn), current);
3386                 return RET_PF_RETRY;
3387         }
3388 
3389         return -EFAULT;
3390 }
3391 
3392 static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
3393                                         gfn_t gfn, kvm_pfn_t *pfnp,
3394                                         int *levelp)
3395 {
3396         kvm_pfn_t pfn = *pfnp;
3397         int level = *levelp;
3398 
3399         
3400 
3401 
3402 
3403 
3404 
3405         if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn) &&
3406             !kvm_is_zone_device_pfn(pfn) && level == PT_PAGE_TABLE_LEVEL &&
3407             PageTransCompoundMap(pfn_to_page(pfn)) &&
3408             !mmu_gfn_lpage_is_disallowed(vcpu, gfn, PT_DIRECTORY_LEVEL)) {
3409                 unsigned long mask;
3410                 
3411 
3412 
3413 
3414 
3415 
3416 
3417 
3418 
3419                 *levelp = level = PT_DIRECTORY_LEVEL;
3420                 mask = KVM_PAGES_PER_HPAGE(level) - 1;
3421                 VM_BUG_ON((gfn & mask) != (pfn & mask));
3422                 if (pfn & mask) {
3423                         kvm_release_pfn_clean(pfn);
3424                         pfn &= ~mask;
3425                         kvm_get_pfn(pfn);
3426                         *pfnp = pfn;
3427                 }
3428         }
3429 }
3430 
3431 static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
3432                                 kvm_pfn_t pfn, unsigned access, int *ret_val)
3433 {
3434         
3435         if (unlikely(is_error_pfn(pfn))) {
3436                 *ret_val = kvm_handle_bad_page(vcpu, gfn, pfn);
3437                 return true;
3438         }
3439 
3440         if (unlikely(is_noslot_pfn(pfn)))
3441                 vcpu_cache_mmio_info(vcpu, gva, gfn,
3442                                      access & shadow_mmio_access_mask);
3443 
3444         return false;
3445 }
3446 
3447 static bool page_fault_can_be_fast(u32 error_code)
3448 {
3449         
3450 
3451 
3452 
3453         if (unlikely(error_code & PFERR_RSVD_MASK))
3454                 return false;
3455 
3456         
3457         if (unlikely(((error_code & (PFERR_FETCH_MASK | PFERR_PRESENT_MASK))
3458                       == (PFERR_FETCH_MASK | PFERR_PRESENT_MASK))))
3459                 return false;
3460 
3461         
3462 
3463 
3464 
3465 
3466 
3467 
3468 
3469 
3470 
3471 
3472 
3473 
3474 
3475         return shadow_acc_track_mask != 0 ||
3476                ((error_code & (PFERR_WRITE_MASK | PFERR_PRESENT_MASK))
3477                 == (PFERR_WRITE_MASK | PFERR_PRESENT_MASK));
3478 }
3479 
3480 
3481 
3482 
3483 
3484 static bool
3485 fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
3486                         u64 *sptep, u64 old_spte, u64 new_spte)
3487 {
3488         gfn_t gfn;
3489 
3490         WARN_ON(!sp->role.direct);
3491 
3492         
3493 
3494 
3495 
3496 
3497 
3498 
3499 
3500 
3501 
3502 
3503 
3504         if (cmpxchg64(sptep, old_spte, new_spte) != old_spte)
3505                 return false;
3506 
3507         if (is_writable_pte(new_spte) && !is_writable_pte(old_spte)) {
3508                 
3509 
3510 
3511 
3512                 gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
3513                 kvm_vcpu_mark_page_dirty(vcpu, gfn);
3514         }
3515 
3516         return true;
3517 }
3518 
3519 static bool is_access_allowed(u32 fault_err_code, u64 spte)
3520 {
3521         if (fault_err_code & PFERR_FETCH_MASK)
3522                 return is_executable_pte(spte);
3523 
3524         if (fault_err_code & PFERR_WRITE_MASK)
3525                 return is_writable_pte(spte);
3526 
3527         
3528         return spte & PT_PRESENT_MASK;
3529 }
3530 
3531 
3532 
3533 
3534 
3535 
3536 static bool fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, int level,
3537                             u32 error_code)
3538 {
3539         struct kvm_shadow_walk_iterator iterator;
3540         struct kvm_mmu_page *sp;
3541         bool fault_handled = false;
3542         u64 spte = 0ull;
3543         uint retry_count = 0;
3544 
3545         if (!VALID_PAGE(vcpu->arch.mmu->root_hpa))
3546                 return false;
3547 
3548         if (!page_fault_can_be_fast(error_code))
3549                 return false;
3550 
3551         walk_shadow_page_lockless_begin(vcpu);
3552 
3553         do {
3554                 u64 new_spte;
3555 
3556                 for_each_shadow_entry_lockless(vcpu, cr2_or_gpa, iterator, spte)
3557                         if (!is_shadow_present_pte(spte) ||
3558                             iterator.level < level)
3559                                 break;
3560 
3561                 sp = page_header(__pa(iterator.sptep));
3562                 if (!is_last_spte(spte, sp->role.level))
3563                         break;
3564 
3565                 
3566 
3567 
3568 
3569 
3570 
3571 
3572 
3573 
3574 
3575                 if (is_access_allowed(error_code, spte)) {
3576                         fault_handled = true;
3577                         break;
3578                 }
3579 
3580                 new_spte = spte;
3581 
3582                 if (is_access_track_spte(spte))
3583                         new_spte = restore_acc_track_spte(new_spte);
3584 
3585                 
3586 
3587 
3588 
3589 
3590                 if ((error_code & PFERR_WRITE_MASK) &&
3591                     spte_can_locklessly_be_made_writable(spte))
3592                 {
3593                         new_spte |= PT_WRITABLE_MASK;
3594 
3595                         
3596 
3597 
3598 
3599 
3600 
3601 
3602 
3603 
3604 
3605 
3606                         if (sp->role.level > PT_PAGE_TABLE_LEVEL)
3607                                 break;
3608                 }
3609 
3610                 
3611                 if (new_spte == spte ||
3612                     !is_access_allowed(error_code, new_spte))
3613                         break;
3614 
3615                 
3616 
3617 
3618 
3619 
3620                 fault_handled = fast_pf_fix_direct_spte(vcpu, sp,
3621                                                         iterator.sptep, spte,
3622                                                         new_spte);
3623                 if (fault_handled)
3624                         break;
3625 
3626                 if (++retry_count > 4) {
3627                         printk_once(KERN_WARNING
3628                                 "kvm: Fast #PF retrying more than 4 times.\n");
3629                         break;
3630                 }
3631 
3632         } while (true);
3633 
3634         trace_fast_page_fault(vcpu, cr2_or_gpa, error_code, iterator.sptep,
3635                               spte, fault_handled);
3636         walk_shadow_page_lockless_end(vcpu);
3637 
3638         return fault_handled;
3639 }
3640 
3641 static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
3642                          gpa_t cr2_or_gpa, kvm_pfn_t *pfn, bool write,
3643                          bool *writable);
3644 static int make_mmu_pages_available(struct kvm_vcpu *vcpu);
3645 
3646 static int nonpaging_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
3647                          gfn_t gfn, bool prefault)
3648 {
3649         int r;
3650         int level;
3651         bool force_pt_level;
3652         kvm_pfn_t pfn;
3653         unsigned long mmu_seq;
3654         bool map_writable, write = error_code & PFERR_WRITE_MASK;
3655         bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) &&
3656                                 is_nx_huge_page_enabled();
3657 
3658         force_pt_level = lpage_disallowed;
3659         level = mapping_level(vcpu, gfn, &force_pt_level);
3660         if (likely(!force_pt_level)) {
3661                 
3662 
3663 
3664 
3665 
3666                 if (level > PT_DIRECTORY_LEVEL)
3667                         level = PT_DIRECTORY_LEVEL;
3668 
3669                 gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
3670         }
3671 
3672         if (fast_page_fault(vcpu, gpa, level, error_code))
3673                 return RET_PF_RETRY;
3674 
3675         mmu_seq = vcpu->kvm->mmu_notifier_seq;
3676         smp_rmb();
3677 
3678         if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable))
3679                 return RET_PF_RETRY;
3680 
3681         if (handle_abnormal_pfn(vcpu, gpa, gfn, pfn, ACC_ALL, &r))
3682                 return r;
3683 
3684         r = RET_PF_RETRY;
3685         spin_lock(&vcpu->kvm->mmu_lock);
3686         if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
3687                 goto out_unlock;
3688         if (make_mmu_pages_available(vcpu) < 0)
3689                 goto out_unlock;
3690         if (likely(!force_pt_level))
3691                 transparent_hugepage_adjust(vcpu, gfn, &pfn, &level);
3692         r = __direct_map(vcpu, gpa, write, map_writable, level, pfn,
3693                          prefault, false);
3694 out_unlock:
3695         spin_unlock(&vcpu->kvm->mmu_lock);
3696         kvm_release_pfn_clean(pfn);
3697         return r;
3698 }
3699 
3700 static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
3701                                struct list_head *invalid_list)
3702 {
3703         struct kvm_mmu_page *sp;
3704 
3705         if (!VALID_PAGE(*root_hpa))
3706                 return;
3707 
3708         sp = page_header(*root_hpa & PT64_BASE_ADDR_MASK);
3709         --sp->root_count;
3710         if (!sp->root_count && sp->role.invalid)
3711                 kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
3712 
3713         *root_hpa = INVALID_PAGE;
3714 }
3715 
3716 
3717 void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
3718                         ulong roots_to_free)
3719 {
3720         int i;
3721         LIST_HEAD(invalid_list);
3722         bool free_active_root = roots_to_free & KVM_MMU_ROOT_CURRENT;
3723 
3724         BUILD_BUG_ON(KVM_MMU_NUM_PREV_ROOTS >= BITS_PER_LONG);
3725 
3726         
3727         if (!(free_active_root && VALID_PAGE(mmu->root_hpa))) {
3728                 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
3729                         if ((roots_to_free & KVM_MMU_ROOT_PREVIOUS(i)) &&
3730                             VALID_PAGE(mmu->prev_roots[i].hpa))
3731                                 break;
3732 
3733                 if (i == KVM_MMU_NUM_PREV_ROOTS)
3734                         return;
3735         }
3736 
3737         spin_lock(&vcpu->kvm->mmu_lock);
3738 
3739         for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
3740                 if (roots_to_free & KVM_MMU_ROOT_PREVIOUS(i))
3741                         mmu_free_root_page(vcpu->kvm, &mmu->prev_roots[i].hpa,
3742                                            &invalid_list);
3743 
3744         if (free_active_root) {
3745                 if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL &&
3746                     (mmu->root_level >= PT64_ROOT_4LEVEL || mmu->direct_map)) {
3747                         mmu_free_root_page(vcpu->kvm, &mmu->root_hpa,
3748                                            &invalid_list);
3749                 } else {
3750                         for (i = 0; i < 4; ++i)
3751                                 if (mmu->pae_root[i] != 0)
3752                                         mmu_free_root_page(vcpu->kvm,
3753                                                            &mmu->pae_root[i],
3754                                                            &invalid_list);
3755                         mmu->root_hpa = INVALID_PAGE;
3756                 }
3757                 mmu->root_cr3 = 0;
3758         }
3759 
3760         kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
3761         spin_unlock(&vcpu->kvm->mmu_lock);
3762 }
3763 EXPORT_SYMBOL_GPL(kvm_mmu_free_roots);
3764 
3765 static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn)
3766 {
3767         int ret = 0;
3768 
3769         if (!kvm_is_visible_gfn(vcpu->kvm, root_gfn)) {
3770                 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
3771                 ret = 1;
3772         }
3773 
3774         return ret;
3775 }
3776 
3777 static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
3778 {
3779         struct kvm_mmu_page *sp;
3780         unsigned i;
3781 
3782         if (vcpu->arch.mmu->shadow_root_level >= PT64_ROOT_4LEVEL) {
3783                 spin_lock(&vcpu->kvm->mmu_lock);
3784                 if(make_mmu_pages_available(vcpu) < 0) {
3785                         spin_unlock(&vcpu->kvm->mmu_lock);
3786                         return -ENOSPC;
3787                 }
3788                 sp = kvm_mmu_get_page(vcpu, 0, 0,
3789                                 vcpu->arch.mmu->shadow_root_level, 1, ACC_ALL);
3790                 ++sp->root_count;
3791                 spin_unlock(&vcpu->kvm->mmu_lock);
3792                 vcpu->arch.mmu->root_hpa = __pa(sp->spt);
3793         } else if (vcpu->arch.mmu->shadow_root_level == PT32E_ROOT_LEVEL) {
3794                 for (i = 0; i < 4; ++i) {
3795                         hpa_t root = vcpu->arch.mmu->pae_root[i];
3796 
3797                         MMU_WARN_ON(VALID_PAGE(root));
3798                         spin_lock(&vcpu->kvm->mmu_lock);
3799                         if (make_mmu_pages_available(vcpu) < 0) {
3800                                 spin_unlock(&vcpu->kvm->mmu_lock);
3801                                 return -ENOSPC;
3802                         }
3803                         sp = kvm_mmu_get_page(vcpu, i << (30 - PAGE_SHIFT),
3804                                         i << 30, PT32_ROOT_LEVEL, 1, ACC_ALL);
3805                         root = __pa(sp->spt);
3806                         ++sp->root_count;
3807                         spin_unlock(&vcpu->kvm->mmu_lock);
3808                         vcpu->arch.mmu->pae_root[i] = root | PT_PRESENT_MASK;
3809                 }
3810                 vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->pae_root);
3811         } else
3812                 BUG();
3813         vcpu->arch.mmu->root_cr3 = vcpu->arch.mmu->get_cr3(vcpu);
3814 
3815         return 0;
3816 }
3817 
3818 static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
3819 {
3820         struct kvm_mmu_page *sp;
3821         u64 pdptr, pm_mask;
3822         gfn_t root_gfn, root_cr3;
3823         int i;
3824 
3825         root_cr3 = vcpu->arch.mmu->get_cr3(vcpu);
3826         root_gfn = root_cr3 >> PAGE_SHIFT;
3827 
3828         if (mmu_check_root(vcpu, root_gfn))
3829                 return 1;
3830 
3831         
3832 
3833 
3834 
3835         if (vcpu->arch.mmu->root_level >= PT64_ROOT_4LEVEL) {
3836                 hpa_t root = vcpu->arch.mmu->root_hpa;
3837 
3838                 MMU_WARN_ON(VALID_PAGE(root));
3839 
3840                 spin_lock(&vcpu->kvm->mmu_lock);
3841                 if (make_mmu_pages_available(vcpu) < 0) {
3842                         spin_unlock(&vcpu->kvm->mmu_lock);
3843                         return -ENOSPC;
3844                 }
3845                 sp = kvm_mmu_get_page(vcpu, root_gfn, 0,
3846                                 vcpu->arch.mmu->shadow_root_level, 0, ACC_ALL);
3847                 root = __pa(sp->spt);
3848                 ++sp->root_count;
3849                 spin_unlock(&vcpu->kvm->mmu_lock);
3850                 vcpu->arch.mmu->root_hpa = root;
3851                 goto set_root_cr3;
3852         }
3853 
3854         
3855 
3856 
3857 
3858 
3859         pm_mask = PT_PRESENT_MASK;
3860         if (vcpu->arch.mmu->shadow_root_level == PT64_ROOT_4LEVEL)
3861                 pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK;
3862 
3863         for (i = 0; i < 4; ++i) {
3864                 hpa_t root = vcpu->arch.mmu->pae_root[i];
3865 
3866                 MMU_WARN_ON(VALID_PAGE(root));
3867                 if (vcpu->arch.mmu->root_level == PT32E_ROOT_LEVEL) {
3868                         pdptr = vcpu->arch.mmu->get_pdptr(vcpu, i);
3869                         if (!(pdptr & PT_PRESENT_MASK)) {
3870                                 vcpu->arch.mmu->pae_root[i] = 0;
3871                                 continue;
3872                         }
3873                         root_gfn = pdptr >> PAGE_SHIFT;
3874                         if (mmu_check_root(vcpu, root_gfn))
3875                                 return 1;
3876                 }
3877                 spin_lock(&vcpu->kvm->mmu_lock);
3878                 if (make_mmu_pages_available(vcpu) < 0) {
3879                         spin_unlock(&vcpu->kvm->mmu_lock);
3880                         return -ENOSPC;
3881                 }
3882                 sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, PT32_ROOT_LEVEL,
3883                                       0, ACC_ALL);
3884                 root = __pa(sp->spt);
3885                 ++sp->root_count;
3886                 spin_unlock(&vcpu->kvm->mmu_lock);
3887 
3888                 vcpu->arch.mmu->pae_root[i] = root | pm_mask;
3889         }
3890         vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->pae_root);
3891 
3892         
3893 
3894 
3895 
3896         if (vcpu->arch.mmu->shadow_root_level == PT64_ROOT_4LEVEL) {
3897                 if (vcpu->arch.mmu->lm_root == NULL) {
3898                         
3899 
3900 
3901 
3902 
3903                         u64 *lm_root;
3904 
3905                         lm_root = (void*)get_zeroed_page(GFP_KERNEL_ACCOUNT);
3906                         if (lm_root == NULL)
3907                                 return 1;
3908 
3909                         lm_root[0] = __pa(vcpu->arch.mmu->pae_root) | pm_mask;
3910 
3911                         vcpu->arch.mmu->lm_root = lm_root;
3912                 }
3913 
3914                 vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->lm_root);
3915         }
3916 
3917 set_root_cr3:
3918         vcpu->arch.mmu->root_cr3 = root_cr3;
3919 
3920         return 0;
3921 }
3922 
3923 static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
3924 {
3925         if (vcpu->arch.mmu->direct_map)
3926                 return mmu_alloc_direct_roots(vcpu);
3927         else
3928                 return mmu_alloc_shadow_roots(vcpu);
3929 }
3930 
3931 void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
3932 {
3933         int i;
3934         struct kvm_mmu_page *sp;
3935 
3936         if (vcpu->arch.mmu->direct_map)
3937                 return;
3938 
3939         if (!VALID_PAGE(vcpu->arch.mmu->root_hpa))
3940                 return;
3941 
3942         vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
3943 
3944         if (vcpu->arch.mmu->root_level >= PT64_ROOT_4LEVEL) {
3945                 hpa_t root = vcpu->arch.mmu->root_hpa;
3946                 sp = page_header(root);
3947 
3948                 
3949 
3950 
3951 
3952 
3953 
3954 
3955 
3956 
3957 
3958                 if (!smp_load_acquire(&sp->unsync) &&
3959                     !smp_load_acquire(&sp->unsync_children))
3960                         return;
3961 
3962                 spin_lock(&vcpu->kvm->mmu_lock);
3963                 kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
3964 
3965                 mmu_sync_children(vcpu, sp);
3966 
3967                 kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
3968                 spin_unlock(&vcpu->kvm->mmu_lock);
3969                 return;
3970         }
3971 
3972         spin_lock(&vcpu->kvm->mmu_lock);
3973         kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
3974 
3975         for (i = 0; i < 4; ++i) {
3976                 hpa_t root = vcpu->arch.mmu->pae_root[i];
3977 
3978                 if (root && VALID_PAGE(root)) {
3979                         root &= PT64_BASE_ADDR_MASK;
3980                         sp = page_header(root);
3981                         mmu_sync_children(vcpu, sp);
3982                 }
3983         }
3984 
3985         kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
3986         spin_unlock(&vcpu->kvm->mmu_lock);
3987 }
3988 EXPORT_SYMBOL_GPL(kvm_mmu_sync_roots);
3989 
3990 static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gpa_t vaddr,
3991                                   u32 access, struct x86_exception *exception)
3992 {
3993         if (exception)
3994                 exception->error_code = 0;
3995         return vaddr;
3996 }
3997 
3998 static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gpa_t vaddr,
3999                                          u32 access,
4000                                          struct x86_exception *exception)
4001 {
4002         if (exception)
4003                 exception->error_code = 0;
4004         return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access, exception);
4005 }
4006 
4007 static bool
4008 __is_rsvd_bits_set(struct rsvd_bits_validate *rsvd_check, u64 pte, int level)
4009 {
4010         int bit7 = (pte >> 7) & 1, low6 = pte & 0x3f;
4011 
4012         return (pte & rsvd_check->rsvd_bits_mask[bit7][level-1]) |
4013                 ((rsvd_check->bad_mt_xwr & (1ull << low6)) != 0);
4014 }
4015 
4016 static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level)
4017 {
4018         return __is_rsvd_bits_set(&mmu->guest_rsvd_check, gpte, level);
4019 }
4020 
4021 static bool is_shadow_zero_bits_set(struct kvm_mmu *mmu, u64 spte, int level)
4022 {
4023         return __is_rsvd_bits_set(&mmu->shadow_zero_check, spte, level);
4024 }
4025 
4026 static bool mmio_info_in_cache(struct kvm_vcpu *vcpu, u64 addr, bool direct)
4027 {
4028         
4029 
4030 
4031 
4032         if (mmu_is_nested(vcpu))
4033                 return false;
4034 
4035         if (direct)
4036                 return vcpu_match_mmio_gpa(vcpu, addr);
4037 
4038         return vcpu_match_mmio_gva(vcpu, addr);
4039 }
4040 
4041 
4042 static bool
4043 walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep)
4044 {
4045         struct kvm_shadow_walk_iterator iterator;
4046         u64 sptes[PT64_ROOT_MAX_LEVEL], spte = 0ull;
4047         int root, leaf;
4048         bool reserved = false;
4049 
4050         if (!VALID_PAGE(vcpu->arch.mmu->root_hpa))
4051                 goto exit;
4052 
4053         walk_shadow_page_lockless_begin(vcpu);
4054 
4055         for (shadow_walk_init(&iterator, vcpu, addr),
4056                  leaf = root = iterator.level;
4057              shadow_walk_okay(&iterator);
4058              __shadow_walk_next(&iterator, spte)) {
4059                 spte = mmu_spte_get_lockless(iterator.sptep);
4060 
4061                 sptes[leaf - 1] = spte;
4062                 leaf--;
4063 
4064                 if (!is_shadow_present_pte(spte))
4065                         break;
4066 
4067                 reserved |= is_shadow_zero_bits_set(vcpu->arch.mmu, spte,
4068                                                     iterator.level);
4069         }
4070 
4071         walk_shadow_page_lockless_end(vcpu);
4072 
4073         if (reserved) {
4074                 pr_err("%s: detect reserved bits on spte, addr 0x%llx, dump hierarchy:\n",
4075                        __func__, addr);
4076                 while (root > leaf) {
4077                         pr_err("------ spte 0x%llx level %d.\n",
4078                                sptes[root - 1], root);
4079                         root--;
4080                 }
4081         }
4082 exit:
4083         *sptep = spte;
4084         return reserved;
4085 }
4086 
4087 static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct)
4088 {
4089         u64 spte;
4090         bool reserved;
4091 
4092         if (mmio_info_in_cache(vcpu, addr, direct))
4093                 return RET_PF_EMULATE;
4094 
4095         reserved = walk_shadow_page_get_mmio_spte(vcpu, addr, &spte);
4096         if (WARN_ON(reserved))
4097                 return -EINVAL;
4098 
4099         if (is_mmio_spte(spte)) {
4100                 gfn_t gfn = get_mmio_spte_gfn(spte);
4101                 unsigned access = get_mmio_spte_access(spte);
4102 
4103                 if (!check_mmio_spte(vcpu, spte))
4104                         return RET_PF_INVALID;
4105 
4106                 if (direct)
4107                         addr = 0;
4108 
4109                 trace_handle_mmio_page_fault(addr, gfn, access);
4110                 vcpu_cache_mmio_info(vcpu, addr, gfn, access);
4111                 return RET_PF_EMULATE;
4112         }
4113 
4114         
4115 
4116 
4117 
4118         return RET_PF_RETRY;
4119 }
4120 
4121 static bool page_fault_handle_page_track(struct kvm_vcpu *vcpu,
4122                                          u32 error_code, gfn_t gfn)
4123 {
4124         if (unlikely(error_code & PFERR_RSVD_MASK))
4125                 return false;
4126 
4127         if (!(error_code & PFERR_PRESENT_MASK) ||
4128               !(error_code & PFERR_WRITE_MASK))
4129                 return false;
4130 
4131         
4132 
4133 
4134 
4135         if (kvm_page_track_is_active(vcpu, gfn, KVM_PAGE_TRACK_WRITE))
4136                 return true;
4137 
4138         return false;
4139 }
4140 
4141 static void shadow_page_table_clear_flood(struct kvm_vcpu *vcpu, gva_t addr)
4142 {
4143         struct kvm_shadow_walk_iterator iterator;
4144         u64 spte;
4145 
4146         if (!VALID_PAGE(vcpu->arch.mmu->root_hpa))
4147                 return;
4148 
4149         walk_shadow_page_lockless_begin(vcpu);
4150         for_each_shadow_entry_lockless(vcpu, addr, iterator, spte) {
4151                 clear_sp_write_flooding_count(iterator.sptep);
4152                 if (!is_shadow_present_pte(spte))
4153                         break;
4154         }
4155         walk_shadow_page_lockless_end(vcpu);
4156 }
4157 
4158 static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa,
4159                                 u32 error_code, bool prefault)
4160 {
4161         gfn_t gfn = gpa >> PAGE_SHIFT;
4162         int r;
4163 
4164         
4165         pgprintk("%s: gva %lx error %x\n", __func__, gpa, error_code);
4166 
4167         if (page_fault_handle_page_track(vcpu, error_code, gfn))
4168                 return RET_PF_EMULATE;
4169 
4170         r = mmu_topup_memory_caches(vcpu);
4171         if (r)
4172                 return r;
4173 
4174         MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa));
4175 
4176 
4177         return nonpaging_map(vcpu, gpa & PAGE_MASK,
4178                              error_code, gfn, prefault);
4179 }
4180 
4181 static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
4182                                    gfn_t gfn)
4183 {
4184         struct kvm_arch_async_pf arch;
4185 
4186         arch.token = (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id;
4187         arch.gfn = gfn;
4188         arch.direct_map = vcpu->arch.mmu->direct_map;
4189         arch.cr3 = vcpu->arch.mmu->get_cr3(vcpu);
4190 
4191         return kvm_setup_async_pf(vcpu, cr2_or_gpa,
4192                                   kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch);
4193 }
4194 
4195 static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
4196                          gpa_t cr2_or_gpa, kvm_pfn_t *pfn, bool write,
4197                          bool *writable)
4198 {
4199         struct kvm_memory_slot *slot;
4200         bool async;
4201 
4202         
4203 
4204 
4205         if (is_guest_mode(vcpu) && !kvm_is_visible_gfn(vcpu->kvm, gfn)) {
4206                 *pfn = KVM_PFN_NOSLOT;
4207                 return false;
4208         }
4209 
4210         slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
4211         async = false;
4212         *pfn = __gfn_to_pfn_memslot(slot, gfn, false, &async, write, writable);
4213         if (!async)
4214                 return false; 
4215 
4216         if (!prefault && kvm_can_do_async_pf(vcpu)) {
4217                 trace_kvm_try_async_get_page(cr2_or_gpa, gfn);
4218                 if (kvm_find_async_pf_gfn(vcpu, gfn)) {
4219                         trace_kvm_async_pf_doublefault(cr2_or_gpa, gfn);
4220                         kvm_make_request(KVM_REQ_APF_HALT, vcpu);
4221                         return true;
4222                 } else if (kvm_arch_setup_async_pf(vcpu, cr2_or_gpa, gfn))
4223                         return true;
4224         }
4225 
4226         *pfn = __gfn_to_pfn_memslot(slot, gfn, false, NULL, write, writable);
4227         return false;
4228 }
4229 
4230 int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
4231                                 u64 fault_address, char *insn, int insn_len)
4232 {
4233         int r = 1;
4234 
4235 #ifndef CONFIG_X86_64
4236         
4237         if (WARN_ON_ONCE(fault_address >> 32))
4238                 return -EFAULT;
4239 #endif
4240 
4241         vcpu->arch.l1tf_flush_l1d = true;
4242         switch (vcpu->arch.apf.host_apf_reason) {
4243         default:
4244                 trace_kvm_page_fault(fault_address, error_code);
4245 
4246                 if (kvm_event_needs_reinjection(vcpu))
4247                         kvm_mmu_unprotect_page_virt(vcpu, fault_address);
4248                 r = kvm_mmu_page_fault(vcpu, fault_address, error_code, insn,
4249                                 insn_len);
4250                 break;
4251         case KVM_PV_REASON_PAGE_NOT_PRESENT:
4252                 vcpu->arch.apf.host_apf_reason = 0;
4253                 local_irq_disable();
4254                 kvm_async_pf_task_wait(fault_address, 0);
4255                 local_irq_enable();
4256                 break;
4257         case KVM_PV_REASON_PAGE_READY:
4258                 vcpu->arch.apf.host_apf_reason = 0;
4259                 local_irq_disable();
4260                 kvm_async_pf_task_wake(fault_address);
4261                 local_irq_enable();
4262                 break;
4263         }
4264         return r;
4265 }
4266 EXPORT_SYMBOL_GPL(kvm_handle_page_fault);
4267 
4268 static bool
4269 check_hugepage_cache_consistency(struct kvm_vcpu *vcpu, gfn_t gfn, int level)
4270 {
4271         int page_num = KVM_PAGES_PER_HPAGE(level);
4272 
4273         gfn &= ~(page_num - 1);
4274 
4275         return kvm_mtrr_check_gfn_range_consistency(vcpu, gfn, page_num);
4276 }
4277 
4278 static int tdp_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
4279                           bool prefault)
4280 {
4281         kvm_pfn_t pfn;
4282         int r;
4283         int level;
4284         bool force_pt_level;
4285         gfn_t gfn = gpa >> PAGE_SHIFT;
4286         unsigned long mmu_seq;
4287         int write = error_code & PFERR_WRITE_MASK;
4288         bool map_writable;
4289         bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) &&
4290                                 is_nx_huge_page_enabled();
4291 
4292         MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa));
4293 
4294         if (page_fault_handle_page_track(vcpu, error_code, gfn))
4295                 return RET_PF_EMULATE;
4296 
4297         r = mmu_topup_memory_caches(vcpu);
4298         if (r)
4299                 return r;
4300 
4301         force_pt_level =
4302                 lpage_disallowed ||
4303                 !check_hugepage_cache_consistency(vcpu, gfn, PT_DIRECTORY_LEVEL);
4304         level = mapping_level(vcpu, gfn, &force_pt_level);
4305         if (likely(!force_pt_level)) {
4306                 if (level > PT_DIRECTORY_LEVEL &&
4307                     !check_hugepage_cache_consistency(vcpu, gfn, level))
4308                         level = PT_DIRECTORY_LEVEL;
4309                 gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
4310         }
4311 
4312         if (fast_page_fault(vcpu, gpa, level, error_code))
4313                 return RET_PF_RETRY;
4314 
4315         mmu_seq = vcpu->kvm->mmu_notifier_seq;
4316         smp_rmb();
4317 
4318         if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable))
4319                 return RET_PF_RETRY;
4320 
4321         if (handle_abnormal_pfn(vcpu, 0, gfn, pfn, ACC_ALL, &r))
4322                 return r;
4323 
4324         r = RET_PF_RETRY;
4325         spin_lock(&vcpu->kvm->mmu_lock);
4326         if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
4327                 goto out_unlock;
4328         if (make_mmu_pages_available(vcpu) < 0)
4329                 goto out_unlock;
4330         if (likely(!force_pt_level))
4331                 transparent_hugepage_adjust(vcpu, gfn, &pfn, &level);
4332         r = __direct_map(vcpu, gpa, write, map_writable, level, pfn,
4333                          prefault, lpage_disallowed);
4334 out_unlock:
4335         spin_unlock(&vcpu->kvm->mmu_lock);
4336         kvm_release_pfn_clean(pfn);
4337         return r;
4338 }
4339 
4340 static void nonpaging_init_context(struct kvm_vcpu *vcpu,
4341                                    struct kvm_mmu *context)
4342 {
4343         context->page_fault = nonpaging_page_fault;
4344         context->gva_to_gpa = nonpaging_gva_to_gpa;
4345         context->sync_page = nonpaging_sync_page;
4346         context->invlpg = nonpaging_invlpg;
4347         context->update_pte = nonpaging_update_pte;
4348         context->root_level = 0;
4349         context->shadow_root_level = PT32E_ROOT_LEVEL;
4350         context->direct_map = true;
4351         context->nx = false;
4352 }
4353 
4354 
4355 
4356 
4357 
4358 
4359 
4360 
4361 
4362 static bool cached_root_available(struct kvm_vcpu *vcpu, gpa_t new_cr3,
4363                                   union kvm_mmu_page_role new_role)
4364 {
4365         uint i;
4366         struct kvm_mmu_root_info root;
4367         struct kvm_mmu *mmu = vcpu->arch.mmu;
4368 
4369         root.cr3 = mmu->root_cr3;
4370         root.hpa = mmu->root_hpa;
4371 
4372         for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
4373                 swap(root, mmu->prev_roots[i]);
4374 
4375                 if (new_cr3 == root.cr3 && VALID_PAGE(root.hpa) &&
4376                     page_header(root.hpa) != NULL &&
4377                     new_role.word == page_header(root.hpa)->role.word)
4378                         break;
4379         }
4380 
4381         mmu->root_hpa = root.hpa;
4382         mmu->root_cr3 = root.cr3;
4383 
4384         return i < KVM_MMU_NUM_PREV_ROOTS;
4385 }
4386 
4387 static bool fast_cr3_switch(struct kvm_vcpu *vcpu, gpa_t new_cr3,
4388                             union kvm_mmu_page_role new_role,
4389                             bool skip_tlb_flush)
4390 {
4391         struct kvm_mmu *mmu = vcpu->arch.mmu;
4392 
4393         
4394 
4395 
4396 
4397 
4398         if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL &&
4399             mmu->root_level >= PT64_ROOT_4LEVEL) {
4400                 if (mmu_check_root(vcpu, new_cr3 >> PAGE_SHIFT))
4401                         return false;
4402 
4403                 if (cached_root_available(vcpu, new_cr3, new_role)) {
4404                         
4405 
4406 
4407 
4408 
4409 
4410 
4411                         kvm_make_request(KVM_REQ_LOAD_CR3, vcpu);
4412                         if (!skip_tlb_flush) {
4413                                 kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
4414                                 kvm_x86_ops->tlb_flush(vcpu, true);
4415                         }
4416 
4417                         
4418 
4419 
4420 
4421 
4422 
4423 
4424                         vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
4425 
4426                         __clear_sp_write_flooding_count(
4427                                 page_header(mmu->root_hpa));
4428 
4429                         return true;
4430                 }
4431         }
4432 
4433         return false;
4434 }
4435 
4436 static void __kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3,
4437                               union kvm_mmu_page_role new_role,
4438                               bool skip_tlb_flush)
4439 {
4440         if (!fast_cr3_switch(vcpu, new_cr3, new_role, skip_tlb_flush))
4441                 kvm_mmu_free_roots(vcpu, vcpu->arch.mmu,
4442                                    KVM_MMU_ROOT_CURRENT);
4443 }
4444 
4445 void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3, bool skip_tlb_flush)
4446 {
4447         __kvm_mmu_new_cr3(vcpu, new_cr3, kvm_mmu_calc_root_page_role(vcpu),
4448                           skip_tlb_flush);
4449 }
4450 EXPORT_SYMBOL_GPL(kvm_mmu_new_cr3);
4451 
4452 static unsigned long get_cr3(struct kvm_vcpu *vcpu)
4453 {
4454         return kvm_read_cr3(vcpu);
4455 }
4456 
4457 static void inject_page_fault(struct kvm_vcpu *vcpu,
4458                               struct x86_exception *fault)
4459 {
4460         vcpu->arch.mmu->inject_page_fault(vcpu, fault);
4461 }
4462 
4463 static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
4464                            unsigned access, int *nr_present)
4465 {
4466         if (unlikely(is_mmio_spte(*sptep))) {
4467                 if (gfn != get_mmio_spte_gfn(*sptep)) {
4468                         mmu_spte_clear_no_track(sptep);
4469                         return true;
4470                 }
4471 
4472                 (*nr_present)++;
4473                 mark_mmio_spte(vcpu, sptep, gfn, access);
4474                 return true;
4475         }
4476 
4477         return false;
4478 }
4479 
4480 static inline bool is_last_gpte(struct kvm_mmu *mmu,
4481                                 unsigned level, unsigned gpte)
4482 {
4483         
4484 
4485 
4486 
4487 
4488         gpte &= level - mmu->last_nonleaf_level;
4489 
4490         
4491 
4492 
4493 
4494 
4495         gpte |= level - PT_PAGE_TABLE_LEVEL - 1;
4496 
4497         return gpte & PT_PAGE_SIZE_MASK;
4498 }
4499 
4500 #define PTTYPE_EPT 18 
4501 #define PTTYPE PTTYPE_EPT
4502 #include "paging_tmpl.h"
4503 #undef PTTYPE
4504 
4505 #define PTTYPE 64
4506 #include "paging_tmpl.h"
4507 #undef PTTYPE
4508 
4509 #define PTTYPE 32
4510 #include "paging_tmpl.h"
4511 #undef PTTYPE
4512 
4513 static void
4514 __reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
4515                         struct rsvd_bits_validate *rsvd_check,
4516                         int maxphyaddr, int level, bool nx, bool gbpages,
4517                         bool pse, bool amd)
4518 {
4519         u64 exb_bit_rsvd = 0;
4520         u64 gbpages_bit_rsvd = 0;
4521         u64 nonleaf_bit8_rsvd = 0;
4522 
4523         rsvd_check->bad_mt_xwr = 0;
4524 
4525         if (!nx)
4526                 exb_bit_rsvd = rsvd_bits(63, 63);
4527         if (!gbpages)
4528                 gbpages_bit_rsvd = rsvd_bits(7, 7);
4529 
4530         
4531 
4532 
4533 
4534         if (amd)
4535                 nonleaf_bit8_rsvd = rsvd_bits(8, 8);
4536 
4537         switch (level) {
4538         case PT32_ROOT_LEVEL:
4539                 
4540                 rsvd_check->rsvd_bits_mask[0][1] = 0;
4541                 rsvd_check->rsvd_bits_mask[0][0] = 0;
4542                 rsvd_check->rsvd_bits_mask[1][0] =
4543                         rsvd_check->rsvd_bits_mask[0][0];
4544 
4545                 if (!pse) {
4546                         rsvd_check->rsvd_bits_mask[1][1] = 0;
4547                         break;
4548                 }
4549 
4550                 if (is_cpuid_PSE36())
4551                         
4552                         rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(17, 21);
4553                 else
4554                         
4555                         rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(13, 21);
4556                 break;
4557         case PT32E_ROOT_LEVEL:
4558                 rsvd_check->rsvd_bits_mask[0][2] =
4559                         rsvd_bits(maxphyaddr, 63) |
4560                         rsvd_bits(5, 8) | rsvd_bits(1, 2);      
4561                 rsvd_check->rsvd_bits_mask[0][1] = exb_bit_rsvd |
4562                         rsvd_bits(maxphyaddr, 62);      
4563                 rsvd_check->rsvd_bits_mask[0][0] = exb_bit_rsvd |
4564                         rsvd_bits(maxphyaddr, 62);      
4565                 rsvd_check->rsvd_bits_mask[1][1] = exb_bit_rsvd |
4566                         rsvd_bits(maxphyaddr, 62) |
4567                         rsvd_bits(13, 20);              
4568                 rsvd_check->rsvd_bits_mask[1][0] =
4569                         rsvd_check->rsvd_bits_mask[0][0];
4570                 break;
4571         case PT64_ROOT_5LEVEL:
4572                 rsvd_check->rsvd_bits_mask[0][4] = exb_bit_rsvd |
4573                         nonleaf_bit8_rsvd | rsvd_bits(7, 7) |
4574                         rsvd_bits(maxphyaddr, 51);
4575                 rsvd_check->rsvd_bits_mask[1][4] =
4576                         rsvd_check->rsvd_bits_mask[0][4];
4577                 
4578         case PT64_ROOT_4LEVEL:
4579                 rsvd_check->rsvd_bits_mask[0][3] = exb_bit_rsvd |
4580                         nonleaf_bit8_rsvd | rsvd_bits(7, 7) |
4581                         rsvd_bits(maxphyaddr, 51);
4582                 rsvd_check->rsvd_bits_mask[0][2] = exb_bit_rsvd |
4583                         nonleaf_bit8_rsvd | gbpages_bit_rsvd |
4584                         rsvd_bits(maxphyaddr, 51);
4585                 rsvd_check->rsvd_bits_mask[0][1] = exb_bit_rsvd |
4586                         rsvd_bits(maxphyaddr, 51);
4587                 rsvd_check->rsvd_bits_mask[0][0] = exb_bit_rsvd |
4588                         rsvd_bits(maxphyaddr, 51);
4589                 rsvd_check->rsvd_bits_mask[1][3] =
4590                         rsvd_check->rsvd_bits_mask[0][3];
4591                 rsvd_check->rsvd_bits_mask[1][2] = exb_bit_rsvd |
4592                         gbpages_bit_rsvd | rsvd_bits(maxphyaddr, 51) |
4593                         rsvd_bits(13, 29);
4594                 rsvd_check->rsvd_bits_mask[1][1] = exb_bit_rsvd |
4595                         rsvd_bits(maxphyaddr, 51) |
4596                         rsvd_bits(13, 20);              
4597                 rsvd_check->rsvd_bits_mask[1][0] =
4598                         rsvd_check->rsvd_bits_mask[0][0];
4599                 break;
4600         }
4601 }
4602 
4603 static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
4604                                   struct kvm_mmu *context)
4605 {
4606         __reset_rsvds_bits_mask(vcpu, &context->guest_rsvd_check,
4607                                 cpuid_maxphyaddr(vcpu), context->root_level,
4608                                 context->nx,
4609                                 guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES),
4610                                 is_pse(vcpu), guest_cpuid_is_amd(vcpu));
4611 }
4612 
4613 static void
4614 __reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check,
4615                             int maxphyaddr, bool execonly)
4616 {
4617         u64 bad_mt_xwr;
4618 
4619         rsvd_check->rsvd_bits_mask[0][4] =
4620                 rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 7);
4621         rsvd_check->rsvd_bits_mask[0][3] =
4622                 rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 7);
4623         rsvd_check->rsvd_bits_mask[0][2] =
4624                 rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6);
4625         rsvd_check->rsvd_bits_mask[0][1] =
4626                 rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6);
4627         rsvd_check->rsvd_bits_mask[0][0] = rsvd_bits(maxphyaddr, 51);
4628 
4629         
4630         rsvd_check->rsvd_bits_mask[1][4] = rsvd_check->rsvd_bits_mask[0][4];
4631         rsvd_check->rsvd_bits_mask[1][3] = rsvd_check->rsvd_bits_mask[0][3];
4632         rsvd_check->rsvd_bits_mask[1][2] =
4633                 rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 29);
4634         rsvd_check->rsvd_bits_mask[1][1] =
4635                 rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 20);
4636         rsvd_check->rsvd_bits_mask[1][0] = rsvd_check->rsvd_bits_mask[0][0];
4637 
4638         bad_mt_xwr = 0xFFull << (2 * 8);        
4639         bad_mt_xwr |= 0xFFull << (3 * 8);       
4640         bad_mt_xwr |= 0xFFull << (7 * 8);       
4641         bad_mt_xwr |= REPEAT_BYTE(1ull << 2);   
4642         bad_mt_xwr |= REPEAT_BYTE(1ull << 6);   
4643         if (!execonly) {
4644                 
4645                 bad_mt_xwr |= REPEAT_BYTE(1ull << 4);
4646         }
4647         rsvd_check->bad_mt_xwr = bad_mt_xwr;
4648 }
4649 
4650 static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu,
4651                 struct kvm_mmu *context, bool execonly)
4652 {
4653         __reset_rsvds_bits_mask_ept(&context->guest_rsvd_check,
4654                                     cpuid_maxphyaddr(vcpu), execonly);
4655 }
4656 
4657 
4658 
4659 
4660 
4661 
4662 void
4663 reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
4664 {
4665         bool uses_nx = context->nx ||
4666                 context->mmu_role.base.smep_andnot_wp;
4667         struct rsvd_bits_validate *shadow_zero_check;
4668         int i;
4669 
4670         
4671 
4672 
4673 
4674         shadow_zero_check = &context->shadow_zero_check;
4675         __reset_rsvds_bits_mask(vcpu, shadow_zero_check,
4676                                 shadow_phys_bits,
4677                                 context->shadow_root_level, uses_nx,
4678                                 guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES),
4679                                 is_pse(vcpu), true);
4680 
4681         if (!shadow_me_mask)
4682                 return;
4683 
4684         for (i = context->shadow_root_level; --i >= 0;) {
4685                 shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_mask;
4686                 shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_mask;
4687         }
4688 
4689 }
4690 EXPORT_SYMBOL_GPL(reset_shadow_zero_bits_mask);
4691 
4692 static inline bool boot_cpu_is_amd(void)
4693 {
4694         WARN_ON_ONCE(!tdp_enabled);
4695         return shadow_x_mask == 0;
4696 }
4697 
4698 
4699 
4700 
4701 
4702 static void
4703 reset_tdp_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
4704                                 struct kvm_mmu *context)
4705 {
4706         struct rsvd_bits_validate *shadow_zero_check;
4707         int i;
4708 
4709         shadow_zero_check = &context->shadow_zero_check;
4710 
4711         if (boot_cpu_is_amd())
4712                 __reset_rsvds_bits_mask(vcpu, shadow_zero_check,
4713                                         shadow_phys_bits,
4714                                         context->shadow_root_level, false,
4715                                         boot_cpu_has(X86_FEATURE_GBPAGES),
4716                                         true, true);
4717         else
4718                 __reset_rsvds_bits_mask_ept(shadow_zero_check,
4719                                             shadow_phys_bits,
4720                                             false);
4721 
4722         if (!shadow_me_mask)
4723                 return;
4724 
4725         for (i = context->shadow_root_level; --i >= 0;) {
4726                 shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_mask;
4727                 shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_mask;
4728         }
4729 }
4730 
4731 
4732 
4733 
4734 
4735 static void
4736 reset_ept_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
4737                                 struct kvm_mmu *context, bool execonly)
4738 {
4739         __reset_rsvds_bits_mask_ept(&context->shadow_zero_check,
4740                                     shadow_phys_bits, execonly);
4741 }
4742 
4743 #define BYTE_MASK(access) \
4744         ((1 & (access) ? 2 : 0) | \
4745          (2 & (access) ? 4 : 0) | \
4746          (3 & (access) ? 8 : 0) | \
4747          (4 & (access) ? 16 : 0) | \
4748          (5 & (access) ? 32 : 0) | \
4749          (6 & (access) ? 64 : 0) | \
4750          (7 & (access) ? 128 : 0))
4751 
4752 
4753 static void update_permission_bitmask(struct kvm_vcpu *vcpu,
4754                                       struct kvm_mmu *mmu, bool ept)
4755 {
4756         unsigned byte;
4757 
4758         const u8 x = BYTE_MASK(ACC_EXEC_MASK);
4759         const u8 w = BYTE_MASK(ACC_WRITE_MASK);
4760         const u8 u = BYTE_MASK(ACC_USER_MASK);
4761 
4762         bool cr4_smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP) != 0;
4763         bool cr4_smap = kvm_read_cr4_bits(vcpu, X86_CR4_SMAP) != 0;
4764         bool cr0_wp = is_write_protection(vcpu);
4765 
4766         for (byte = 0; byte < ARRAY_SIZE(mmu->permissions); ++byte) {
4767                 unsigned pfec = byte << 1;
4768 
4769                 
4770 
4771 
4772 
4773 
4774                 
4775                 u8 wf = (pfec & PFERR_WRITE_MASK) ? (u8)~w : 0;
4776                 
4777                 u8 uf = (pfec & PFERR_USER_MASK) ? (u8)~u : 0;
4778                 
4779                 u8 ff = (pfec & PFERR_FETCH_MASK) ? (u8)~x : 0;
4780                 
4781                 u8 smepf = 0;
4782                 
4783                 u8 smapf = 0;
4784 
4785                 if (!ept) {
4786                         
4787                         u8 kf = (pfec & PFERR_USER_MASK) ? 0 : u;
4788 
4789                         
4790                         if (!mmu->nx)
4791                                 ff = 0;
4792 
4793                         
4794                         if (!cr0_wp)
4795                                 wf = (pfec & PFERR_USER_MASK) ? wf : 0;
4796 
4797                         
4798                         if (cr4_smep)
4799                                 smepf = (pfec & PFERR_FETCH_MASK) ? kf : 0;
4800 
4801                         
4802 
4803 
4804 
4805 
4806 
4807 
4808 
4809 
4810 
4811 
4812 
4813 
4814 
4815 
4816 
4817                         if (cr4_smap)
4818                                 smapf = (pfec & (PFERR_RSVD_MASK|PFERR_FETCH_MASK)) ? 0 : kf;
4819                 }
4820 
4821                 mmu->permissions[byte] = ff | uf | wf | smepf | smapf;
4822         }
4823 }
4824 
4825 
4826 
4827 
4828 
4829 
4830 
4831 
4832 
4833 
4834 
4835 
4836 
4837 
4838 
4839 
4840 
4841 
4842 
4843 
4844 
4845 
4846 
4847 
4848 
4849 static void update_pkru_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
4850                                 bool ept)
4851 {
4852         unsigned bit;
4853         bool wp;
4854 
4855         if (ept) {
4856                 mmu->pkru_mask = 0;
4857                 return;
4858         }
4859 
4860         
4861         if (!kvm_read_cr4_bits(vcpu, X86_CR4_PKE) || !is_long_mode(vcpu)) {
4862                 mmu->pkru_mask = 0;
4863                 return;
4864         }
4865 
4866         wp = is_write_protection(vcpu);
4867 
4868         for (bit = 0; bit < ARRAY_SIZE(mmu->permissions); ++bit) {
4869                 unsigned pfec, pkey_bits;
4870                 bool check_pkey, check_write, ff, uf, wf, pte_user;
4871 
4872                 pfec = bit << 1;
4873                 ff = pfec & PFERR_FETCH_MASK;
4874                 uf = pfec & PFERR_USER_MASK;
4875                 wf = pfec & PFERR_WRITE_MASK;
4876 
4877                 
4878                 pte_user = pfec & PFERR_RSVD_MASK;
4879 
4880                 
4881 
4882 
4883 
4884                 check_pkey = (!ff && pte_user);
4885                 
4886 
4887 
4888 
4889                 check_write = check_pkey && wf && (uf || wp);
4890 
4891                 
4892                 pkey_bits = !!check_pkey;
4893                 
4894                 pkey_bits |= (!!check_write) << 1;
4895 
4896                 mmu->pkru_mask |= (pkey_bits & 3) << pfec;
4897         }
4898 }
4899 
4900 static void update_last_nonleaf_level(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
4901 {
4902         unsigned root_level = mmu->root_level;
4903 
4904         mmu->last_nonleaf_level = root_level;
4905         if (root_level == PT32_ROOT_LEVEL && is_pse(vcpu))
4906                 mmu->last_nonleaf_level++;
4907 }
4908 
4909 static void paging64_init_context_common(struct kvm_vcpu *vcpu,
4910                                          struct kvm_mmu *context,
4911                                          int level)
4912 {
4913         context->nx = is_nx(vcpu);
4914         context->root_level = level;
4915 
4916         reset_rsvds_bits_mask(vcpu, context);
4917         update_permission_bitmask(vcpu, context, false);
4918         update_pkru_bitmask(vcpu, context, false);
4919         update_last_nonleaf_level(vcpu, context);
4920 
4921         MMU_WARN_ON(!is_pae(vcpu));
4922         context->page_fault = paging64_page_fault;
4923         context->gva_to_gpa = paging64_gva_to_gpa;
4924         context->sync_page = paging64_sync_page;
4925         context->invlpg = paging64_invlpg;
4926         context->update_pte = paging64_update_pte;
4927         context->shadow_root_level = level;
4928         context->direct_map = false;
4929 }
4930 
4931 static void paging64_init_context(struct kvm_vcpu *vcpu,
4932                                   struct kvm_mmu *context)
4933 {
4934         int root_level = is_la57_mode(vcpu) ?
4935                          PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL;
4936 
4937         paging64_init_context_common(vcpu, context, root_level);
4938 }
4939 
4940 static void paging32_init_context(struct kvm_vcpu *vcpu,
4941                                   struct kvm_mmu *context)
4942 {
4943         context->nx = false;
4944         context->root_level = PT32_ROOT_LEVEL;
4945 
4946         reset_rsvds_bits_mask(vcpu, context);
4947         update_permission_bitmask(vcpu, context, false);
4948         update_pkru_bitmask(vcpu, context, false);
4949         update_last_nonleaf_level(vcpu, context);
4950 
4951         context->page_fault = paging32_page_fault;
4952         context->gva_to_gpa = paging32_gva_to_gpa;
4953         context->sync_page = paging32_sync_page;
4954         context->invlpg = paging32_invlpg;
4955         context->update_pte = paging32_update_pte;
4956         context->shadow_root_level = PT32E_ROOT_LEVEL;
4957         context->direct_map = false;
4958 }
4959 
4960 static void paging32E_init_context(struct kvm_vcpu *vcpu,
4961                                    struct kvm_mmu *context)
4962 {
4963         paging64_init_context_common(vcpu, context, PT32E_ROOT_LEVEL);
4964 }
4965 
4966 static union kvm_mmu_extended_role kvm_calc_mmu_role_ext(struct kvm_vcpu *vcpu)
4967 {
4968         union kvm_mmu_extended_role ext = {0};
4969 
4970         ext.cr0_pg = !!is_paging(vcpu);
4971         ext.cr4_pae = !!is_pae(vcpu);
4972         ext.cr4_smep = !!kvm_read_cr4_bits(vcpu, X86_CR4_SMEP);
4973         ext.cr4_smap = !!kvm_read_cr4_bits(vcpu, X86_CR4_SMAP);
4974         ext.cr4_pse = !!is_pse(vcpu);
4975         ext.cr4_pke = !!kvm_read_cr4_bits(vcpu, X86_CR4_PKE);
4976         ext.cr4_la57 = !!kvm_read_cr4_bits(vcpu, X86_CR4_LA57);
4977         ext.maxphyaddr = cpuid_maxphyaddr(vcpu);
4978 
4979         ext.valid = 1;
4980 
4981         return ext;
4982 }
4983 
4984 static union kvm_mmu_role kvm_calc_mmu_role_common(struct kvm_vcpu *vcpu,
4985                                                    bool base_only)
4986 {
4987         union kvm_mmu_role role = {0};
4988 
4989         role.base.access = ACC_ALL;
4990         role.base.nxe = !!is_nx(vcpu);
4991         role.base.cr0_wp = is_write_protection(vcpu);
4992         role.base.smm = is_smm(vcpu);
4993         role.base.guest_mode = is_guest_mode(vcpu);
4994 
4995         if (base_only)
4996                 return role;
4997 
4998         role.ext = kvm_calc_mmu_role_ext(vcpu);
4999 
5000         return role;
5001 }
5002 
5003 static union kvm_mmu_role
5004 kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only)
5005 {
5006         union kvm_mmu_role role = kvm_calc_mmu_role_common(vcpu, base_only);
5007 
5008         role.base.ad_disabled = (shadow_accessed_mask == 0);
5009         role.base.level = kvm_x86_ops->get_tdp_level(vcpu);
5010         role.base.direct = true;
5011         role.base.gpte_is_8_bytes = true;
5012 
5013         return role;
5014 }
5015 
5016 static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
5017 {
5018         struct kvm_mmu *context = vcpu->arch.mmu;
5019         union kvm_mmu_role new_role =
5020                 kvm_calc_tdp_mmu_root_page_role(vcpu, false);
5021 
5022         new_role.base.word &= mmu_base_role_mask.word;
5023         if (new_role.as_u64 == context->mmu_role.as_u64)
5024                 return;
5025 
5026         context->mmu_role.as_u64 = new_role.as_u64;
5027         context->page_fault = tdp_page_fault;
5028         context->sync_page = nonpaging_sync_page;
5029         context->invlpg = nonpaging_invlpg;
5030         context->update_pte = nonpaging_update_pte;
5031         context->shadow_root_level = kvm_x86_ops->get_tdp_level(vcpu);
5032         context->direct_map = true;
5033         context->set_cr3 = kvm_x86_ops->set_tdp_cr3;
5034         context->get_cr3 = get_cr3;
5035         context->get_pdptr = kvm_pdptr_read;
5036         context->inject_page_fault = kvm_inject_page_fault;
5037 
5038         if (!is_paging(vcpu)) {
5039                 context->nx = false;
5040                 context->gva_to_gpa = nonpaging_gva_to_gpa;
5041                 context->root_level = 0;
5042         } else if (is_long_mode(vcpu)) {
5043                 context->nx = is_nx(vcpu);
5044                 context->root_level = is_la57_mode(vcpu) ?
5045                                 PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL;
5046                 reset_rsvds_bits_mask(vcpu, context);
5047                 context->gva_to_gpa = paging64_gva_to_gpa;
5048         } else if (is_pae(vcpu)) {
5049                 context->nx = is_nx(vcpu);
5050                 context->root_level = PT32E_ROOT_LEVEL;
5051                 reset_rsvds_bits_mask(vcpu, context);
5052                 context->gva_to_gpa = paging64_gva_to_gpa;
5053         } else {
5054                 context->nx = false;
5055                 context->root_level = PT32_ROOT_LEVEL;
5056                 reset_rsvds_bits_mask(vcpu, context);
5057                 context->gva_to_gpa = paging32_gva_to_gpa;
5058         }
5059 
5060         update_permission_bitmask(vcpu, context, false);
5061         update_pkru_bitmask(vcpu, context, false);
5062         update_last_nonleaf_level(vcpu, context);
5063         reset_tdp_shadow_zero_bits_mask(vcpu, context);
5064 }
5065 
5066 static union kvm_mmu_role
5067 kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only)
5068 {
5069         union kvm_mmu_role role = kvm_calc_mmu_role_common(vcpu, base_only);
5070 
5071         role.base.smep_andnot_wp = role.ext.cr4_smep &&
5072                 !is_write_protection(vcpu);
5073         role.base.smap_andnot_wp = role.ext.cr4_smap &&
5074                 !is_write_protection(vcpu);
5075         role.base.direct = !is_paging(vcpu);
5076         role.base.gpte_is_8_bytes = !!is_pae(vcpu);
5077 
5078         if (!is_long_mode(vcpu))
5079                 role.base.level = PT32E_ROOT_LEVEL;
5080         else if (is_la57_mode(vcpu))
5081                 role.base.level = PT64_ROOT_5LEVEL;
5082         else
5083                 role.base.level = PT64_ROOT_4LEVEL;
5084 
5085         return role;
5086 }
5087 
5088 void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu)
5089 {
5090         struct kvm_mmu *context = vcpu->arch.mmu;
5091         union kvm_mmu_role new_role =
5092                 kvm_calc_shadow_mmu_root_page_role(vcpu, false);
5093 
5094         new_role.base.word &= mmu_base_role_mask.word;
5095         if (new_role.as_u64 == context->mmu_role.as_u64)
5096                 return;
5097 
5098         if (!is_paging(vcpu))
5099                 nonpaging_init_context(vcpu, context);
5100         else if (is_long_mode(vcpu))
5101                 paging64_init_context(vcpu, context);
5102         else if (is_pae(vcpu))
5103                 paging32E_init_context(vcpu, context);
5104         else
5105                 paging32_init_context(vcpu, context);
5106 
5107         context->mmu_role.as_u64 = new_role.as_u64;
5108         reset_shadow_zero_bits_mask(vcpu, context);
5109 }
5110 EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu);
5111 
5112 static union kvm_mmu_role
5113 kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty,
5114                                    bool execonly)
5115 {
5116         union kvm_mmu_role role = {0};
5117 
5118         
5119         role.base.smm = vcpu->arch.root_mmu.mmu_role.base.smm;
5120 
5121         role.base.level = PT64_ROOT_4LEVEL;
5122         role.base.gpte_is_8_bytes = true;
5123         role.base.direct = false;
5124         role.base.ad_disabled = !accessed_dirty;
5125         role.base.guest_mode = true;
5126         role.base.access = ACC_ALL;
5127 
5128         
5129 
5130 
5131 
5132         role.base.cr0_wp = true;
5133         role.base.smap_andnot_wp = true;
5134 
5135         role.ext = kvm_calc_mmu_role_ext(vcpu);
5136         role.ext.execonly = execonly;
5137 
5138         return role;
5139 }
5140 
5141 void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
5142                              bool accessed_dirty, gpa_t new_eptp)
5143 {
5144         struct kvm_mmu *context = vcpu->arch.mmu;
5145         union kvm_mmu_role new_role =
5146                 kvm_calc_shadow_ept_root_page_role(vcpu, accessed_dirty,
5147                                                    execonly);
5148 
5149         __kvm_mmu_new_cr3(vcpu, new_eptp, new_role.base, false);
5150 
5151         new_role.base.word &= mmu_base_role_mask.word;
5152         if (new_role.as_u64 == context->mmu_role.as_u64)
5153                 return;
5154 
5155         context->shadow_root_level = PT64_ROOT_4LEVEL;
5156 
5157         context->nx = true;
5158         context->ept_ad = accessed_dirty;
5159         context->page_fault = ept_page_fault;
5160         context->gva_to_gpa = ept_gva_to_gpa;
5161         context->sync_page = ept_sync_page;
5162         context->invlpg = ept_invlpg;
5163         context->update_pte = ept_update_pte;
5164         context->root_level = PT64_ROOT_4LEVEL;
5165         context->direct_map = false;
5166         context->mmu_role.as_u64 = new_role.as_u64;
5167 
5168         update_permission_bitmask(vcpu, context, true);
5169         update_pkru_bitmask(vcpu, context, true);
5170         update_last_nonleaf_level(vcpu, context);
5171         reset_rsvds_bits_mask_ept(vcpu, context, execonly);
5172         reset_ept_shadow_zero_bits_mask(vcpu, context, execonly);
5173 }
5174 EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu);
5175 
5176 static void init_kvm_softmmu(struct kvm_vcpu *vcpu)
5177 {
5178         struct kvm_mmu *context = vcpu->arch.mmu;
5179 
5180         kvm_init_shadow_mmu(vcpu);
5181         context->set_cr3           = kvm_x86_ops->set_cr3;
5182         context->get_cr3           = get_cr3;
5183         context->get_pdptr         = kvm_pdptr_read;
5184         context->inject_page_fault = kvm_inject_page_fault;
5185 }
5186 
5187 static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
5188 {
5189         union kvm_mmu_role new_role = kvm_calc_mmu_role_common(vcpu, false);
5190         struct kvm_mmu *g_context = &vcpu->arch.nested_mmu;
5191 
5192         new_role.base.word &= mmu_base_role_mask.word;
5193         if (new_role.as_u64 == g_context->mmu_role.as_u64)
5194                 return;
5195 
5196         g_context->mmu_role.as_u64 = new_role.as_u64;
5197         g_context->get_cr3           = get_cr3;
5198         g_context->get_pdptr         = kvm_pdptr_read;
5199         g_context->inject_page_fault = kvm_inject_page_fault;
5200 
5201         
5202 
5203 
5204 
5205 
5206 
5207 
5208 
5209         if (!is_paging(vcpu)) {
5210                 g_context->nx = false;
5211                 g_context->root_level = 0;
5212                 g_context->gva_to_gpa = nonpaging_gva_to_gpa_nested;
5213         } else if (is_long_mode(vcpu)) {
5214                 g_context->nx = is_nx(vcpu);
5215                 g_context->root_level = is_la57_mode(vcpu) ?
5216                                         PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL;
5217                 reset_rsvds_bits_mask(vcpu, g_context);
5218                 g_context->gva_to_gpa = paging64_gva_to_gpa_nested;
5219         } else if (is_pae(vcpu)) {
5220                 g_context->nx = is_nx(vcpu);
5221                 g_context->root_level = PT32E_ROOT_LEVEL;
5222                 reset_rsvds_bits_mask(vcpu, g_context);
5223                 g_context->gva_to_gpa = paging64_gva_to_gpa_nested;
5224         } else {
5225                 g_context->nx = false;
5226                 g_context->root_level = PT32_ROOT_LEVEL;
5227                 reset_rsvds_bits_mask(vcpu, g_context);
5228                 g_context->gva_to_gpa = paging32_gva_to_gpa_nested;
5229         }
5230 
5231         update_permission_bitmask(vcpu, g_context, false);
5232         update_pkru_bitmask(vcpu, g_context, false);
5233         update_last_nonleaf_level(vcpu, g_context);
5234 }
5235 
5236 void kvm_init_mmu(struct kvm_vcpu *vcpu, bool reset_roots)
5237 {
5238         if (reset_roots) {
5239                 uint i;
5240 
5241                 vcpu->arch.mmu->root_hpa = INVALID_PAGE;
5242 
5243                 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
5244                         vcpu->arch.mmu->prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
5245         }
5246 
5247         if (mmu_is_nested(vcpu))
5248                 init_kvm_nested_mmu(vcpu);
5249         else if (tdp_enabled)
5250                 init_kvm_tdp_mmu(vcpu);
5251         else
5252                 init_kvm_softmmu(vcpu);
5253 }
5254 EXPORT_SYMBOL_GPL(kvm_init_mmu);
5255 
5256 static union kvm_mmu_page_role
5257 kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu)
5258 {
5259         union kvm_mmu_role role;
5260 
5261         if (tdp_enabled)
5262                 role = kvm_calc_tdp_mmu_root_page_role(vcpu, true);
5263         else
5264                 role = kvm_calc_shadow_mmu_root_page_role(vcpu, true);
5265 
5266         return role.base;
5267 }
5268 
5269 void kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
5270 {
5271         kvm_mmu_unload(vcpu);
5272         kvm_init_mmu(vcpu, true);
5273 }
5274 EXPORT_SYMBOL_GPL(kvm_mmu_reset_context);
5275 
5276 int kvm_mmu_load(struct kvm_vcpu *vcpu)
5277 {
5278         int r;
5279 
5280         r = mmu_topup_memory_caches(vcpu);
5281         if (r)
5282                 goto out;
5283         r = mmu_alloc_roots(vcpu);
5284         kvm_mmu_sync_roots(vcpu);
5285         if (r)
5286                 goto out;
5287         kvm_mmu_load_cr3(vcpu);
5288         kvm_x86_ops->tlb_flush(vcpu, true);
5289 out:
5290         return r;
5291 }
5292 EXPORT_SYMBOL_GPL(kvm_mmu_load);
5293 
5294 void kvm_mmu_unload(struct kvm_vcpu *vcpu)
5295 {
5296         kvm_mmu_free_roots(vcpu, &vcpu->arch.root_mmu, KVM_MMU_ROOTS_ALL);
5297         WARN_ON(VALID_PAGE(vcpu->arch.root_mmu.root_hpa));
5298         kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
5299         WARN_ON(VALID_PAGE(vcpu->arch.guest_mmu.root_hpa));
5300 }
5301 EXPORT_SYMBOL_GPL(kvm_mmu_unload);
5302 
5303 static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
5304                                   struct kvm_mmu_page *sp, u64 *spte,
5305                                   const void *new)
5306 {
5307         if (sp->role.level != PT_PAGE_TABLE_LEVEL) {
5308                 ++vcpu->kvm->stat.mmu_pde_zapped;
5309                 return;
5310         }
5311 
5312         ++vcpu->kvm->stat.mmu_pte_updated;
5313         vcpu->arch.mmu->update_pte(vcpu, sp, spte, new);
5314 }
5315 
5316 static bool need_remote_flush(u64 old, u64 new)
5317 {
5318         if (!is_shadow_present_pte(old))
5319                 return false;
5320         if (!is_shadow_present_pte(new))
5321                 return true;
5322         if ((old ^ new) & PT64_BASE_ADDR_MASK)
5323                 return true;
5324         old ^= shadow_nx_mask;
5325         new ^= shadow_nx_mask;
5326         return (old & ~new & PT64_PERM_MASK) != 0;
5327 }
5328 
5329 static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa,
5330                                     int *bytes)
5331 {
5332         u64 gentry = 0;
5333         int r;
5334 
5335         
5336 
5337 
5338 
5339 
5340         if (is_pae(vcpu) && *bytes == 4) {
5341                 
5342                 *gpa &= ~(gpa_t)7;
5343                 *bytes = 8;
5344         }
5345 
5346         if (*bytes == 4 || *bytes == 8) {
5347                 r = kvm_vcpu_read_guest_atomic(vcpu, *gpa, &gentry, *bytes);
5348                 if (r)
5349                         gentry = 0;
5350         }
5351 
5352         return gentry;
5353 }
5354 
5355 
5356 
5357 
5358 
5359 static bool detect_write_flooding(struct kvm_mmu_page *sp)
5360 {
5361         
5362 
5363 
5364 
5365         if (sp->role.level == PT_PAGE_TABLE_LEVEL)
5366                 return false;
5367 
5368         atomic_inc(&sp->write_flooding_count);
5369         return atomic_read(&sp->write_flooding_count) >= 3;
5370 }
5371 
5372 
5373 
5374 
5375 
5376 static bool detect_write_misaligned(struct kvm_mmu_page *sp, gpa_t gpa,
5377                                     int bytes)
5378 {
5379         unsigned offset, pte_size, misaligned;
5380 
5381         pgprintk("misaligned: gpa %llx bytes %d role %x\n",
5382                  gpa, bytes, sp->role.word);
5383 
5384         offset = offset_in_page(gpa);
5385         pte_size = sp->role.gpte_is_8_bytes ? 8 : 4;
5386 
5387         
5388 
5389 
5390 
5391         if (!(offset & (pte_size - 1)) && bytes == 1)
5392                 return false;
5393 
5394         misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
5395         misaligned |= bytes < 4;
5396 
5397         return misaligned;
5398 }
5399 
5400 static u64 *get_written_sptes(struct kvm_mmu_page *sp, gpa_t gpa, int *nspte)
5401 {
5402         unsigned page_offset, quadrant;
5403         u64 *spte;
5404         int level;
5405 
5406         page_offset = offset_in_page(gpa);
5407         level = sp->role.level;
5408         *nspte = 1;
5409         if (!sp->role.gpte_is_8_bytes) {
5410                 page_offset <<= 1;      
5411                 
5412 
5413 
5414 
5415 
5416                 if (level == PT32_ROOT_LEVEL) {
5417                         page_offset &= ~7; 
5418                         page_offset <<= 1;
5419                         *nspte = 2;
5420                 }
5421                 quadrant = page_offset >> PAGE_SHIFT;
5422                 page_offset &= ~PAGE_MASK;
5423                 if (quadrant != sp->role.quadrant)
5424                         return NULL;
5425         }
5426 
5427         spte = &sp->spt[page_offset / sizeof(*spte)];
5428         return spte;
5429 }
5430 
5431 static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
5432                               const u8 *new, int bytes,
5433                               struct kvm_page_track_notifier_node *node)
5434 {
5435         gfn_t gfn = gpa >> PAGE_SHIFT;
5436         struct kvm_mmu_page *sp;
5437         LIST_HEAD(invalid_list);
5438         u64 entry, gentry, *spte;
5439         int npte;
5440         bool remote_flush, local_flush;
5441 
5442         
5443 
5444 
5445 
5446         if (!READ_ONCE(vcpu->kvm->arch.indirect_shadow_pages))
5447                 return;
5448 
5449         remote_flush = local_flush = false;
5450 
5451         pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
5452 
5453         
5454 
5455 
5456 
5457 
5458         mmu_topup_memory_caches(vcpu);
5459 
5460         spin_lock(&vcpu->kvm->mmu_lock);
5461 
5462         gentry = mmu_pte_write_fetch_gpte(vcpu, &gpa, &bytes);
5463 
5464         ++vcpu->kvm->stat.mmu_pte_write;
5465         kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE);
5466 
5467         for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn) {
5468                 if (detect_write_misaligned(sp, gpa, bytes) ||
5469                       detect_write_flooding(sp)) {
5470                         kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
5471                         ++vcpu->kvm->stat.mmu_flooded;
5472                         continue;
5473                 }
5474 
5475                 spte = get_written_sptes(sp, gpa, &npte);
5476                 if (!spte)
5477                         continue;
5478 
5479                 local_flush = true;
5480                 while (npte--) {
5481                         u32 base_role = vcpu->arch.mmu->mmu_role.base.word;
5482 
5483                         entry = *spte;
5484                         mmu_page_zap_pte(vcpu->kvm, sp, spte);
5485                         if (gentry &&
5486                               !((sp->role.word ^ base_role)
5487                               & mmu_base_role_mask.word) && rmap_can_add(vcpu))
5488                                 mmu_pte_write_new_pte(vcpu, sp, spte, &gentry);
5489                         if (need_remote_flush(entry, *spte))
5490                                 remote_flush = true;
5491                         ++spte;
5492                 }
5493         }
5494         kvm_mmu_flush_or_zap(vcpu, &invalid_list, remote_flush, local_flush);
5495         kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE);
5496         spin_unlock(&vcpu->kvm->mmu_lock);
5497 }
5498 
5499 int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
5500 {
5501         gpa_t gpa;
5502         int r;
5503 
5504         if (vcpu->arch.mmu->direct_map)
5505                 return 0;
5506 
5507         gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL);
5508 
5509         r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
5510 
5511         return r;
5512 }
5513 EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt);
5514 
5515 static int make_mmu_pages_available(struct kvm_vcpu *vcpu)
5516 {
5517         LIST_HEAD(invalid_list);
5518 
5519         if (likely(kvm_mmu_available_pages(vcpu->kvm) >= KVM_MIN_FREE_MMU_PAGES))
5520                 return 0;
5521 
5522         while (kvm_mmu_available_pages(vcpu->kvm) < KVM_REFILL_PAGES) {
5523                 if (!prepare_zap_oldest_mmu_page(vcpu->kvm, &invalid_list))
5524                         break;
5525 
5526                 ++vcpu->kvm->stat.mmu_recycled;
5527         }
5528         kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
5529 
5530         if (!kvm_mmu_available_pages(vcpu->kvm))
5531                 return -ENOSPC;
5532         return 0;
5533 }
5534 
5535 int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code,
5536                        void *insn, int insn_len)
5537 {
5538         int r, emulation_type = 0;
5539         bool direct = vcpu->arch.mmu->direct_map;
5540 
5541         
5542         if (vcpu->arch.mmu->direct_map) {
5543                 vcpu->arch.gpa_available = true;
5544                 vcpu->arch.gpa_val = cr2_or_gpa;
5545         }
5546 
5547         r = RET_PF_INVALID;
5548         if (unlikely(error_code & PFERR_RSVD_MASK)) {
5549                 r = handle_mmio_page_fault(vcpu, cr2_or_gpa, direct);
5550                 if (r == RET_PF_EMULATE)
5551                         goto emulate;
5552         }
5553 
5554         if (r == RET_PF_INVALID) {
5555                 r = vcpu->arch.mmu->page_fault(vcpu, cr2_or_gpa,
5556                                                lower_32_bits(error_code),
5557                                                false);
5558                 WARN_ON(r == RET_PF_INVALID);
5559         }
5560 
5561         if (r == RET_PF_RETRY)
5562                 return 1;
5563         if (r < 0)
5564                 return r;
5565 
5566         
5567 
5568 
5569 
5570 
5571 
5572 
5573         if (vcpu->arch.mmu->direct_map &&
5574             (error_code & PFERR_NESTED_GUEST_PAGE) == PFERR_NESTED_GUEST_PAGE) {
5575                 kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2_or_gpa));
5576                 return 1;
5577         }
5578 
5579         
5580 
5581 
5582 
5583 
5584 
5585 
5586 
5587 
5588 
5589 
5590         if (!mmio_info_in_cache(vcpu, cr2_or_gpa, direct) && !is_guest_mode(vcpu))
5591                 emulation_type = EMULTYPE_ALLOW_RETRY;
5592 emulate:
5593         
5594 
5595 
5596 
5597 
5598 
5599 
5600         if (unlikely(insn && !insn_len)) {
5601                 if (!kvm_x86_ops->need_emulation_on_page_fault(vcpu))
5602                         return 1;
5603         }
5604 
5605         return x86_emulate_instruction(vcpu, cr2_or_gpa, emulation_type, insn,
5606                                        insn_len);
5607 }
5608 EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
5609 
5610 void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
5611 {
5612         struct kvm_mmu *mmu = vcpu->arch.mmu;
5613         int i;
5614 
5615         
5616         if (is_noncanonical_address(gva, vcpu))
5617                 return;
5618 
5619         mmu->invlpg(vcpu, gva, mmu->root_hpa);
5620 
5621         
5622 
5623 
5624 
5625 
5626 
5627 
5628 
5629 
5630 
5631 
5632         for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
5633                 if (VALID_PAGE(mmu->prev_roots[i].hpa))
5634                         mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa);
5635 
5636         kvm_x86_ops->tlb_flush_gva(vcpu, gva);
5637         ++vcpu->stat.invlpg;
5638 }
5639 EXPORT_SYMBOL_GPL(kvm_mmu_invlpg);
5640 
5641 void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid)
5642 {
5643         struct kvm_mmu *mmu = vcpu->arch.mmu;
5644         bool tlb_flush = false;
5645         uint i;
5646 
5647         if (pcid == kvm_get_active_pcid(vcpu)) {
5648                 mmu->invlpg(vcpu, gva, mmu->root_hpa);
5649                 tlb_flush = true;
5650         }
5651 
5652         for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
5653                 if (VALID_PAGE(mmu->prev_roots[i].hpa) &&
5654                     pcid == kvm_get_pcid(vcpu, mmu->prev_roots[i].cr3)) {
5655                         mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa);
5656                         tlb_flush = true;
5657                 }
5658         }
5659 
5660         if (tlb_flush)
5661                 kvm_x86_ops->tlb_flush_gva(vcpu, gva);
5662 
5663         ++vcpu->stat.invlpg;
5664 
5665         
5666 
5667 
5668 
5669 
5670 }
5671 EXPORT_SYMBOL_GPL(kvm_mmu_invpcid_gva);
5672 
5673 void kvm_enable_tdp(void)
5674 {
5675         tdp_enabled = true;
5676 }
5677 EXPORT_SYMBOL_GPL(kvm_enable_tdp);
5678 
5679 void kvm_disable_tdp(void)
5680 {
5681         tdp_enabled = false;
5682 }
5683 EXPORT_SYMBOL_GPL(kvm_disable_tdp);
5684 
5685 
5686 
5687 typedef bool (*slot_level_handler) (struct kvm *kvm, struct kvm_rmap_head *rmap_head);
5688 
5689 
5690 static __always_inline bool
5691 slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot,
5692                         slot_level_handler fn, int start_level, int end_level,
5693                         gfn_t start_gfn, gfn_t end_gfn, bool lock_flush_tlb)
5694 {
5695         struct slot_rmap_walk_iterator iterator;
5696         bool flush = false;
5697 
5698         for_each_slot_rmap_range(memslot, start_level, end_level, start_gfn,
5699                         end_gfn, &iterator) {
5700                 if (iterator.rmap)
5701                         flush |= fn(kvm, iterator.rmap);
5702 
5703                 if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
5704                         if (flush && lock_flush_tlb) {
5705                                 kvm_flush_remote_tlbs_with_address(kvm,
5706                                                 start_gfn,
5707                                                 iterator.gfn - start_gfn + 1);
5708                                 flush = false;
5709                         }
5710                         cond_resched_lock(&kvm->mmu_lock);
5711                 }
5712         }
5713 
5714         if (flush && lock_flush_tlb) {
5715                 kvm_flush_remote_tlbs_with_address(kvm, start_gfn,
5716                                                    end_gfn - start_gfn + 1);
5717                 flush = false;
5718         }
5719 
5720         return flush;
5721 }
5722 
5723 static __always_inline bool
5724 slot_handle_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
5725                   slot_level_handler fn, int start_level, int end_level,
5726                   bool lock_flush_tlb)
5727 {
5728         return slot_handle_level_range(kvm, memslot, fn, start_level,
5729                         end_level, memslot->base_gfn,
5730                         memslot->base_gfn + memslot->npages - 1,
5731                         lock_flush_tlb);
5732 }
5733 
5734 static __always_inline bool
5735 slot_handle_all_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
5736                       slot_level_handler fn, bool lock_flush_tlb)
5737 {
5738         return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL,
5739                                  PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb);
5740 }
5741 
5742 static __always_inline bool
5743 slot_handle_large_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
5744                         slot_level_handler fn, bool lock_flush_tlb)
5745 {
5746         return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL + 1,
5747                                  PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb);
5748 }
5749 
5750 static __always_inline bool
5751 slot_handle_leaf(struct kvm *kvm, struct kvm_memory_slot *memslot,
5752                  slot_level_handler fn, bool lock_flush_tlb)
5753 {
5754         return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL,
5755                                  PT_PAGE_TABLE_LEVEL, lock_flush_tlb);
5756 }
5757 
5758 static void free_mmu_pages(struct kvm_mmu *mmu)
5759 {
5760         free_page((unsigned long)mmu->pae_root);
5761         free_page((unsigned long)mmu->lm_root);
5762 }
5763 
5764 static int alloc_mmu_pages(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
5765 {
5766         struct page *page;
5767         int i;
5768 
5769         
5770 
5771 
5772 
5773 
5774 
5775 
5776 
5777 
5778         if (tdp_enabled && kvm_x86_ops->get_tdp_level(vcpu) > PT32E_ROOT_LEVEL)
5779                 return 0;
5780 
5781         page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_DMA32);
5782         if (!page)
5783                 return -ENOMEM;
5784 
5785         mmu->pae_root = page_address(page);
5786         for (i = 0; i < 4; ++i)
5787                 mmu->pae_root[i] = INVALID_PAGE;
5788 
5789         return 0;
5790 }
5791 
5792 int kvm_mmu_create(struct kvm_vcpu *vcpu)
5793 {
5794         uint i;
5795         int ret;
5796 
5797         vcpu->arch.mmu = &vcpu->arch.root_mmu;
5798         vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
5799 
5800         vcpu->arch.root_mmu.root_hpa = INVALID_PAGE;
5801         vcpu->arch.root_mmu.root_cr3 = 0;
5802         vcpu->arch.root_mmu.translate_gpa = translate_gpa;
5803         for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
5804                 vcpu->arch.root_mmu.prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
5805 
5806         vcpu->arch.guest_mmu.root_hpa = INVALID_PAGE;
5807         vcpu->arch.guest_mmu.root_cr3 = 0;
5808         vcpu->arch.guest_mmu.translate_gpa = translate_gpa;
5809         for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
5810                 vcpu->arch.guest_mmu.prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
5811 
5812         vcpu->arch.nested_mmu.translate_gpa = translate_nested_gpa;
5813 
5814         ret = alloc_mmu_pages(vcpu, &vcpu->arch.guest_mmu);
5815         if (ret)
5816                 return ret;
5817 
5818         ret = alloc_mmu_pages(vcpu, &vcpu->arch.root_mmu);
5819         if (ret)
5820                 goto fail_allocate_root;
5821 
5822         return ret;
5823  fail_allocate_root:
5824         free_mmu_pages(&vcpu->arch.guest_mmu);
5825         return ret;
5826 }
5827 
5828 #define BATCH_ZAP_PAGES 10
5829 static void kvm_zap_obsolete_pages(struct kvm *kvm)
5830 {
5831         struct kvm_mmu_page *sp, *node;
5832         int nr_zapped, batch = 0;
5833 
5834 restart:
5835         list_for_each_entry_safe_reverse(sp, node,
5836               &kvm->arch.active_mmu_pages, link) {
5837                 
5838 
5839 
5840 
5841                 if (!is_obsolete_sp(kvm, sp))
5842                         break;
5843 
5844                 
5845 
5846 
5847 
5848 
5849 
5850                 if (sp->role.invalid && sp->root_count)
5851                         continue;
5852 
5853                 
5854 
5855 
5856 
5857 
5858 
5859                 if (batch >= BATCH_ZAP_PAGES &&
5860                     cond_resched_lock(&kvm->mmu_lock)) {
5861                         batch = 0;
5862                         goto restart;
5863                 }
5864 
5865                 if (__kvm_mmu_prepare_zap_page(kvm, sp,
5866                                 &kvm->arch.zapped_obsolete_pages, &nr_zapped)) {
5867                         batch += nr_zapped;
5868                         goto restart;
5869                 }
5870         }
5871 
5872         
5873 
5874 
5875 
5876 
5877         kvm_mmu_commit_zap_page(kvm, &kvm->arch.zapped_obsolete_pages);
5878 }
5879 
5880 
5881 
5882 
5883 
5884 
5885 
5886 
5887 
5888 
5889 static void kvm_mmu_zap_all_fast(struct kvm *kvm)
5890 {
5891         lockdep_assert_held(&kvm->slots_lock);
5892 
5893         spin_lock(&kvm->mmu_lock);
5894         trace_kvm_mmu_zap_all_fast(kvm);
5895 
5896         
5897 
5898 
5899 
5900 
5901 
5902 
5903         kvm->arch.mmu_valid_gen = kvm->arch.mmu_valid_gen ? 0 : 1;
5904 
5905         
5906 
5907 
5908 
5909 
5910 
5911 
5912 
5913         kvm_reload_remote_mmus(kvm);
5914 
5915         kvm_zap_obsolete_pages(kvm);
5916         spin_unlock(&kvm->mmu_lock);
5917 }
5918 
5919 static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm)
5920 {
5921         return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages));
5922 }
5923 
5924 static void kvm_mmu_invalidate_zap_pages_in_memslot(struct kvm *kvm,
5925                         struct kvm_memory_slot *slot,
5926                         struct kvm_page_track_notifier_node *node)
5927 {
5928         kvm_mmu_zap_all_fast(kvm);
5929 }
5930 
5931 void kvm_mmu_init_vm(struct kvm *kvm)
5932 {
5933         struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
5934 
5935         node->track_write = kvm_mmu_pte_write;
5936         node->track_flush_slot = kvm_mmu_invalidate_zap_pages_in_memslot;
5937         kvm_page_track_register_notifier(kvm, node);
5938 }
5939 
5940 void kvm_mmu_uninit_vm(struct kvm *kvm)
5941 {
5942         struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
5943 
5944         kvm_page_track_unregister_notifier(kvm, node);
5945 }
5946 
5947 void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
5948 {
5949         struct kvm_memslots *slots;
5950         struct kvm_memory_slot *memslot;
5951         int i;
5952 
5953         spin_lock(&kvm->mmu_lock);
5954         for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
5955                 slots = __kvm_memslots(kvm, i);
5956                 kvm_for_each_memslot(memslot, slots) {
5957                         gfn_t start, end;
5958 
5959                         start = max(gfn_start, memslot->base_gfn);
5960                         end = min(gfn_end, memslot->base_gfn + memslot->npages);
5961                         if (start >= end)
5962                                 continue;
5963 
5964                         slot_handle_level_range(kvm, memslot, kvm_zap_rmapp,
5965                                                 PT_PAGE_TABLE_LEVEL, PT_MAX_HUGEPAGE_LEVEL,
5966                                                 start, end - 1, true);
5967                 }
5968         }
5969 
5970         spin_unlock(&kvm->mmu_lock);
5971 }
5972 
5973 static bool slot_rmap_write_protect(struct kvm *kvm,
5974                                     struct kvm_rmap_head *rmap_head)
5975 {
5976         return __rmap_write_protect(kvm, rmap_head, false);
5977 }
5978 
5979 void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
5980                                       struct kvm_memory_slot *memslot)
5981 {
5982         bool flush;
5983 
5984         spin_lock(&kvm->mmu_lock);
5985         flush = slot_handle_all_level(kvm, memslot, slot_rmap_write_protect,
5986                                       false);
5987         spin_unlock(&kvm->mmu_lock);
5988 
5989         
5990 
5991 
5992 
5993 
5994         lockdep_assert_held(&kvm->slots_lock);
5995 
5996         
5997 
5998 
5999 
6000 
6001 
6002 
6003 
6004 
6005 
6006 
6007         if (flush)
6008                 kvm_flush_remote_tlbs_with_address(kvm, memslot->base_gfn,
6009                         memslot->npages);
6010 }
6011 
6012 static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
6013                                          struct kvm_rmap_head *rmap_head)
6014 {
6015         u64 *sptep;
6016         struct rmap_iterator iter;
6017         int need_tlb_flush = 0;
6018         kvm_pfn_t pfn;
6019         struct kvm_mmu_page *sp;
6020 
6021 restart:
6022         for_each_rmap_spte(rmap_head, &iter, sptep) {
6023                 sp = page_header(__pa(sptep));
6024                 pfn = spte_to_pfn(*sptep);
6025 
6026                 
6027 
6028 
6029 
6030 
6031 
6032 
6033                 if (sp->role.direct && !kvm_is_reserved_pfn(pfn) &&
6034                     !kvm_is_zone_device_pfn(pfn) &&
6035                     PageTransCompoundMap(pfn_to_page(pfn))) {
6036                         pte_list_remove(rmap_head, sptep);
6037 
6038                         if (kvm_available_flush_tlb_with_range())
6039                                 kvm_flush_remote_tlbs_with_address(kvm, sp->gfn,
6040                                         KVM_PAGES_PER_HPAGE(sp->role.level));
6041                         else
6042                                 need_tlb_flush = 1;
6043 
6044                         goto restart;
6045                 }
6046         }
6047 
6048         return need_tlb_flush;
6049 }
6050 
6051 void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
6052                                    const struct kvm_memory_slot *memslot)
6053 {
6054         
6055         spin_lock(&kvm->mmu_lock);
6056         slot_handle_leaf(kvm, (struct kvm_memory_slot *)memslot,
6057                          kvm_mmu_zap_collapsible_spte, true);
6058         spin_unlock(&kvm->mmu_lock);
6059 }
6060 
6061 void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
6062                                    struct kvm_memory_slot *memslot)
6063 {
6064         bool flush;
6065 
6066         spin_lock(&kvm->mmu_lock);
6067         flush = slot_handle_leaf(kvm, memslot, __rmap_clear_dirty, false);
6068         spin_unlock(&kvm->mmu_lock);
6069 
6070         lockdep_assert_held(&kvm->slots_lock);
6071 
6072         
6073 
6074 
6075 
6076 
6077 
6078         if (flush)
6079                 kvm_flush_remote_tlbs_with_address(kvm, memslot->base_gfn,
6080                                 memslot->npages);
6081 }
6082 EXPORT_SYMBOL_GPL(kvm_mmu_slot_leaf_clear_dirty);
6083 
6084 void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm,
6085                                         struct kvm_memory_slot *memslot)
6086 {
6087         bool flush;
6088 
6089         spin_lock(&kvm->mmu_lock);
6090         flush = slot_handle_large_level(kvm, memslot, slot_rmap_write_protect,
6091                                         false);
6092         spin_unlock(&kvm->mmu_lock);
6093 
6094         
6095         lockdep_assert_held(&kvm->slots_lock);
6096 
6097         if (flush)
6098                 kvm_flush_remote_tlbs_with_address(kvm, memslot->base_gfn,
6099                                 memslot->npages);
6100 }
6101 EXPORT_SYMBOL_GPL(kvm_mmu_slot_largepage_remove_write_access);
6102 
6103 void kvm_mmu_slot_set_dirty(struct kvm *kvm,
6104                             struct kvm_memory_slot *memslot)
6105 {
6106         bool flush;
6107 
6108         spin_lock(&kvm->mmu_lock);
6109         flush = slot_handle_all_level(kvm, memslot, __rmap_set_dirty, false);
6110         spin_unlock(&kvm->mmu_lock);
6111 
6112         lockdep_assert_held(&kvm->slots_lock);
6113 
6114         
6115         if (flush)
6116                 kvm_flush_remote_tlbs_with_address(kvm, memslot->base_gfn,
6117                                 memslot->npages);
6118 }
6119 EXPORT_SYMBOL_GPL(kvm_mmu_slot_set_dirty);
6120 
6121 void kvm_mmu_zap_all(struct kvm *kvm)
6122 {
6123         struct kvm_mmu_page *sp, *node;
6124         LIST_HEAD(invalid_list);
6125         int ign;
6126 
6127         spin_lock(&kvm->mmu_lock);
6128 restart:
6129         list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) {
6130                 if (sp->role.invalid && sp->root_count)
6131                         continue;
6132                 if (__kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &ign))
6133                         goto restart;
6134                 if (cond_resched_lock(&kvm->mmu_lock))
6135                         goto restart;
6136         }
6137 
6138         kvm_mmu_commit_zap_page(kvm, &invalid_list);
6139         spin_unlock(&kvm->mmu_lock);
6140 }
6141 
6142 void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen)
6143 {
6144         WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS);
6145 
6146         gen &= MMIO_SPTE_GEN_MASK;
6147 
6148         
6149 
6150 
6151 
6152 
6153 
6154 
6155         gen &= ~((u64)KVM_ADDRESS_SPACE_NUM - 1);
6156 
6157         
6158 
6159 
6160 
6161         if (unlikely(gen == 0)) {
6162                 kvm_debug_ratelimited("kvm: zapping shadow pages for mmio generation wraparound\n");
6163                 kvm_mmu_zap_all_fast(kvm);
6164         }
6165 }
6166 
6167 static unsigned long
6168 mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
6169 {
6170         struct kvm *kvm;
6171         int nr_to_scan = sc->nr_to_scan;
6172         unsigned long freed = 0;
6173 
6174         mutex_lock(&kvm_lock);
6175 
6176         list_for_each_entry(kvm, &vm_list, vm_list) {
6177                 int idx;
6178                 LIST_HEAD(invalid_list);
6179 
6180                 
6181 
6182 
6183 
6184 
6185 
6186                 if (!nr_to_scan--)
6187                         break;
6188                 
6189 
6190 
6191 
6192 
6193 
6194                 if (!kvm->arch.n_used_mmu_pages &&
6195                     !kvm_has_zapped_obsolete_pages(kvm))
6196                         continue;
6197 
6198                 idx = srcu_read_lock(&kvm->srcu);
6199                 spin_lock(&kvm->mmu_lock);
6200 
6201                 if (kvm_has_zapped_obsolete_pages(kvm)) {
6202                         kvm_mmu_commit_zap_page(kvm,
6203                               &kvm->arch.zapped_obsolete_pages);
6204                         goto unlock;
6205                 }
6206 
6207                 if (prepare_zap_oldest_mmu_page(kvm, &invalid_list))
6208                         freed++;
6209                 kvm_mmu_commit_zap_page(kvm, &invalid_list);
6210 
6211 unlock:
6212                 spin_unlock(&kvm->mmu_lock);
6213                 srcu_read_unlock(&kvm->srcu, idx);
6214 
6215                 
6216 
6217 
6218 
6219 
6220                 list_move_tail(&kvm->vm_list, &vm_list);
6221                 break;
6222         }
6223 
6224         mutex_unlock(&kvm_lock);
6225         return freed;
6226 }
6227 
6228 static unsigned long
6229 mmu_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
6230 {
6231         return percpu_counter_read_positive(&kvm_total_used_mmu_pages);
6232 }
6233 
6234 static struct shrinker mmu_shrinker = {
6235         .count_objects = mmu_shrink_count,
6236         .scan_objects = mmu_shrink_scan,
6237         .seeks = DEFAULT_SEEKS * 10,
6238 };
6239 
6240 static void mmu_destroy_caches(void)
6241 {
6242         kmem_cache_destroy(pte_list_desc_cache);
6243         kmem_cache_destroy(mmu_page_header_cache);
6244 }
6245 
6246 static void kvm_set_mmio_spte_mask(void)
6247 {
6248         u64 mask;
6249 
6250         
6251 
6252 
6253 
6254 
6255 
6256 
6257         if (shadow_phys_bits < 52)
6258                 mask = BIT_ULL(51) | PT_PRESENT_MASK;
6259         else
6260                 mask = 0;
6261 
6262         kvm_mmu_set_mmio_spte_mask(mask, mask, ACC_WRITE_MASK | ACC_USER_MASK);
6263 }
6264 
6265 static bool get_nx_auto_mode(void)
6266 {
6267         
6268         return boot_cpu_has_bug(X86_BUG_ITLB_MULTIHIT) && !cpu_mitigations_off();
6269 }
6270 
6271 static void __set_nx_huge_pages(bool val)
6272 {
6273         nx_huge_pages = itlb_multihit_kvm_mitigation = val;
6274 }
6275 
6276 static int set_nx_huge_pages(const char *val, const struct kernel_param *kp)
6277 {
6278         bool old_val = nx_huge_pages;
6279         bool new_val;
6280 
6281         
6282         if (sysfs_streq(val, "off"))
6283                 new_val = 0;
6284         else if (sysfs_streq(val, "force"))
6285                 new_val = 1;
6286         else if (sysfs_streq(val, "auto"))
6287                 new_val = get_nx_auto_mode();
6288         else if (strtobool(val, &new_val) < 0)
6289                 return -EINVAL;
6290 
6291         __set_nx_huge_pages(new_val);
6292 
6293         if (new_val != old_val) {
6294                 struct kvm *kvm;
6295 
6296                 mutex_lock(&kvm_lock);
6297 
6298                 list_for_each_entry(kvm, &vm_list, vm_list) {
6299                         mutex_lock(&kvm->slots_lock);
6300                         kvm_mmu_zap_all_fast(kvm);
6301                         mutex_unlock(&kvm->slots_lock);
6302 
6303                         wake_up_process(kvm->arch.nx_lpage_recovery_thread);
6304                 }
6305                 mutex_unlock(&kvm_lock);
6306         }
6307 
6308         return 0;
6309 }
6310 
6311 int kvm_mmu_module_init(void)
6312 {
6313         int ret = -ENOMEM;
6314 
6315         if (nx_huge_pages == -1)
6316                 __set_nx_huge_pages(get_nx_auto_mode());
6317 
6318         
6319 
6320 
6321 
6322 
6323 
6324         BUILD_BUG_ON(sizeof(union kvm_mmu_page_role) != sizeof(u32));
6325         BUILD_BUG_ON(sizeof(union kvm_mmu_extended_role) != sizeof(u32));
6326         BUILD_BUG_ON(sizeof(union kvm_mmu_role) != sizeof(u64));
6327 
6328         kvm_mmu_reset_all_pte_masks();
6329 
6330         kvm_set_mmio_spte_mask();
6331 
6332         pte_list_desc_cache = kmem_cache_create("pte_list_desc",
6333                                             sizeof(struct pte_list_desc),
6334                                             0, SLAB_ACCOUNT, NULL);
6335         if (!pte_list_desc_cache)
6336                 goto out;
6337 
6338         mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header",
6339                                                   sizeof(struct kvm_mmu_page),
6340                                                   0, SLAB_ACCOUNT, NULL);
6341         if (!mmu_page_header_cache)
6342                 goto out;
6343 
6344         if (percpu_counter_init(&kvm_total_used_mmu_pages, 0, GFP_KERNEL))
6345                 goto out;
6346 
6347         ret = register_shrinker(&mmu_shrinker);
6348         if (ret)
6349                 goto out;
6350 
6351         return 0;
6352 
6353 out:
6354         mmu_destroy_caches();
6355         return ret;
6356 }
6357 
6358 
6359 
6360 
6361 unsigned long kvm_mmu_calculate_default_mmu_pages(struct kvm *kvm)
6362 {
6363         unsigned long nr_mmu_pages;
6364         unsigned long nr_pages = 0;
6365         struct kvm_memslots *slots;
6366         struct kvm_memory_slot *memslot;
6367         int i;
6368 
6369         for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
6370                 slots = __kvm_memslots(kvm, i);
6371 
6372                 kvm_for_each_memslot(memslot, slots)
6373                         nr_pages += memslot->npages;
6374         }
6375 
6376         nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000;
6377         nr_mmu_pages = max(nr_mmu_pages, KVM_MIN_ALLOC_MMU_PAGES);
6378 
6379         return nr_mmu_pages;
6380 }
6381 
6382 void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
6383 {
6384         kvm_mmu_unload(vcpu);
6385         free_mmu_pages(&vcpu->arch.root_mmu);
6386         free_mmu_pages(&vcpu->arch.guest_mmu);
6387         mmu_free_memory_caches(vcpu);
6388 }
6389 
6390 void kvm_mmu_module_exit(void)
6391 {
6392         mmu_destroy_caches();
6393         percpu_counter_destroy(&kvm_total_used_mmu_pages);
6394         unregister_shrinker(&mmu_shrinker);
6395         mmu_audit_disable();
6396 }
6397 
6398 static int set_nx_huge_pages_recovery_ratio(const char *val, const struct kernel_param *kp)
6399 {
6400         unsigned int old_val;
6401         int err;
6402 
6403         old_val = nx_huge_pages_recovery_ratio;
6404         err = param_set_uint(val, kp);
6405         if (err)
6406                 return err;
6407 
6408         if (READ_ONCE(nx_huge_pages) &&
6409             !old_val && nx_huge_pages_recovery_ratio) {
6410                 struct kvm *kvm;
6411 
6412                 mutex_lock(&kvm_lock);
6413 
6414                 list_for_each_entry(kvm, &vm_list, vm_list)
6415                         wake_up_process(kvm->arch.nx_lpage_recovery_thread);
6416 
6417                 mutex_unlock(&kvm_lock);
6418         }
6419 
6420         return err;
6421 }
6422 
6423 static void kvm_recover_nx_lpages(struct kvm *kvm)
6424 {
6425         int rcu_idx;
6426         struct kvm_mmu_page *sp;
6427         unsigned int ratio;
6428         LIST_HEAD(invalid_list);
6429         ulong to_zap;
6430 
6431         rcu_idx = srcu_read_lock(&kvm->srcu);
6432         spin_lock(&kvm->mmu_lock);
6433 
6434         ratio = READ_ONCE(nx_huge_pages_recovery_ratio);
6435         to_zap = ratio ? DIV_ROUND_UP(kvm->stat.nx_lpage_splits, ratio) : 0;
6436         while (to_zap && !list_empty(&kvm->arch.lpage_disallowed_mmu_pages)) {
6437                 
6438 
6439 
6440 
6441 
6442                 sp = list_first_entry(&kvm->arch.lpage_disallowed_mmu_pages,
6443                                       struct kvm_mmu_page,
6444                                       lpage_disallowed_link);
6445                 WARN_ON_ONCE(!sp->lpage_disallowed);
6446                 kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
6447                 WARN_ON_ONCE(sp->lpage_disallowed);
6448 
6449                 if (!--to_zap || need_resched() || spin_needbreak(&kvm->mmu_lock)) {
6450                         kvm_mmu_commit_zap_page(kvm, &invalid_list);
6451                         if (to_zap)
6452                                 cond_resched_lock(&kvm->mmu_lock);
6453                 }
6454         }
6455 
6456         spin_unlock(&kvm->mmu_lock);
6457         srcu_read_unlock(&kvm->srcu, rcu_idx);
6458 }
6459 
6460 static long get_nx_lpage_recovery_timeout(u64 start_time)
6461 {
6462         return READ_ONCE(nx_huge_pages) && READ_ONCE(nx_huge_pages_recovery_ratio)
6463                 ? start_time + 60 * HZ - get_jiffies_64()
6464                 : MAX_SCHEDULE_TIMEOUT;
6465 }
6466 
6467 static int kvm_nx_lpage_recovery_worker(struct kvm *kvm, uintptr_t data)
6468 {
6469         u64 start_time;
6470         long remaining_time;
6471 
6472         while (true) {
6473                 start_time = get_jiffies_64();
6474                 remaining_time = get_nx_lpage_recovery_timeout(start_time);
6475 
6476                 set_current_state(TASK_INTERRUPTIBLE);
6477                 while (!kthread_should_stop() && remaining_time > 0) {
6478                         schedule_timeout(remaining_time);
6479                         remaining_time = get_nx_lpage_recovery_timeout(start_time);
6480                         set_current_state(TASK_INTERRUPTIBLE);
6481                 }
6482 
6483                 set_current_state(TASK_RUNNING);
6484 
6485                 if (kthread_should_stop())
6486                         return 0;
6487 
6488                 kvm_recover_nx_lpages(kvm);
6489         }
6490 }
6491 
6492 int kvm_mmu_post_init_vm(struct kvm *kvm)
6493 {
6494         int err;
6495 
6496         err = kvm_vm_create_worker_thread(kvm, kvm_nx_lpage_recovery_worker, 0,
6497                                           "kvm-nx-lpage-recovery",
6498                                           &kvm->arch.nx_lpage_recovery_thread);
6499         if (!err)
6500                 kthread_unpark(kvm->arch.nx_lpage_recovery_thread);
6501 
6502         return err;
6503 }
6504 
6505 void kvm_mmu_pre_destroy_vm(struct kvm *kvm)
6506 {
6507         if (kvm->arch.nx_lpage_recovery_thread)
6508                 kthread_stop(kvm->arch.nx_lpage_recovery_thread);
6509 }