diff options
Diffstat (limited to 'arch/x86/kvm/mmu/tdp_mmu.c')
-rw-r--r-- | arch/x86/kvm/mmu/tdp_mmu.c | 154 |
1 files changed, 93 insertions, 61 deletions
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 953082bf96..04c1f0957f 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -149,11 +149,11 @@ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm, * If shared is set, this function is operating under the MMU lock in read * mode. */ -#define __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _only_valid)\ - for (_root = tdp_mmu_next_root(_kvm, NULL, _only_valid); \ - ({ lockdep_assert_held(&(_kvm)->mmu_lock); }), _root; \ - _root = tdp_mmu_next_root(_kvm, _root, _only_valid)) \ - if (kvm_mmu_page_as_id(_root) != _as_id) { \ +#define __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _only_valid) \ + for (_root = tdp_mmu_next_root(_kvm, NULL, _only_valid); \ + ({ lockdep_assert_held(&(_kvm)->mmu_lock); }), _root; \ + _root = tdp_mmu_next_root(_kvm, _root, _only_valid)) \ + if (_as_id >= 0 && kvm_mmu_page_as_id(_root) != _as_id) { \ } else #define for_each_valid_tdp_mmu_root_yield_safe(_kvm, _root, _as_id) \ @@ -171,12 +171,19 @@ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm, * Holding mmu_lock for write obviates the need for RCU protection as the list * is guaranteed to be stable. */ -#define for_each_tdp_mmu_root(_kvm, _root, _as_id) \ - list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link) \ - if (kvm_lockdep_assert_mmu_lock_held(_kvm, false) && \ - kvm_mmu_page_as_id(_root) != _as_id) { \ +#define __for_each_tdp_mmu_root(_kvm, _root, _as_id, _only_valid) \ + list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link) \ + if (kvm_lockdep_assert_mmu_lock_held(_kvm, false) && \ + ((_as_id >= 0 && kvm_mmu_page_as_id(_root) != _as_id) || \ + ((_only_valid) && (_root)->role.invalid))) { \ } else +#define for_each_tdp_mmu_root(_kvm, _root, _as_id) \ + __for_each_tdp_mmu_root(_kvm, _root, _as_id, false) + +#define for_each_valid_tdp_mmu_root(_kvm, _root, _as_id) \ + __for_each_tdp_mmu_root(_kvm, _root, _as_id, true) + static struct kvm_mmu_page *tdp_mmu_alloc_sp(struct kvm_vcpu *vcpu) { struct kvm_mmu_page *sp; @@ -216,22 +223,41 @@ static void tdp_mmu_init_child_sp(struct kvm_mmu_page *child_sp, tdp_mmu_init_sp(child_sp, iter->sptep, iter->gfn, role); } -hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu) +int kvm_tdp_mmu_alloc_root(struct kvm_vcpu *vcpu) { - union kvm_mmu_page_role role = vcpu->arch.mmu->root_role; + struct kvm_mmu *mmu = vcpu->arch.mmu; + union kvm_mmu_page_role role = mmu->root_role; + int as_id = kvm_mmu_role_as_id(role); struct kvm *kvm = vcpu->kvm; struct kvm_mmu_page *root; - lockdep_assert_held_write(&kvm->mmu_lock); + /* + * Check for an existing root before acquiring the pages lock to avoid + * unnecessary serialization if multiple vCPUs are loading a new root. + * E.g. when bringing up secondary vCPUs, KVM will already have created + * a valid root on behalf of the primary vCPU. + */ + read_lock(&kvm->mmu_lock); + + for_each_valid_tdp_mmu_root_yield_safe(kvm, root, as_id) { + if (root->role.word == role.word) + goto out_read_unlock; + } + + spin_lock(&kvm->arch.tdp_mmu_pages_lock); /* - * Check for an existing root before allocating a new one. Note, the - * role check prevents consuming an invalid root. + * Recheck for an existing root after acquiring the pages lock, another + * vCPU may have raced ahead and created a new usable root. Manually + * walk the list of roots as the standard macros assume that the pages + * lock is *not* held. WARN if grabbing a reference to a usable root + * fails, as the last reference to a root can only be put *after* the + * root has been invalidated, which requires holding mmu_lock for write. */ - for_each_tdp_mmu_root(kvm, root, kvm_mmu_role_as_id(role)) { + list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) { if (root->role.word == role.word && - kvm_tdp_mmu_get_root(root)) - goto out; + !WARN_ON_ONCE(!kvm_tdp_mmu_get_root(root))) + goto out_spin_unlock; } root = tdp_mmu_alloc_sp(vcpu); @@ -245,13 +271,20 @@ hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu) * is ultimately put by kvm_tdp_mmu_zap_invalidated_roots(). */ refcount_set(&root->tdp_mmu_root_count, 2); - - spin_lock(&kvm->arch.tdp_mmu_pages_lock); list_add_rcu(&root->link, &kvm->arch.tdp_mmu_roots); - spin_unlock(&kvm->arch.tdp_mmu_pages_lock); -out: - return __pa(root->spt); +out_spin_unlock: + spin_unlock(&kvm->arch.tdp_mmu_pages_lock); +out_read_unlock: + read_unlock(&kvm->mmu_lock); + /* + * Note, KVM_REQ_MMU_FREE_OBSOLETE_ROOTS will prevent entering the guest + * and actually consuming the root if it's invalidated after dropping + * mmu_lock, and the root can't be freed as this vCPU holds a reference. + */ + mmu->root.hpa = __pa(root->spt); + mmu->root.pgd = 0; + return 0; } static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, @@ -734,15 +767,26 @@ static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root, rcu_read_lock(); /* - * To avoid RCU stalls due to recursively removing huge swaths of SPs, - * split the zap into two passes. On the first pass, zap at the 1gb - * level, and then zap top-level SPs on the second pass. "1gb" is not - * arbitrary, as KVM must be able to zap a 1gb shadow page without - * inducing a stall to allow in-place replacement with a 1gb hugepage. + * Zap roots in multiple passes of decreasing granularity, i.e. zap at + * 4KiB=>2MiB=>1GiB=>root, in order to better honor need_resched() (all + * preempt models) or mmu_lock contention (full or real-time models). + * Zapping at finer granularity marginally increases the total time of + * the zap, but in most cases the zap itself isn't latency sensitive. * - * Because zapping a SP recurses on its children, stepping down to - * PG_LEVEL_4K in the iterator itself is unnecessary. + * If KVM is configured to prove the MMU, skip the 4KiB and 2MiB zaps + * in order to mimic the page fault path, which can replace a 1GiB page + * table with an equivalent 1GiB hugepage, i.e. can get saddled with + * zapping a 1GiB region that's fully populated with 4KiB SPTEs. This + * allows verifying that KVM can safely zap 1GiB regions, e.g. without + * inducing RCU stalls, without relying on a relatively rare event + * (zapping roots is orders of magnitude more common). Note, because + * zapping a SP recurses on its children, stepping down to PG_LEVEL_4K + * in the iterator itself is unnecessary. */ + if (!IS_ENABLED(CONFIG_KVM_PROVE_MMU)) { + __tdp_mmu_zap_root(kvm, root, shared, PG_LEVEL_4K); + __tdp_mmu_zap_root(kvm, root, shared, PG_LEVEL_2M); + } __tdp_mmu_zap_root(kvm, root, shared, PG_LEVEL_1G); __tdp_mmu_zap_root(kvm, root, shared, root->role.level); @@ -800,7 +844,13 @@ static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root, continue; tdp_mmu_iter_set_spte(kvm, &iter, 0); - flush = true; + + /* + * Zappings SPTEs in invalid roots doesn't require a TLB flush, + * see kvm_tdp_mmu_zap_invalidated_roots() for details. + */ + if (!root->role.invalid) + flush = true; } rcu_read_unlock(); @@ -813,16 +863,16 @@ static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root, } /* - * Zap leaf SPTEs for the range of gfns, [start, end), for all roots. Returns - * true if a TLB flush is needed before releasing the MMU lock, i.e. if one or - * more SPTEs were zapped since the MMU lock was last acquired. + * Zap leaf SPTEs for the range of gfns, [start, end), for all *VALID** roots. + * Returns true if a TLB flush is needed before releasing the MMU lock, i.e. if + * one or more SPTEs were zapped since the MMU lock was last acquired. */ bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, gfn_t start, gfn_t end, bool flush) { struct kvm_mmu_page *root; lockdep_assert_held_write(&kvm->mmu_lock); - for_each_tdp_mmu_root_yield_safe(kvm, root) + for_each_valid_tdp_mmu_root_yield_safe(kvm, root, -1) flush = tdp_mmu_zap_leafs(kvm, root, start, end, true, flush); return flush; @@ -896,7 +946,7 @@ void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm) * the VM is being destroyed). * * Note, kvm_tdp_mmu_zap_invalidated_roots() is gifted the TDP MMU's reference. - * See kvm_tdp_mmu_get_vcpu_root_hpa(). + * See kvm_tdp_mmu_alloc_root(). */ void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm) { @@ -1508,13 +1558,6 @@ static bool tdp_mmu_need_write_protect(struct kvm_mmu_page *sp) return kvm_mmu_page_ad_need_write_protect(sp) || !kvm_ad_enabled(); } -/* - * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If - * AD bits are enabled, this will involve clearing the dirty bit on each SPTE. - * If AD bits are not enabled, this will require clearing the writable bit on - * each SPTE. Returns true if an SPTE has been changed and the TLBs need to - * be flushed. - */ static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, gfn_t start, gfn_t end) { @@ -1551,11 +1594,9 @@ retry: } /* - * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If - * AD bits are enabled, this will involve clearing the dirty bit on each SPTE. - * If AD bits are not enabled, this will require clearing the writable bit on - * each SPTE. Returns true if an SPTE has been changed and the TLBs need to - * be flushed. + * Clear the dirty status (D-bit or W-bit) of all the SPTEs mapping GFNs in the + * memslot. Returns true if an SPTE has been changed and the TLBs need to be + * flushed. */ bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, const struct kvm_memory_slot *slot) @@ -1571,13 +1612,6 @@ bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, return spte_set; } -/* - * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is - * set in mask, starting at gfn. The given memslot is expected to contain all - * the GFNs represented by set bits in the mask. If AD bits are enabled, - * clearing the dirty status will involve clearing the dirty bit on each SPTE - * or, if AD bits are not enabled, clearing the writable bit on each SPTE. - */ static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root, gfn_t gfn, unsigned long mask, bool wrprot) { @@ -1620,11 +1654,9 @@ static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root, } /* - * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is - * set in mask, starting at gfn. The given memslot is expected to contain all - * the GFNs represented by set bits in the mask. If AD bits are enabled, - * clearing the dirty status will involve clearing the dirty bit on each SPTE - * or, if AD bits are not enabled, clearing the writable bit on each SPTE. + * Clear the dirty status (D-bit or W-bit) of all the 4k SPTEs mapping GFNs for + * which a bit is set in mask, starting at gfn. The given memslot is expected to + * contain all the GFNs represented by set bits in the mask. */ void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm, struct kvm_memory_slot *slot, @@ -1633,7 +1665,7 @@ void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm, { struct kvm_mmu_page *root; - for_each_tdp_mmu_root(kvm, root, slot->as_id) + for_each_valid_tdp_mmu_root(kvm, root, slot->as_id) clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot); } @@ -1751,7 +1783,7 @@ bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm, bool spte_set = false; lockdep_assert_held_write(&kvm->mmu_lock); - for_each_tdp_mmu_root(kvm, root, slot->as_id) + for_each_valid_tdp_mmu_root(kvm, root, slot->as_id) spte_set |= write_protect_gfn(kvm, root, gfn, min_level); return spte_set; |