Skip to content

Commit

Permalink
kvm: mmu: ITLB_MULTIHIT mitigation
Browse files Browse the repository at this point in the history
With some Intel processors, putting the same virtual address in the TLB
as both a 4 KiB and 2 MiB page can confuse the instruction fetch unit
and cause the processor to issue a machine check resulting in a CPU lockup.

Unfortunately when EPT page tables use huge pages, it is possible for a
malicious guest to cause this situation.

Add a knob to mark huge pages as non-executable. When the nx_huge_pages
parameter is enabled (and we are using EPT), all huge pages are marked as
NX. If the guest attempts to execute in one of those pages, the page is
broken down into 4K pages, which are then marked executable.

This is not an issue for shadow paging (except nested EPT), because then
the host is in control of TLB flushes and the problematic situation cannot
happen.  With nested EPT, again the nested guest can cause problems shadow
and direct EPT is treated in the same way.

[ tglx: Fixup default to auto and massage wording a bit ]

Originally-by: Junaid Shahid <junaids@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
  • Loading branch information
bonzini authored and KAGA-KOKO committed Nov 4, 2019
1 parent 731dc9d commit b8e8c83
Show file tree
Hide file tree
Showing 6 changed files with 200 additions and 13 deletions.
19 changes: 19 additions & 0 deletions Documentation/admin-guide/kernel-parameters.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2055,6 +2055,19 @@
KVM MMU at runtime.
Default is 0 (off)

kvm.nx_huge_pages=
[KVM] Controls the software workaround for the
X86_BUG_ITLB_MULTIHIT bug.
force : Always deploy workaround.
off : Never deploy workaround.
auto : Deploy workaround based on the presence of
X86_BUG_ITLB_MULTIHIT.

Default is 'auto'.

If the software workaround is enabled for the host,
guests do need not to enable it for nested guests.

kvm-amd.nested= [KVM,AMD] Allow nested virtualization in KVM/SVM.
Default is 1 (enabled)

Expand Down Expand Up @@ -2637,6 +2650,12 @@
l1tf=off [X86]
mds=off [X86]
tsx_async_abort=off [X86]
kvm.nx_huge_pages=off [X86]

Exceptions:
This does not have any effect on
kvm.nx_huge_pages when
kvm.nx_huge_pages=force.

auto (default)
Mitigate all CPU vulnerabilities, but leave SMT
Expand Down
2 changes: 2 additions & 0 deletions arch/x86/include/asm/kvm_host.h
Original file line number Diff line number Diff line change
Expand Up @@ -315,6 +315,7 @@ struct kvm_mmu_page {
bool unsync;
u8 mmu_valid_gen;
bool mmio_cached;
bool lpage_disallowed; /* Can't be replaced by an equiv large page */

/*
* The following two entries are used to key the shadow page in the
Expand Down Expand Up @@ -946,6 +947,7 @@ struct kvm_vm_stat {
ulong mmu_unsync;
ulong remote_tlb_flush;
ulong lpages;
ulong nx_lpage_splits;
ulong max_mmu_page_hash_collisions;
};

Expand Down
13 changes: 12 additions & 1 deletion arch/x86/kernel/cpu/bugs.c
Original file line number Diff line number Diff line change
Expand Up @@ -1257,6 +1257,9 @@ void x86_spec_ctrl_setup_ap(void)
x86_amd_ssb_disable();
}

bool itlb_multihit_kvm_mitigation;
EXPORT_SYMBOL_GPL(itlb_multihit_kvm_mitigation);

#undef pr_fmt
#define pr_fmt(fmt) "L1TF: " fmt

Expand Down Expand Up @@ -1412,17 +1415,25 @@ static ssize_t l1tf_show_state(char *buf)
l1tf_vmx_states[l1tf_vmx_mitigation],
sched_smt_active() ? "vulnerable" : "disabled");
}

static ssize_t itlb_multihit_show_state(char *buf)
{
if (itlb_multihit_kvm_mitigation)
return sprintf(buf, "KVM: Mitigation: Split huge pages\n");
else
return sprintf(buf, "KVM: Vulnerable\n");
}
#else
static ssize_t l1tf_show_state(char *buf)
{
return sprintf(buf, "%s\n", L1TF_DEFAULT_MSG);
}
#endif

static ssize_t itlb_multihit_show_state(char *buf)
{
return sprintf(buf, "Processor vulnerable\n");
}
#endif

static ssize_t mds_show_state(char *buf)
{
Expand Down
141 changes: 135 additions & 6 deletions arch/x86/kvm/mmu.c
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,20 @@
#include <asm/kvm_page_track.h>
#include "trace.h"

extern bool itlb_multihit_kvm_mitigation;

static int __read_mostly nx_huge_pages = -1;

static int set_nx_huge_pages(const char *val, const struct kernel_param *kp);

static struct kernel_param_ops nx_huge_pages_ops = {
.set = set_nx_huge_pages,
.get = param_get_bool,
};

module_param_cb(nx_huge_pages, &nx_huge_pages_ops, &nx_huge_pages, 0644);
__MODULE_PARM_TYPE(nx_huge_pages, "bool");

/*
* When setting this variable to true it enables Two-Dimensional-Paging
* where the hardware walks 2 page tables:
Expand Down Expand Up @@ -352,6 +366,11 @@ static inline bool spte_ad_need_write_protect(u64 spte)
return (spte & SPTE_SPECIAL_MASK) != SPTE_AD_ENABLED_MASK;
}

static bool is_nx_huge_page_enabled(void)
{
return READ_ONCE(nx_huge_pages);
}

static inline u64 spte_shadow_accessed_mask(u64 spte)
{
MMU_WARN_ON(is_mmio_spte(spte));
Expand Down Expand Up @@ -1190,6 +1209,15 @@ static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
kvm_mmu_gfn_disallow_lpage(slot, gfn);
}

static void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp)
{
if (sp->lpage_disallowed)
return;

++kvm->stat.nx_lpage_splits;
sp->lpage_disallowed = true;
}

static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
{
struct kvm_memslots *slots;
Expand All @@ -1207,6 +1235,12 @@ static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
kvm_mmu_gfn_allow_lpage(slot, gfn);
}

static void unaccount_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp)
{
--kvm->stat.nx_lpage_splits;
sp->lpage_disallowed = false;
}

static bool __mmu_gfn_lpage_is_disallowed(gfn_t gfn, int level,
struct kvm_memory_slot *slot)
{
Expand Down Expand Up @@ -2792,6 +2826,9 @@ static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm,
kvm_reload_remote_mmus(kvm);
}

if (sp->lpage_disallowed)
unaccount_huge_nx_page(kvm, sp);

sp->role.invalid = 1;
return list_unstable;
}
Expand Down Expand Up @@ -3013,6 +3050,11 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
if (!speculative)
spte |= spte_shadow_accessed_mask(spte);

if (level > PT_PAGE_TABLE_LEVEL && (pte_access & ACC_EXEC_MASK) &&
is_nx_huge_page_enabled()) {
pte_access &= ~ACC_EXEC_MASK;
}

if (pte_access & ACC_EXEC_MASK)
spte |= shadow_x_mask;
else
Expand Down Expand Up @@ -3233,9 +3275,32 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
__direct_pte_prefetch(vcpu, sp, sptep);
}

static void disallowed_hugepage_adjust(struct kvm_shadow_walk_iterator it,
gfn_t gfn, kvm_pfn_t *pfnp, int *levelp)
{
int level = *levelp;
u64 spte = *it.sptep;

if (it.level == level && level > PT_PAGE_TABLE_LEVEL &&
is_nx_huge_page_enabled() &&
is_shadow_present_pte(spte) &&
!is_large_pte(spte)) {
/*
* A small SPTE exists for this pfn, but FNAME(fetch)
* and __direct_map would like to create a large PTE
* instead: just force them to go down another level,
* patching back for them into pfn the next 9 bits of
* the address.
*/
u64 page_mask = KVM_PAGES_PER_HPAGE(level) - KVM_PAGES_PER_HPAGE(level - 1);
*pfnp |= gfn & page_mask;
(*levelp)--;
}
}

static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write,
int map_writable, int level, kvm_pfn_t pfn,
bool prefault)
bool prefault, bool lpage_disallowed)
{
struct kvm_shadow_walk_iterator it;
struct kvm_mmu_page *sp;
Expand All @@ -3248,6 +3313,12 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write,

trace_kvm_mmu_spte_requested(gpa, level, pfn);
for_each_shadow_entry(vcpu, gpa, it) {
/*
* We cannot overwrite existing page tables with an NX
* large page, as the leaf could be executable.
*/
disallowed_hugepage_adjust(it, gfn, &pfn, &level);

base_gfn = gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
if (it.level == level)
break;
Expand All @@ -3258,6 +3329,8 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write,
it.level - 1, true, ACC_ALL);

link_shadow_page(vcpu, it.sptep, sp);
if (lpage_disallowed)
account_huge_nx_page(vcpu->kvm, sp);
}
}

Expand Down Expand Up @@ -3550,11 +3623,14 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
{
int r;
int level;
bool force_pt_level = false;
bool force_pt_level;
kvm_pfn_t pfn;
unsigned long mmu_seq;
bool map_writable, write = error_code & PFERR_WRITE_MASK;
bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) &&
is_nx_huge_page_enabled();

force_pt_level = lpage_disallowed;
level = mapping_level(vcpu, gfn, &force_pt_level);
if (likely(!force_pt_level)) {
/*
Expand Down Expand Up @@ -3588,7 +3664,8 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
goto out_unlock;
if (likely(!force_pt_level))
transparent_hugepage_adjust(vcpu, gfn, &pfn, &level);
r = __direct_map(vcpu, v, write, map_writable, level, pfn, prefault);
r = __direct_map(vcpu, v, write, map_writable, level, pfn,
prefault, false);
out_unlock:
spin_unlock(&vcpu->kvm->mmu_lock);
kvm_release_pfn_clean(pfn);
Expand Down Expand Up @@ -4174,6 +4251,8 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
unsigned long mmu_seq;
int write = error_code & PFERR_WRITE_MASK;
bool map_writable;
bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) &&
is_nx_huge_page_enabled();

MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa));

Expand All @@ -4184,8 +4263,9 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
if (r)
return r;

force_pt_level = !check_hugepage_cache_consistency(vcpu, gfn,
PT_DIRECTORY_LEVEL);
force_pt_level =
lpage_disallowed ||
!check_hugepage_cache_consistency(vcpu, gfn, PT_DIRECTORY_LEVEL);
level = mapping_level(vcpu, gfn, &force_pt_level);
if (likely(!force_pt_level)) {
if (level > PT_DIRECTORY_LEVEL &&
Expand Down Expand Up @@ -4214,7 +4294,8 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
goto out_unlock;
if (likely(!force_pt_level))
transparent_hugepage_adjust(vcpu, gfn, &pfn, &level);
r = __direct_map(vcpu, gpa, write, map_writable, level, pfn, prefault);
r = __direct_map(vcpu, gpa, write, map_writable, level, pfn,
prefault, lpage_disallowed);
out_unlock:
spin_unlock(&vcpu->kvm->mmu_lock);
kvm_release_pfn_clean(pfn);
Expand Down Expand Up @@ -6155,10 +6236,58 @@ static void kvm_set_mmio_spte_mask(void)
kvm_mmu_set_mmio_spte_mask(mask, mask, ACC_WRITE_MASK | ACC_USER_MASK);
}

static bool get_nx_auto_mode(void)
{
/* Return true when CPU has the bug, and mitigations are ON */
return boot_cpu_has_bug(X86_BUG_ITLB_MULTIHIT) && !cpu_mitigations_off();
}

static void __set_nx_huge_pages(bool val)
{
nx_huge_pages = itlb_multihit_kvm_mitigation = val;
}

static int set_nx_huge_pages(const char *val, const struct kernel_param *kp)
{
bool old_val = nx_huge_pages;
bool new_val;

/* In "auto" mode deploy workaround only if CPU has the bug. */
if (sysfs_streq(val, "off"))
new_val = 0;
else if (sysfs_streq(val, "force"))
new_val = 1;
else if (sysfs_streq(val, "auto"))
new_val = get_nx_auto_mode();
else if (strtobool(val, &new_val) < 0)
return -EINVAL;

__set_nx_huge_pages(new_val);

if (new_val != old_val) {
struct kvm *kvm;
int idx;

mutex_lock(&kvm_lock);

list_for_each_entry(kvm, &vm_list, vm_list) {
idx = srcu_read_lock(&kvm->srcu);
kvm_mmu_zap_all_fast(kvm);
srcu_read_unlock(&kvm->srcu, idx);
}
mutex_unlock(&kvm_lock);
}

return 0;
}

int kvm_mmu_module_init(void)
{
int ret = -ENOMEM;

if (nx_huge_pages == -1)
__set_nx_huge_pages(get_nx_auto_mode());

/*
* MMU roles use union aliasing which is, generally speaking, an
* undefined behavior. However, we supposedly know how compilers behave
Expand Down
Loading

0 comments on commit b8e8c83

Please sign in to comment.