RE: [RESEND PATCH 5/6] KVM: x86/VMX: add kvm_vmx_reinject_nmi_irq() for NMI/IRQ reinjection
From: Li, Xin3
Date: Wed Nov 23 2022 - 22:41:14 EST
> > > thouh we'd like want a fair bit of refactoring so that all of
> > > vmx_vcpu_run() and
> > > svm_vcpu_run() don't need to be noinstr.
>
> For the record, svm_vcpu_run() is fine, at least as far as NMIs are concerned.
>
> > This sounds reasonable to me, however from
> > Documentation/core-api/entry.rst, we do need it.
>
> Why do you say that?
>
Copy/Paste from Documentation/core-api/entry.rst:
KVM
---
Entering or exiting guest mode is very similar to syscalls. From the host
kernel point of view the CPU goes off into user space when entering the
guest and returns to the kernel on exit.
kvm_guest_enter_irqoff() is a KVM-specific variant of exit_to_user_mode()
and kvm_guest_exit_irqoff() is the KVM variant of enter_from_user_mode().
The state operations have the same ordering.
Task work handling is done separately for guest at the boundary of the
vcpu_run() loop via xfer_to_guest_mode_handle_work() which is a subset of
the work handled on return to user space.
Do not nest KVM entry/exit transitions because doing so is nonsensical.
>
> I believe this is what we should end up with. Compile tested only, and needs to
> split up into 4+ patches. I'll test and massage this into a proper series next
> week (US holiday the rest of this week).
Great, thanks for doing it quickly.
Xin
>
> ---
> arch/x86/kvm/kvm_cache_regs.h | 16 +++++------
> arch/x86/kvm/vmx/vmenter.S | 4 +--
> arch/x86/kvm/vmx/vmx.c | 51 ++++++++++++++++++-----------------
> arch/x86/kvm/vmx/vmx.h | 2 +-
> arch/x86/kvm/x86.h | 6 ++---
> 5 files changed, 41 insertions(+), 38 deletions(-)
>
> diff --git a/arch/x86/kvm/kvm_cache_regs.h
> b/arch/x86/kvm/kvm_cache_regs.h index c09174f73a34..af9bd0374915
> 100644
> --- a/arch/x86/kvm/kvm_cache_regs.h
> +++ b/arch/x86/kvm/kvm_cache_regs.h
> @@ -50,26 +50,26 @@ BUILD_KVM_GPR_ACCESSORS(r15, R15)
> * 1 0 register in vcpu->arch
> * 1 1 register in vcpu->arch, needs to be stored back
> */
> -static inline bool kvm_register_is_available(struct kvm_vcpu *vcpu,
> - enum kvm_reg reg)
> +static __always_inline bool kvm_register_is_available(struct kvm_vcpu *vcpu,
> + enum kvm_reg reg)
> {
> return test_bit(reg, (unsigned long *)&vcpu->arch.regs_avail); }
>
> -static inline bool kvm_register_is_dirty(struct kvm_vcpu *vcpu,
> - enum kvm_reg reg)
> +static __always_inline bool kvm_register_is_dirty(struct kvm_vcpu *vcpu,
> + enum kvm_reg reg)
> {
> return test_bit(reg, (unsigned long *)&vcpu->arch.regs_dirty); }
>
> -static inline void kvm_register_mark_available(struct kvm_vcpu *vcpu,
> - enum kvm_reg reg)
> +static __always_inline void kvm_register_mark_available(struct kvm_vcpu
> *vcpu,
> + enum kvm_reg reg)
> {
> __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail); }
>
> -static inline void kvm_register_mark_dirty(struct kvm_vcpu *vcpu,
> - enum kvm_reg reg)
> +static __always_inline void kvm_register_mark_dirty(struct kvm_vcpu *vcpu,
> + enum kvm_reg reg)
> {
> __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
> __set_bit(reg, (unsigned long *)&vcpu->arch.regs_dirty); diff --git
> a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S index
> 0b5db4de4d09..b104dfd282ed 100644
> --- a/arch/x86/kvm/vmx/vmenter.S
> +++ b/arch/x86/kvm/vmx/vmenter.S
> @@ -318,7 +318,7 @@ SYM_FUNC_START(vmread_error_trampoline)
> RET
> SYM_FUNC_END(vmread_error_trampoline)
>
> -SYM_FUNC_START(vmx_do_interrupt_nmi_irqoff)
> +SYM_FUNC_START(vmx_do_interrupt_irqoff)
> /*
> * Unconditionally create a stack frame, getting the correct RSP on the
> * stack (for x86-64) would take two instructions anyways, and RBP can
> @@ -349,4 +349,4 @@ SYM_FUNC_START(vmx_do_interrupt_nmi_irqoff)
> mov %_ASM_BP, %_ASM_SP
> pop %_ASM_BP
> RET
> -SYM_FUNC_END(vmx_do_interrupt_nmi_irqoff)
> +SYM_FUNC_END(vmx_do_interrupt_irqoff)
> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index
> cea8c07f5229..b721fde4f5af 100644
> --- a/arch/x86/kvm/vmx/vmx.c
> +++ b/arch/x86/kvm/vmx/vmx.c
> @@ -5064,8 +5064,13 @@ static int handle_exception_nmi(struct kvm_vcpu
> *vcpu)
> vect_info = vmx->idt_vectoring_info;
> intr_info = vmx_get_intr_info(vcpu);
>
> + /*
> + * Machine checks are handled by handle_exception_irqoff(), or by
> + * vmx_vcpu_run() if a #MC occurs on VM-Entry. NMIs are handled by
> + * vmx_vcpu_enter_exit().
> + */
> if (is_machine_check(intr_info) || is_nmi(intr_info))
> - return 1; /* handled by handle_exception_nmi_irqoff() */
> + return 1;
>
> /*
> * Queue the exception here instead of in handle_nm_fault_irqoff().
> @@ -6755,18 +6760,6 @@ static void vmx_apicv_post_state_restore(struct
> kvm_vcpu *vcpu)
> memset(vmx->pi_desc.pir, 0, sizeof(vmx->pi_desc.pir)); }
>
> -void vmx_do_interrupt_nmi_irqoff(unsigned long entry);
> -
> -static void handle_interrupt_nmi_irqoff(struct kvm_vcpu *vcpu,
> - unsigned long entry)
> -{
> - bool is_nmi = entry == (unsigned long)asm_exc_nmi_noist;
> -
> - kvm_before_interrupt(vcpu, is_nmi ? KVM_HANDLING_NMI :
> KVM_HANDLING_IRQ);
> - vmx_do_interrupt_nmi_irqoff(entry);
> - kvm_after_interrupt(vcpu);
> -}
> -
> static void handle_nm_fault_irqoff(struct kvm_vcpu *vcpu) {
> /*
> @@ -6787,9 +6780,8 @@ static void handle_nm_fault_irqoff(struct kvm_vcpu
> *vcpu)
> rdmsrl(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err); }
>
> -static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx)
> +static void handle_exception_irqoff(struct vcpu_vmx *vmx)
> {
> - const unsigned long nmi_entry = (unsigned long)asm_exc_nmi_noist;
> u32 intr_info = vmx_get_intr_info(&vmx->vcpu);
>
> /* if exit due to PF check for async PF */ @@ -6801,11 +6793,10 @@
> static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx)
> /* Handle machine checks before interrupts are enabled */
> else if (is_machine_check(intr_info))
> kvm_machine_check();
> - /* We need to handle NMIs before interrupts are enabled */
> - else if (is_nmi(intr_info))
> - handle_interrupt_nmi_irqoff(&vmx->vcpu, nmi_entry);
> }
>
> +void vmx_do_interrupt_irqoff(unsigned long entry);
> +
> static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu) {
> u32 intr_info = vmx_get_intr_info(vcpu); @@ -6816,7 +6807,10 @@
> static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu)
> "KVM: unexpected VM-Exit interrupt info: 0x%x", intr_info))
> return;
>
> - handle_interrupt_nmi_irqoff(vcpu, gate_offset(desc));
> + kvm_before_interrupt(vcpu, KVM_HANDLING_IRQ);
> + vmx_do_interrupt_irqoff(gate_offset(desc));
> + kvm_after_interrupt(vcpu);
> +
> vcpu->arch.at_instruction_boundary = true; }
>
> @@ -6830,7 +6824,7 @@ static void vmx_handle_exit_irqoff(struct kvm_vcpu
> *vcpu)
> if (vmx->exit_reason.basic == EXIT_REASON_EXTERNAL_INTERRUPT)
> handle_external_interrupt_irqoff(vcpu);
> else if (vmx->exit_reason.basic == EXIT_REASON_EXCEPTION_NMI)
> - handle_exception_nmi_irqoff(vmx);
> + handle_exception_irqoff(vmx);
> }
>
> /*
> @@ -7091,6 +7085,18 @@ static noinstr void vmx_vcpu_enter_exit(struct
> kvm_vcpu *vcpu,
>
> vmx_enable_fb_clear(vmx);
>
> + if (unlikely(vmx->fail))
> + vmx->exit_reason.full = 0xdead;
> + else
> + vmx->exit_reason.full = vmcs_read32(VM_EXIT_REASON);
> +
> + if ((u16)vmx->exit_reason.basic == EXIT_REASON_EXCEPTION_NMI &&
> + is_nmi(vmx_get_intr_info(vcpu))) {
> + kvm_before_interrupt(vcpu, KVM_HANDLING_NMI);
> + asm("int $2");
> + kvm_after_interrupt(vcpu);
> + }
> +
> guest_state_exit_irqoff();
> }
>
> @@ -7232,12 +7238,9 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu
> *vcpu)
>
> vmx->idt_vectoring_info = 0;
>
> - if (unlikely(vmx->fail)) {
> - vmx->exit_reason.full = 0xdead;
> + if (unlikely(vmx->fail))
> return EXIT_FASTPATH_NONE;
> - }
>
> - vmx->exit_reason.full = vmcs_read32(VM_EXIT_REASON);
> if (unlikely((u16)vmx->exit_reason.basic ==
> EXIT_REASON_MCE_DURING_VMENTRY))
> kvm_machine_check();
>
> diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index
> a3da84f4ea45..eb52cfde5ec2 100644
> --- a/arch/x86/kvm/vmx/vmx.h
> +++ b/arch/x86/kvm/vmx/vmx.h
> @@ -680,7 +680,7 @@ static inline unsigned long vmx_get_exit_qual(struct
> kvm_vcpu *vcpu)
> return vmx->exit_qualification;
> }
>
> -static inline u32 vmx_get_intr_info(struct kvm_vcpu *vcpu)
> +static __always_inline u32 vmx_get_intr_info(struct kvm_vcpu *vcpu)
> {
> struct vcpu_vmx *vmx = to_vmx(vcpu);
>
> diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index
> 9de72586f406..44d1827f0a30 100644
> --- a/arch/x86/kvm/x86.h
> +++ b/arch/x86/kvm/x86.h
> @@ -382,13 +382,13 @@ enum kvm_intr_type {
> KVM_HANDLING_NMI,
> };
>
> -static inline void kvm_before_interrupt(struct kvm_vcpu *vcpu,
> - enum kvm_intr_type intr)
> +static __always_inline void kvm_before_interrupt(struct kvm_vcpu *vcpu,
> + enum kvm_intr_type intr)
> {
> WRITE_ONCE(vcpu->arch.handling_intr_from_guest, (u8)intr); }
>
> -static inline void kvm_after_interrupt(struct kvm_vcpu *vcpu)
> +static __always_inline void kvm_after_interrupt(struct kvm_vcpu *vcpu)
> {
> WRITE_ONCE(vcpu->arch.handling_intr_from_guest, 0); }
>
> base-commit: 0fa32dad1e78629cb42999dacd82489503fdf4c2
> --