summaryrefslogtreecommitdiffstats
path: root/Documentation/virt/kvm
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-18 17:39:57 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-18 17:39:57 +0000
commitdc50eab76b709d68175a358d6e23a5a3890764d3 (patch)
treec754d0390db060af0213ff994f0ac310e4cfd6e9 /Documentation/virt/kvm
parentAdding debian version 6.6.15-2. (diff)
downloadlinux-dc50eab76b709d68175a358d6e23a5a3890764d3.tar.xz
linux-dc50eab76b709d68175a358d6e23a5a3890764d3.zip
Merging upstream version 6.7.7.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'Documentation/virt/kvm')
-rw-r--r--Documentation/virt/kvm/api.rst158
-rw-r--r--Documentation/virt/kvm/arm/index.rst1
-rw-r--r--Documentation/virt/kvm/arm/vcpu-features.rst48
-rw-r--r--Documentation/virt/kvm/devices/arm-vgic-v3.rst7
-rw-r--r--Documentation/virt/kvm/x86/mmu.rst43
5 files changed, 230 insertions, 27 deletions
diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index 21a7578142..7025b37510 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -416,6 +416,13 @@ Reads the general purpose registers from the vcpu.
__u64 pc;
};
+ /* LoongArch */
+ struct kvm_regs {
+ /* out (KVM_GET_REGS) / in (KVM_SET_REGS) */
+ unsigned long gpr[32];
+ unsigned long pc;
+ };
+
4.12 KVM_SET_REGS
-----------------
@@ -506,7 +513,7 @@ translation mode.
------------------
:Capability: basic
-:Architectures: x86, ppc, mips, riscv
+:Architectures: x86, ppc, mips, riscv, loongarch
:Type: vcpu ioctl
:Parameters: struct kvm_interrupt (in)
:Returns: 0 on success, negative on failure.
@@ -540,7 +547,7 @@ ioctl is useful if the in-kernel PIC is not used.
PPC:
^^^^
-Queues an external interrupt to be injected. This ioctl is overleaded
+Queues an external interrupt to be injected. This ioctl is overloaded
with 3 different irq values:
a) KVM_INTERRUPT_SET
@@ -592,6 +599,14 @@ b) KVM_INTERRUPT_UNSET
This is an asynchronous vcpu ioctl and can be invoked from any thread.
+LOONGARCH:
+^^^^^^^^^^
+
+Queues an external interrupt to be injected into the virtual CPU. A negative
+interrupt number dequeues the interrupt.
+
+This is an asynchronous vcpu ioctl and can be invoked from any thread.
+
4.17 KVM_DEBUG_GUEST
--------------------
@@ -737,7 +752,7 @@ signal mask.
----------------
:Capability: basic
-:Architectures: x86
+:Architectures: x86, loongarch
:Type: vcpu ioctl
:Parameters: struct kvm_fpu (out)
:Returns: 0 on success, -1 on error
@@ -746,7 +761,7 @@ Reads the floating point state from the vcpu.
::
- /* for KVM_GET_FPU and KVM_SET_FPU */
+ /* x86: for KVM_GET_FPU and KVM_SET_FPU */
struct kvm_fpu {
__u8 fpr[8][16];
__u16 fcw;
@@ -761,12 +776,21 @@ Reads the floating point state from the vcpu.
__u32 pad2;
};
+ /* LoongArch: for KVM_GET_FPU and KVM_SET_FPU */
+ struct kvm_fpu {
+ __u32 fcsr;
+ __u64 fcc;
+ struct kvm_fpureg {
+ __u64 val64[4];
+ }fpr[32];
+ };
+
4.23 KVM_SET_FPU
----------------
:Capability: basic
-:Architectures: x86
+:Architectures: x86, loongarch
:Type: vcpu ioctl
:Parameters: struct kvm_fpu (in)
:Returns: 0 on success, -1 on error
@@ -775,7 +799,7 @@ Writes the floating point state to the vcpu.
::
- /* for KVM_GET_FPU and KVM_SET_FPU */
+ /* x86: for KVM_GET_FPU and KVM_SET_FPU */
struct kvm_fpu {
__u8 fpr[8][16];
__u16 fcw;
@@ -790,6 +814,15 @@ Writes the floating point state to the vcpu.
__u32 pad2;
};
+ /* LoongArch: for KVM_GET_FPU and KVM_SET_FPU */
+ struct kvm_fpu {
+ __u32 fcsr;
+ __u64 fcc;
+ struct kvm_fpureg {
+ __u64 val64[4];
+ }fpr[32];
+ };
+
4.24 KVM_CREATE_IRQCHIP
-----------------------
@@ -965,7 +998,7 @@ be set in the flags field of this ioctl:
The KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL flag requests KVM to generate
the contents of the hypercall page automatically; hypercalls will be
intercepted and passed to userspace through KVM_EXIT_XEN. In this
-ase, all of the blob size and address fields must be zero.
+case, all of the blob size and address fields must be zero.
The KVM_XEN_HVM_CONFIG_EVTCHN_SEND flag indicates to KVM that userspace
will always use the KVM_XEN_HVM_EVTCHN_SEND ioctl to deliver event
@@ -1070,7 +1103,7 @@ Other flags returned by ``KVM_GET_CLOCK`` are accepted but ignored.
:Extended by: KVM_CAP_INTR_SHADOW
:Architectures: x86, arm64
:Type: vcpu ioctl
-:Parameters: struct kvm_vcpu_event (out)
+:Parameters: struct kvm_vcpu_events (out)
:Returns: 0 on success, -1 on error
X86:
@@ -1193,7 +1226,7 @@ directly to the virtual CPU).
:Extended by: KVM_CAP_INTR_SHADOW
:Architectures: x86, arm64
:Type: vcpu ioctl
-:Parameters: struct kvm_vcpu_event (in)
+:Parameters: struct kvm_vcpu_events (in)
:Returns: 0 on success, -1 on error
X86:
@@ -1387,7 +1420,7 @@ documentation when it pops into existence).
-------------------
:Capability: KVM_CAP_ENABLE_CAP
-:Architectures: mips, ppc, s390, x86
+:Architectures: mips, ppc, s390, x86, loongarch
:Type: vcpu ioctl
:Parameters: struct kvm_enable_cap (in)
:Returns: 0 on success; -1 on error
@@ -1442,7 +1475,7 @@ for vm-wide capabilities.
---------------------
:Capability: KVM_CAP_MP_STATE
-:Architectures: x86, s390, arm64, riscv
+:Architectures: x86, s390, arm64, riscv, loongarch
:Type: vcpu ioctl
:Parameters: struct kvm_mp_state (out)
:Returns: 0 on success; -1 on error
@@ -1460,7 +1493,7 @@ Possible values are:
========================== ===============================================
KVM_MP_STATE_RUNNABLE the vcpu is currently running
- [x86,arm64,riscv]
+ [x86,arm64,riscv,loongarch]
KVM_MP_STATE_UNINITIALIZED the vcpu is an application processor (AP)
which has not yet received an INIT signal [x86]
KVM_MP_STATE_INIT_RECEIVED the vcpu has received an INIT signal, and is
@@ -1516,11 +1549,14 @@ For riscv:
The only states that are valid are KVM_MP_STATE_STOPPED and
KVM_MP_STATE_RUNNABLE which reflect if the vcpu is paused or not.
+On LoongArch, only the KVM_MP_STATE_RUNNABLE state is used to reflect
+whether the vcpu is runnable.
+
4.39 KVM_SET_MP_STATE
---------------------
:Capability: KVM_CAP_MP_STATE
-:Architectures: x86, s390, arm64, riscv
+:Architectures: x86, s390, arm64, riscv, loongarch
:Type: vcpu ioctl
:Parameters: struct kvm_mp_state (in)
:Returns: 0 on success; -1 on error
@@ -1538,6 +1574,9 @@ For arm64/riscv:
The only states that are valid are KVM_MP_STATE_STOPPED and
KVM_MP_STATE_RUNNABLE which reflect if the vcpu should be paused or not.
+On LoongArch, only the KVM_MP_STATE_RUNNABLE state is used to reflect
+whether the vcpu is runnable.
+
4.40 KVM_SET_IDENTITY_MAP_ADDR
------------------------------
@@ -2841,6 +2880,19 @@ Following are the RISC-V D-extension registers:
0x8020 0000 0600 0020 fcsr Floating point control and status register
======================= ========= =============================================
+LoongArch registers are mapped using the lower 32 bits. The upper 16 bits of
+that is the register group type.
+
+LoongArch csr registers are used to control guest cpu or get status of guest
+cpu, and they have the following id bit patterns::
+
+ 0x9030 0000 0001 00 <reg:5> <sel:3> (64-bit)
+
+LoongArch KVM control registers are used to implement some new defined functions
+such as set vcpu counter or reset vcpu, and they have the following id bit patterns::
+
+ 0x9030 0000 0002 <reg:16>
+
4.69 KVM_GET_ONE_REG
--------------------
@@ -3063,7 +3115,7 @@ as follow::
};
An entry with a "page_shift" of 0 is unused. Because the array is
-organized in increasing order, a lookup can stop when encoutering
+organized in increasing order, a lookup can stop when encountering
such an entry.
The "slb_enc" field provides the encoding to use in the SLB for the
@@ -3370,6 +3422,8 @@ return indicates the attribute is implemented. It does not necessarily
indicate that the attribute can be read or written in the device's
current state. "addr" is ignored.
+.. _KVM_ARM_VCPU_INIT:
+
4.82 KVM_ARM_VCPU_INIT
----------------------
@@ -3455,7 +3509,7 @@ Possible features:
- KVM_RUN and KVM_GET_REG_LIST are not available;
- KVM_GET_ONE_REG and KVM_SET_ONE_REG cannot be used to access
- the scalable archietctural SVE registers
+ the scalable architectural SVE registers
KVM_REG_ARM64_SVE_ZREG(), KVM_REG_ARM64_SVE_PREG() or
KVM_REG_ARM64_SVE_FFR;
@@ -4401,7 +4455,7 @@ This will have undefined effects on the guest if it has not already
placed itself in a quiescent state where no vcpu will make MMU enabled
memory accesses.
-On succsful completion, the pending HPT will become the guest's active
+On successful completion, the pending HPT will become the guest's active
HPT and the previous HPT will be discarded.
On failure, the guest will still be operating on its previous HPT.
@@ -5016,7 +5070,7 @@ before the vcpu is fully usable.
Between KVM_ARM_VCPU_INIT and KVM_ARM_VCPU_FINALIZE, the feature may be
configured by use of ioctls such as KVM_SET_ONE_REG. The exact configuration
-that should be performaned and how to do it are feature-dependent.
+that should be performed and how to do it are feature-dependent.
Other calls that depend on a particular feature being finalized, such as
KVM_RUN, KVM_GET_REG_LIST, KVM_GET_ONE_REG and KVM_SET_ONE_REG, will fail with
@@ -5124,6 +5178,24 @@ Valid values for 'action'::
#define KVM_PMU_EVENT_ALLOW 0
#define KVM_PMU_EVENT_DENY 1
+Via this API, KVM userspace can also control the behavior of the VM's fixed
+counters (if any) by configuring the "action" and "fixed_counter_bitmap" fields.
+
+Specifically, KVM follows the following pseudo-code when determining whether to
+allow the guest FixCtr[i] to count its pre-defined fixed event::
+
+ FixCtr[i]_is_allowed = (action == ALLOW) && (bitmap & BIT(i)) ||
+ (action == DENY) && !(bitmap & BIT(i));
+ FixCtr[i]_is_denied = !FixCtr[i]_is_allowed;
+
+KVM always consumes fixed_counter_bitmap, it's userspace's responsibility to
+ensure fixed_counter_bitmap is set correctly, e.g. if userspace wants to define
+a filter that only affects general purpose counters.
+
+Note, the "events" field also applies to fixed counters' hardcoded event_select
+and unit_mask values. "fixed_counter_bitmap" has higher priority than "events"
+if there is a contradiction between the two.
+
4.121 KVM_PPC_SVM_OFF
---------------------
@@ -5475,7 +5547,7 @@ KVM_XEN_ATTR_TYPE_EVTCHN
from the guest. A given sending port number may be directed back to
a specified vCPU (by APIC ID) / port / priority on the guest, or to
trigger events on an eventfd. The vCPU and priority can be changed
- by setting KVM_XEN_EVTCHN_UPDATE in a subsequent call, but but other
+ by setting KVM_XEN_EVTCHN_UPDATE in a subsequent call, but other
fields cannot change for a given sending port. A port mapping is
removed by using KVM_XEN_EVTCHN_DEASSIGN in the flags field. Passing
KVM_XEN_EVTCHN_RESET in the flags field removes all interception of
@@ -6070,6 +6142,56 @@ writes to the CNTVCT_EL0 and CNTPCT_EL0 registers using the SET_ONE_REG
interface. No error will be returned, but the resulting offset will not be
applied.
+.. _KVM_ARM_GET_REG_WRITABLE_MASKS:
+
+4.139 KVM_ARM_GET_REG_WRITABLE_MASKS
+-------------------------------------------
+
+:Capability: KVM_CAP_ARM_SUPPORTED_REG_MASK_RANGES
+:Architectures: arm64
+:Type: vm ioctl
+:Parameters: struct reg_mask_range (in/out)
+:Returns: 0 on success, < 0 on error
+
+
+::
+
+ #define KVM_ARM_FEATURE_ID_RANGE 0
+ #define KVM_ARM_FEATURE_ID_RANGE_SIZE (3 * 8 * 8)
+
+ struct reg_mask_range {
+ __u64 addr; /* Pointer to mask array */
+ __u32 range; /* Requested range */
+ __u32 reserved[13];
+ };
+
+This ioctl copies the writable masks for a selected range of registers to
+userspace.
+
+The ``addr`` field is a pointer to the destination array where KVM copies
+the writable masks.
+
+The ``range`` field indicates the requested range of registers.
+``KVM_CHECK_EXTENSION`` for the ``KVM_CAP_ARM_SUPPORTED_REG_MASK_RANGES``
+capability returns the supported ranges, expressed as a set of flags. Each
+flag's bit index represents a possible value for the ``range`` field.
+All other values are reserved for future use and KVM may return an error.
+
+The ``reserved[13]`` array is reserved for future use and should be 0, or
+KVM may return an error.
+
+KVM_ARM_FEATURE_ID_RANGE (0)
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The Feature ID range is defined as the AArch64 System register space with
+op0==3, op1=={0, 1, 3}, CRn==0, CRm=={0-7}, op2=={0-7}.
+
+The mask returned array pointed to by ``addr`` is indexed by the macro
+``ARM64_FEATURE_ID_RANGE_IDX(op0, op1, crn, crm, op2)``, allowing userspace
+to know what fields can be changed for the system register described by
+``op0, op1, crn, crm, op2``. KVM rejects ID register values that describe a
+superset of the features supported by the system.
+
5. The kvm_run structure
========================
diff --git a/Documentation/virt/kvm/arm/index.rst b/Documentation/virt/kvm/arm/index.rst
index e848484321..7f231c724e 100644
--- a/Documentation/virt/kvm/arm/index.rst
+++ b/Documentation/virt/kvm/arm/index.rst
@@ -11,3 +11,4 @@ ARM
hypercalls
pvtime
ptp_kvm
+ vcpu-features
diff --git a/Documentation/virt/kvm/arm/vcpu-features.rst b/Documentation/virt/kvm/arm/vcpu-features.rst
new file mode 100644
index 0000000000..f7cc6d8d8b
--- /dev/null
+++ b/Documentation/virt/kvm/arm/vcpu-features.rst
@@ -0,0 +1,48 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+===============================
+vCPU feature selection on arm64
+===============================
+
+KVM/arm64 provides two mechanisms that allow userspace to configure
+the CPU features presented to the guest.
+
+KVM_ARM_VCPU_INIT
+=================
+
+The ``KVM_ARM_VCPU_INIT`` ioctl accepts a bitmap of feature flags
+(``struct kvm_vcpu_init::features``). Features enabled by this interface are
+*opt-in* and may change/extend UAPI. See :ref:`KVM_ARM_VCPU_INIT` for complete
+documentation of the features controlled by the ioctl.
+
+Otherwise, all CPU features supported by KVM are described by the architected
+ID registers.
+
+The ID Registers
+================
+
+The Arm architecture specifies a range of *ID Registers* that describe the set
+of architectural features supported by the CPU implementation. KVM initializes
+the guest's ID registers to the maximum set of CPU features supported by the
+system. The ID register values may be VM-scoped in KVM, meaning that the
+values could be shared for all vCPUs in a VM.
+
+KVM allows userspace to *opt-out* of certain CPU features described by the ID
+registers by writing values to them via the ``KVM_SET_ONE_REG`` ioctl. The ID
+registers are mutable until the VM has started, i.e. userspace has called
+``KVM_RUN`` on at least one vCPU in the VM. Userspace can discover what fields
+are mutable in the ID registers using the ``KVM_ARM_GET_REG_WRITABLE_MASKS``.
+See the :ref:`ioctl documentation <KVM_ARM_GET_REG_WRITABLE_MASKS>` for more
+details.
+
+Userspace is allowed to *limit* or *mask* CPU features according to the rules
+outlined by the architecture in DDI0487J.a D19.1.3 'Principles of the ID
+scheme for fields in ID register'. KVM does not allow ID register values that
+exceed the capabilities of the system.
+
+.. warning::
+ It is **strongly recommended** that userspace modify the ID register values
+ before accessing the rest of the vCPU's CPU register state. KVM may use the
+ ID register values to control feature emulation. Interleaving ID register
+ modification with other system register accesses may lead to unpredictable
+ behavior.
diff --git a/Documentation/virt/kvm/devices/arm-vgic-v3.rst b/Documentation/virt/kvm/devices/arm-vgic-v3.rst
index 51e5e57625..5817edb4e0 100644
--- a/Documentation/virt/kvm/devices/arm-vgic-v3.rst
+++ b/Documentation/virt/kvm/devices/arm-vgic-v3.rst
@@ -59,6 +59,13 @@ Groups:
It is invalid to mix calls with KVM_VGIC_V3_ADDR_TYPE_REDIST and
KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION attributes.
+ Note that to obtain reproducible results (the same VCPU being associated
+ with the same redistributor across a save/restore operation), VCPU creation
+ order, redistributor region creation order as well as the respective
+ interleaves of VCPU and region creation MUST be preserved. Any change in
+ either ordering may result in a different vcpu_id/redistributor association,
+ resulting in a VM that will fail to run at restore time.
+
Errors:
======= =============================================================
diff --git a/Documentation/virt/kvm/x86/mmu.rst b/Documentation/virt/kvm/x86/mmu.rst
index d47595b33f..2b3b6d4423 100644
--- a/Documentation/virt/kvm/x86/mmu.rst
+++ b/Documentation/virt/kvm/x86/mmu.rst
@@ -202,10 +202,22 @@ Shadow pages contain the following information:
Is 1 if the MMU instance cannot use A/D bits. EPT did not have A/D
bits before Haswell; shadow EPT page tables also cannot use A/D bits
if the L1 hypervisor does not enable them.
+ role.guest_mode:
+ Indicates the shadow page is created for a nested guest.
role.passthrough:
The page is not backed by a guest page table, but its first entry
points to one. This is set if NPT uses 5-level page tables (host
CR4.LA57=1) and is shadowing L1's 4-level NPT (L1 CR4.LA57=0).
+ mmu_valid_gen:
+ The MMU generation of this page, used to fast zap of all MMU pages within a
+ VM without blocking vCPUs too long. Specifically, KVM updates the per-VM
+ valid MMU generation which causes the mismatch of mmu_valid_gen for each mmu
+ page. This makes all existing MMU pages obsolete. Obsolete pages can't be
+ used. Therefore, vCPUs must load a new, valid root before re-entering the
+ guest. The MMU generation is only ever '0' or '1'. Note, the TDP MMU doesn't
+ use this field as non-root TDP MMU pages are reachable only from their
+ owning root. Thus it suffices for TDP MMU to use role.invalid in root pages
+ to invalidate all MMU pages.
gfn:
Either the guest page table containing the translations shadowed by this
page, or the base page frame for linear translations. See role.direct.
@@ -219,21 +231,30 @@ Shadow pages contain the following information:
at __pa(sp2->spt). sp2 will point back at sp1 through parent_pte.
The spt array forms a DAG structure with the shadow page as a node, and
guest pages as leaves.
- gfns:
- An array of 512 guest frame numbers, one for each present pte. Used to
- perform a reverse map from a pte to a gfn. When role.direct is set, any
- element of this array can be calculated from the gfn field when used, in
- this case, the array of gfns is not allocated. See role.direct and gfn.
- root_count:
- A counter keeping track of how many hardware registers (guest cr3 or
- pdptrs) are now pointing at the page. While this counter is nonzero, the
- page cannot be destroyed. See role.invalid.
+ shadowed_translation:
+ An array of 512 shadow translation entries, one for each present pte. Used
+ to perform a reverse map from a pte to a gfn as well as its access
+ permission. When role.direct is set, the shadow_translation array is not
+ allocated. This is because the gfn contained in any element of this array
+ can be calculated from the gfn field when used. In addition, when
+ role.direct is set, KVM does not track access permission for each of the
+ gfn. See role.direct and gfn.
+ root_count / tdp_mmu_root_count:
+ root_count is a reference counter for root shadow pages in Shadow MMU.
+ vCPUs elevate the refcount when getting a shadow page that will be used as
+ a root page, i.e. page that will be loaded into hardware directly (CR3,
+ PDPTRs, nCR3 EPTP). Root pages cannot be destroyed while their refcount is
+ non-zero. See role.invalid. tdp_mmu_root_count is similar but exclusively
+ used in TDP MMU as an atomic refcount.
parent_ptes:
The reverse mapping for the pte/ptes pointing at this page's spt. If
parent_ptes bit 0 is zero, only one spte points at this page and
parent_ptes points at this single spte, otherwise, there exists multiple
sptes pointing at this page and (parent_ptes & ~0x1) points at a data
structure with a list of parent sptes.
+ ptep:
+ The kernel virtual address of the SPTE that points at this shadow page.
+ Used exclusively by the TDP MMU, this field is a union with parent_ptes.
unsync:
If true, then the translations in this page may not match the guest's
translation. This is equivalent to the state of the tlb when a pte is
@@ -261,6 +282,10 @@ Shadow pages contain the following information:
since the last time the page table was actually used; if emulation
is triggered too frequently on this page, KVM will unmap the page
to avoid emulation in the future.
+ tdp_mmu_page:
+ Is 1 if the shadow page is a TDP MMU page. This variable is used to
+ bifurcate the control flows for KVM when walking any data structure that
+ may contain pages from both TDP MMU and shadow MMU.
Reverse map
===========