Skip to content

Commit 4192204

Browse files
avagingvisor-bot
authored andcommitted
kvm: avoid mmio exits when Sentry faults on unmapped memory
Currently, we generate page tables for the entire sentry address space. Consequently, when the Sentry faults on unmapped memory - meaning a memory region not yet mapped into the VM - an MMIO exit is triggered. The issue is that because the current instruction is emulated (instead of executed natively), it becomes impossible to trigger a "normal" memory fault. To solve this, we must set up page tables only for the regions that are explicitly mapped into the VM. This, however, is more challenging than it sounds for several reasons: We map memory regions into the VM from a signal handler, where memory allocation is prohibited. This means all necessary page table entries must be allocated during platform initialization. Our memory regions are not aligned to huge page boundaries. Therefore, when mapping a memory slot, we often need to split huge pages and allocate new page table entries. We run into the nosplit stack limit, requiring us to introduce a PTE.Get method to safely avoid accessing splice entries via indices, which could trigger a panic and so requires a lot of extra stack. PiperOrigin-RevId: 827726897
1 parent fec04fd commit 4192204

File tree

11 files changed

+156
-37
lines changed

11 files changed

+156
-37
lines changed

pkg/ring0/pagetables/BUILD

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ go_library(
4949
"pagetables_aarch64.go",
5050
"pagetables_amd64.go",
5151
"pagetables_arm64.go",
52+
"pagetables_unsafe.go",
5253
"pagetables_x86.go",
5354
"pcids.go",
5455
"pcids_aarch64.go",

pkg/ring0/pagetables/pagetables_aarch64.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,9 @@ type MapOpts struct {
9191
// User indicates the page is a user page.
9292
User bool
9393

94+
// Static indicates the entries should not be cleared/freed.
95+
Static bool
96+
9497
// MemoryType is the memory type.
9598
MemoryType hostarch.MemoryType
9699
}
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
// Copyright 2025 The gVisor Authors.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
package pagetables
16+
17+
import (
18+
"unsafe"
19+
)
20+
21+
// Get returns the entry with the specified index.
22+
//
23+
//go:nosplit
24+
func (p *PTEs) Get(idx uint16) *PTE {
25+
return (*PTE)(unsafe.Pointer(uintptr(unsafe.Pointer(&p[0])) + 8*uintptr(idx)))
26+
}

pkg/ring0/pagetables/pagetables_x86.go

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,9 @@ type MapOpts struct {
7373
// User indicates the page is a user page.
7474
User bool
7575

76+
// Static indicates the entries should not be cleared/freed.
77+
Static bool
78+
7679
// MemoryType is the memory type.
7780
MemoryType hostarch.MemoryType
7881
}
@@ -91,7 +94,7 @@ func (p *PTE) Clear() {
9194
//
9295
//go:nosplit
9396
func (p *PTE) Valid() bool {
94-
return atomic.LoadUintptr((*uintptr)(p))&present != 0
97+
return atomic.LoadUintptr((*uintptr)(p)) != 0
9598
}
9699

97100
// Opts returns the PTE options.
@@ -139,7 +142,7 @@ func (p *PTE) IsSuper() bool {
139142
//
140143
//go:nosplit
141144
func (p *PTE) Set(addr uintptr, opts MapOpts) {
142-
if !opts.AccessType.Any() {
145+
if false && !opts.AccessType.Any() && !opts.Static {
143146
p.Clear()
144147
return
145148
}

pkg/ring0/pagetables/walker_amd64.go

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ func (w *Walker) walkPTEs(entries *PTEs, start, end uintptr) (bool, uint16) {
4343
var clearEntries uint16
4444
for start < end {
4545
pteIndex := uint16((start & pteMask) >> pteShift)
46-
entry := &entries[pteIndex]
46+
entry := entries.Get(pteIndex)
4747
if !entry.Valid() && !w.visitor.requiresAlloc() {
4848
clearEntries++
4949
start += pteSize
@@ -81,7 +81,7 @@ func (w *Walker) walkPMDs(pmdEntries *PTEs, start, end uintptr) (bool, uint16) {
8181
var pteEntries *PTEs
8282
nextBoundary := addrEnd(start, end, pmdSize)
8383
pmdIndex := uint16((start & pmdMask) >> pmdShift)
84-
pmdEntry := &pmdEntries[pmdIndex]
84+
pmdEntry := pmdEntries.Get(pmdIndex)
8585
if !pmdEntry.Valid() {
8686
if !w.visitor.requiresAlloc() {
8787
// Skip over this entry.
@@ -173,7 +173,7 @@ func (w *Walker) walkPUDs(pudEntries *PTEs, start, end uintptr) (bool, uint16) {
173173
var pmdEntries *PTEs
174174
nextBoundary := addrEnd(start, end, pudSize)
175175
pudIndex := uint16((start & pudMask) >> pudShift)
176-
pudEntry := &pudEntries[pudIndex]
176+
pudEntry := pudEntries.Get(pudIndex)
177177
if !pudEntry.Valid() {
178178
if !w.visitor.requiresAlloc() {
179179
// Skip over this entry.
@@ -261,7 +261,7 @@ func (w *Walker) iterateRangeCanonical(start, end uintptr) bool {
261261
var pudEntries *PTEs
262262
nextBoundary := addrEnd(start, end, pgdSize)
263263
pgdIndex := uint16((start & pgdMask) >> pgdShift)
264-
pgdEntry := &w.pageTables.root[pgdIndex]
264+
pgdEntry := w.pageTables.root.Get(pgdIndex)
265265
if !w.pageTables.largeAddressesEnabled {
266266
if !pgdEntry.Valid() {
267267
if !w.visitor.requiresAlloc() {

pkg/sentry/platform/kvm/bluepill_fault.go

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -25,11 +25,14 @@ import (
2525
var (
2626
// faultBlockSize is the size used for servicing memory faults.
2727
//
28-
// This should be large enough to avoid frequent faults and avoid using
29-
// all available KVM slots (~512), but small enough that KVM does not
30-
// complain about slot sizes (~4GB). See handleBluepillFault for how
31-
// this block is used.
32-
faultBlockSize = uintptr(2 << 30)
28+
// This should be large enough so that the total number of slots
29+
// required to cover the 47-bit virtual address space does not exceed
30+
// the KVM slot limit (e.g. 32764). Linux doesn't allocate virtual
31+
// address space above 47-bit by default.
32+
// It must be small enough to limit the memory overhead associated with
33+
// KVM slot allocation. For example, using a 46-bit address space
34+
// results in an overhead of ~250 MB.
35+
faultBlockSize = uintptr(8 << 30)
3336

3437
// faultBlockMask is the mask for the fault blocks.
3538
//
@@ -56,13 +59,17 @@ func calculateBluepillFault(physical uintptr) (virtualStart, physicalStart, leng
5659
}
5760

5861
// Adjust the block to match our size.
59-
physicalStart = pr.physical + (alignedPhysical-pr.physical)&faultBlockMask
60-
virtualStart = pr.virtual + (physicalStart - pr.physical)
62+
physicalStart = pr.physical / faultBlockSize * faultBlockSize
63+
physicalStart = physicalStart + (alignedPhysical-physicalStart)&faultBlockMask
6164
physicalEnd := physicalStart + faultBlockSize
65+
if physicalStart < pr.physical {
66+
physicalStart = pr.physical
67+
}
6268
if physicalEnd > end {
6369
physicalEnd = end
6470
}
6571
length = physicalEnd - physicalStart
72+
virtualStart = pr.virtual + (physicalStart - pr.physical)
6673
return virtualStart, physicalStart, length, &physicalRegions[i]
6774
}
6875

pkg/sentry/platform/kvm/kvm_test.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ import (
3939
var dummyFPState fpu.State
4040

4141
type testHarness interface {
42+
Logf(format string, args ...any)
4243
Errorf(format string, args ...any)
4344
Fatalf(format string, args ...any)
4445
}
@@ -146,6 +147,7 @@ func applicationTest(t testHarness, useHostMappings bool, targetFn uintptr, fn f
146147
// done for regular user code, but is fine for test
147148
// purposes.)
148149
applyPhysicalRegions(func(pr physicalRegion) bool {
150+
t.Logf("Map %x-%x", pr.virtual, pr.virtual+pr.length)
149151
pt.Map(hostarch.Addr(pr.virtual), pr.length, pagetables.MapOpts{
150152
AccessType: hostarch.AnyAccess,
151153
User: true,

pkg/sentry/platform/kvm/machine.go

Lines changed: 33 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -332,26 +332,33 @@ func newMachine(vm int, config *Config) (*machine, error) {
332332
// faultBlockSize has to equal or less than KVM_MEM_MAX_NR_PAGES.
333333
faultBlockSize = uintptr(1) << 42
334334
faultBlockMask = ^uintptr(faultBlockSize - 1)
335+
for _, r := range physicalRegions {
336+
m.mapPhysical(r.physical, r.length)
337+
}
335338
} else {
339+
// Apply the physical mappings. Note that these mappings may point to
340+
// guest physical addresses that are not actually available. These
341+
// physical pages are mapped on demand, see kernel_unsafe.go.
342+
applyPhysicalRegions(func(pr physicalRegion) bool {
343+
physical := pr.physical
344+
for physical < pr.physical+pr.length {
345+
virtualStart, physicalStart, length, _ := calculateBluepillFault(physical)
346+
// Pre-allocate page tables in the lower half.
347+
m.kernel.PageTables.Map(
348+
hostarch.Addr(virtualStart),
349+
length,
350+
pagetables.MapOpts{Static: true},
351+
physicalStart)
352+
physical += length
353+
}
354+
355+
return true // Keep iterating.
356+
})
336357
// Install seccomp rules to trap runtime mmap system calls. They will
337358
// be handled by seccompMmapHandler.
338359
seccompMmapRules(m)
339360
}
340361

341-
// Apply the physical mappings. Note that these mappings may point to
342-
// guest physical addresses that are not actually available. These
343-
// physical pages are mapped on demand, see kernel_unsafe.go.
344-
applyPhysicalRegions(func(pr physicalRegion) bool {
345-
// Map everything in the lower half.
346-
m.kernel.PageTables.Map(
347-
hostarch.Addr(pr.virtual),
348-
pr.length,
349-
pagetables.MapOpts{AccessType: hostarch.ReadWrite},
350-
pr.physical)
351-
352-
return true // Keep iterating.
353-
})
354-
355362
// Ensure that the currently mapped virtual regions are actually
356363
// available in the VM. Note that this doesn't guarantee no future
357364
// faults, however it should guarantee that everything is available to
@@ -368,6 +375,9 @@ func newMachine(vm int, config *Config) (*machine, error) {
368375
// Cap the length to the end of the area.
369376
length = vr.virtual + vr.length - virtual
370377
}
378+
// Ensure the physical range is mapped.
379+
m.mapPhysical(physical, length)
380+
371381
// Update page tables for executable mappings.
372382
if vr.accessType.Execute {
373383
if vr.accessType.Write {
@@ -380,8 +390,6 @@ func newMachine(vm int, config *Config) (*machine, error) {
380390
physical)
381391
}
382392

383-
// Ensure the physical range is mapped.
384-
m.mapPhysical(physical, length)
385393
virtual += length
386394
}
387395
}
@@ -404,11 +412,6 @@ func newMachine(vm int, config *Config) (*machine, error) {
404412
mapRegion(vr, 0)
405413

406414
})
407-
if mapEntireAddressSpace {
408-
for _, r := range physicalRegions {
409-
m.mapPhysical(r.physical, r.length)
410-
}
411-
}
412415
enableAsyncPreemption()
413416
// Initialize architecture state.
414417
if err := m.initArchState(); err != nil {
@@ -458,8 +461,15 @@ func (m *machine) mapPhysical(physical, length uintptr) {
458461
}
459462

460463
// Is this already mapped? Check the usedSlots.
461-
if !pr.mmio && !m.hasSlot(physicalStart) {
462-
m.mapMemorySlot(virtualStart, physicalStart, length, pr.readOnly)
464+
if !m.hasSlot(physicalStart) {
465+
m.kernel.PageTables.Map(
466+
hostarch.Addr(virtualStart),
467+
length,
468+
pagetables.MapOpts{AccessType: hostarch.ReadWrite},
469+
physicalStart)
470+
if !pr.mmio {
471+
m.mapMemorySlot(virtualStart, physicalStart, length, pr.readOnly)
472+
}
463473
}
464474

465475
// Move to the next chunk.

pkg/sentry/platform/kvm/physical_map.go

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ func fillAddressSpace() (specialRegions []specialVirtualRegion) {
6666
pSize := uintptr(1) << ring0.PhysicalAddressBits
6767
pSize -= reservedMemory
6868

69+
maxUserAddr := uintptr(0)
6970
// Add specifically excluded regions; see excludeVirtualRegion.
7071
if err := applyVirtualRegions(func(vr virtualRegion) {
7172
if excludeVirtualRegion(vr) {
@@ -81,10 +82,17 @@ func fillAddressSpace() (specialRegions []specialVirtualRegion) {
8182
})
8283
log.Infof("mmio: virtual [%x,%x)", vr.virtual, vr.virtual+vr.length)
8384
}
85+
if vr.filename != "[vsyscall]" {
86+
maxUserAddr = vr.region.virtual + vr.region.length
87+
}
8488
}); err != nil {
8589
panic(fmt.Sprintf("error parsing /proc/self/maps: %v", err))
8690
}
8791

92+
var archRegions []specialVirtualRegion
93+
vSize, archRegions = archSpecialRegions(vSize, maxUserAddr)
94+
specialRegions = append(specialRegions, archRegions...)
95+
8896
// Do we need any more work?
8997
if vSize < pSize {
9098
return specialRegions
@@ -109,7 +117,7 @@ func fillAddressSpace() (specialRegions []specialVirtualRegion) {
109117
current := required // Attempted mmap size.
110118
filled := uintptr(0)
111119
suggestedAddr := uintptr(0)
112-
if ring0.VirtualAddressBits > 48 {
120+
if exendedAddressSpaceAllowed && ring0.VirtualAddressBits > 48 {
113121
// Pass a hint address above 47 bits to indicate to the kernel that
114122
// we can handle, and want, mappings above 47 bits:
115123
// https://docs.kernel.org/arch/x86/x86_64/5level-paging.html#user-space-and-large-virtual-address-space.

pkg/sentry/platform/kvm/physical_map_amd64.go

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,62 @@
1414

1515
package kvm
1616

17+
import (
18+
"gvisor.dev/gvisor/pkg/log"
19+
"gvisor.dev/gvisor/pkg/ring0"
20+
)
21+
1722
const (
1823
// reservedMemory is a chunk of physical memory reserved starting at
1924
// physical address zero. There are some special pages in this region,
2025
// so we just call the whole thing off.
2126
reservedMemory = 0x100000000
2227
)
28+
29+
const (
30+
// defaultAddressSpaceSize is the default limit for the user virtual
31+
// address space, which is 47-bits (2^47 bytes). The mmap syscall
32+
// respects this limit by default, even with 5-level page tables
33+
// enabled.
34+
defaultAddressSpaceSize = uintptr(1) << 47
35+
36+
// exendedAddressSpaceAllowed controls address space usage beyond
37+
// the default 47-bit limit. It is set to 'false' for several reasons:
38+
// * There are no known use cases requiring the extended address space.
39+
// * By restricting the size, we avoid the overhead of:
40+
// a) Aligning the virtual address space size to the physical
41+
// address space size.
42+
// b) Creating unnecessary page table entries for the unused
43+
// extended range.
44+
// * The memory slot size is currently configured only to cover
45+
// the default 47-bit address space.
46+
// * 5-level page table support was primarily introduced to workaround
47+
// a specific kernel bug where VDSO could be mapped above the 47-bit
48+
// boundary (v6.9-rc1~186^2~7).
49+
exendedAddressSpaceAllowed = false
50+
)
51+
52+
// archSpecialRegions returns special regions that are excluded from the virtual
53+
// address space. Linux doesn't map vma-s above 47-bit by default.
54+
func archSpecialRegions(vSize uintptr, maxUserAddr uintptr) (uintptr, []specialVirtualRegion) {
55+
var specialRegions []specialVirtualRegion
56+
if exendedAddressSpaceAllowed || vSize <= defaultAddressSpaceSize {
57+
return vSize, nil
58+
}
59+
// This is a workaround for the kernel bug when vdso can be
60+
// mapped above the 47-bit address space boundary.
61+
if defaultAddressSpaceSize > maxUserAddr {
62+
maxUserAddr = defaultAddressSpaceSize
63+
}
64+
r := region{
65+
virtual: maxUserAddr,
66+
length: ring0.MaximumUserAddress - defaultAddressSpaceSize,
67+
}
68+
specialRegions = append(specialRegions, specialVirtualRegion{
69+
region: r,
70+
})
71+
vSize -= r.length
72+
log.Infof("excluded: virtual [%x,%x)", r.virtual, r.virtual+r.length)
73+
74+
return vSize, specialRegions
75+
}

0 commit comments

Comments
 (0)