Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions pkg/ring0/pagetables/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ go_library(
"pagetables_aarch64.go",
"pagetables_amd64.go",
"pagetables_arm64.go",
"pagetables_unsafe.go",
"pagetables_x86.go",
"pcids.go",
"pcids_aarch64.go",
Expand Down
9 changes: 5 additions & 4 deletions pkg/ring0/pagetables/pagetables.go
Original file line number Diff line number Diff line change
Expand Up @@ -110,16 +110,17 @@ func New(a Allocator) *PageTables {
type mapVisitor struct {
target uintptr // Input.
physical uintptr // Input.
opts MapOpts // Input.
prev bool // Output.
// opts is a pointer just to reduce a stack usage. It should never be changed.
opts *MapOpts // Input.
prev bool // Output.
}

// visit is used for map.
//
//go:nosplit
func (v *mapVisitor) visit(start uintptr, pte *PTE, align uintptr) bool {
p := v.physical + (start - v.target)
if pte.Valid() && (pte.Address() != p || pte.Opts() != v.opts) {
if pte.Valid() && (pte.Address() != p || pte.Opts() != *v.opts) {
v.prev = true
}
if p&align != 0 {
Expand Down Expand Up @@ -169,7 +170,7 @@ func (p *PageTables) Map(addr hostarch.Addr, length uintptr, opts MapOpts, physi
visitor: mapVisitor{
target: uintptr(addr),
physical: physical,
opts: opts,
opts: &opts,
},
}
w.iterateRange(uintptr(addr), uintptr(addr)+length)
Expand Down
5 changes: 4 additions & 1 deletion pkg/ring0/pagetables/pagetables_aarch64.go
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,9 @@ type MapOpts struct {
// User indicates the page is a user page.
User bool

// Static indicates the entries should not be cleared/freed.
Static bool

// MemoryType is the memory type.
MemoryType hostarch.MemoryType
}
Expand Down Expand Up @@ -156,7 +159,7 @@ func (p *PTE) IsSect() bool {
// This does not change the sect page property.
//
//go:nosplit
func (p *PTE) Set(addr uintptr, opts MapOpts) {
func (p *PTE) Set(addr uintptr, opts *MapOpts) {
v := (addr &^ optionMask) | nG | readOnly | protDefault
// Note: p.IsSect is manually inlined to reduce stack size for
// nosplit-ness.
Expand Down
26 changes: 26 additions & 0 deletions pkg/ring0/pagetables/pagetables_unsafe.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
// Copyright 2025 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package pagetables

import (
"unsafe"
)

// Get returns the entry with the specified index.
//
//go:nosplit
func (p *PTEs) Get(idx uint16) *PTE {
return (*PTE)(unsafe.Pointer(uintptr(unsafe.Pointer(&p[0])) + 8*uintptr(idx)))
}
9 changes: 6 additions & 3 deletions pkg/ring0/pagetables/pagetables_x86.go
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,9 @@ type MapOpts struct {
// User indicates the page is a user page.
User bool

// Static indicates the entries should not be cleared/freed.
Static bool

// MemoryType is the memory type.
MemoryType hostarch.MemoryType
}
Expand All @@ -91,7 +94,7 @@ func (p *PTE) Clear() {
//
//go:nosplit
func (p *PTE) Valid() bool {
return atomic.LoadUintptr((*uintptr)(p))&present != 0
return atomic.LoadUintptr((*uintptr)(p)) != 0
}

// Opts returns the PTE options.
Expand Down Expand Up @@ -138,8 +141,8 @@ func (p *PTE) IsSuper() bool {
// This does not change the super page property.
//
//go:nosplit
func (p *PTE) Set(addr uintptr, opts MapOpts) {
if !opts.AccessType.Any() {
func (p *PTE) Set(addr uintptr, opts *MapOpts) {
if !opts.AccessType.Any() && !opts.Static {
p.Clear()
return
}
Expand Down
14 changes: 8 additions & 6 deletions pkg/ring0/pagetables/walker_amd64.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ func (w *Walker) walkPTEs(entries *PTEs, start, end uintptr) (bool, uint16) {
var clearEntries uint16
for start < end {
pteIndex := uint16((start & pteMask) >> pteShift)
entry := &entries[pteIndex]
entry := entries.Get(pteIndex)
if !entry.Valid() && !w.visitor.requiresAlloc() {
clearEntries++
start += pteSize
Expand Down Expand Up @@ -81,7 +81,7 @@ func (w *Walker) walkPMDs(pmdEntries *PTEs, start, end uintptr) (bool, uint16) {
var pteEntries *PTEs
nextBoundary := addrEnd(start, end, pmdSize)
pmdIndex := uint16((start & pmdMask) >> pmdShift)
pmdEntry := &pmdEntries[pmdIndex]
pmdEntry := pmdEntries.Get(pmdIndex)
if !pmdEntry.Valid() {
if !w.visitor.requiresAlloc() {
// Skip over this entry.
Expand Down Expand Up @@ -114,9 +114,10 @@ func (w *Walker) walkPMDs(pmdEntries *PTEs, start, end uintptr) (bool, uint16) {
// Install the relevant entries.
pteEntries = w.pageTables.Allocator.NewPTEs()
for index := uint16(0); index < entriesPerPage; index++ {
opts := pmdEntry.Opts()
pteEntries[index].Set(
pmdEntry.Address()+(pteSize*uintptr(index)),
pmdEntry.Opts())
&opts)
}
pmdEntry.setPageTable(w.pageTables, pteEntries)
} else {
Expand Down Expand Up @@ -173,7 +174,7 @@ func (w *Walker) walkPUDs(pudEntries *PTEs, start, end uintptr) (bool, uint16) {
var pmdEntries *PTEs
nextBoundary := addrEnd(start, end, pudSize)
pudIndex := uint16((start & pudMask) >> pudShift)
pudEntry := &pudEntries[pudIndex]
pudEntry := pudEntries.Get(pudIndex)
if !pudEntry.Valid() {
if !w.visitor.requiresAlloc() {
// Skip over this entry.
Expand Down Expand Up @@ -209,9 +210,10 @@ func (w *Walker) walkPUDs(pudEntries *PTEs, start, end uintptr) (bool, uint16) {
pmdEntries = w.pageTables.Allocator.NewPTEs() // escapes: see above.
for index := uint16(0); index < entriesPerPage; index++ {
pmdEntries[index].SetSuper()
opts := pudEntry.Opts()
pmdEntries[index].Set(
pudEntry.Address()+(pmdSize*uintptr(index)),
pudEntry.Opts())
&opts)
}
pudEntry.setPageTable(w.pageTables, pmdEntries)
} else {
Expand Down Expand Up @@ -261,7 +263,7 @@ func (w *Walker) iterateRangeCanonical(start, end uintptr) bool {
var pudEntries *PTEs
nextBoundary := addrEnd(start, end, pgdSize)
pgdIndex := uint16((start & pgdMask) >> pgdShift)
pgdEntry := &w.pageTables.root[pgdIndex]
pgdEntry := w.pageTables.root.Get(pgdIndex)
if !w.pageTables.largeAddressesEnabled {
if !pgdEntry.Valid() {
if !w.visitor.requiresAlloc() {
Expand Down
6 changes: 4 additions & 2 deletions pkg/ring0/pagetables/walker_arm64.go
Original file line number Diff line number Diff line change
Expand Up @@ -87,9 +87,10 @@ func (w *Walker) iterateRangeCanonical(start, end uintptr) bool {
pmdEntries = w.pageTables.Allocator.NewPTEs()
for index := uint16(0); index < entriesPerPage; index++ {
pmdEntries[index].SetSect()
opts := pudEntry.Opts()
pmdEntries[index].Set(
pudEntry.Address()+(pmdSize*uintptr(index)),
pudEntry.Opts())
&opts)
}
pudEntry.setPageTable(w.pageTables, pmdEntries)
} else {
Expand Down Expand Up @@ -152,9 +153,10 @@ func (w *Walker) iterateRangeCanonical(start, end uintptr) bool {
// Install the relevant entries.
pteEntries = w.pageTables.Allocator.NewPTEs()
for index := uint16(0); index < entriesPerPage; index++ {
opts := pmdEntry.Opts()
pteEntries[index].Set(
pmdEntry.Address()+(pteSize*uintptr(index)),
pmdEntry.Opts())
&opts)
}
pmdEntry.setPageTable(w.pageTables, pteEntries)
} else {
Expand Down
21 changes: 14 additions & 7 deletions pkg/sentry/platform/kvm/bluepill_fault.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,14 @@ import (
var (
// faultBlockSize is the size used for servicing memory faults.
//
// This should be large enough to avoid frequent faults and avoid using
// all available KVM slots (~512), but small enough that KVM does not
// complain about slot sizes (~4GB). See handleBluepillFault for how
// this block is used.
faultBlockSize = uintptr(2 << 30)
// This should be large enough so that the total number of slots
// required to cover the 47-bit virtual address space does not exceed
// the KVM slot limit (e.g. 32764). Linux doesn't allocate virtual
// address space above 47-bit by default.
// It must be small enough to limit the memory overhead associated with
// KVM slot allocation. For example, using a 46-bit address space
// results in an overhead of ~250 MB.
faultBlockSize = uintptr(8 << 30)

// faultBlockMask is the mask for the fault blocks.
//
Expand All @@ -56,13 +59,17 @@ func calculateBluepillFault(physical uintptr) (virtualStart, physicalStart, leng
}

// Adjust the block to match our size.
physicalStart = pr.physical + (alignedPhysical-pr.physical)&faultBlockMask
virtualStart = pr.virtual + (physicalStart - pr.physical)
physicalStart = pr.physical / faultBlockSize * faultBlockSize
physicalStart = physicalStart + (alignedPhysical-physicalStart)&faultBlockMask
physicalEnd := physicalStart + faultBlockSize
if physicalStart < pr.physical {
physicalStart = pr.physical
}
if physicalEnd > end {
physicalEnd = end
}
length = physicalEnd - physicalStart
virtualStart = pr.virtual + (physicalStart - pr.physical)
return virtualStart, physicalStart, length, &physicalRegions[i]
}

Expand Down
2 changes: 2 additions & 0 deletions pkg/sentry/platform/kvm/kvm_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ import (
var dummyFPState fpu.State

type testHarness interface {
Logf(format string, args ...any)
Errorf(format string, args ...any)
Fatalf(format string, args ...any)
}
Expand Down Expand Up @@ -146,6 +147,7 @@ func applicationTest(t testHarness, useHostMappings bool, targetFn uintptr, fn f
// done for regular user code, but is fine for test
// purposes.)
applyPhysicalRegions(func(pr physicalRegion) bool {
t.Logf("Map %x-%x", pr.virtual, pr.virtual+pr.length)
pt.Map(hostarch.Addr(pr.virtual), pr.length, pagetables.MapOpts{
AccessType: hostarch.AnyAccess,
User: true,
Expand Down
56 changes: 33 additions & 23 deletions pkg/sentry/platform/kvm/machine.go
Original file line number Diff line number Diff line change
Expand Up @@ -332,26 +332,33 @@ func newMachine(vm int, config *Config) (*machine, error) {
// faultBlockSize has to equal or less than KVM_MEM_MAX_NR_PAGES.
faultBlockSize = uintptr(1) << 42
faultBlockMask = ^uintptr(faultBlockSize - 1)
for _, r := range physicalRegions {
m.mapPhysical(r.physical, r.length)
}
} else {
// Apply the physical mappings. Note that these mappings may point to
// guest physical addresses that are not actually available. These
// physical pages are mapped on demand, see kernel_unsafe.go.
applyPhysicalRegions(func(pr physicalRegion) bool {
physical := pr.physical
for physical < pr.physical+pr.length {
virtualStart, physicalStart, length, _ := calculateBluepillFault(physical)
// Pre-allocate page tables in the lower half.
m.kernel.PageTables.Map(
hostarch.Addr(virtualStart),
length,
pagetables.MapOpts{Static: true},
physicalStart)
physical += length
}

return true // Keep iterating.
})
// Install seccomp rules to trap runtime mmap system calls. They will
// be handled by seccompMmapHandler.
seccompMmapRules(m)
}

// Apply the physical mappings. Note that these mappings may point to
// guest physical addresses that are not actually available. These
// physical pages are mapped on demand, see kernel_unsafe.go.
applyPhysicalRegions(func(pr physicalRegion) bool {
// Map everything in the lower half.
m.kernel.PageTables.Map(
hostarch.Addr(pr.virtual),
pr.length,
pagetables.MapOpts{AccessType: hostarch.ReadWrite},
pr.physical)

return true // Keep iterating.
})

// Ensure that the currently mapped virtual regions are actually
// available in the VM. Note that this doesn't guarantee no future
// faults, however it should guarantee that everything is available to
Expand All @@ -368,6 +375,9 @@ func newMachine(vm int, config *Config) (*machine, error) {
// Cap the length to the end of the area.
length = vr.virtual + vr.length - virtual
}
// Ensure the physical range is mapped.
m.mapPhysical(physical, length)

// Update page tables for executable mappings.
if vr.accessType.Execute {
if vr.accessType.Write {
Expand All @@ -380,8 +390,6 @@ func newMachine(vm int, config *Config) (*machine, error) {
physical)
}

// Ensure the physical range is mapped.
m.mapPhysical(physical, length)
virtual += length
}
}
Expand All @@ -404,11 +412,6 @@ func newMachine(vm int, config *Config) (*machine, error) {
mapRegion(vr, 0)

})
if mapEntireAddressSpace {
for _, r := range physicalRegions {
m.mapPhysical(r.physical, r.length)
}
}
enableAsyncPreemption()
// Initialize architecture state.
if err := m.initArchState(); err != nil {
Expand Down Expand Up @@ -458,8 +461,15 @@ func (m *machine) mapPhysical(physical, length uintptr) {
}

// Is this already mapped? Check the usedSlots.
if !pr.mmio && !m.hasSlot(physicalStart) {
m.mapMemorySlot(virtualStart, physicalStart, length, pr.readOnly)
if !m.hasSlot(physicalStart) {
m.kernel.PageTables.Map(
hostarch.Addr(virtualStart),
length,
pagetables.MapOpts{AccessType: hostarch.ReadWrite},
physicalStart)
if !pr.mmio {
m.mapMemorySlot(virtualStart, physicalStart, length, pr.readOnly)
}
}

// Move to the next chunk.
Expand Down
10 changes: 9 additions & 1 deletion pkg/sentry/platform/kvm/physical_map.go
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ func fillAddressSpace() (specialRegions []specialVirtualRegion) {
pSize := uintptr(1) << ring0.PhysicalAddressBits
pSize -= reservedMemory

maxUserAddr := uintptr(0)
// Add specifically excluded regions; see excludeVirtualRegion.
if err := applyVirtualRegions(func(vr virtualRegion) {
if excludeVirtualRegion(vr) {
Expand All @@ -81,10 +82,17 @@ func fillAddressSpace() (specialRegions []specialVirtualRegion) {
})
log.Infof("mmio: virtual [%x,%x)", vr.virtual, vr.virtual+vr.length)
}
if vr.filename != "[vsyscall]" {
maxUserAddr = vr.region.virtual + vr.region.length
}
}); err != nil {
panic(fmt.Sprintf("error parsing /proc/self/maps: %v", err))
}

var archRegions []specialVirtualRegion
vSize, archRegions = archSpecialRegions(vSize, maxUserAddr)
specialRegions = append(specialRegions, archRegions...)

// Do we need any more work?
if vSize < pSize {
return specialRegions
Expand All @@ -109,7 +117,7 @@ func fillAddressSpace() (specialRegions []specialVirtualRegion) {
current := required // Attempted mmap size.
filled := uintptr(0)
suggestedAddr := uintptr(0)
if ring0.VirtualAddressBits > 48 {
if exendedAddressSpaceAllowed && ring0.VirtualAddressBits > 48 {
// Pass a hint address above 47 bits to indicate to the kernel that
// we can handle, and want, mappings above 47 bits:
// https://docs.kernel.org/arch/x86/x86_64/5level-paging.html#user-space-and-large-virtual-address-space.
Expand Down
Loading