Skip to content

Commit df2f1ad

Browse files
committed
Allocator: Reserve upper 128TB of VA on 64-bit process
Only a partial fix for FEX-Emu#1330, still needs preemption disabled to work. On x86-64 hosts the Linux kernel resides in the top bit of VA which isn't mapped in to userspace. This means that userspace will never receive pointers living with that top bit set unless you're running a 57bit VA host. This results in userspace pointers never needing to do the sign extending pointer canonicalization. But additionally some applications actually don't understand the pointer canonicalization. This results in bugs like: golang/go#49405 Now if you're running on a 57bit VA host, this will end up behaving like FEX but it seems like no one in golang land has really messed with 57bit VA yet. In AArch64, when configured with a 48bit VA, the userspace gets the full 48bit VA space and on EL mode switch has the full address range change to the kernel's 48bit VA. This means that we will /very/ likely allocate pointers in the high 48bit space since Linux currently allocates top-down. So behave more like x86-64, hide the top 128TB of memory space from the guest before boot. Testing: Took the M1Max 15ms to 21ms allocate the top 128TB.
1 parent 8f170d4 commit df2f1ad

File tree

4 files changed

+177
-128
lines changed

4 files changed

+177
-128
lines changed

External/FEXCore/Source/Utils/Allocator.cpp

+147
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
#include "Utils/Allocator/HostAllocator.h"
22
#include <FEXCore/Utils/Allocator.h>
3+
#include <FEXCore/Utils/CompilerDefs.h>
4+
#include <FEXCore/Utils/LogManager.h>
35
#include <sys/mman.h>
46
#ifdef ENABLE_JEMALLOC
57
#include <jemalloc/jemalloc.h>
@@ -85,4 +87,149 @@ namespace FEXCore::Allocator {
8587
}
8688
#pragma GCC diagnostic pop
8789

90+
FEX_DEFAULT_VISIBILITY size_t DetermineVASize() {
91+
static constexpr std::array<uintptr_t, 7> TLBSizes = {
92+
57,
93+
52,
94+
48,
95+
47,
96+
42,
97+
39,
98+
36,
99+
};
100+
101+
for (auto Bits : TLBSizes) {
102+
uintptr_t Size = 1ULL << Bits;
103+
// Just try allocating
104+
// We can't actually determine VA size on ARM safely
105+
auto Find = [](uintptr_t Size) -> bool {
106+
for (int i = 0; i < 64; ++i) {
107+
// Try grabbing a some of the top pages of the range
108+
// x86 allocates some high pages in the top end
109+
void *Ptr = ::mmap(reinterpret_cast<void*>(Size - PAGE_SIZE * i), PAGE_SIZE, PROT_NONE, MAP_FIXED_NOREPLACE | MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
110+
if (Ptr != (void*)~0ULL) {
111+
::munmap(Ptr, PAGE_SIZE);
112+
if (Ptr == (void*)(Size - PAGE_SIZE * i)) {
113+
return true;
114+
}
115+
}
116+
}
117+
return false;
118+
};
119+
120+
if (Find(Size)) {
121+
return Bits;
122+
}
123+
}
124+
125+
LOGMAN_MSG_A_FMT("Couldn't determine host VA size");
126+
FEX_UNREACHABLE;
127+
}
128+
129+
PtrCache* StealMemoryRegion(uintptr_t Begin, uintptr_t End) {
130+
PtrCache *Cache{};
131+
uint64_t CacheSize{};
132+
uint64_t CurrentCacheOffset = 0;
133+
constexpr std::array<size_t, 10> ReservedVMARegionSizes = {{
134+
// Anything larger than 64GB fails out
135+
64ULL * 1024 * 1024 * 1024, // 64GB
136+
32ULL * 1024 * 1024 * 1024, // 32GB
137+
16ULL * 1024 * 1024 * 1024, // 16GB
138+
4ULL * 1024 * 1024 * 1024, // 4GB
139+
1ULL * 1024 * 1024 * 1024, // 1GB
140+
512ULL * 1024 * 1024, // 512MB
141+
128ULL * 1024 * 1024, // 128MB
142+
32ULL * 1024 * 1024, // 32MB
143+
1ULL * 1024 * 1024, // 1MB
144+
4096ULL // One page
145+
}};
146+
constexpr size_t AllocationSizeMaxIndex = ReservedVMARegionSizes.size() - 1;
147+
uint64_t CurrentSizeIndex = 0;
148+
149+
int PROT_FLAGS = PROT_READ | PROT_WRITE;
150+
for (size_t MemoryOffset = Begin; MemoryOffset < End;) {
151+
size_t AllocationSize = ReservedVMARegionSizes[CurrentSizeIndex];
152+
size_t MemoryOffsetUpper = MemoryOffset + AllocationSize;
153+
154+
// If we would go above the upper bound on size then try the next size
155+
if (MemoryOffsetUpper > End) {
156+
++CurrentSizeIndex;
157+
continue;
158+
}
159+
160+
void *Ptr = ::mmap(reinterpret_cast<void*>(MemoryOffset), AllocationSize, PROT_FLAGS, MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE | MAP_FIXED_NOREPLACE, -1, 0);
161+
162+
// If we managed to allocate and not get the address we want then unmap it
163+
// This happens with kernels older than 4.17
164+
if (reinterpret_cast<uintptr_t>(Ptr) + AllocationSize > End) {
165+
::munmap(Ptr, AllocationSize);
166+
Ptr = reinterpret_cast<void*>(~0ULL);
167+
}
168+
169+
// If we failed to allocate and we are on the smallest allocation size then just continue onward
170+
// This page was unmappable
171+
if (reinterpret_cast<uintptr_t>(Ptr) == ~0ULL && CurrentSizeIndex == AllocationSizeMaxIndex) {
172+
CurrentSizeIndex = 0;
173+
MemoryOffset += AllocationSize;
174+
continue;
175+
}
176+
177+
// Congratulations we were able to map this bit
178+
// Reset and claim it was available
179+
if (reinterpret_cast<uintptr_t>(Ptr) != ~0ULL) {
180+
if (!Cache) {
181+
Cache = reinterpret_cast<PtrCache *>(Ptr);
182+
CacheSize = AllocationSize;
183+
PROT_FLAGS = PROT_NONE;
184+
}
185+
else {
186+
Cache[CurrentCacheOffset] = {
187+
.Ptr = static_cast<uint64_t>(reinterpret_cast<uint64_t>(Ptr)),
188+
.Size = static_cast<uint64_t>(AllocationSize)
189+
};
190+
++CurrentCacheOffset;
191+
}
192+
193+
CurrentSizeIndex = 0;
194+
MemoryOffset += AllocationSize;
195+
continue;
196+
}
197+
198+
// Couldn't allocate at this size
199+
// Increase and continue
200+
++CurrentSizeIndex;
201+
}
202+
203+
Cache[CurrentCacheOffset] = {
204+
.Ptr = static_cast<uint64_t>(reinterpret_cast<uint64_t>(Cache)),
205+
.Size = CacheSize,
206+
};
207+
return Cache;
208+
}
209+
210+
PtrCache* Steal48BitVA() {
211+
size_t Bits = FEXCore::Allocator::DetermineVASize();
212+
if (Bits < 48) {
213+
return nullptr;
214+
}
215+
216+
uintptr_t Begin48BitVA = 0x0'8000'0000'0000ULL;
217+
uintptr_t End48BitVA = 0x1'0000'0000'0000ULL;
218+
return StealMemoryRegion(Begin48BitVA, End48BitVA);
219+
}
220+
221+
void ReclaimMemoryRegion(PtrCache* Regions) {
222+
if (Regions == nullptr) {
223+
return;
224+
}
225+
226+
for (size_t i = 0;; ++i) {
227+
void *Ptr = reinterpret_cast<void*>(Regions[i].Ptr);
228+
size_t Size = Regions[i].Size;
229+
::munmap(Ptr, Size);
230+
if (Ptr == Regions) {
231+
break;
232+
}
233+
}
234+
}
88235
}

External/FEXCore/Source/Utils/Allocator/64BitAllocator.cpp

+9-128
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
#include "Utils/Allocator/FlexBitSet.h"
22
#include "Utils/Allocator/HostAllocator.h"
33
#include "Utils/Allocator/IntrusiveArenaAllocator.h"
4+
#include <FEXCore/Utils/Allocator.h>
45
#include <FEXCore/Utils/LogManager.h>
56

67
#include <algorithm>
@@ -139,49 +140,14 @@ namespace Alloc::OSAllocator {
139140
}
140141

141142
// 32-bit old kernel workarounds
142-
struct PtrCache {
143-
uint32_t Ptr;
144-
uint32_t Size;
145-
};
146-
PtrCache *Steal32BitIfOldKernel();
147-
void Clear32BitOnOldKernel(PtrCache *Base);
143+
FEXCore::Allocator::PtrCache *Steal32BitIfOldKernel();
148144
};
149145

150146
void OSAllocator_64Bit::DetermineVASize() {
151-
static constexpr std::array<uintptr_t, 7> TLBSizes = {
152-
1ULL << 57,
153-
1ULL << 52,
154-
1ULL << 48,
155-
1ULL << 47,
156-
1ULL << 42,
157-
1ULL << 39,
158-
1ULL << 36,
159-
};
160-
161-
for (auto Size : TLBSizes) {
162-
// Just try allocating
163-
// We can't actually determine VA size on ARM safely
164-
auto Find = [](uintptr_t Size) -> bool {
165-
for (int i = 0; i < 64; ++i) {
166-
// Try grabbing a some of the top pages of the range
167-
// x86 allocates some high pages in the top end
168-
void *Ptr = ::mmap(reinterpret_cast<void*>(Size - PAGE_SIZE * i), PAGE_SIZE, PROT_NONE, MAP_FIXED_NOREPLACE | MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
169-
if (Ptr != (void*)~0ULL) {
170-
::munmap(Ptr, PAGE_SIZE);
171-
if (Ptr == (void*)(Size - PAGE_SIZE * i)) {
172-
return true;
173-
}
174-
}
175-
}
176-
return false;
177-
};
178-
179-
if (Find(Size)) {
180-
UPPER_BOUND = Size;
181-
UPPER_BOUND_PAGE = UPPER_BOUND / PAGE_SIZE;
182-
break;
183-
}
184-
}
147+
size_t Bits = FEXCore::Allocator::DetermineVASize();
148+
uintptr_t Size = 1ULL << Bits;
149+
UPPER_BOUND = Size;
150+
UPPER_BOUND_PAGE = UPPER_BOUND / PAGE_SIZE;
185151
}
186152

187153
void *OSAllocator_64Bit::Mmap(void *addr, size_t length, int prot, int flags, int fd, off_t offset) {
@@ -523,7 +489,7 @@ int OSAllocator_64Bit::Munmap(void *addr, size_t length) {
523489
return 0;
524490
}
525491

526-
OSAllocator_64Bit::PtrCache *OSAllocator_64Bit::Steal32BitIfOldKernel() {
492+
FEXCore::Allocator::PtrCache *OSAllocator_64Bit::Steal32BitIfOldKernel() {
527493
// First calculate kernel version
528494
struct utsname buf{};
529495
if (uname(&buf) == -1) {
@@ -548,95 +514,10 @@ OSAllocator_64Bit::PtrCache *OSAllocator_64Bit::Steal32BitIfOldKernel() {
548514
return nullptr;
549515
}
550516

551-
OSAllocator_64Bit::PtrCache *Cache{};
552-
uint32_t CacheSize{};
553-
uint32_t CurrentCacheOffset = 0;
554-
constexpr std::array<size_t, 6> ReservedVMARegionSizes = {{
555-
1ULL * 1024 * 1024 * 1024, // 1GB
556-
512ULL * 1024 * 1024, // 512MB
557-
128ULL * 1024 * 1024, // 128MB
558-
32ULL * 1024 * 1024, // 32MB
559-
1ULL * 1024 * 1024, // 1MB
560-
4096ULL // One page
561-
}};
562-
constexpr size_t AllocationSizeMaxIndex = ReservedVMARegionSizes.size() - 1;
563-
uint64_t CurrentSizeIndex = 0;
564-
565517
constexpr size_t LOWER_BOUND_32 = 0x1'0000;
566518
constexpr size_t UPPER_BOUND_32 = LOWER_BOUND;
567519

568-
for (size_t MemoryOffset = LOWER_BOUND_32; MemoryOffset < UPPER_BOUND_32;) {
569-
size_t AllocationSize = ReservedVMARegionSizes[CurrentSizeIndex];
570-
size_t MemoryOffsetUpper = MemoryOffset + AllocationSize;
571-
572-
// If we would go above the upper bound on size then try the next size
573-
if (MemoryOffsetUpper > UPPER_BOUND_32) {
574-
++CurrentSizeIndex;
575-
continue;
576-
}
577-
578-
void *Ptr = ::mmap(reinterpret_cast<void*>(MemoryOffset), AllocationSize, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE, -1, 0);
579-
580-
// If we managed to allocate and not get the address we want then unmap it
581-
// This happens with kernels older than 4.17
582-
if (reinterpret_cast<uintptr_t>(Ptr) + AllocationSize > UPPER_BOUND_32) {
583-
::munmap(Ptr, AllocationSize);
584-
Ptr = reinterpret_cast<void*>(~0ULL);
585-
}
586-
587-
// If we failed to allocate and we are on the smallest allocation size then just continue onward
588-
// This page was unmappable
589-
if (reinterpret_cast<uintptr_t>(Ptr) == ~0ULL && CurrentSizeIndex == AllocationSizeMaxIndex) {
590-
CurrentSizeIndex = 0;
591-
MemoryOffset += AllocationSize;
592-
continue;
593-
}
594-
595-
// Congratulations we were able to map this bit
596-
// Reset and claim it was available
597-
if (reinterpret_cast<uintptr_t>(Ptr) != ~0ULL) {
598-
if (!Cache) {
599-
Cache = reinterpret_cast<OSAllocator_64Bit::PtrCache *>(Ptr);
600-
CacheSize = AllocationSize;
601-
}
602-
else {
603-
Cache[CurrentCacheOffset] = {
604-
.Ptr = static_cast<uint32_t>(reinterpret_cast<uint64_t>(Ptr)),
605-
.Size = static_cast<uint32_t>(AllocationSize)
606-
};
607-
++CurrentCacheOffset;
608-
}
609-
610-
CurrentSizeIndex = 0;
611-
MemoryOffset += AllocationSize;
612-
continue;
613-
}
614-
615-
// Couldn't allocate at this size
616-
// Increase and continue
617-
++CurrentSizeIndex;
618-
}
619-
620-
Cache[CurrentCacheOffset] = {
621-
.Ptr = static_cast<uint32_t>(reinterpret_cast<uint64_t>(Cache)),
622-
.Size = CacheSize,
623-
};
624-
return Cache;
625-
}
626-
627-
void OSAllocator_64Bit::Clear32BitOnOldKernel(OSAllocator_64Bit::PtrCache *Base) {
628-
if (Base == nullptr) {
629-
return;
630-
}
631-
632-
for (size_t i = 0;; ++i) {
633-
void *Ptr = reinterpret_cast<void*>(Base[i].Ptr);
634-
size_t Size = Base[i].Size;
635-
::munmap(Ptr, Size);
636-
if (Ptr == Base) {
637-
break;
638-
}
639-
}
520+
return FEXCore::Allocator::StealMemoryRegion(LOWER_BOUND_32, UPPER_BOUND_32);
640521
}
641522

642523
OSAllocator_64Bit::OSAllocator_64Bit() {
@@ -735,7 +616,7 @@ OSAllocator_64Bit::OSAllocator_64Bit() {
735616
++CurrentSizeIndex;
736617
}
737618

738-
Clear32BitOnOldKernel(ArrayPtr);
619+
FEXCore::Allocator::ReclaimMemoryRegion(ArrayPtr);
739620
}
740621

741622
OSAllocator_64Bit::~OSAllocator_64Bit() {

External/FEXCore/include/FEXCore/Utils/Allocator.h

+17
Original file line numberDiff line numberDiff line change
@@ -21,4 +21,21 @@ namespace FEXCore::Allocator {
2121

2222
FEX_DEFAULT_VISIBILITY void SetupHooks();
2323
FEX_DEFAULT_VISIBILITY void ClearHooks();
24+
25+
FEX_DEFAULT_VISIBILITY size_t DetermineVASize();
26+
// 48-bit VA handling
27+
struct PtrCache {
28+
uint64_t Ptr;
29+
uint64_t Size;
30+
};
31+
32+
FEX_DEFAULT_VISIBILITY PtrCache* StealMemoryRegion(uintptr_t Begin, uintptr_t End);
33+
FEX_DEFAULT_VISIBILITY void ReclaimMemoryRegion(PtrCache* Regions);
34+
// When running a 64-bit executable on ARM then userspace guest only gets 47 bits of VA
35+
// This is a feature of x86-64 where the kernel gets a full 128TB of VA space
36+
// x86-64 canonical addresses with bit 48 set will sign extend the address (Ignoring LA57)
37+
// AArch64 canonical addresses are only up to bits 48/52 with the remainder being other things
38+
// Use this to reserve the top 128TB of VA so the guest never see it
39+
// Returns nullptr on host VA < 48bits
40+
FEX_DEFAULT_VISIBILITY PtrCache* Steal48BitVA();
2441
}

Source/Tests/FEXLoader.cpp

+4
Original file line numberDiff line numberDiff line change
@@ -478,8 +478,11 @@ int main(int argc, char **argv, char **const envp) {
478478
FEXCore::Config::Set(FEXCore::Config::CONFIG_IS64BIT_MODE, Loader.Is64BitMode() ? "1" : "0");
479479

480480
std::unique_ptr<FEX::HLE::x32::MemAllocator> Allocator;
481+
FEXCore::Allocator::PtrCache *Base48Bit{};
481482

482483
if (Loader.Is64BitMode()) {
484+
// Destroy the 48th bit if it exists
485+
Base48Bit = FEXCore::Allocator::Steal48BitVA();
483486
if (!Loader.MapMemory([](void *addr, size_t length, int prot, int flags, int fd, off_t offset) {
484487
return FEXCore::Allocator::mmap(addr, length, prot, flags, fd, offset);
485488
}, [](void *addr, size_t length) {
@@ -619,6 +622,7 @@ int main(int argc, char **argv, char **const envp) {
619622
LogMan::Msg::UnInstallHandlers();
620623

621624
FEXCore::Allocator::ClearHooks();
625+
FEXCore::Allocator::ReclaimMemoryRegion(Base48Bit);
622626
// Allocator is now original system allocator
623627

624628
FEXCore::Telemetry::Shutdown(ProgramName);

0 commit comments

Comments
 (0)