Skip to content

Commit 2105296

Browse files
klucketensorflower-gardener
authored andcommitted
Move GpuDriver HostAllocate and HostDeallocate functions into the proper Executor classes.
PiperOrigin-RevId: 685451178
1 parent a2cf898 commit 2105296

File tree

7 files changed

+80
-85
lines changed

7 files changed

+80
-85
lines changed

third_party/xla/xla/stream_executor/cuda/cuda_driver.cc

Lines changed: 0 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -787,27 +787,6 @@ void GpuDriver::DestroyStream(Context* context, GpuStreamHandle stream) {
787787
}
788788
}
789789

790-
void* GpuDriver::HostAllocate(Context* context, uint64_t bytes) {
791-
ScopedActivateContext activation(context);
792-
void* host_mem = nullptr;
793-
// "Portable" memory is visible to all CUDA contexts. Safe for our use model.
794-
auto status = cuda::ToStatus(
795-
cuMemHostAlloc(&host_mem, bytes, CU_MEMHOSTALLOC_PORTABLE));
796-
if (!status.ok()) {
797-
LOG(ERROR) << "failed to alloc " << bytes << " bytes on host: " << status;
798-
}
799-
return host_mem;
800-
}
801-
802-
void GpuDriver::HostDeallocate(Context* context, void* location) {
803-
ScopedActivateContext activation(context);
804-
auto status = cuda::ToStatus(cuMemFreeHost(location));
805-
if (!status.ok()) {
806-
LOG(ERROR) << "error deallocating host memory at " << location << ": "
807-
<< status;
808-
}
809-
}
810-
811790
absl::Status GpuDriver::SynchronizeStream(Context* context, CUstream stream) {
812791
ScopedActivateContext activated{context};
813792
CHECK(stream != nullptr);

third_party/xla/xla/stream_executor/cuda/cuda_executor.cc

Lines changed: 41 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -70,9 +70,11 @@ limitations under the License.
7070
#include "xla/stream_executor/gpu/gpu_types.h"
7171
#include "xla/stream_executor/gpu/read_numa_node.h"
7272
#include "xla/stream_executor/gpu/scoped_activate_context.h"
73+
#include "xla/stream_executor/host_memory_allocation.h"
7374
#include "xla/stream_executor/kernel.h"
7475
#include "xla/stream_executor/kernel_spec.h"
7576
#include "xla/stream_executor/launch_dim.h"
77+
#include "xla/stream_executor/memory_allocation.h"
7678
#include "xla/stream_executor/module_spec.h"
7779
#include "xla/stream_executor/platform.h"
7880
#include "xla/stream_executor/plugin_registry.h"
@@ -503,6 +505,29 @@ void DeviceDeallocate(Context* context, void* location) {
503505
}
504506
}
505507

508+
// Allocates memory on the host.
509+
void* HostAllocate(Context* context, uint64_t bytes) {
510+
ScopedActivateContext activation(context);
511+
void* host_mem = nullptr;
512+
// "Portable" memory is visible to all CUDA contexts. Safe for our use model.
513+
auto status = cuda::ToStatus(
514+
cuMemHostAlloc(&host_mem, bytes, CU_MEMHOSTALLOC_PORTABLE));
515+
if (!status.ok()) {
516+
LOG(ERROR) << "failed to alloc " << bytes << " bytes on host: " << status;
517+
}
518+
return host_mem;
519+
}
520+
521+
// Deallocates memory allocated via HostAllocate.
522+
void HostDeallocate(Context* context, void* location) {
523+
ScopedActivateContext activation(context);
524+
auto status = cuda::ToStatus(cuMemFreeHost(location));
525+
if (!status.ok()) {
526+
LOG(ERROR) << "error deallocating host memory at " << location << ": "
527+
<< status;
528+
}
529+
}
530+
506531
} // namespace
507532

508533
// Given const GPU memory, returns a libcuda device pointer datatype, suitable
@@ -878,12 +903,22 @@ DeviceMemoryBase CudaExecutor::Allocate(uint64_t size, int64_t memory_space) {
878903
return DeviceMemoryBase(nullptr, 0);
879904
} else if (memory_space ==
880905
static_cast<int64_t>(stream_executor::MemoryType::kHost)) {
881-
return DeviceMemoryBase(GpuDriver::HostAllocate(gpu_context(), size), size);
906+
return DeviceMemoryBase(HostAllocate(gpu_context(), size), size);
882907
}
883908
CHECK_EQ(memory_space, 0);
884909
return DeviceMemoryBase(DeviceAllocate(gpu_context(), size), size);
885910
}
886911

912+
absl::StatusOr<std::unique_ptr<MemoryAllocation>>
913+
CudaExecutor::HostMemoryAllocate(uint64_t size) {
914+
auto* buffer = HostAllocate(gpu_context(), size);
915+
if (buffer == nullptr && size > 0) {
916+
return absl::InternalError(
917+
absl::StrFormat("Failed to allocate HostMemory of size %d", size));
918+
}
919+
return std::make_unique<HostMemoryAllocation>(buffer, size, this);
920+
}
921+
887922
void CudaExecutor::Deallocate(DeviceMemoryBase* mem) {
888923
auto status_or_memory_space = GetPointerMemorySpace(mem->opaque());
889924
if (!status_or_memory_space.ok()) {
@@ -892,12 +927,16 @@ void CudaExecutor::Deallocate(DeviceMemoryBase* mem) {
892927
}
893928
auto memory_space = status_or_memory_space.value();
894929
if (memory_space == MemoryType::kHost) {
895-
GpuDriver::HostDeallocate(gpu_context(), mem->opaque());
930+
HostDeallocate(gpu_context(), mem->opaque());
896931
} else {
897932
DeviceDeallocate(gpu_context(), mem->opaque());
898933
}
899934
}
900935

936+
void CudaExecutor::HostMemoryDeallocate(void* location) {
937+
return HostDeallocate(gpu_context(), location);
938+
}
939+
901940
bool CudaExecutor::SynchronizeAllActivity() {
902941
return gpu_context()->Synchronize().ok();
903942
}

third_party/xla/xla/stream_executor/cuda/cuda_executor.h

Lines changed: 2 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,6 @@ limitations under the License.
3030
#include "absl/numeric/int128.h"
3131
#include "absl/status/status.h"
3232
#include "absl/status/statusor.h"
33-
#include "absl/strings/str_format.h"
3433
#include "absl/synchronization/mutex.h"
3534
#include "absl/types/span.h"
3635
#include "xla/stream_executor/blas.h"
@@ -46,7 +45,6 @@ limitations under the License.
4645
#include "xla/stream_executor/gpu/gpu_executor.h"
4746
#include "xla/stream_executor/gpu/gpu_kernel.h"
4847
#include "xla/stream_executor/gpu/gpu_types.h"
49-
#include "xla/stream_executor/host_memory_allocation.h"
5048
#include "xla/stream_executor/kernel.h"
5149
#include "xla/stream_executor/kernel_spec.h"
5250
#include "xla/stream_executor/memory_allocation.h"
@@ -118,19 +116,9 @@ class CudaExecutor : public GpuExecutor {
118116
void* UnifiedMemoryAllocate(uint64_t size) override;
119117
void UnifiedMemoryDeallocate(void* location) override;
120118
absl::StatusOr<std::unique_ptr<MemoryAllocation>> HostMemoryAllocate(
121-
uint64_t size) override {
122-
auto* buffer = GpuDriver::HostAllocate(gpu_context(), size);
123-
if (buffer == nullptr && size > 0) {
124-
return absl::InternalError(
125-
absl::StrFormat("Failed to allocate HostMemory of size %d", size));
126-
}
127-
return std::make_unique<HostMemoryAllocation>(buffer, size, this);
128-
}
129-
130-
void HostMemoryDeallocate(void* location) override {
131-
return GpuDriver::HostDeallocate(gpu_context(), location);
132-
}
119+
uint64_t size) override;
133120

121+
void HostMemoryDeallocate(void* location) override;
134122
bool HostMemoryRegister(void* location, uint64_t size) override;
135123
bool HostMemoryUnregister(void* location) override;
136124

third_party/xla/xla/stream_executor/gpu/gpu_driver.h

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -69,18 +69,6 @@ class GpuDriver {
6969
// https://rocm.docs.amd.com/projects/HIPIFY/en/latest/tables/CUDA_Driver_API_functions_supported_by_HIP.html#stream-management
7070
static void DestroyStream(Context* context, GpuStreamHandle stream);
7171

72-
// Allocates page-locked and CUDA-registered memory on the host via
73-
// cuMemAllocHost/hipHostMalloc.
74-
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gdd8311286d2c2691605362c689bc64e0
75-
// https://rocm.docs.amd.com/projects/HIPIFY/en/latest/tables/CUDA_Driver_API_functions_supported_by_HIP.html#memory-management
76-
static void* HostAllocate(Context* context, uint64_t bytes);
77-
78-
// Deallocates a location created by HostAllocate, via
79-
// cuMemFreeHost/hipHostFree.
80-
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g62e0fdbe181dab6b1c90fa1a51c7b92c
81-
// https://rocm.docs.amd.com/projects/HIPIFY/en/latest/tables/CUDA_Driver_API_functions_supported_by_HIP.html#memory-management
82-
static void HostDeallocate(Context* context, void* location);
83-
8472
// Launches a CUDA/ROCm kernel via cuLaunchKernel/hipModuleLaunchKernel.
8573
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EXEC.html#group__CUDA__EXEC_1gb8f3dc3031b40da29d5f9a7139e52e15
8674
// https://rocm.docs.amd.com/projects/HIPIFY/en/latest/tables/CUDA_Driver_API_functions_supported_by_HIP.html#execution-control

third_party/xla/xla/stream_executor/rocm/rocm_driver.cc

Lines changed: 0 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -610,27 +610,6 @@ void GpuDriver::DestroyStream(Context* context, GpuStreamHandle stream) {
610610
}
611611
}
612612

613-
void* GpuDriver::HostAllocate(Context* context, uint64_t bytes) {
614-
ScopedActivateContext activation{context};
615-
void* host_mem = nullptr;
616-
// "Portable" memory is visible to all ROCM contexts. Safe for our use model.
617-
hipError_t res = wrap::hipHostMalloc(&host_mem, bytes, hipHostMallocPortable);
618-
if (res != hipSuccess) {
619-
LOG(ERROR) << "failed to alloc " << bytes
620-
<< " bytes on host: " << ToString(res);
621-
}
622-
return host_mem;
623-
}
624-
625-
void GpuDriver::HostDeallocate(Context* context, void* location) {
626-
ScopedActivateContext activation{context};
627-
hipError_t res = wrap::hipHostFree(location);
628-
if (res != hipSuccess) {
629-
LOG(ERROR) << "error deallocating host memory at " << location << ": "
630-
<< ToString(res);
631-
}
632-
}
633-
634613
absl::Status GpuDriver::SynchronizeStream(Context* context,
635614
GpuStreamHandle stream) {
636615
ScopedActivateContext activated{context};

third_party/xla/xla/stream_executor/rocm/rocm_executor.cc

Lines changed: 35 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,9 +58,11 @@ limitations under the License.
5858
#include "xla/stream_executor/gpu/gpu_types.h"
5959
#include "xla/stream_executor/gpu/read_numa_node.h"
6060
#include "xla/stream_executor/gpu/scoped_activate_context.h"
61+
#include "xla/stream_executor/host_memory_allocation.h"
6162
#include "xla/stream_executor/kernel.h"
6263
#include "xla/stream_executor/kernel_spec.h"
6364
#include "xla/stream_executor/launch_dim.h"
65+
#include "xla/stream_executor/memory_allocation.h"
6466
#include "xla/stream_executor/module_spec.h"
6567
#include "xla/stream_executor/platform.h"
6668
#include "xla/stream_executor/platform/initialize.h"
@@ -462,6 +464,20 @@ void DeviceDeallocate(Context* context, void* location) {
462464
<< context->device_ordinal();
463465
}
464466
}
467+
468+
// Allocates memory on the host.
469+
void* HostAllocate(Context* context, uint64_t bytes) {
470+
ScopedActivateContext activation{context};
471+
void* host_mem = nullptr;
472+
// "Portable" memory is visible to all ROCM contexts. Safe for our use model.
473+
hipError_t res = wrap::hipHostMalloc(&host_mem, bytes, hipHostMallocPortable);
474+
if (res != hipSuccess) {
475+
LOG(ERROR) << "failed to alloc " << bytes
476+
<< " bytes on host: " << ToString(res);
477+
}
478+
return host_mem;
479+
}
480+
465481
} // namespace
466482

467483
RocmExecutor::~RocmExecutor() {
@@ -711,11 +727,29 @@ absl::Status RocmExecutor::LoadModuleFromHsaco(const char* hsaco,
711727
DeviceMemoryBase RocmExecutor::Allocate(uint64_t size, int64_t memory_space) {
712728
if (memory_space ==
713729
static_cast<int64_t>(stream_executor::MemoryType::kHost)) {
714-
return DeviceMemoryBase(GpuDriver::HostAllocate(gpu_context(), size), size);
730+
return DeviceMemoryBase(HostAllocate(gpu_context(), size), size);
715731
}
716732
CHECK_EQ(memory_space, 0);
717733
return DeviceMemoryBase(DeviceAllocate(gpu_context(), size), size);
718734
}
735+
absl::StatusOr<std::unique_ptr<MemoryAllocation>>
736+
RocmExecutor::HostMemoryAllocate(uint64_t size) {
737+
auto* buffer = HostAllocate(gpu_context(), size);
738+
if (buffer == nullptr && size > 0) {
739+
return absl::InternalError(
740+
absl::StrFormat("Failed to allocate HostMemory of size %d", size));
741+
}
742+
return std::make_unique<HostMemoryAllocation>(buffer, size, this);
743+
}
744+
745+
void RocmExecutor::HostMemoryDeallocate(void* location) {
746+
ScopedActivateContext activation{gpu_context()};
747+
hipError_t res = wrap::hipHostFree(location);
748+
if (res != hipSuccess) {
749+
LOG(ERROR) << "error deallocating host memory at " << location << ": "
750+
<< ToString(res);
751+
}
752+
}
719753

720754
void RocmExecutor::Deallocate(DeviceMemoryBase* mem) {
721755
DeviceDeallocate(gpu_context(), mem->opaque());

third_party/xla/xla/stream_executor/rocm/rocm_executor.h

Lines changed: 2 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,6 @@ limitations under the License.
3030
#include "absl/numeric/int128.h"
3131
#include "absl/status/status.h"
3232
#include "absl/status/statusor.h"
33-
#include "absl/strings/str_format.h"
3433
#include "absl/synchronization/mutex.h"
3534
#include "absl/types/span.h"
3635
#include "xla/stream_executor/blas.h"
@@ -45,7 +44,6 @@ limitations under the License.
4544
#include "xla/stream_executor/gpu/gpu_executor.h"
4645
#include "xla/stream_executor/gpu/gpu_kernel.h"
4746
#include "xla/stream_executor/gpu/gpu_types.h"
48-
#include "xla/stream_executor/host_memory_allocation.h"
4947
#include "xla/stream_executor/kernel.h"
5048
#include "xla/stream_executor/kernel_spec.h"
5149
#include "xla/stream_executor/memory_allocation.h"
@@ -111,18 +109,8 @@ class RocmExecutor : public GpuExecutor {
111109

112110
void UnifiedMemoryDeallocate(void* location) override;
113111
absl::StatusOr<std::unique_ptr<MemoryAllocation>> HostMemoryAllocate(
114-
uint64_t size) override {
115-
auto* buffer = GpuDriver::HostAllocate(gpu_context(), size);
116-
if (buffer == nullptr && size > 0) {
117-
return absl::InternalError(
118-
absl::StrFormat("Failed to allocate HostMemory of size %d", size));
119-
}
120-
return std::make_unique<HostMemoryAllocation>(buffer, size, this);
121-
}
122-
123-
void HostMemoryDeallocate(void* location) override {
124-
return GpuDriver::HostDeallocate(gpu_context(), location);
125-
}
112+
uint64_t size) override;
113+
void HostMemoryDeallocate(void* location) override;
126114

127115
absl::StatusOr<MemoryType> GetPointerMemorySpace(const void* ptr) override;
128116

0 commit comments

Comments
 (0)