Revert "[libc] Remove 'packaged' GPU build support (#100208)"

jhuber6 · jhuber6 · commit 550b83d65875 · 2024-07-24T07:51:47.000-05:00
Summary: I forgot that the OpenMP tests still look for this, reverting for now until I can make a fix. This reverts commit c1c6ed8.
diff --git a/libc/cmake/modules/LLVMLibCLibraryRules.cmake b/libc/cmake/modules/LLVMLibCLibraryRules.cmake
@@ -83,6 +83,97 @@ function(get_all_object_file_deps result fq_deps_list)
   set(${result} ${all_deps} PARENT_SCOPE)
 endfunction()
 
+# A rule to build a library from a collection of entrypoint objects and bundle
+# it into a GPU fatbinary. Usage is the same as 'add_entrypoint_library'.
+# Usage:
+#     add_gpu_entrypoint_library(
+#       DEPENDS <list of add_entrypoint_object targets>
+#     )
+function(add_gpu_entrypoint_library target_name base_target_name)
+  cmake_parse_arguments(
+    "ENTRYPOINT_LIBRARY"
+    "" # No optional arguments
+    "" # No single value arguments
+    "DEPENDS" # Multi-value arguments
+    ${ARGN}
+  )
+  if(NOT ENTRYPOINT_LIBRARY_DEPENDS)
+    message(FATAL_ERROR "'add_entrypoint_library' target requires a DEPENDS list "
+                        "of 'add_entrypoint_object' targets.")
+  endif()
+
+  get_fq_deps_list(fq_deps_list ${ENTRYPOINT_LIBRARY_DEPENDS})
+  get_all_object_file_deps(all_deps "${fq_deps_list}")
+
+  # The GPU 'libc' needs to be exported in a format that can be linked with
+  # offloading langauges like OpenMP or CUDA. This wraps every GPU object into a
+  # fat binary and adds them to a static library.
+  set(objects "")
+  foreach(dep IN LISTS all_deps)
+    set(object $<$<STREQUAL:$<TARGET_NAME_IF_EXISTS:${dep}>,${dep}>:$<TARGET_OBJECTS:${dep}>>)
+    string(FIND ${dep} "." last_dot_loc REVERSE)
+    math(EXPR name_loc "${last_dot_loc} + 1")
+    string(SUBSTRING ${dep} ${name_loc} -1 name)
+    if(LIBC_TARGET_ARCHITECTURE_IS_NVPTX)
+      set(prefix --image=arch=generic,triple=nvptx64-nvidia-cuda,feature=+ptx63)
+    elseif(LIBC_TARGET_ARCHITECTURE_IS_AMDGPU)
+      set(prefix --image=arch=generic,triple=amdgcn-amd-amdhsa)
+    endif()
+
+    # Use the 'clang-offload-packager' to merge these files into a binary blob.
+    add_custom_command(
+      OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/binary/${name}.gpubin"
+      COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_CURRENT_BINARY_DIR}/binary
+      COMMAND ${LIBC_CLANG_OFFLOAD_PACKAGER}
+              "${prefix},file=$<JOIN:${object},,file=>" -o
+              ${CMAKE_CURRENT_BINARY_DIR}/binary/${name}.gpubin
+      DEPENDS ${dep} ${base_target_name}
+      COMMENT "Packaging LLVM offloading binary for '${object}'"
+    )
+    add_custom_target(${dep}.__gpubin__ DEPENDS ${dep}
+                      "${CMAKE_CURRENT_BINARY_DIR}/binary/${name}.gpubin")
+    if(TARGET clang-offload-packager)
+      add_dependencies(${dep}.__gpubin__ clang-offload-packager)
+    endif()
+
+    # CMake does not permit setting the name on object files. In order to have
+    # human readable names we create an empty stub file with the entrypoint
+    # name. This empty file will then have the created binary blob embedded.
+    add_custom_command(
+      OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/stubs/${name}.cpp"
+      COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_CURRENT_BINARY_DIR}/stubs
+      COMMAND ${CMAKE_COMMAND} -E touch ${CMAKE_CURRENT_BINARY_DIR}/stubs/${name}.cpp
+      DEPENDS ${dep} ${dep}.__gpubin__ ${base_target_name}
+    )
+    add_custom_target(${dep}.__stub__
+                      DEPENDS ${dep}.__gpubin__ "${CMAKE_CURRENT_BINARY_DIR}/stubs/${name}.cpp")
+
+    add_library(${dep}.__fatbin__
+      EXCLUDE_FROM_ALL OBJECT
+      "${CMAKE_CURRENT_BINARY_DIR}/stubs/${name}.cpp"
+    )
+
+    # This is always compiled for the LLVM host triple instead of the native GPU
+    # triple that is used by default in the build.
+    target_compile_options(${dep}.__fatbin__ BEFORE PRIVATE -nostdlib)
+    target_compile_options(${dep}.__fatbin__ PRIVATE
+      --target=${LLVM_HOST_TRIPLE}
+      "SHELL:-Xclang -fembed-offload-object=${CMAKE_CURRENT_BINARY_DIR}/binary/${name}.gpubin")
+    add_dependencies(${dep}.__fatbin__
+                     ${dep} ${dep}.__stub__ ${dep}.__gpubin__ ${base_target_name})
+
+    # Set the list of newly create fat binaries containing embedded device code.
+    list(APPEND objects $<TARGET_OBJECTS:${dep}.__fatbin__>)
+  endforeach()
+
+  add_library(
+    ${target_name}
+    STATIC
+      ${objects}
+  )
+  set_target_properties(${target_name} PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${LIBC_LIBRARY_DIR})
+endfunction(add_gpu_entrypoint_library)
+
 # A rule to build a library from a collection of entrypoint objects and bundle
 # it in a single LLVM-IR bitcode file.
 # Usage:
diff --git a/libc/docs/gpu/building.rst b/libc/docs/gpu/building.rst
@@ -151,6 +151,25 @@ Build overview
 Once installed, the GPU build will create several files used for different
 targets. This section will briefly describe their purpose.
 
+**lib/<host-triple>/libcgpu-amdgpu.a or lib/libcgpu-amdgpu.a**
+  A static library containing fat binaries supporting AMD GPUs. These are built
+  using the support described in the `clang documentation
+  <https://clang.llvm.org/docs/OffloadingDesign.html>`_. These are intended to
+  be static libraries included natively for offloading languages like CUDA, HIP,
+  or OpenMP. This implements the standard C library.
+
+**lib/<host-triple>/libmgpu-amdgpu.a or lib/libmgpu-amdgpu.a**
+  A static library containing fat binaries that implements the standard math
+  library for AMD GPUs.
+
+**lib/<host-triple>/libcgpu-nvptx.a or lib/libcgpu-nvptx.a**
+  A static library containing fat binaries that implement the standard C library
+  for NVIDIA GPUs.
+
+**lib/<host-triple>/libmgpu-nvptx.a or lib/libmgpu-nvptx.a**
+  A static library containing fat binaries that implement the standard math
+  library for NVIDIA GPUs.
+
 **include/<target-triple>**
   The include directory where all of the generated headers for the target will
   go. These definitions are strictly for the GPU when being targeted directly.
diff --git a/libc/docs/gpu/using.rst b/libc/docs/gpu/using.rst
@@ -34,17 +34,16 @@ described in the `clang documentation
 by the OpenMP toolchain, but is currently opt-in for the CUDA and HIP toolchains
 through the ``--offload-new-driver``` and ``-fgpu-rdc`` flags.
 
-In order or link the GPU runtime, we simply pass this library to the embedded 
-device linker job. This can be done using the ``-Xoffload-linker`` option, which 
-forwards an argument to a ``clang`` job used to create the final GPU executable. 
-The toolchain should pick up the C libraries automatically in most cases, so 
-this shouldn't be necessary.
+The installation should contain a static library called ``libcgpu-amdgpu.a`` or
+``libcgpu-nvptx.a`` depending on which GPU architectures your build targeted.
+These contain fat binaries compatible with the offloading toolchain such that
+they can be used directly.
 
 .. code-block:: sh
 
-  $> clang openmp.c -fopenmp --offload-arch=gfx90a -Xoffload-linker -lc
-  $> clang cuda.cu --offload-arch=sm_80 --offload-new-driver -fgpu-rdc -Xoffload-linker -lc
-  $> clang hip.hip --offload-arch=gfx940 --offload-new-driver -fgpu-rdc -Xoffload-linker -lc
+  $> clang openmp.c -fopenmp --offload-arch=gfx90a -lcgpu-amdgpu
+  $> clang cuda.cu --offload-arch=sm_80 --offload-new-driver -fgpu-rdc -lcgpu-nvptx
+  $> clang hip.hip --offload-arch=gfx940 --offload-new-driver -fgpu-rdc -lcgpu-amdgpu
 
 This will automatically link in the needed function definitions if they were
 required by the user's application. Normally using the ``-fgpu-rdc`` option
diff --git a/libc/lib/CMakeLists.txt b/libc/lib/CMakeLists.txt
@@ -40,6 +40,20 @@ foreach(archive IN ZIP_LISTS
   # Add the offloading version of the library for offloading languages. These
   # are installed in the standard search path separate from the other libraries.
   if(LIBC_TARGET_OS_IS_GPU)
+    add_gpu_entrypoint_library(
+      ${archive_1}gpu
+      ${archive_1}
+      DEPENDS
+        ${${archive_2}}
+    )
+    set_target_properties(
+      ${archive_1}gpu
+      PROPERTIES
+        ARCHIVE_OUTPUT_NAME ${archive_0}gpu-${LIBC_TARGET_ARCHITECTURE}
+        ARCHIVE_OUTPUT_DIRECTORY ${LLVM_LIBRARY_OUTPUT_INTDIR}
+    )
+    list(APPEND added_gpu_archive_targets ${archive_1}gpu)
+
     add_bitcode_entrypoint_library(
       ${archive_1}bitcode
       ${archive_1}
@@ -51,6 +65,7 @@ foreach(archive IN ZIP_LISTS
       PROPERTIES
         OUTPUT_NAME ${archive_1}.bc
     )
+    add_dependencies(${archive_1}gpu ${archive_1}bitcode)
     list(APPEND added_gpu_bitcode_targets ${archive_1}bitcode)
   endif()
 endforeach()