Merge branch 'karpathy:master' into master

morphpiece · web-flow · commit 369672c2e5cf · 2024-08-06T14:57:33.000+05:00
diff --git a/Makefile b/Makefile
@@ -62,6 +62,7 @@ $(info ---------------------------------------------)
 
 ifneq ($(OS), Windows_NT)
   NVCC := $(shell which nvcc 2>/dev/null)
+  NVCC_LDFLAGS += -lnvidia-ml
 
   # Function to test if the compiler accepts a given flag.
   define check_and_add_flag
diff --git a/dev/cuda/Makefile b/dev/cuda/Makefile
@@ -9,15 +9,15 @@ ifeq ($(NVCC),)
 endif
 
 ifneq ($(CI),true) # if not in CI, then use the GPU query
-  ifndef GPU_COMPUTE_CAPABILITY # set to defaults if: make GPU_COMPUTE_CAPABILITY= 
+  ifndef GPU_COMPUTE_CAPABILITY # set to defaults if: make GPU_COMPUTE_CAPABILITY=
     GPU_COMPUTE_CAPABILITY = $(shell __nvcc_device_query) # assume if NVCC is present, then this likely is too
     GPU_COMPUTE_CAPABILITY := $(strip $(GPU_COMPUTE_CAPABILITY))
   endif
 endif
 
 # Compiler flags
-ifeq ($(GPU_COMPUTE_CAPABILITY),) # set to defaults if: make GPU_COMPUTE_CAPABILITY= 
-  CFLAGS = -O3 --use_fast_math 
+ifeq ($(GPU_COMPUTE_CAPABILITY),) # set to defaults if: make GPU_COMPUTE_CAPABILITY=
+  CFLAGS = -O3 --use_fast_math
 else
   CFLAGS = -O3 --use_fast_math --generate-code arch=compute_$(GPU_COMPUTE_CAPABILITY),code=[compute_$(GPU_COMPUTE_CAPABILITY),sm_$(GPU_COMPUTE_CAPABILITY)]
 endif
@@ -30,7 +30,8 @@ MPI_PATHS = -I/usr/lib/x86_64-linux-gnu/openmpi/include -L/usr/lib/x86_64-linux-
 	$(NVCC) $(CFLAGS) $(NVCCFLAGS) $< -o $@
 
 # Build all targets
-TARGETS = adamw attention_backward attention_forward classifier_fused crossentropy_forward crossentropy_softmax_backward encoder_backward encoder_forward gelu_backward gelu_forward layernorm_backward layernorm_forward matmul_backward matmul_backward_bias matmul_forward nccl_all_reduce residual_forward softmax_forward trimat_forward fused_residual_forward  global_norm
+TARGETS = adamw attention_backward attention_forward classifier_fused crossentropy_forward crossentropy_softmax_backward encoder_backward encoder_forward gelu_backward gelu_forward layernorm_backward layernorm_forward matmul_backward matmul_backward_bias matmul_forward nccl_all_reduce residual_forward softmax_forward trimat_forward fused_residual_forward  global_norm permute
+
 all: $(TARGETS)
 all_ptx:  $(TARGETS:%=%.ptx)
 all_sass: $(TARGETS:%=%.sass)
@@ -64,6 +65,8 @@ matmul_backward: matmul_backward.cu
 adamw: adamw.cu
 global_norm: global_norm.cu
 
+permute: permute.cu
+
 # NCCL communication kernels
 nccl_all_reduce: nccl_all_reduce.cu
 	$(NVCC) -lmpi -lnccl $(NVCCFLAGS) $(MPI_PATHS) nccl_all_reduce.cu -o nccl_all_reduce
diff --git a/dev/cuda/attention_backward.cu b/dev/cuda/attention_backward.cu
@@ -1137,6 +1137,7 @@ int main(int argc, char **argv) {
     free(dinp);
     free(dpreatt);
     free(datt);
+    free(h_dinp);
     cudaCheck(cudaFree(d_inp));
     cudaCheck(cudaFree(d_qkvr));
     cudaCheck(cudaFree(d_preatt));
diff --git a/dev/cuda/attention_forward.cu b/dev/cuda/attention_forward.cu
@@ -1377,6 +1377,7 @@ int main(int argc, char **argv) {
     cudaCheck(cudaFree(d_preatt));
     cudaCheck(cudaFree(d_att));
     cudaCheck(cudaFree(d_inp));
+    cudaCheck(cudaFree(d_stats));
     cublasDestroy(cublas_handle);
 
     #ifdef ENABLE_CUDNN
diff --git a/dev/cuda/classifier_fused.cu b/dev/cuda/classifier_fused.cu
@@ -766,6 +766,7 @@ int main(int argc, char **argv) {
     cudaCheck(cudaFree(d_logits));
     cudaCheck(cudaFree(d_dlosses));
     cudaCheck(cudaFree(d_targets));
+    cudaCheck(cudaFree(d_dlogits_no_pad));
 
     return 0;
 }
diff --git a/dev/cuda/nccl_all_reduce.cu b/dev/cuda/nccl_all_reduce.cu
@@ -193,5 +193,6 @@ int main(int argc, char **argv) {
 
   free(all_reduce_buffer_host);
   cudaCheck(cudaFree(all_reduce_buffer));
+  cudaCheck(cudaFree(all_reduce_buffer_recv));
   multi_gpu_config_free(&multi_gpu_config);
 }
diff --git a/dev/cuda/permute.cu b/dev/cuda/permute.cu
@@ -0,0 +1,181 @@
+/*
+Kernels to demonstrate permute operation.
+
+Compile example:
+nvcc -O3 permute.cu -o permute
+
+The goal is to permute a 4D matrix from its original shape (dim1, dim2, dim3, dim4) to a new shape (dim4, dim3, dim1, dim2).
+
+Before permutation, we need to understand how to access elements in a flattened (linear) form of the matrix.
+
+Given:
+
+dim1 = size of the 1st dimension
+dim2 = size of the 2nd dimension
+dim3 = size of the 3rd dimension
+dim4 = size of the 4th dimension
+
+For any element in a 4D matrix at position (i1, i2, i3, i4), where:
+
+i1 is the index in dimension 1
+i2 is the index in dimension 2
+i3 is the index in dimension 3
+i4 is the index in dimension 4
+
+If you find it challenging to calculate the indices i1, i2, i3, and i4, observe the pattern in the index calculations.
+Initially, it might take some time to grasp, but with practice, you'll develop a mental model for it.
+
+To calculate the indices, use the following formulas:
+
+i1 = (idx / (dim2 * dim3 * dim4)) % dim1;
+i2 = (idx / (dim3 * dim4)) % dim2;
+i3 = (idx / dim4) % dim3;
+i4 = idx % dim4;
+
+Pattern Explanation:
+To find the index for any dimension, divide the thread ID (idx) by the product of all subsequent dimensions.
+Then, perform modulo operation with the current dimension.
+
+
+
+The linear index in a flattened 1D array is calculated as:
+linear_idx = i1 × ( dim2 × dim3 × dim4 ) + i2 × ( dim3 × dim4 ) + i3 × dim4 + i4
+This linear index uniquely identifies the position of the element in the 1D array.
+
+To permute the matrix, we need to rearrange the indices according to the new shape.
+In this case, we are permuting from (dim1, dim2, dim3, dim4) to (dim4, dim3, dim1, dim2).
+
+The new dimension post permutation will be as follows:
+
+dim1 becomes the new 3rd dimension.
+dim2 becomes the new 4th dimension.
+dim3 becomes the new 2nd dimension.
+dim4 becomes the new 1st dimension.
+
+permuted_idx = i4 * (dim3 * dim1 * dim2) + i3 * (dim1 * dim2) + i1 * dim2 + i2;
+
+Here's how this works:
+
+i4 * (dim3 * dim1 * dim2): This accounts for how many complete dim3 × dim1 × dim2 blocks fit before the current i4 block.
+i3 * (dim1 * dim2): This accounts for the offset within the current i4 block, specifying which i3 block we are in.
+i1 * dim2: This accounts for the offset within the current i3 block, specifying which i1 block we are in.
+i2: This gives the offset within the current i1 block.
+
+Lastly at the end we store the current value at idx index of the original value to the permuted index in the permuted_matrix.
+
+
+--------------------------------------------------------------------------------------------------------------------------------------------------------
+
+Similarly we can follow the above approach to permute matrices of any dimensions.
+
+*/
+
+
+#include <cuda_runtime.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <cmath>
+
+#include "common.h"
+
+// CPU function to permute a 4D matrix
+void permute_cpu(const float* matrix, float* out_matrix, int dim1, int dim2, int dim3, int dim4) {
+    int total_threads = dim1 * dim2 * dim3 * dim4;
+
+    for (int idx = 0; idx < total_threads; idx++) {
+        // Calculate the 4D indices from the linear index
+        int i1 = (idx / (dim2 * dim3 * dim4)) % dim1;
+        int i2 = (idx / (dim3 * dim4)) % dim2;
+        int i3 = (idx / dim4) % dim3;
+        int i4 = idx % dim4;
+
+        // Compute the new index for the permuted matrix
+        // Transpose from (dim1, dim2, dim3, dim4) to (dim4, dim3, dim1, dim2)
+        int permuted_idx = i4 * (dim3 * dim1 * dim2) + i3 * (dim1 * dim2) + i1 * dim2 + i2;
+        out_matrix[permuted_idx] = matrix[idx];
+    }
+}
+
+// CUDA kernel to permute a 4D matrix
+__global__ void permute_kernel(const float* matrix, float* out_matrix, int dim1, int dim2, int dim3, int dim4) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+    // Ensure index is within bounds
+    if (idx < dim1 * dim2 * dim3 * dim4) {
+        // Calculate the 4D indices from the linear index
+        int i1 = (idx / (dim2 * dim3 * dim4)) % dim1;
+        int i2 = (idx / (dim3 * dim4)) % dim2;
+        int i3 = (idx / dim4) % dim3;
+        int i4 = idx % dim4;
+
+        // Compute the new index for the permuted matrix
+        // Transpose from (dim1, dim2, dim3, dim4) to (dim4, dim3, dim1, dim2)
+        int permuted_idx = i4 * (dim3 * dim1 * dim2) + i3 * (dim1 * dim2) + i1 * dim2 + i2;
+        out_matrix[permuted_idx] = matrix[idx];
+    }
+}
+
+
+int main() {
+    int dim_1 = 24;
+    int dim_2 = 42;
+    int dim_3 = 20;
+    int dim_4 = 32;
+
+    // Set up the device
+    int deviceIdx = 0;
+    cudaSetDevice(deviceIdx);
+    cudaDeviceProp deviceProp;
+    cudaGetDeviceProperties(&deviceProp, deviceIdx);
+    printf("Device %d: %s\n", deviceIdx, deviceProp.name);
+
+    // Allocate host memory
+    float* matrix = make_random_float(dim_1 * dim_2 * dim_3 * dim_4);
+    float* permuted_matrix = (float*)malloc(dim_1 * dim_2 * dim_3 * dim_4 * sizeof(float));
+
+    // Initialize the matrix with random values
+
+    // Allocate device memory
+    float *d_matrix, *d_permuted_matrix;
+    cudaMalloc(&d_matrix, dim_1 * dim_2 * dim_3 * dim_4 * sizeof(float));
+    cudaMalloc(&d_permuted_matrix, dim_1 * dim_2 * dim_3 * dim_4 * sizeof(float));
+
+    // Copy matrix from host to device
+    cudaMemcpy(d_matrix, matrix, dim_1 * dim_2 * dim_3 * dim_4 * sizeof(float), cudaMemcpyHostToDevice);
+
+    // Perform permutation on CPU
+    clock_t start = clock();
+    permute_cpu(matrix, permuted_matrix, dim_1, dim_2, dim_3, dim_4);
+    clock_t end = clock();
+    double elapsed_time_cpu = (double)(end - start) / CLOCKS_PER_SEC;
+
+    // Define block and grid sizes
+    dim3 blockSize(256);
+    int totalThreads = dim_1 * dim_2 * dim_3 * dim_4;
+    int gridSize = (totalThreads + blockSize.x - 1) / blockSize.x; // Compute grid size
+
+    // Launch CUDA kernel to perform permutation
+    permute_kernel<<<gridSize, blockSize>>>(d_matrix, d_permuted_matrix, dim_1, dim_2, dim_3, dim_4);
+    cudaDeviceSynchronize(); // Ensure kernel execution is complete
+
+    // Verify results
+    printf("Checking correctness...\n");
+    validate_result(d_permuted_matrix, permuted_matrix, "permuted_matrix", dim_1 * dim_2 * dim_3 * dim_4, 1e-5f);
+
+    printf("All results match.\n\n");
+    // benchmark kernel
+    int repeat_times = 1000;
+    float elapsed_time = benchmark_kernel(repeat_times, permute_kernel,
+                                          d_matrix, d_permuted_matrix, dim_1, dim_2, dim_3, dim_4
+    );
+    printf("time gpu %.4f ms\n", elapsed_time);
+    printf("time cpu %.4f ms\n", elapsed_time_cpu);
+
+    // Free allocated memory
+    free(matrix);
+    free(permuted_matrix);
+    cudaFree(d_matrix);
+    cudaFree(d_permuted_matrix);
+
+    return 0;
+}
diff --git a/dev/cuda/trimat_forward.cu b/dev/cuda/trimat_forward.cu
@@ -643,6 +643,7 @@ int main(int argc, char **argv) {
     free(inp);
     cudaCheck(cudaFree(d_out));
     cudaCheck(cudaFree(d_inp));
+    cudaCheck(cudaFree(d_qkvr));
     cublasDestroy(cublas_handle);
 
     return 0;
diff --git a/llmc/adamw.cuh b/llmc/adamw.cuh
@@ -61,6 +61,16 @@ __global__ void adamw_kernel3(Tp* params_memory, float* master_params_memory, Tg
                  );
 }
 
+template <typename Tp>
+__global__ void init_from_master_kernel(Tp* params_memory, float* master_params_memory, size_t num_parameters,
+                                          ptrdiff_t w_stride, ptrdiff_t s_stride, unsigned int seed) {
+    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= num_parameters) { return; }
+    params_memory += blockIdx.y * w_stride; // adjust for layer offset
+    master_params_memory += blockIdx.y * s_stride;
+    stochastic_rounding(master_params_memory[idx], &params_memory[idx], seed);
+}
+
 template <typename Tp, typename Tg>
 void adamw_update(Tp* params_memory, float* master_params_memory, Tg* grads_memory, float* m_memory, float* v_memory, size_t num_parameters,
                   ptrdiff_t w_stride, ptrdiff_t g_stride, ptrdiff_t s_stride,  int num_slices, float learning_rate, float beta1, float beta2, int t, float eps, float weight_decay,
@@ -75,4 +85,14 @@ void adamw_update(Tp* params_memory, float* master_params_memory, Tg* grads_memo
                                                          learning_rate, beta1, beta2, beta1_correction, beta2_correction, eps, weight_decay,
                                                          grad_scale, seed);
     cudaCheck(cudaGetLastError());
-}
+}
+
+template <typename Tp>
+void init_from_master(Tp* params_memory, float* master_params_memory, size_t num_parameters,
+                        ptrdiff_t w_stride, ptrdiff_t s_stride, int num_slices, unsigned int seed, cudaStream_t stream) {
+    int block_size = 512; // must match block size of adamw_update so that RNG also matches
+    int num_blocks = CEIL_DIV(num_parameters, block_size);
+    init_from_master_kernel<<<dim3(num_blocks, num_slices), block_size, 0, stream>>>
+                             (params_memory, master_params_memory, num_parameters, w_stride, s_stride, seed);
+    cudaCheck(cudaGetLastError());
+}
diff --git a/llmc/mfu.h b/llmc/mfu.h
diff --git a/train_gpt2.c b/train_gpt2.c
diff --git a/train_gpt2.cu b/train_gpt2.cu

Original file line number	Diff line number	Diff line change
`@@ -766,6 +766,7 @@ int main(int argc, char **argv) {`
`766`	`766`	`cudaCheck(cudaFree(d_logits));`
`767`	`767`	`cudaCheck(cudaFree(d_dlosses));`
`768`	`768`	`cudaCheck(cudaFree(d_targets));`
	`769`	`+ cudaCheck(cudaFree(d_dlogits_no_pad));`
`769`	`770`
`770`	`771`	`return 0;`
`771`	`772`	`}`
Original file line number	Diff line number	Diff line change
`@@ -193,5 +193,6 @@ int main(int argc, char **argv) {`
`193`	`193`
`194`	`194`	`free(all_reduce_buffer_host);`
`195`	`195`	`cudaCheck(cudaFree(all_reduce_buffer));`
	`196`	`+ cudaCheck(cudaFree(all_reduce_buffer_recv));`
`196`	`197`	`multi_gpu_config_free(&multi_gpu_config);`
`197`	`198`	`}`