remove dyanmic device memory alloc free in reduce kernel

ryichando · ryichando · commit 70a204b43fbc · 2025-01-13T15:41:37.000+09:00
diff --git a/src/cpp/main/main.cu b/src/cpp/main/main.cu
@@ -138,6 +138,11 @@ void initialize(DataSet _host_dataset, DataSet _dev_dataset, ParamSet *_param) {
     unsigned face_count = host_dataset.mesh.mesh.face.size;
     unsigned hinge_count = host_dataset.mesh.mesh.hinge.size;
     unsigned tet_count = host_dataset.mesh.mesh.tet.size;
+
+    const unsigned max_reduce_count = std::max(
+        std::max(face_count, edge_count), std::max(tet_count, 3 * vert_count));
+    utility::set_max_reduce_count(max_reduce_count);
+
     unsigned collision_mesh_vert_count =
         host_dataset.constraint.mesh.active
             ? host_dataset.constraint.mesh.vertex.size
@@ -238,43 +243,8 @@ StepResult advance() {
     const unsigned shell_face_count = host_dataset.shell_face_count;
     const unsigned tet_count = host_data.mesh.mesh.tet.size;
     const float strain_limit_sum = prm.strain_limit_tau + prm.strain_limit_eps;
-    SimpleLog::set(prm.time);
-
-    // Name: Vertex Count
-    // Format: list[(vid_time,int)]
-    // Description:
-    // Total vertex count in the scene. The format is time-dependent
-    // but should not change during the simulation.
-    logging.mark("vertex count", vertex_count);
 
-    // Name: Rod Count
-    // Format: list[(vid_time,int)]
-    // Description:
-    // Total edge rod element count in the scene. The format is time-dependent
-    // but should not change during the simulation.
-    logging.mark("rod count", host_data.rod_count);
-
-    // Name: Shell Count
-    // Format: list[(vid_time,int)]
-    // Description:
-    // Total triangular shell element count in the scene. The format is
-    // time-dependent but should not change during the simulation.
-    logging.mark("shell count", host_data.shell_face_count);
-
-    // Name: Triangle Count
-    // Format: list[(vid_time,int)]
-    // Map: triangle_count
-    // Description:
-    // Total triangular shell element count in the scene. The format is
-    // time-dependent but should not change during the simulation.
-    logging.mark("face count", host_data.mesh.mesh.face.size);
-
-    // Name: Tet Count
-    // Format: list[(vid_time,int)]
-    // Description:
-    // Total tetrahedral element count in the scene. The format is
-    // time-dependent but should not change during the simulation.
-    logging.mark("tet count", host_data.mesh.mesh.tet.size);
+    SimpleLog::set(prm.time);
 
     logging.push("build_kinematic");
     build_kinematic(host_dataset, dev_dataset, *param);
@@ -307,16 +277,6 @@ StepResult advance() {
                 tmp_scalar[i] = data.prop.face[i].mass;
             }
         } DISPATCH_END;
-        total_shell_mass = utility::sum_array(tmp_scalar, shell_face_count);
-    }
-
-    if (total_shell_mass > 0.0f) {
-        // Name: Total Shell Mass
-        // Format: list[(vid_time,kg)]
-        // Description:
-        // Total mass of all the shell elements in the scene.
-        // Should not change during the simulation.
-        logging.mark("total shell mass", total_shell_mass);
     }
 
     float total_solid_mass = 0.0f;
@@ -328,16 +288,6 @@ StepResult advance() {
                 tmp_scalar[i] = data.prop.tet[i].mass;
             }
         } DISPATCH_END;
-        total_solid_mass = utility::sum_array(tmp_scalar, tet_count);
-    }
-
-    if (total_solid_mass > 0.0f) {
-        // Name: Total Solid Mass
-        // Format: list[(vid_time,kg)]
-        // Description:
-        // Total mass of all the tet elements in the scene.
-        // Should not change during the simulation.
-        logging.mark("total solid mass", total_solid_mass);
     }
 
     float dt = param->dt;
diff --git a/src/cpp/utility/utility.cu b/src/cpp/utility/utility.cu
@@ -16,6 +16,22 @@
 
 namespace utility {
 
+struct ReduceInfo {
+    unsigned n = 0;
+    unsigned *d_block_sums = nullptr;
+    unsigned *h_results = nullptr;
+
+    void init(unsigned n) {
+        this->n = n;
+        unsigned num_blocks = (n + BLOCK_SIZE - 1) / BLOCK_SIZE;
+        CUDA_HANDLE_ERROR(
+            cudaMalloc(&d_block_sums, num_blocks * sizeof(unsigned)));
+        h_results = new unsigned[num_blocks];
+    }
+};
+
+static ReduceInfo reduce_info;
+
 __device__ Vec3f compute_vertex_normal(const DataSet &data,
                                        const Vec<Vec3f> &vertex, unsigned i) {
     Vec3f normal = Vec3f::Zero();
@@ -216,32 +232,25 @@ __global__ void reduce_op_kernel(const T *input, Y *output, Op func, Y init_val,
 
 template <class T, class Y, typename Op>
 Y reduce(const T *d_input, Op func, Y init_val, unsigned n) {
-    unsigned grid_size = (n + BLOCK_SIZE - 1) / BLOCK_SIZE;
-    const unsigned scale_factor = 2;
-    static Y *d_output = nullptr;
-    static Y *h_results = nullptr;
-    static unsigned max_grid_size = 0;
-    if (d_output == nullptr) {
-        max_grid_size = scale_factor * grid_size;
-        cudaMalloc(&d_output, max_grid_size * sizeof(Y));
-        h_results = new Y[max_grid_size];
-    } else if (grid_size > max_grid_size) {
-        max_grid_size = scale_factor * grid_size;
-        cudaFree(d_output);
-        delete[] h_results;
-        cudaMalloc(&d_output, max_grid_size * sizeof(Y));
-        h_results = new Y[max_grid_size];
-    }
-    size_t shared_mem_size = sizeof(Y) * BLOCK_SIZE;
-    reduce_op_kernel<T, Y><<<grid_size, BLOCK_SIZE, shared_mem_size>>>(
-        d_input, d_output, func, init_val, n);
-    cudaMemcpy(h_results, d_output, grid_size * sizeof(Y),
-               cudaMemcpyDeviceToHost);
-    Y result = init_val;
-    for (unsigned i = 0; i < grid_size; i++) {
-        result = func(result, h_results[i]);
+    if (sizeof(Y) * n <= sizeof(unsigned) * reduce_info.n) {
+        unsigned grid_size = (n + BLOCK_SIZE - 1) / BLOCK_SIZE;
+        Y *d_output = reinterpret_cast<Y *>(reduce_info.d_block_sums);
+        Y *h_results = reinterpret_cast<Y *>(reduce_info.h_results);
+        size_t shared_mem_size = sizeof(Y) * BLOCK_SIZE;
+        reduce_op_kernel<T, Y><<<grid_size, BLOCK_SIZE, shared_mem_size>>>(
+            d_input, d_output, func, init_val, n);
+        cudaMemcpy(h_results, d_output, grid_size * sizeof(Y),
+                   cudaMemcpyDeviceToHost);
+        Y result = init_val;
+        for (unsigned i = 0; i < grid_size; i++) {
+            result = func(result, h_results[i]);
+        }
+        return result;
+    } else {
+        fprintf(stderr, "Error: reduce buffer size is too small\n");
+        fprintf(stderr, "n: %u, reduce_info.n: %u\n", n, reduce_info.n);
+        exit(1);
     }
-    return result;
 }
 
 template <class T> T sum_array(Vec<T> array, unsigned size) {
@@ -288,6 +297,8 @@ __device__ float get_wind_weight(float time) {
     return t * (0.5f * (1.0f + sinf(angle))) + (1.0f - t);
 }
 
+void set_max_reduce_count(unsigned n) { reduce_info.init(n); }
+
 } // namespace utility
 
 template float utility::sum_array(Vec<float> array, unsigned size);
diff --git a/src/cpp/utility/utility.hpp b/src/cpp/utility/utility.hpp
@@ -73,6 +73,8 @@ void compute_svd(DataSet data, Vec<Vec3f> curr, Vec<Svd3x2> svd,
                  ParamSet param);
 __device__ float get_wind_weight(float time);
 
+void set_max_reduce_count(unsigned n);
+
 } // namespace utility
 
 #endif