Skip to content

Commit 70a204b

Browse files
committed
remove dyanmic device memory alloc free in reduce kernel
1 parent fc5b58d commit 70a204b

File tree

3 files changed

+44
-81
lines changed

3 files changed

+44
-81
lines changed

src/cpp/main/main.cu

Lines changed: 6 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -138,6 +138,11 @@ void initialize(DataSet _host_dataset, DataSet _dev_dataset, ParamSet *_param) {
138138
unsigned face_count = host_dataset.mesh.mesh.face.size;
139139
unsigned hinge_count = host_dataset.mesh.mesh.hinge.size;
140140
unsigned tet_count = host_dataset.mesh.mesh.tet.size;
141+
142+
const unsigned max_reduce_count = std::max(
143+
std::max(face_count, edge_count), std::max(tet_count, 3 * vert_count));
144+
utility::set_max_reduce_count(max_reduce_count);
145+
141146
unsigned collision_mesh_vert_count =
142147
host_dataset.constraint.mesh.active
143148
? host_dataset.constraint.mesh.vertex.size
@@ -238,43 +243,8 @@ StepResult advance() {
238243
const unsigned shell_face_count = host_dataset.shell_face_count;
239244
const unsigned tet_count = host_data.mesh.mesh.tet.size;
240245
const float strain_limit_sum = prm.strain_limit_tau + prm.strain_limit_eps;
241-
SimpleLog::set(prm.time);
242-
243-
// Name: Vertex Count
244-
// Format: list[(vid_time,int)]
245-
// Description:
246-
// Total vertex count in the scene. The format is time-dependent
247-
// but should not change during the simulation.
248-
logging.mark("vertex count", vertex_count);
249246

250-
// Name: Rod Count
251-
// Format: list[(vid_time,int)]
252-
// Description:
253-
// Total edge rod element count in the scene. The format is time-dependent
254-
// but should not change during the simulation.
255-
logging.mark("rod count", host_data.rod_count);
256-
257-
// Name: Shell Count
258-
// Format: list[(vid_time,int)]
259-
// Description:
260-
// Total triangular shell element count in the scene. The format is
261-
// time-dependent but should not change during the simulation.
262-
logging.mark("shell count", host_data.shell_face_count);
263-
264-
// Name: Triangle Count
265-
// Format: list[(vid_time,int)]
266-
// Map: triangle_count
267-
// Description:
268-
// Total triangular shell element count in the scene. The format is
269-
// time-dependent but should not change during the simulation.
270-
logging.mark("face count", host_data.mesh.mesh.face.size);
271-
272-
// Name: Tet Count
273-
// Format: list[(vid_time,int)]
274-
// Description:
275-
// Total tetrahedral element count in the scene. The format is
276-
// time-dependent but should not change during the simulation.
277-
logging.mark("tet count", host_data.mesh.mesh.tet.size);
247+
SimpleLog::set(prm.time);
278248

279249
logging.push("build_kinematic");
280250
build_kinematic(host_dataset, dev_dataset, *param);
@@ -307,16 +277,6 @@ StepResult advance() {
307277
tmp_scalar[i] = data.prop.face[i].mass;
308278
}
309279
} DISPATCH_END;
310-
total_shell_mass = utility::sum_array(tmp_scalar, shell_face_count);
311-
}
312-
313-
if (total_shell_mass > 0.0f) {
314-
// Name: Total Shell Mass
315-
// Format: list[(vid_time,kg)]
316-
// Description:
317-
// Total mass of all the shell elements in the scene.
318-
// Should not change during the simulation.
319-
logging.mark("total shell mass", total_shell_mass);
320280
}
321281

322282
float total_solid_mass = 0.0f;
@@ -328,16 +288,6 @@ StepResult advance() {
328288
tmp_scalar[i] = data.prop.tet[i].mass;
329289
}
330290
} DISPATCH_END;
331-
total_solid_mass = utility::sum_array(tmp_scalar, tet_count);
332-
}
333-
334-
if (total_solid_mass > 0.0f) {
335-
// Name: Total Solid Mass
336-
// Format: list[(vid_time,kg)]
337-
// Description:
338-
// Total mass of all the tet elements in the scene.
339-
// Should not change during the simulation.
340-
logging.mark("total solid mass", total_solid_mass);
341291
}
342292

343293
float dt = param->dt;

src/cpp/utility/utility.cu

Lines changed: 36 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,22 @@
1616

1717
namespace utility {
1818

19+
struct ReduceInfo {
20+
unsigned n = 0;
21+
unsigned *d_block_sums = nullptr;
22+
unsigned *h_results = nullptr;
23+
24+
void init(unsigned n) {
25+
this->n = n;
26+
unsigned num_blocks = (n + BLOCK_SIZE - 1) / BLOCK_SIZE;
27+
CUDA_HANDLE_ERROR(
28+
cudaMalloc(&d_block_sums, num_blocks * sizeof(unsigned)));
29+
h_results = new unsigned[num_blocks];
30+
}
31+
};
32+
33+
static ReduceInfo reduce_info;
34+
1935
__device__ Vec3f compute_vertex_normal(const DataSet &data,
2036
const Vec<Vec3f> &vertex, unsigned i) {
2137
Vec3f normal = Vec3f::Zero();
@@ -216,32 +232,25 @@ __global__ void reduce_op_kernel(const T *input, Y *output, Op func, Y init_val,
216232

217233
template <class T, class Y, typename Op>
218234
Y reduce(const T *d_input, Op func, Y init_val, unsigned n) {
219-
unsigned grid_size = (n + BLOCK_SIZE - 1) / BLOCK_SIZE;
220-
const unsigned scale_factor = 2;
221-
static Y *d_output = nullptr;
222-
static Y *h_results = nullptr;
223-
static unsigned max_grid_size = 0;
224-
if (d_output == nullptr) {
225-
max_grid_size = scale_factor * grid_size;
226-
cudaMalloc(&d_output, max_grid_size * sizeof(Y));
227-
h_results = new Y[max_grid_size];
228-
} else if (grid_size > max_grid_size) {
229-
max_grid_size = scale_factor * grid_size;
230-
cudaFree(d_output);
231-
delete[] h_results;
232-
cudaMalloc(&d_output, max_grid_size * sizeof(Y));
233-
h_results = new Y[max_grid_size];
234-
}
235-
size_t shared_mem_size = sizeof(Y) * BLOCK_SIZE;
236-
reduce_op_kernel<T, Y><<<grid_size, BLOCK_SIZE, shared_mem_size>>>(
237-
d_input, d_output, func, init_val, n);
238-
cudaMemcpy(h_results, d_output, grid_size * sizeof(Y),
239-
cudaMemcpyDeviceToHost);
240-
Y result = init_val;
241-
for (unsigned i = 0; i < grid_size; i++) {
242-
result = func(result, h_results[i]);
235+
if (sizeof(Y) * n <= sizeof(unsigned) * reduce_info.n) {
236+
unsigned grid_size = (n + BLOCK_SIZE - 1) / BLOCK_SIZE;
237+
Y *d_output = reinterpret_cast<Y *>(reduce_info.d_block_sums);
238+
Y *h_results = reinterpret_cast<Y *>(reduce_info.h_results);
239+
size_t shared_mem_size = sizeof(Y) * BLOCK_SIZE;
240+
reduce_op_kernel<T, Y><<<grid_size, BLOCK_SIZE, shared_mem_size>>>(
241+
d_input, d_output, func, init_val, n);
242+
cudaMemcpy(h_results, d_output, grid_size * sizeof(Y),
243+
cudaMemcpyDeviceToHost);
244+
Y result = init_val;
245+
for (unsigned i = 0; i < grid_size; i++) {
246+
result = func(result, h_results[i]);
247+
}
248+
return result;
249+
} else {
250+
fprintf(stderr, "Error: reduce buffer size is too small\n");
251+
fprintf(stderr, "n: %u, reduce_info.n: %u\n", n, reduce_info.n);
252+
exit(1);
243253
}
244-
return result;
245254
}
246255

247256
template <class T> T sum_array(Vec<T> array, unsigned size) {
@@ -288,6 +297,8 @@ __device__ float get_wind_weight(float time) {
288297
return t * (0.5f * (1.0f + sinf(angle))) + (1.0f - t);
289298
}
290299

300+
void set_max_reduce_count(unsigned n) { reduce_info.init(n); }
301+
291302
} // namespace utility
292303

293304
template float utility::sum_array(Vec<float> array, unsigned size);

src/cpp/utility/utility.hpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,8 @@ void compute_svd(DataSet data, Vec<Vec3f> curr, Vec<Svd3x2> svd,
7373
ParamSet param);
7474
__device__ float get_wind_weight(float time);
7575

76+
void set_max_reduce_count(unsigned n);
77+
7678
} // namespace utility
7779

7880
#endif

0 commit comments

Comments
 (0)