skip setup and teardown

quantumsteve · quantumsteve · commit 4baaa7f593b7 · 2026-04-28T17:59:20.000-04:00
Signed-off-by: Steven Hahn &lt;hahnse@ornl.gov&gt;
diff --git a/bin/cuda/gpu_benchmark.cu b/bin/cuda/gpu_benchmark.cu
@@ -41,33 +41,33 @@ void runBenchmark(long max_work) {
   cudaEvent_t gpu_start, gpu_stop;
   CUDA_CHECK(cudaEventCreate(&gpu_start));
   CUDA_CHECK(cudaEventCreate(&gpu_stop));
-  CUDA_CHECK(cudaEventRecord(gpu_start, 0));
 
   // set kernel
   dim3 gridSize = 256;
   dim3 blockSize = 256;
   setup_kernel<<<gridSize, blockSize>>>(d_state);
 
   // monte carlo kernel
+  CUDA_CHECK(cudaEventRecord(gpu_start, 0));
   monte_carlo_kernel<<<gridSize, blockSize>>>(d_state, d_count, m);
   CUDA_CHECK(cudaDeviceSynchronize());
 
+  float gpu_elapsed_time = getElapsedTime(gpu_start, gpu_stop);
+  CUDA_CHECK(cudaEventDestroy(gpu_start));
+  CUDA_CHECK(cudaEventDestroy(gpu_stop));
+
   // Allocate device output array
   unsigned long long int *d_out = nullptr;
   CUDA_CHECK(cudaMalloc((void **)&d_out, sizeof(unsigned long long int)));
 
   // Request and allocate temporary storage
   void *d_temp_storage = nullptr;
   size_t temp_storage_bytes = 0;
-  cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_count, d_out, 256);
+  CUDA_CHECK(cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_count, d_out, 256));
   CUDA_CHECK(cudaMalloc((void **)&d_temp_storage, temp_storage_bytes));
 
   // Run
-  cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_count, d_out, 256);
-
-  float gpu_elapsed_time = getElapsedTime(gpu_start, gpu_stop);
-  CUDA_CHECK(cudaEventDestroy(gpu_start));
-  CUDA_CHECK(cudaEventDestroy(gpu_stop));
+  CUDA_CHECK(cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_count, d_out, 256));
 
   // copy results back to the host
   unsigned long long int h_count = 0;
@@ -101,14 +101,14 @@ void runBenchmarkTime(long max_work, int runtime_in_seconds) {
   cudaEvent_t gpu_start, gpu_stop;
   CUDA_CHECK(cudaEventCreate(&gpu_start));
   CUDA_CHECK(cudaEventCreate(&gpu_stop));
-  CUDA_CHECK(cudaEventRecord(gpu_start, 0));
 
   // set kernel
   dim3 gridSize = 256;
   dim3 blockSize = 256;
 
   setup_kernel<<<gridSize, blockSize>>>(d_state);
 
+  CUDA_CHECK(cudaEventRecord(gpu_start, 0));
   int iteration = 0;
   // Run the workload loop until the specified runtime is reached
   while (getElapsedTime(gpu_start, gpu_stop) < runtime_in_seconds) {
@@ -117,6 +117,10 @@ void runBenchmarkTime(long max_work, int runtime_in_seconds) {
     iteration++;
   }
 
+  float gpu_elapsed_time = getElapsedTime(gpu_start, gpu_stop);
+  CUDA_CHECK(cudaEventDestroy(gpu_start));
+  CUDA_CHECK(cudaEventDestroy(gpu_stop));
+
   // copy results back to the host
   // Allocate device output array
   unsigned long long int *d_out = nullptr;
@@ -125,15 +129,11 @@ void runBenchmarkTime(long max_work, int runtime_in_seconds) {
   // Request and allocate temporary storage
   void *d_temp_storage = nullptr;
   size_t temp_storage_bytes = 0;
-  cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_count, d_out, 256);
+  CUDA_CHECK(cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_count, d_out, 256));
   CUDA_CHECK(cudaMalloc((void **)&d_temp_storage, temp_storage_bytes));
 
   // Run
-  cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_count, d_out, 256);
-
-  float gpu_elapsed_time = getElapsedTime(gpu_start, gpu_stop);
-  CUDA_CHECK(cudaEventDestroy(gpu_start));
-  CUDA_CHECK(cudaEventDestroy(gpu_stop));
+  CUDA_CHECK(cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_count, d_out, 256));
 
   // copy results back to the host
   unsigned long long int h_count = 0;
diff --git a/bin/hip/gpu_benchmark.hip b/bin/hip/gpu_benchmark.hip
@@ -39,17 +39,21 @@ void runBenchmark(long max_work) {
   hipEvent_t gpu_start, gpu_stop;
   HIP_CHECK(hipEventCreate(&gpu_start));
   HIP_CHECK(hipEventCreate(&gpu_stop));
-  HIP_CHECK(hipEventRecord(gpu_start, 0));
 
   // set kernel
   dim3 gridSize = 256;
   dim3 blockSize = 256;
   setup_kernel<<<gridSize, blockSize>>>(d_state);
 
   // monte carlo kernel
+  HIP_CHECK(hipEventRecord(gpu_start, 0));
   monte_carlo_kernel<<<gridSize, blockSize>>>(d_state, d_count, m);
   HIP_CHECK(hipDeviceSynchronize());
 
+  float gpu_elapsed_time = getElapsedTime(gpu_start, gpu_stop);
+  HIP_CHECK(hipEventDestroy(gpu_start));
+  HIP_CHECK(hipEventDestroy(gpu_stop));
+
   // Allocate device output array
   unsigned long long int *d_out = nullptr;
   HIP_CHECK(hipMalloc((void **)&d_out, sizeof(unsigned long long int)));
@@ -63,10 +67,6 @@ void runBenchmark(long max_work) {
   // Run
   hipcub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_count, d_out, 256);
 
-  float gpu_elapsed_time = getElapsedTime(gpu_start, gpu_stop);
-  HIP_CHECK(hipEventDestroy(gpu_start));
-  HIP_CHECK(hipEventDestroy(gpu_stop));
-
   // copy results back to the host
   unsigned long long int h_count = 0;
   HIP_CHECK(hipMemcpy(&h_count, d_out, sizeof(unsigned long long int), hipMemcpyDeviceToHost));
@@ -115,6 +115,10 @@ void runBenchmarkTime(long max_work, int runtime_in_seconds) {
     iteration++;
   }
 
+  float gpu_elapsed_time = getElapsedTime(gpu_start, gpu_stop);
+  HIP_CHECK(hipEventDestroy(gpu_start));
+  HIP_CHECK(hipEventDestroy(gpu_stop));
+
   // copy results back to the host
   // Allocate device output array
   unsigned long long int *d_out = nullptr;
@@ -129,10 +133,6 @@ void runBenchmarkTime(long max_work, int runtime_in_seconds) {
   // Run
   hipcub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_count, d_out, 256);
 
-  float gpu_elapsed_time = getElapsedTime(gpu_start, gpu_stop);
-  HIP_CHECK(hipEventDestroy(gpu_start));
-  HIP_CHECK(hipEventDestroy(gpu_stop));
-
   // copy results back to the host
   unsigned long long int h_count = 0;
   HIP_CHECK(hipMemcpy(&h_count, d_out, sizeof(unsigned long long int), hipMemcpyDeviceToHost));