@@ -41,33 +41,33 @@ void runBenchmark(long max_work) {
4141 cudaEvent_t gpu_start, gpu_stop;
4242 CUDA_CHECK (cudaEventCreate (&gpu_start));
4343 CUDA_CHECK (cudaEventCreate (&gpu_stop));
44- CUDA_CHECK (cudaEventRecord (gpu_start, 0 ));
4544
4645 // set kernel
4746 dim3 gridSize = 256 ;
4847 dim3 blockSize = 256 ;
4948 setup_kernel<<<gridSize, blockSize>>> (d_state);
5049
5150 // monte carlo kernel
51+ CUDA_CHECK (cudaEventRecord (gpu_start, 0 ));
5252 monte_carlo_kernel<<<gridSize, blockSize>>> (d_state, d_count, m);
5353 CUDA_CHECK (cudaDeviceSynchronize ());
5454
55+ float gpu_elapsed_time = getElapsedTime (gpu_start, gpu_stop);
56+ CUDA_CHECK (cudaEventDestroy (gpu_start));
57+ CUDA_CHECK (cudaEventDestroy (gpu_stop));
58+
5559 // Allocate device output array
5660 unsigned long long int *d_out = nullptr ;
5761 CUDA_CHECK (cudaMalloc ((void **)&d_out, sizeof (unsigned long long int )));
5862
5963 // Request and allocate temporary storage
6064 void *d_temp_storage = nullptr ;
6165 size_t temp_storage_bytes = 0 ;
62- cub::DeviceReduce::Sum (d_temp_storage, temp_storage_bytes, d_count, d_out, 256 );
66+ CUDA_CHECK ( cub::DeviceReduce::Sum (d_temp_storage, temp_storage_bytes, d_count, d_out, 256 ) );
6367 CUDA_CHECK (cudaMalloc ((void **)&d_temp_storage, temp_storage_bytes));
6468
6569 // Run
66- cub::DeviceReduce::Sum (d_temp_storage, temp_storage_bytes, d_count, d_out, 256 );
67-
68- float gpu_elapsed_time = getElapsedTime (gpu_start, gpu_stop);
69- CUDA_CHECK (cudaEventDestroy (gpu_start));
70- CUDA_CHECK (cudaEventDestroy (gpu_stop));
70+ CUDA_CHECK (cub::DeviceReduce::Sum (d_temp_storage, temp_storage_bytes, d_count, d_out, 256 ));
7171
7272 // copy results back to the host
7373 unsigned long long int h_count = 0 ;
@@ -101,14 +101,14 @@ void runBenchmarkTime(long max_work, int runtime_in_seconds) {
101101 cudaEvent_t gpu_start, gpu_stop;
102102 CUDA_CHECK (cudaEventCreate (&gpu_start));
103103 CUDA_CHECK (cudaEventCreate (&gpu_stop));
104- CUDA_CHECK (cudaEventRecord (gpu_start, 0 ));
105104
106105 // set kernel
107106 dim3 gridSize = 256 ;
108107 dim3 blockSize = 256 ;
109108
110109 setup_kernel<<<gridSize, blockSize>>> (d_state);
111110
111+ CUDA_CHECK (cudaEventRecord (gpu_start, 0 ));
112112 int iteration = 0 ;
113113 // Run the workload loop until the specified runtime is reached
114114 while (getElapsedTime (gpu_start, gpu_stop) < runtime_in_seconds) {
@@ -117,6 +117,10 @@ void runBenchmarkTime(long max_work, int runtime_in_seconds) {
117117 iteration++;
118118 }
119119
120+ float gpu_elapsed_time = getElapsedTime (gpu_start, gpu_stop);
121+ CUDA_CHECK (cudaEventDestroy (gpu_start));
122+ CUDA_CHECK (cudaEventDestroy (gpu_stop));
123+
120124 // copy results back to the host
121125 // Allocate device output array
122126 unsigned long long int *d_out = nullptr ;
@@ -125,15 +129,11 @@ void runBenchmarkTime(long max_work, int runtime_in_seconds) {
125129 // Request and allocate temporary storage
126130 void *d_temp_storage = nullptr ;
127131 size_t temp_storage_bytes = 0 ;
128- cub::DeviceReduce::Sum (d_temp_storage, temp_storage_bytes, d_count, d_out, 256 );
132+ CUDA_CHECK ( cub::DeviceReduce::Sum (d_temp_storage, temp_storage_bytes, d_count, d_out, 256 ) );
129133 CUDA_CHECK (cudaMalloc ((void **)&d_temp_storage, temp_storage_bytes));
130134
131135 // Run
132- cub::DeviceReduce::Sum (d_temp_storage, temp_storage_bytes, d_count, d_out, 256 );
133-
134- float gpu_elapsed_time = getElapsedTime (gpu_start, gpu_stop);
135- CUDA_CHECK (cudaEventDestroy (gpu_start));
136- CUDA_CHECK (cudaEventDestroy (gpu_stop));
136+ CUDA_CHECK (cub::DeviceReduce::Sum (d_temp_storage, temp_storage_bytes, d_count, d_out, 256 ));
137137
138138 // copy results back to the host
139139 unsigned long long int h_count = 0 ;
0 commit comments