Skip to content

Commit 4baaa7f

Browse files
committed
skip setup and teardown
Signed-off-by: Steven Hahn <hahnse@ornl.gov>
1 parent c9aa1dc commit 4baaa7f

2 files changed

Lines changed: 23 additions & 23 deletions

File tree

bin/cuda/gpu_benchmark.cu

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -41,33 +41,33 @@ void runBenchmark(long max_work) {
4141
cudaEvent_t gpu_start, gpu_stop;
4242
CUDA_CHECK(cudaEventCreate(&gpu_start));
4343
CUDA_CHECK(cudaEventCreate(&gpu_stop));
44-
CUDA_CHECK(cudaEventRecord(gpu_start, 0));
4544

4645
// set kernel
4746
dim3 gridSize = 256;
4847
dim3 blockSize = 256;
4948
setup_kernel<<<gridSize, blockSize>>>(d_state);
5049

5150
// monte carlo kernel
51+
CUDA_CHECK(cudaEventRecord(gpu_start, 0));
5252
monte_carlo_kernel<<<gridSize, blockSize>>>(d_state, d_count, m);
5353
CUDA_CHECK(cudaDeviceSynchronize());
5454

55+
float gpu_elapsed_time = getElapsedTime(gpu_start, gpu_stop);
56+
CUDA_CHECK(cudaEventDestroy(gpu_start));
57+
CUDA_CHECK(cudaEventDestroy(gpu_stop));
58+
5559
// Allocate device output array
5660
unsigned long long int *d_out = nullptr;
5761
CUDA_CHECK(cudaMalloc((void **)&d_out, sizeof(unsigned long long int)));
5862

5963
// Request and allocate temporary storage
6064
void *d_temp_storage = nullptr;
6165
size_t temp_storage_bytes = 0;
62-
cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_count, d_out, 256);
66+
CUDA_CHECK(cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_count, d_out, 256));
6367
CUDA_CHECK(cudaMalloc((void **)&d_temp_storage, temp_storage_bytes));
6468

6569
// Run
66-
cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_count, d_out, 256);
67-
68-
float gpu_elapsed_time = getElapsedTime(gpu_start, gpu_stop);
69-
CUDA_CHECK(cudaEventDestroy(gpu_start));
70-
CUDA_CHECK(cudaEventDestroy(gpu_stop));
70+
CUDA_CHECK(cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_count, d_out, 256));
7171

7272
// copy results back to the host
7373
unsigned long long int h_count = 0;
@@ -101,14 +101,14 @@ void runBenchmarkTime(long max_work, int runtime_in_seconds) {
101101
cudaEvent_t gpu_start, gpu_stop;
102102
CUDA_CHECK(cudaEventCreate(&gpu_start));
103103
CUDA_CHECK(cudaEventCreate(&gpu_stop));
104-
CUDA_CHECK(cudaEventRecord(gpu_start, 0));
105104

106105
// set kernel
107106
dim3 gridSize = 256;
108107
dim3 blockSize = 256;
109108

110109
setup_kernel<<<gridSize, blockSize>>>(d_state);
111110

111+
CUDA_CHECK(cudaEventRecord(gpu_start, 0));
112112
int iteration = 0;
113113
// Run the workload loop until the specified runtime is reached
114114
while (getElapsedTime(gpu_start, gpu_stop) < runtime_in_seconds) {
@@ -117,6 +117,10 @@ void runBenchmarkTime(long max_work, int runtime_in_seconds) {
117117
iteration++;
118118
}
119119

120+
float gpu_elapsed_time = getElapsedTime(gpu_start, gpu_stop);
121+
CUDA_CHECK(cudaEventDestroy(gpu_start));
122+
CUDA_CHECK(cudaEventDestroy(gpu_stop));
123+
120124
// copy results back to the host
121125
// Allocate device output array
122126
unsigned long long int *d_out = nullptr;
@@ -125,15 +129,11 @@ void runBenchmarkTime(long max_work, int runtime_in_seconds) {
125129
// Request and allocate temporary storage
126130
void *d_temp_storage = nullptr;
127131
size_t temp_storage_bytes = 0;
128-
cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_count, d_out, 256);
132+
CUDA_CHECK(cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_count, d_out, 256));
129133
CUDA_CHECK(cudaMalloc((void **)&d_temp_storage, temp_storage_bytes));
130134

131135
// Run
132-
cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_count, d_out, 256);
133-
134-
float gpu_elapsed_time = getElapsedTime(gpu_start, gpu_stop);
135-
CUDA_CHECK(cudaEventDestroy(gpu_start));
136-
CUDA_CHECK(cudaEventDestroy(gpu_stop));
136+
CUDA_CHECK(cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_count, d_out, 256));
137137

138138
// copy results back to the host
139139
unsigned long long int h_count = 0;

bin/hip/gpu_benchmark.hip

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -39,17 +39,21 @@ void runBenchmark(long max_work) {
3939
hipEvent_t gpu_start, gpu_stop;
4040
HIP_CHECK(hipEventCreate(&gpu_start));
4141
HIP_CHECK(hipEventCreate(&gpu_stop));
42-
HIP_CHECK(hipEventRecord(gpu_start, 0));
4342

4443
// set kernel
4544
dim3 gridSize = 256;
4645
dim3 blockSize = 256;
4746
setup_kernel<<<gridSize, blockSize>>>(d_state);
4847

4948
// monte carlo kernel
49+
HIP_CHECK(hipEventRecord(gpu_start, 0));
5050
monte_carlo_kernel<<<gridSize, blockSize>>>(d_state, d_count, m);
5151
HIP_CHECK(hipDeviceSynchronize());
5252

53+
float gpu_elapsed_time = getElapsedTime(gpu_start, gpu_stop);
54+
HIP_CHECK(hipEventDestroy(gpu_start));
55+
HIP_CHECK(hipEventDestroy(gpu_stop));
56+
5357
// Allocate device output array
5458
unsigned long long int *d_out = nullptr;
5559
HIP_CHECK(hipMalloc((void **)&d_out, sizeof(unsigned long long int)));
@@ -63,10 +67,6 @@ void runBenchmark(long max_work) {
6367
// Run
6468
hipcub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_count, d_out, 256);
6569

66-
float gpu_elapsed_time = getElapsedTime(gpu_start, gpu_stop);
67-
HIP_CHECK(hipEventDestroy(gpu_start));
68-
HIP_CHECK(hipEventDestroy(gpu_stop));
69-
7070
// copy results back to the host
7171
unsigned long long int h_count = 0;
7272
HIP_CHECK(hipMemcpy(&h_count, d_out, sizeof(unsigned long long int), hipMemcpyDeviceToHost));
@@ -115,6 +115,10 @@ void runBenchmarkTime(long max_work, int runtime_in_seconds) {
115115
iteration++;
116116
}
117117

118+
float gpu_elapsed_time = getElapsedTime(gpu_start, gpu_stop);
119+
HIP_CHECK(hipEventDestroy(gpu_start));
120+
HIP_CHECK(hipEventDestroy(gpu_stop));
121+
118122
// copy results back to the host
119123
// Allocate device output array
120124
unsigned long long int *d_out = nullptr;
@@ -129,10 +133,6 @@ void runBenchmarkTime(long max_work, int runtime_in_seconds) {
129133
// Run
130134
hipcub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_count, d_out, 256);
131135

132-
float gpu_elapsed_time = getElapsedTime(gpu_start, gpu_stop);
133-
HIP_CHECK(hipEventDestroy(gpu_start));
134-
HIP_CHECK(hipEventDestroy(gpu_stop));
135-
136136
// copy results back to the host
137137
unsigned long long int h_count = 0;
138138
HIP_CHECK(hipMemcpy(&h_count, d_out, sizeof(unsigned long long int), hipMemcpyDeviceToHost));

0 commit comments

Comments
 (0)