22#include < hipcub/hipcub.hpp>
33
44#include < chrono>
5- #include < cstdlib> // For std::atoi
65#include < iostream>
76
8- // The macro wraps any CUDA API call
9- # define CUDA_CHECK ( ans ) \
10- { gpuAssert ((ans), __FILE__, __LINE__); }
11-
12- inline void gpuAssert (hipError_t code, const char *file, int line, bool abort = true ) {
13- if (code != hipSuccess) {
14- fprintf (stderr, " GPUassert: %s %s %d \n " , hipGetErrorString (code), file, line);
15- if (abort)
16- exit (code);
17- }
7+ # define HIP_CHECK ( expression ) \
8+ { \
9+ const hipError_t status = expression; \
10+ if (status != hipSuccess){ \
11+ std::cerr << " HIP error " \
12+ << status << " : " \
13+ << hipGetErrorString (status) \
14+ << " at " << __FILE__ << " : " \
15+ << __LINE__ << std::endl; \
16+ } \
1817}
1918
2019float getElapsedTime (const hipEvent_t &gpu_start, hipEvent_t &gpu_stop) {
2120 float gpu_elapsed_time;
22- CUDA_CHECK (hipEventRecord (gpu_stop, 0 ));
23- CUDA_CHECK (hipEventSynchronize (gpu_stop));
24- CUDA_CHECK (hipEventElapsedTime (&gpu_elapsed_time, gpu_start, gpu_stop));
21+ HIP_CHECK (hipEventRecord (gpu_stop, 0 ));
22+ HIP_CHECK (hipEventSynchronize (gpu_stop));
23+ HIP_CHECK (hipEventElapsedTime (&gpu_elapsed_time, gpu_start, gpu_stop));
2524 return gpu_elapsed_time / 1000 .0f ;
2625}
2726
@@ -32,15 +31,15 @@ void runBenchmark(long max_work) {
3231
3332 unsigned long long int *d_count;
3433 hiprandState *d_state;
35- CUDA_CHECK (hipMalloc ((void **)&d_count, 256 * sizeof (unsigned long long int )));
36- CUDA_CHECK (hipMalloc ((void **)&d_state, n * sizeof (hiprandState)));
37- CUDA_CHECK (hipMemset (d_count, 0 , 256 * sizeof (unsigned long long int )));
34+ HIP_CHECK (hipMalloc ((void **)&d_count, 256 * sizeof (unsigned long long int )));
35+ HIP_CHECK (hipMalloc ((void **)&d_state, n * sizeof (hiprandState)));
36+ HIP_CHECK (hipMemset (d_count, 0 , 256 * sizeof (unsigned long long int )));
3837
3938 // set up timing stuff
4039 hipEvent_t gpu_start, gpu_stop;
41- CUDA_CHECK (hipEventCreate (&gpu_start));
42- CUDA_CHECK (hipEventCreate (&gpu_stop));
43- CUDA_CHECK (hipEventRecord (gpu_start, 0 ));
40+ HIP_CHECK (hipEventCreate (&gpu_start));
41+ HIP_CHECK (hipEventCreate (&gpu_stop));
42+ HIP_CHECK (hipEventRecord (gpu_start, 0 ));
4443
4544 // set kernel
4645 dim3 gridSize = 256 ;
@@ -49,38 +48,38 @@ void runBenchmark(long max_work) {
4948
5049 // monte carlo kernel
5150 monte_carlo_kernel<<<gridSize, blockSize>>>(d_state, d_count, m);
52- CUDA_CHECK (hipDeviceSynchronize ());
51+ HIP_CHECK (hipDeviceSynchronize ());
5352
5453 // Allocate device output array
5554 unsigned long long int *d_out = nullptr ;
56- CUDA_CHECK (hipMalloc ((void **)&d_out, sizeof (unsigned long long int )));
55+ HIP_CHECK (hipMalloc ((void **)&d_out, sizeof (unsigned long long int )));
5756
5857 // Request and allocate temporary storage
5958 void *d_temp_storage = nullptr ;
6059 size_t temp_storage_bytes = 0 ;
6160 hipcub::DeviceReduce::Sum (d_temp_storage, temp_storage_bytes, d_count, d_out, 256 );
62- CUDA_CHECK (hipMalloc ((void **)&d_temp_storage, temp_storage_bytes));
61+ HIP_CHECK (hipMalloc ((void **)&d_temp_storage, temp_storage_bytes));
6362
6463 // Run
6564 hipcub::DeviceReduce::Sum (d_temp_storage, temp_storage_bytes, d_count, d_out, 256 );
6665
6766 float gpu_elapsed_time = getElapsedTime (gpu_start, gpu_stop);
68- CUDA_CHECK (hipEventDestroy (gpu_start));
69- CUDA_CHECK (hipEventDestroy (gpu_stop));
67+ HIP_CHECK (hipEventDestroy (gpu_start));
68+ HIP_CHECK (hipEventDestroy (gpu_stop));
7069
7170 // copy results back to the host
7271 unsigned long long int h_count = 0 ;
73- CUDA_CHECK (hipMemcpy (&h_count, d_out, sizeof (unsigned long long int ), hipMemcpyDeviceToHost));
72+ HIP_CHECK (hipMemcpy (&h_count, d_out, sizeof (unsigned long long int ), hipMemcpyDeviceToHost));
7473
7574 // display results and timings for gpu
7675 float pi = h_count * 4.0 / (n * m);
7776 std::cout << " Approximate pi calculated on GPU is: " << pi << " and calculation took " << gpu_elapsed_time << " s\n " ;
7877 std::cout << " Benchmark completed!" << std::endl;
7978
80- CUDA_CHECK (hipFree (d_count));
81- CUDA_CHECK (hipFree (d_state));
82- CUDA_CHECK (hipFree (d_out));
83- CUDA_CHECK (hipFree (d_temp_storage));
79+ HIP_CHECK (hipFree (d_count));
80+ HIP_CHECK (hipFree (d_state));
81+ HIP_CHECK (hipFree (d_out));
82+ HIP_CHECK (hipFree (d_temp_storage));
8483}
8584
8685// Function to run the GPU benchmark for a specified time
@@ -92,15 +91,15 @@ void runBenchmarkTime(long max_work, int runtime_in_seconds) {
9291 // allocate memory
9392 unsigned long long int *d_count;
9493 hiprandState *d_state;
95- CUDA_CHECK (hipMalloc ((void **)&d_count, 256 * sizeof (unsigned long long int )));
96- CUDA_CHECK (hipMalloc ((void **)&d_state, n * sizeof (hiprandState)));
97- CUDA_CHECK (hipMemset (d_count, 0 , 256 * sizeof (unsigned long long int )));
94+ HIP_CHECK (hipMalloc ((void **)&d_count, 256 * sizeof (unsigned long long int )));
95+ HIP_CHECK (hipMalloc ((void **)&d_state, n * sizeof (hiprandState)));
96+ HIP_CHECK (hipMemset (d_count, 0 , 256 * sizeof (unsigned long long int )));
9897
9998 // set up timing stuff
10099 hipEvent_t gpu_start, gpu_stop;
101- CUDA_CHECK (hipEventCreate (&gpu_start));
102- CUDA_CHECK (hipEventCreate (&gpu_stop));
103- CUDA_CHECK (hipEventRecord (gpu_start, 0 ));
100+ HIP_CHECK (hipEventCreate (&gpu_start));
101+ HIP_CHECK (hipEventCreate (&gpu_stop));
102+ HIP_CHECK (hipEventRecord (gpu_start, 0 ));
104103
105104 // set kernel
106105 dim3 gridSize = 256 ;
@@ -112,40 +111,40 @@ void runBenchmarkTime(long max_work, int runtime_in_seconds) {
112111 // Run the workload loop until the specified runtime is reached
113112 while (getElapsedTime (gpu_start, gpu_stop) < runtime_in_seconds) {
114113 monte_carlo_kernel<<<gridSize, blockSize>>>(d_state, d_count, m);
115- CUDA_CHECK (hipDeviceSynchronize ()); // Ensure the kernel has finished executing
114+ HIP_CHECK (hipDeviceSynchronize ()); // Ensure the kernel has finished executing
116115 iteration++;
117116 }
118117
119118 // copy results back to the host
120119 // Allocate device output array
121120 unsigned long long int *d_out = nullptr ;
122- CUDA_CHECK (hipMalloc ((void **)&d_out, sizeof (unsigned long long int )));
121+ HIP_CHECK (hipMalloc ((void **)&d_out, sizeof (unsigned long long int )));
123122
124123 // Request and allocate temporary storage
125124 void *d_temp_storage = nullptr ;
126125 size_t temp_storage_bytes = 0 ;
127126 hipcub::DeviceReduce::Sum (d_temp_storage, temp_storage_bytes, d_count, d_out, 256 );
128- CUDA_CHECK (hipMalloc ((void **)&d_temp_storage, temp_storage_bytes));
127+ HIP_CHECK (hipMalloc ((void **)&d_temp_storage, temp_storage_bytes));
129128
130129 // Run
131130 hipcub::DeviceReduce::Sum (d_temp_storage, temp_storage_bytes, d_count, d_out, 256 );
132131
133132 float gpu_elapsed_time = getElapsedTime (gpu_start, gpu_stop);
134- CUDA_CHECK (hipEventDestroy (gpu_start));
135- CUDA_CHECK (hipEventDestroy (gpu_stop));
133+ HIP_CHECK (hipEventDestroy (gpu_start));
134+ HIP_CHECK (hipEventDestroy (gpu_stop));
136135
137136 // copy results back to the host
138137 unsigned long long int h_count = 0 ;
139- CUDA_CHECK (hipMemcpy (&h_count, d_out, sizeof (unsigned long long int ), hipMemcpyDeviceToHost));
138+ HIP_CHECK (hipMemcpy (&h_count, d_out, sizeof (unsigned long long int ), hipMemcpyDeviceToHost));
140139
141140 // display results and timings for gpu
142141 float pi = h_count * 4.0 / (n * m) / iteration;
143142 std::cout << " Approximate pi calculated on GPU is: " << pi << " and calculation took " << gpu_elapsed_time << " s\n " ;
144143
145- CUDA_CHECK (hipFree (d_count));
146- CUDA_CHECK (hipFree (d_state));
147- CUDA_CHECK (hipFree (d_out));
148- CUDA_CHECK (hipFree (d_temp_storage));
144+ HIP_CHECK (hipFree (d_count));
145+ HIP_CHECK (hipFree (d_state));
146+ HIP_CHECK (hipFree (d_out));
147+ HIP_CHECK (hipFree (d_temp_storage));
149148}
150149
151150int main (int argc, char *argv[]) {
0 commit comments