-
Notifications
You must be signed in to change notification settings - Fork 254
Expand file tree
/
Copy pathgithub_issue_1383_pt_2.cu
More file actions
116 lines (95 loc) · 3.58 KB
/
github_issue_1383_pt_2.cu
File metadata and controls
116 lines (95 loc) · 3.58 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024 - 2026.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
// Purposefully pull in headers that caused errors in the linked issue
#include <boost/math/special_functions.hpp>
#include <boost/math/tools/config.hpp>
#include <boost/math/tools/numeric_limits.hpp>
#include <boost/math/constants/constants.hpp>
#include <iostream>
#include <iomanip>
#include <vector>
#include <cmath>
#include <random>
#include "cuda_managed_ptr.hpp"
#include "stopwatch.hpp"
// For the CUDA runtime routines (prefixed with "cuda_")
#include <cuda_runtime.h>
typedef double float_type;
/**
* CUDA Kernel Device code
*
*/
__global__ void cuda_test(const float_type *in, float_type *out, int numElements)
{
using std::cos;
const int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
out[i] = cos(in[i]);
if (out[i] > boost::math::numeric_limits<float_type>::max() || !boost::math::numeric_limits<float_type>::is_signed)
{
__trap();
}
}
}
/**
* Host main routine
*/
int main()
{
// Error code to check return values for CUDA calls
cudaError_t err = cudaSuccess;
// Print the vector length to be used, and compute its size
int numElements = 50000;
std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
// Allocate the managed input vector A
cuda_managed_ptr<float_type> input_vector(numElements);
// Allocate the managed output vector C
cuda_managed_ptr<float_type> output_vector(numElements);
// Initialize the input vectors
// Check some of our numeric_limits for viability
std::mt19937_64 rng {42};
std::uniform_real_distribution<float_type> dist(0, boost::math::constants::pi<float_type>());
static_assert(boost::math::numeric_limits<float_type>::is_specialized, "Should be since it's a double");
static_assert(boost::math::numeric_limits<float_type>::is_signed, "Should be since it's a double");
for (int i = 0; i < numElements; ++i)
{
input_vector[i] = dist(rng);
}
// Launch the Vector Add CUDA Kernel
int threadsPerBlock = 256;
int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
watch w;
cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
cudaDeviceSynchronize();
std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
err = cudaGetLastError();
if (err != cudaSuccess)
{
std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
return EXIT_FAILURE;
}
// Verify that the result vector is correct
std::vector<float_type> results;
results.reserve(numElements);
w.reset();
for(int i = 0; i < numElements; ++i)
results.push_back(std::cos(input_vector[i]));
double t = w.elapsed();
// check the results
for(int i = 0; i < numElements; ++i)
{
if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10)
{
std::cerr << "Result verification failed at element " << i << "!" << std::endl;
return EXIT_FAILURE;
}
}
std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
std::cout << "Done\n";
return 0;
}