1. cudaDeviceReset
解析:重置当前线程所关联过的当前设备的所有资源。
2. CUDART_VERSION
解析:CUDA 7.5版本的CUDART_VERSION为7050,包含在头文件#include中。
3. thrust::count
解析:thrust:count函数原型,如下所示:
template
thrust::iterator_traits::difference_type thrust::count (
InputIterator first,
InputIterator last,
const EqualityComparable & value
)
说明:count returns the number of iterators i in [first, last) such that *i == value.
4. transform_reduce
解析:transform_reduce函数原型,如下所示:
template
OutputType thrust::transform_reduce ( InputIteratorfirst,
InputIteratorlast,
UnaryFunctionunary_op,
OutputTypeinit,
BinaryFunctionbinary_op
)
举个例子,如下所示:
#include
#include
#include
#include
#include
using namespace std;
using namespace thrust;
template
struct square
{
__host__ __device__
T operator()(const T& x) const {
return x*x;
}
};
int main(void)
{
float x[4] = { 1.0, 2.0, 3.0, 4.0 };
device_vector d_x(x, x + 4);
square unary_op;
thrust::plus binary_op;
float init = 10;
float norm = thrust::transform_reduce(d_x.begin(), d_x.end(), unary_op, init, binary_op);
cout << norm << endl;
return 0;
}
5. Prefix-Sums:inclusive_scan和exclusive_scan
解析:
#include
#include
#include
using namespace std;
using namespace thrust;
int main(void)
{
int data[6] = { 1, 0, 2, 2, 1, 3 };
// data is now {1, 1, 3, 5, 6, 9}
// data[2] = data[0] + data[1] + data[2]
// thrust::inclusive_scan(data, data + 6, data);
// data is now {0, 1, 1, 3, 5, 6}
// data[2] = data[0] + data[1]
thrust::exclusive_scan(data, data + 6, data);
for (int i = 0;
i < 6;
i++)
{
cout << data[i] << endl;
}
return 0;
}
6. thrust::sort和thrust::stable_sort
解析:thrust::stable_sort函数原型,如下所示:
template
__host__ __device__ void thrust::stable_sort (
const thrust::detail::execution_policy_base< DerivedPolicy > &exec,
RandomAccessIteratorfirst,
RandomAccessIteratorlast,
StrictWeakOrderingcomp
)
(1)exec:The execution policy to use for parallelization.
(2)first:The beginning of the sequence.
(3)last:The end of the sequence.
(4)comp:Comparison operator.
举个例子,如下所示:
#include
using namespace std;
using namespace thrust;
int main(void)
{
const int N = 6;
int A[N] = { 1, 4, 2, 8, 5, 7 };
// A is now {1, 2, 4, 5, 7, 8}
// thrust::sort(A, A + N);
// A is now {1, 2, 4, 5, 7, 8}
thrust::stable_sort(A, A + N);
for (int i = 0;
i < 6;
i++)
{
cout << A[i] << endl;
}
return 0;
}
(1)#include :Function objects and tools for manipulating them.
(2)#include :Thrust execution policies.
7. thrust::sort_by_key和thrust::stable_sort_by_key
解析:
#include
using namespace std;
using namespace thrust;
int main(void)
{
const int N = 6;
int keys[N] = { 1, 4, 2, 8, 5, 7 };
char values[N] = { 'a', 'b', 'c', 'd', 'e', 'f' };
// keys is now { 1, 2, 4, 5, 7, 8}
// values is now {'a', 'c', 'b', 'e', 'f', 'd'}
// thrust::sort_by_key(keys, keys + N, values);
// keys is now { 1, 2, 4, 5, 7, 8}
// values is now {'a', 'c', 'b', 'e', 'f', 'd'}
thrust::stable_sort_by_key(keys, keys + N, values);
for (int i = 0;
i < 6;
i++)
{
cout << values[i] << endl;
}
return 0;
}
8. Thrust中的Iterator
解析:
(1)constant_iterator
(2)counting_iterator
#include
#include
#include
#include
using namespace std;
using namespace thrust;
int main(void)
{
thrust::constant_iterator first(10);
thrust::constant_iterator last = first + 3;
// returns 30 (i.e. 3 * 10)
// thrust::reduce(first, last);
// returns 33 (i.e. 10 + 11 + 12)
thrust::reduce(first, last);
cout << thrust::reduce(first, last) << endl;
return 0;
}
(3)transform_iterator
#include
#include
#include
using namespace std;
using namespace thrust;
int main(void)
{
thrust::device_vector vec(3);
vec[0] = 10;
vec[1] = 20;
vec[2] = 30;
// returns -60 (i.e. -10 + -20 + -30)
cout << thrust::reduce(thrust::make_transform_iterator(vec.begin(), thrust::negate()),
thrust::make_transform_iterator(vec.end(), thrust::negate())) << endl;
return 0;
}
(4)permutation_iterator
#include
#include
#include
using namespace std;
using namespace thrust;
int main(void)
{
thrust::device_vector map(4);
map[0] = 3;
map[1] = 1;
map[2] = 0;
map[3] = 5;
thrust::device_vector source(6);
source[0] = 10;
source[1] = 20;
source[2] = 30;
source[3] = 40;
source[4] = 50;
source[5] = 60;
// sum = source[map[0]] + source[map[1]] + ...
int sum = thrust::reduce(thrust::make_permutation_iterator(source.begin(), map.begin()),
thrust::make_permutation_iterator(source.begin(), map.end()));
cout << sum << endl;
return 0;
}
(5)zip_iterator
#include
#include
#include
using namespace std;
using namespace thrust;
int main(void)
{
thrust::device_vector A(3);
thrust::device_vector B(3);
A[0] = 10;
A[1] = 20;
A[2] = 30;
B[0] = 'x';
B[1] = 'y';
B[2] = 'z';
thrust::maximum< thrust::tuple > binary_op;
thrust::tuple init = thrust::make_zip_iterator(thrust::make_tuple(A.begin(), B.begin()))[0];
thrust::tuple result = thrust::reduce(thrust::make_zip_iterator(thrust::make_tuple(A.begin(), B.begin())), thrust::make_zip_iterator(thrust::make_tuple(A.end(), B.end())), init, binary_op);
cout << thrust::get<0>(result) << endl;
cout << thrust::get<1>(result) << endl;
return 0;
}
8. #include
解析:
(1)#define EXIT_SUCCESS 0
(2)#define EXIT_FAILURE 1
9. cuBLAS与CUBLASXT
解析:在CUDA 6的开发包中,提供了一个新的API——CUBLASXT,它是在cuBLAS API的上层封装了一个矩阵分块算法,解决了当数据量大时显存不足的问题。
10. cuRAND库
解析:cuRAND库提供了通过GPU生成随机数的接口,包含头文件#include 。
11. CUDA同步方式
解析:在CUDA中,有两种方式实现同步,如下所示:
(1)System-level:等待所有host和device的工作完成。
(2)Block-level:等待device中block的所有thread执行到某个点。
参考文献: 【CUDA学习日记5】[1] Thrust:http://docs.nvidia.com/cuda/thrust/index.html#axzz4aFPI7CYb
推荐阅读