CUDA学习日记5

1. cudaDeviceReset
解析:重置当前线程所关联过的当前设备的所有资源。


2. CUDART_VERSION
解析:CUDA 7.5版本的CUDART_VERSION为7050,包含在头文件#include中。


3. thrust::count
解析:thrust:count函数原型,如下所示:

template thrust::iterator_traits::difference_type thrust::count ( InputIterator first, InputIterator last, const EqualityComparable & value )

说明:count returns the number of iterators i in [first, last) such that *i == value.


4. transform_reduce
解析:transform_reduce函数原型,如下所示:
template OutputType thrust::transform_reduce ( InputIteratorfirst, InputIteratorlast, UnaryFunctionunary_op, OutputTypeinit, BinaryFunctionbinary_op )

举个例子,如下所示:
#include #include #include #include #include using namespace std; using namespace thrust; template struct square { __host__ __device__ T operator()(const T& x) const { return x*x; } }; int main(void) { float x[4] = { 1.0, 2.0, 3.0, 4.0 }; device_vector d_x(x, x + 4); square unary_op; thrust::plus binary_op; float init = 10; float norm = thrust::transform_reduce(d_x.begin(), d_x.end(), unary_op, init, binary_op); cout << norm << endl; return 0; }



5. Prefix-Sums:inclusive_scan和exclusive_scan
解析:
#include #include #include using namespace std; using namespace thrust; int main(void) { int data[6] = { 1, 0, 2, 2, 1, 3 }; // data is now {1, 1, 3, 5, 6, 9} // data[2] = data[0] + data[1] + data[2] // thrust::inclusive_scan(data, data + 6, data); // data is now {0, 1, 1, 3, 5, 6} // data[2] = data[0] + data[1] thrust::exclusive_scan(data, data + 6, data); for (int i = 0; i < 6; i++) { cout << data[i] << endl; } return 0; }



6. thrust::sort和thrust::stable_sort
解析:thrust::stable_sort函数原型,如下所示:
template __host__ __device__ void thrust::stable_sort ( const thrust::detail::execution_policy_base< DerivedPolicy > &exec, RandomAccessIteratorfirst, RandomAccessIteratorlast, StrictWeakOrderingcomp )

(1)exec:The execution policy to use for parallelization.
(2)first:The beginning of the sequence.
(3)last:The end of the sequence.
(4)comp:Comparison operator.

举个例子,如下所示:
#include using namespace std; using namespace thrust; int main(void) { const int N = 6; int A[N] = { 1, 4, 2, 8, 5, 7 }; // A is now {1, 2, 4, 5, 7, 8} // thrust::sort(A, A + N); // A is now {1, 2, 4, 5, 7, 8} thrust::stable_sort(A, A + N); for (int i = 0; i < 6; i++) { cout << A[i] << endl; } return 0; }

(1)#include :Function objects and tools for manipulating them.
(2)#include :Thrust execution policies.


7. thrust::sort_by_key和thrust::stable_sort_by_key
解析:
#include using namespace std; using namespace thrust; int main(void) { const int N = 6; int keys[N] = { 1, 4, 2, 8, 5, 7 }; char values[N] = { 'a', 'b', 'c', 'd', 'e', 'f' }; // keys is now { 1, 2, 4, 5, 7, 8} // values is now {'a', 'c', 'b', 'e', 'f', 'd'} // thrust::sort_by_key(keys, keys + N, values); // keys is now { 1, 2, 4, 5, 7, 8} // values is now {'a', 'c', 'b', 'e', 'f', 'd'} thrust::stable_sort_by_key(keys, keys + N, values); for (int i = 0; i < 6; i++) { cout << values[i] << endl; } return 0; }



8. Thrust中的Iterator
解析:
(1)constant_iterator
(2)counting_iterator
#include #include #include #include using namespace std; using namespace thrust; int main(void) { thrust::constant_iterator first(10); thrust::constant_iterator last = first + 3; // returns 30 (i.e. 3 * 10) // thrust::reduce(first, last); // returns 33 (i.e. 10 + 11 + 12) thrust::reduce(first, last); cout << thrust::reduce(first, last) << endl; return 0; }

(3)transform_iterator
#include #include #include using namespace std; using namespace thrust; int main(void) { thrust::device_vector vec(3); vec[0] = 10; vec[1] = 20; vec[2] = 30; // returns -60 (i.e. -10 + -20 + -30) cout << thrust::reduce(thrust::make_transform_iterator(vec.begin(), thrust::negate()), thrust::make_transform_iterator(vec.end(), thrust::negate())) << endl; return 0; }

(4)permutation_iterator
#include #include #include using namespace std; using namespace thrust; int main(void) { thrust::device_vector map(4); map[0] = 3; map[1] = 1; map[2] = 0; map[3] = 5; thrust::device_vector source(6); source[0] = 10; source[1] = 20; source[2] = 30; source[3] = 40; source[4] = 50; source[5] = 60; // sum = source[map[0]] + source[map[1]] + ... int sum = thrust::reduce(thrust::make_permutation_iterator(source.begin(), map.begin()), thrust::make_permutation_iterator(source.begin(), map.end())); cout << sum << endl; return 0; }

(5)zip_iterator
#include #include #include using namespace std; using namespace thrust; int main(void) { thrust::device_vector A(3); thrust::device_vector B(3); A[0] = 10; A[1] = 20; A[2] = 30; B[0] = 'x'; B[1] = 'y'; B[2] = 'z'; thrust::maximum< thrust::tuple > binary_op; thrust::tuple init = thrust::make_zip_iterator(thrust::make_tuple(A.begin(), B.begin()))[0]; thrust::tuple result = thrust::reduce(thrust::make_zip_iterator(thrust::make_tuple(A.begin(), B.begin())), thrust::make_zip_iterator(thrust::make_tuple(A.end(), B.end())), init, binary_op); cout << thrust::get<0>(result) << endl; cout << thrust::get<1>(result) << endl; return 0; }



8. #include
解析:
(1)#define EXIT_SUCCESS 0
(2)#define EXIT_FAILURE 1


9. cuBLAS与CUBLASXT
解析:在CUDA 6的开发包中,提供了一个新的API——CUBLASXT,它是在cuBLAS API的上层封装了一个矩阵分块算法,解决了当数据量大时显存不足的问题。


10. cuRAND库
解析:cuRAND库提供了通过GPU生成随机数的接口,包含头文件#include


11. CUDA同步方式
解析:在CUDA中,有两种方式实现同步,如下所示:
(1)System-level:等待所有host和device的工作完成。
(2)Block-level:等待device中block的所有thread执行到某个点。


参考文献: 【CUDA学习日记5】[1] Thrust:http://docs.nvidia.com/cuda/thrust/index.html#axzz4aFPI7CYb

    推荐阅读