CUDA学习日记5 高性能计算

1. cudaDeviceReset
解析：重置当前线程所关联过的当前设备的所有资源。

2. CUDART_VERSION
解析：CUDA 7.5版本的CUDART_VERSION为7050，包含在头文件#include中。

3. thrust::count
解析：thrust:count函数原型，如下所示：

template thrust::iterator_traits::difference_type thrust::count ( InputIterator first, InputIterator last, const EqualityComparable & value )

说明：count returns the number of iterators i in [first, last) such that *i == value.

4. transform_reduce
解析：transform_reduce函数原型，如下所示：

template OutputType thrust::transform_reduce ( InputIteratorfirst, InputIteratorlast, UnaryFunctionunary_op, OutputTypeinit, BinaryFunctionbinary_op )

举个例子，如下所示：

#include #include #include #include #include using namespace std; using namespace thrust; template struct square { __host__ __device__ T operator()(const T& x) const { return x*x; } }; int main(void) { float x[4] = { 1.0, 2.0, 3.0, 4.0 }; device_vector d_x(x, x + 4); square unary_op; thrust::plus binary_op; float init = 10; float norm = thrust::transform_reduce(d_x.begin(), d_x.end(), unary_op, init, binary_op); cout << norm << endl; return 0; }

5. Prefix-Sums：inclusive_scan和exclusive_scan
解析：

#include #include #include using namespace std; using namespace thrust; int main(void) { int data[6] = { 1, 0, 2, 2, 1, 3 }; // data is now {1, 1, 3, 5, 6, 9} // data[2] = data[0] + data[1] + data[2] // thrust::inclusive_scan(data, data + 6, data); // data is now {0, 1, 1, 3, 5, 6} // data[2] = data[0] + data[1] thrust::exclusive_scan(data, data + 6, data); for (int i = 0; i < 6; i++) { cout << data[i] << endl; } return 0; }

6. thrust::sort和thrust::stable_sort
解析：thrust::stable_sort函数原型，如下所示：

template __host__ __device__ void thrust::stable_sort ( const thrust::detail::execution_policy_base< DerivedPolicy > &exec, RandomAccessIteratorfirst, RandomAccessIteratorlast, StrictWeakOrderingcomp )

（1）exec：The execution policy to use for parallelization.
（2）first：The beginning of the sequence.
（3）last：The end of the sequence.
（4）comp：Comparison operator.

举个例子，如下所示：

#include using namespace std; using namespace thrust; int main(void) { const int N = 6; int A[N] = { 1, 4, 2, 8, 5, 7 }; // A is now {1, 2, 4, 5, 7, 8} // thrust::sort(A, A + N); // A is now {1, 2, 4, 5, 7, 8} thrust::stable_sort(A, A + N); for (int i = 0; i < 6; i++) { cout << A[i] << endl; } return 0; }

（1）#include ：Function objects and tools for manipulating them.
（2）#include ：Thrust execution policies.

7. thrust::sort_by_key和thrust::stable_sort_by_key
解析：

#include using namespace std; using namespace thrust; int main(void) { const int N = 6; int keys[N] = { 1, 4, 2, 8, 5, 7 }; char values[N] = { 'a', 'b', 'c', 'd', 'e', 'f' }; // keys is now { 1, 2, 4, 5, 7, 8} // values is now {'a', 'c', 'b', 'e', 'f', 'd'} // thrust::sort_by_key(keys, keys + N, values); // keys is now { 1, 2, 4, 5, 7, 8} // values is now {'a', 'c', 'b', 'e', 'f', 'd'} thrust::stable_sort_by_key(keys, keys + N, values); for (int i = 0; i < 6; i++) { cout << values[i] << endl; } return 0; }

8. Thrust中的Iterator
解析：
（1）constant_iterator
（2）counting_iterator

#include #include #include #include using namespace std; using namespace thrust; int main(void) { thrust::constant_iterator first(10); thrust::constant_iterator last = first + 3; // returns 30 (i.e. 3 * 10) // thrust::reduce(first, last); // returns 33 (i.e. 10 + 11 + 12) thrust::reduce(first, last); cout << thrust::reduce(first, last) << endl; return 0; }

（3）transform_iterator

#include #include #include using namespace std; using namespace thrust; int main(void) { thrust::device_vector vec(3); vec[0] = 10; vec[1] = 20; vec[2] = 30; // returns -60 (i.e. -10 + -20 + -30) cout << thrust::reduce(thrust::make_transform_iterator(vec.begin(), thrust::negate()), thrust::make_transform_iterator(vec.end(), thrust::negate())) << endl; return 0; }

（4）permutation_iterator

#include #include #include using namespace std; using namespace thrust; int main(void) { thrust::device_vector map(4); map[0] = 3; map[1] = 1; map[2] = 0; map[3] = 5; thrust::device_vector source(6); source[0] = 10; source[1] = 20; source[2] = 30; source[3] = 40; source[4] = 50; source[5] = 60; // sum = source[map[0]] + source[map[1]] + ... int sum = thrust::reduce(thrust::make_permutation_iterator(source.begin(), map.begin()), thrust::make_permutation_iterator(source.begin(), map.end())); cout << sum << endl; return 0; }

（5）zip_iterator

#include #include #include using namespace std; using namespace thrust; int main(void) { thrust::device_vector A(3); thrust::device_vector B(3); A[0] = 10; A[1] = 20; A[2] = 30; B[0] = 'x'; B[1] = 'y'; B[2] = 'z'; thrust::maximum< thrust::tuple > binary_op; thrust::tuple init = thrust::make_zip_iterator(thrust::make_tuple(A.begin(), B.begin()))[0]; thrust::tuple result = thrust::reduce(thrust::make_zip_iterator(thrust::make_tuple(A.begin(), B.begin())), thrust::make_zip_iterator(thrust::make_tuple(A.end(), B.end())), init, binary_op); cout << thrust::get<0>(result) << endl; cout << thrust::get<1>(result) << endl; return 0; }

8. #include
解析：
（1）#define EXIT_SUCCESS 0
（2）#define EXIT_FAILURE 1

9. cuBLAS与CUBLASXT
解析：在CUDA 6的开发包中，提供了一个新的API——CUBLASXT，它是在cuBLAS API的上层封装了一个矩阵分块算法，解决了当数据量大时显存不足的问题。

10. cuRAND库
解析：cuRAND库提供了通过GPU生成随机数的接口，包含头文件#include 。

11. CUDA同步方式
解析：在CUDA中，有两种方式实现同步，如下所示：
（1）System-level：等待所有host和device的工作完成。
（2）Block-level：等待device中block的所有thread执行到某个点。

参考文献： 【CUDA学习日记5】[1] Thrust：http://docs.nvidia.com/cuda/thrust/index.html#axzz4aFPI7CYb