MindSpore:CUDA编程(五)Event

Event是CUDA中的事件,用于分析、检测CUDA程序中的错误。一般我们会定义一个宏:#pragma once
include define CHECK(call) \ do \
{ \

const cudaError_t error_code = call; \ if (error_code != cudaSuccess)\ {\ printf("CUDA Error:\n"); \ printf("File:%s\n", __FILE__); \ printf("Line:%d\n", __LINE__); \ printf("Error code: %d\n", error_code); \ printf("Error text: %s\n",\ cudaGetErrorString(error_code)); \ exit(1); \ }\

} while (0)并在适当的位置使用这个宏来打印CUDA的错误日志。#pragma once, 不要放在源代码文件里,这个一般只放在头文件里的。(防止头文件被引入多次)Event的调用有以下内容:
MindSpore:CUDA编程(五)Event
文章图片

具体的顺序如下:(1)声明Event(这里以计算核函数运行时间前后的start Event和stop Event为例)cudaEvent_t start, stop; (2)创建EventCHECK(cudaEventCreate(&start));
CHECK(cudaEventCreate(&stop)); (3)添加Event(在合适的地方)cudaEventRecord(start);
cudaEventRecord(stop); (4)等待Event完成(a)非堵塞方式——可以用于一些不需要等待的处理cudaEventQuery(start); (b)堵塞方式——可以用于执行核函数后等待核函数执行完毕后的处理cudaEventSynchronize(stop); (5)计算两个Event间隔时间CHECK(cudaEventElapsedTime(&elapsed_time, start, stop)); (6)销毁EventCHECK(cudaEventDestroy(start));
CHECK(cudaEventDestroy(stop)); 以上次介绍的矩阵乘为例,完整的代码如下:#pragma once
include define CHECK(call) \ 【MindSpore:CUDA编程(五)Event】do \
{ \
const cudaError_t error_code = call; \ if (error_code != cudaSuccess)\ {\ printf("CUDA Error:\n"); \ printf("File:%s\n", __FILE__); \ printf("Line:%d\n", __LINE__); \ printf("Error code: %d\n", error_code); \ printf("Error text: %s\n",\ cudaGetErrorString(error_code)); \ exit(1); \ }\

} while (0)
include include include "error.cuh" define BLOCK_SIZE 32 global void gpu_matrix_mult(int a,int b, int *c, int m, int n, int k)
{
int row = blockIdx.y * blockDim.y + threadIdx.y; int col = blockIdx.x * blockDim.x + threadIdx.x; int sum = 0; if( col < k && row < m) { for(int i = 0; i < n; i++) { sum += a[row * n + i] * b[i * k + col]; } c[row * k + col] = sum; }

}
void cpu_matrix_mult(int h_a, int h_b, int *h_result, int m, int n, int k) {
for (int i = 0; i < m; ++i) { for (int j = 0; j < k; ++j) { int tmp = 0.0; for (int h = 0; h < n; ++h) { tmp += h_a[i * n + h] * h_b[h * k + j]; } h_result[i * k + j] = tmp; } }

}
int main(int argc, char const *argv[])
{
int m=100; int n=100; int k=100; //声明Event cudaEvent_t start, stop, stop2, stop3 , stop4 ; //创建Event CHECK(cudaEventCreate(&start)); CHECK(cudaEventCreate(&stop)); CHECK(cudaEventCreate(&stop2)); int *h_a, *h_b, *h_c, *h_cc; CHECK(cudaMallocHost((void **) &h_a, sizeof(int)*m*n)); CHECK(cudaMallocHost((void **) &h_b, sizeof(int)*n*k)); CHECK(cudaMallocHost((void **) &h_c, sizeof(int)*m*k)); CHECK(cudaMallocHost((void **) &h_cc, sizeof(int)*m*k)); for (int i = 0; i < m; ++i) { for (int j = 0; j < n; ++j) { h_a[i * n + j] = rand() % 1024; } }for (int i = 0; i < n; ++i) { for (int j = 0; j < k; ++j) { h_b[i * k + j] = rand() % 1024; } }int *d_a, *d_b, *d_c; CHECK(cudaMalloc((void **) &d_a, sizeof(int)*m*n)); CHECK(cudaMalloc((void **) &d_b, sizeof(int)*n*k)); CHECK(cudaMalloc((void **) &d_c, sizeof(int)*m*k)); // copy matrix A and B from host to device memory CHECK(cudaMemcpy(d_a, h_a, sizeof(int)*m*n, cudaMemcpyHostToDevice)); CHECK(cudaMemcpy(d_b, h_b, sizeof(int)*n*k, cudaMemcpyHostToDevice)); unsigned int grid_rows = (m + BLOCK_SIZE - 1) / BLOCK_SIZE; unsigned int grid_cols = (k + BLOCK_SIZE - 1) / BLOCK_SIZE; dim3 dimGrid(grid_cols, grid_rows); dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE); //开始start Event cudaEventRecord(start); //非阻塞模式 cudaEventQuery(start); //gpu_matrix_mult<<>>(d_a, d_b, d_c, m, n, k); gpu_matrix_mult_shared<<>>(d_a, d_b, d_c, m, n, k); //开始stop Event cudaEventRecord(stop); //由于要等待核函数执行完毕,所以选择阻塞模式 cudaEventSynchronize(stop); //计算时间 stop-start float elapsed_time; CHECK(cudaEventElapsedTime(&elapsed_time, start, stop)); printf("start-》stop:Time = %g ms.\n", elapsed_time); cudaMemcpy(h_c, d_c, (sizeof(int)*m*k), cudaMemcpyDeviceToHost); //cudaThreadSynchronize(); //开始stop2 Event CHECK(cudaEventRecord(stop2)); //非阻塞模式 //CHECK(cudaEventSynchronize(stop2)); cudaEventQuery(stop2); //计算时间 stop-stop2 float elapsed_time2; cudaEventElapsedTime(&elapsed_time2, stop, stop2); printf("stop-》stop2:Time = %g ms.\n", elapsed_time2); //销毁Event CHECK(cudaEventDestroy(start)); CHECK(cudaEventDestroy(stop)); CHECK(cudaEventDestroy(stop2)); //CPU函数计算 cpu_matrix_mult(h_a, h_b, h_cc, m, n, k); int ok = 1; for (int i = 0; i < m; ++i) { for (int j = 0; j < k; ++j) { if(fabs(h_cc[i*k + j] - h_c[i*k + j])>(1.0e-10)) {ok = 0; } } }if(ok) { printf("Pass!!!\n"); } else { printf("Error!!!\n"); }// free memory cudaFree(d_a); cudaFree(d_b); cudaFree(d_c); cudaFreeHost(h_a); cudaFreeHost(h_b); cudaFreeHost(h_c); return 0;

}在Quardo P1000的GPU上执行:
MindSpore:CUDA编程(五)Event
文章图片

这里以矩阵乘为例,打印了调用矩阵乘核函数的时间,以及后面 cudaMemcpy的时间。我们强行将 CHECK(cudaMemcpy(d_b, h_b, sizeof(int)nk, cudaMemcpyHostToDevice)); 改为 CHECK(cudaMemcpy(d_b, h_b, sizeof(int)nk*2, cudaMemcpyHostToDevice)); 故意让其出界。再重新编译,运行,看看效果:
MindSpore:CUDA编程(五)Event
文章图片

系统会告诉你 这行有错:
MindSpore:CUDA编程(五)Event
文章图片

这样就可以跟踪出CUDA调用中的错误。这里需要总结一下张小白在调试CHECK过程中发现的几个问题:(1)如果没有 CHECK(cudaEventCreate()) 就直接调用 cudaEventRecord() 或者执行后面的Event函数,会导致打印不了信息。张小白当时对于stop2这个event就犯了这个错,导致 stop->stop2的时间怎么都打不出来。(2)对于 cudaEventQuery() 是不能加 CHECK的,如果加了反而会报错:在上面的环境中,如果您这样写:CHECK(cudaEventQuery(stop2)); 编译执行就会出现以下错误:
MindSpore:CUDA编程(五)Event
文章图片

cudaEventQuery的cudaErrorNotReady代表了事件还没发生(还没有被记录),不代表错误。

    推荐阅读