计算机语言|CUDA零拷贝内存(zerocopy memory)

为了实现CPU与GPU内存的共享,cuda采用了零拷贝内存,它值固定内存的一种,当然,也就是实际存储空间实在cpu上。
零拷贝内存的延迟高,在进行频繁的读写操作时尽量少用,否则会大大降低性能。

/* *创建固定内存映射 * * flags:cudaHostAllocDefault:make cudaHostAlloc same as "cudaMallocHost" *cudaHostAllocPortable:函数返回能被所有cuda上下文使用的固定内存,而不仅仅是执行内存分配的那个 *cudaHostAllocWriteCombined:返回写结合的内存,该内存可以在某个系统配置上通过PCIe总线上更快的传输 *cudaHostAllocMapped:该标志返回映射到的设备内存地址空间的主机内存 */ cudaError_t cudaHostAlloc(void **pHost, size_t count, unsigned int flags);


/* * * 获取映射得到的固定内存的设备指针 */ cudaError_t cudaHostGetDevicePointer(void **pDevice, void *pHost, unsigned int flags);


整体的使用过程: 【计算机语言|CUDA零拷贝内存(zerocopy memory)】
size_t nBytes = 2048; float *h_A; h_A = (float *)malloc(nBytes); initialData(h_A, nBytes); cudaHostAlloc((void **)&h_A, nBytes, cudaHostAllocMapped); cudaHostGetDevicePointer((void **)&d_A, (void *)h_A, 0); cudaFreeHost(h_A);



给个完整程序,源码网址: 点击打开链接
#include "../common/common.h" #include #include /* * This example demonstrates the use of zero-copy memory to remove the need to * explicitly issue a memcpy operation between the host and device. By mapping * host, page-locked memory into the device's address space, the address can * directly reference a host array and transfer its contents over the PCIe bus. * * This example compares performing a vector addition with and without zero-copy * memory. */void checkResult(float *hostRef, float *gpuRef, const int N) { double epsilon = 1.0E-8; for (int i = 0; i < N; i++) { if (abs(hostRef[i] - gpuRef[i]) > epsilon) { printf("Arrays do not match!\n"); printf("host %5.2f gpu %5.2f at current %d\n", hostRef[i], gpuRef[i], i); break; } }return; }void initialData(float *ip, int size) { int i; for (i = 0; i < size; i++) { ip[i] = (float)( rand() & 0xFF ) / 10.0f; }return; }void sumArraysOnHost(float *A, float *B, float *C, const int N) { for (int idx = 0; idx < N; idx++) { C[idx] = A[idx] + B[idx]; } }__global__ void sumArrays(float *A, float *B, float *C, const int N) { int i = blockIdx.x * blockDim.x + threadIdx.x; if (i < N) C[i] = A[i] + B[i]; }__global__ void sumArraysZeroCopy(float *A, float *B, float *C, const int N) { int i = blockIdx.x * blockDim.x + threadIdx.x; if (i < N) C[i] = A[i] + B[i]; }int main(int argc, char **argv) { // set up device int dev = 0; CHECK(cudaSetDevice(dev)); // get device properties cudaDeviceProp deviceProp; CHECK(cudaGetDeviceProperties(&deviceProp, dev)); // check if support mapped memory if (!deviceProp.canMapHostMemory) { printf("Device %d does not support mapping CPU host memory!\n", dev); CHECK(cudaDeviceReset()); exit(EXIT_SUCCESS); }printf("Using Device %d: %s ", dev, deviceProp.name); // set up data size of vectors int ipower = 10; if (argc > 1) ipower = atoi(argv[1]); int nElem = 1 << ipower; size_t nBytes = nElem * sizeof(float); if (ipower < 18) { printf("Vector size %d power %dnbytes%3.0f KB\n", nElem, ipower, (float)nBytes / (1024.0f)); } else { printf("Vector size %d power %dnbytes%3.0f MB\n", nElem, ipower, (float)nBytes / (1024.0f * 1024.0f)); }// part 1: using device memory // malloc host memory float *h_A, *h_B, *hostRef, *gpuRef; h_A= (float *)malloc(nBytes); h_B= (float *)malloc(nBytes); hostRef = (float *)malloc(nBytes); gpuRef= (float *)malloc(nBytes); // initialize data at host side initialData(h_A, nElem); initialData(h_B, nElem); memset(hostRef, 0, nBytes); memset(gpuRef,0, nBytes); // add vector at host side for result checks sumArraysOnHost(h_A, h_B, hostRef, nElem); // malloc device global memory float *d_A, *d_B, *d_C; CHECK(cudaMalloc((float**)&d_A, nBytes)); CHECK(cudaMalloc((float**)&d_B, nBytes)); CHECK(cudaMalloc((float**)&d_C, nBytes)); // transfer data from host to device CHECK(cudaMemcpy(d_A, h_A, nBytes, cudaMemcpyHostToDevice)); CHECK(cudaMemcpy(d_B, h_B, nBytes, cudaMemcpyHostToDevice)); // set up execution configuration int iLen = 512; dim3 block (iLen); dim3 grid((nElem + block.x - 1) / block.x); sumArrays<<>>(d_A, d_B, d_C, nElem); // copy kernel result back to host side CHECK(cudaMemcpy(gpuRef, d_C, nBytes, cudaMemcpyDeviceToHost)); // check device results checkResult(hostRef, gpuRef, nElem); // free device global memory CHECK(cudaFree(d_A)); CHECK(cudaFree(d_B)); // free host memory free(h_A); free(h_B); // part 2: using zerocopy memory for array A and B // allocate zerocpy memory CHECK(cudaHostAlloc((void **)&h_A, nBytes, cudaHostAllocMapped)); CHECK(cudaHostAlloc((void **)&h_B, nBytes, cudaHostAllocMapped)); // initialize data at host side initialData(h_A, nElem); initialData(h_B, nElem); memset(hostRef, 0, nBytes); memset(gpuRef,0, nBytes); // pass the pointer to device CHECK(cudaHostGetDevicePointer((void **)&d_A, (void *)h_A, 0)); CHECK(cudaHostGetDevicePointer((void **)&d_B, (void *)h_B, 0)); // add at host side for result checks sumArraysOnHost(h_A, h_B, hostRef, nElem); // execute kernel with zero copy memory sumArraysZeroCopy<<>>(d_A, d_B, d_C, nElem); // copy kernel result back to host side CHECK(cudaMemcpy(gpuRef, d_C, nBytes, cudaMemcpyDeviceToHost)); // check device results checkResult(hostRef, gpuRef, nElem); // freememory CHECK(cudaFree(d_C)); CHECK(cudaFreeHost(h_A)); CHECK(cudaFreeHost(h_B)); free(hostRef); free(gpuRef); // reset device CHECK(cudaDeviceReset()); return EXIT_SUCCESS; }



    推荐阅读