c++|opencv去除最小孔洞以及最小连通域的cuda加速,(这里只供自己查看)
描述
最近遇到一个opencv中,去除最小空洞以及最小连通域的 算法, 因为要优化并集成到项目中,所以使用cuda对其进行加速,不过中间遇到一些问题,需要诚待解决,而且只是初版,里面的关于最佳线程数量的分配,都还没有优化。不过先贴上来吧,等有好的方式再做修改.
opencv原算法.(这里其实也是百度到的一个)
void Qimage2MatInteroperateGpu::removeSmallRegion(Mat & Src, Mat & Dst, int AreaLimit, int CheckMode, int NeihborMode)
{
int RemoveCount = 0;
//新建一幅标签图像初始化为0像素点,为了记录每个像素点检验状态的标签,0代表未检查,1代表正在检查,2代表检查不合格(需要反转颜色),3代表检查合格或不需检查
//初始化的图像全部为0,未检查
Mat PointLabel = Mat::zeros(Src.size(), CV_8UC1);
//和原始图像同等大小的空位图.
if (CheckMode == 1)//去除小连通区域的白色点
{
cout << "去除小连通域.";
for (int i = 0;
i < Src.rows;
i++)
{
for (int j = 0;
j < Src.cols;
j++)
{
if (Src.at(i, j) < 10)
{
PointLabel.at(i, j) = 3;
//将背景黑色点标记为合格,像素为3
}
}
}
}
else//去除孔洞,黑色点像素
{
cout << "去除孔洞";
for (int i = 0;
i < Src.rows;
i++)
{
for (int j = 0;
j < Src.cols;
j++)
{
if (Src.at(i, j) > 10)
{
PointLabel.at(i, j) = 3;
//如果原图是白色区域,标记为合格,像素为3
}
}
}
} showMat(PointLabel,"remove dong");
//for (int i = 0;
i < Src.rows;
i++)
//{
// for (int j = 0;
j < Src.cols;
j++)
// {
//if (PointLabel.at(i, j) == 0)
//{ //}
// }
//} vectorNeihborPos;
//将邻域压进容器田字格
NeihborPos.push_back(Point2i(-1, 0));
NeihborPos.push_back(Point2i(1, 0));
NeihborPos.push_back(Point2i(0, -1));
NeihborPos.push_back(Point2i(0, 1));
if (NeihborMode == 1) //米字格
{
cout << "Neighbor mode: 8邻域." << endl;
NeihborPos.push_back(Point2i(-1, -1));
NeihborPos.push_back(Point2i(-1, 1));
NeihborPos.push_back(Point2i(1, -1));
NeihborPos.push_back(Point2i(1, 1));
}
else cout << "Neighbor mode: 4邻域." << endl;
int NeihborCount = 4 + 4 * NeihborMode;
int CurrX = 0, CurrY = 0;
int recordNumvber = 0;
bool status = false;
//开始检测
for (int i = 0;
i < Src.rows;
i++)
{
for (int j = 0;
j < Src.cols;
j++)
{
if (PointLabel.at(i, j) == 0)//标签图像像素点为0,表示还未检查的不合格点
{std::cout << "开始记录不合格的点: " << i<<" "<【c++|opencv去除最小孔洞以及最小连通域的cuda加速,(这里只供自己查看)】GrowBuffer;
//记录检查像素点的个数
GrowBuffer.push_back(Point2i(j, i));
// j i?
PointLabel.at(i, j) = 1;
//标记为正在检查
int CheckResult = 0;
//循环为自我迭代的过程,在循环自身的过程中,需要不停的对 后来加入的数据也进行计算迭代.//这里并行化的时候,可以使用一个比较大的数组来代替.
for (int z = 0;
z < GrowBuffer.size();
z++)
{
for (int q = 0;
q < NeihborCount;
q++)//循环遍历周围的4或者8点.
{
CurrX = GrowBuffer.at(z).x + NeihborPos.at(q).x;
CurrY = GrowBuffer.at(z).y + NeihborPos.at(q).y;
if (CurrX >= 0 && CurrX < Src.cols&&CurrY >= 0 && CurrY < Src.rows)//防止越界
{
if (PointLabel.at(CurrY, CurrX) == 0)
{
GrowBuffer.push_back(Point2i(CurrX, CurrY));
//邻域点加入buffer
PointLabel.at(CurrY, CurrX) = 1;
//更新邻域点的检查标签,避免重复检查
}
}
}}std::cout << "计算获取到的区域对象的像素点: " << GrowBuffer.size() << " -- (j i) " << i << " " << j << std::endl;
if (GrowBuffer.size() > AreaLimit) //判断结果(是否超出限定的大小),1为未超出,2为超出
CheckResult = 2;
else
{
CheckResult = 1;
RemoveCount++;
//记录有多少区域被去除
}for (int z = 0;
z < GrowBuffer.size();
z++)
{
CurrX = GrowBuffer.at(z).x;
CurrY = GrowBuffer.at(z).y;
PointLabel.at(CurrY, CurrX) += CheckResult;
//标记不合格的像素点,像素值为2
}
//********结束该点处的检查**********//
status = true;
}if (status == true)
{
//break;
}}if (status == true)
{
//break;
}
} CheckMode = 255 * (1 - CheckMode);
//开始反转面积过小的区域
for (int i = 0;
i < Src.rows;
++i)
{
for (int j = 0;
j < Src.cols;
++j)
{
if (PointLabel.at(i, j) == 2)
{
Dst.at(i, j) = CheckMode;
}
else if (PointLabel.at(i, j) == 3)
{
Dst.at(i, j) = Src.at(i, j);
}
}
}
cout << RemoveCount << " objects removed." << endl;
}
cuda加速之后的
定义
__host__ bool removeSmallRegionGpu(Mat & Src, Mat & Dst, int AreaLimit, int CheckMode, int NeihborMode);/******************************************************/
// 函数名: removeSmallRegionKernelGpu
// 功能描述 : 最小连通域核函数.
// 参数: source原图像数据.(注意,这里的原图像必须是经过 灰度变换之后的二值化图像.)
//:Auxiliary记录的检测标志
//: outImg输出的图像像素.
//:
// 返回值: 无
/******************************************************/
__global__ void removeSmallRegionKernelGpu
(uchar* source, uchar* Auxiliary, uchar* outImg,
int *outCalculateNumber,
int2* neiBorModeBuffer, int neightborSize, int2 *GrowBuffer, int GrowBufferSize,
int width, int height,
int AreaLimit, int checkMode, int neiborMode
);//实现的部分. 这里描述一下,因为在核函数里啊,实在做不到那个vector::push_back(),自增,自加,所有,我就使用了一个全局的一维结构体 + 两个变量来模拟 ... (其实大家对比 上面的opencv的写法,就明白cuda这部分为什么这么写了,包括上面的参数列表部分..当然,里面也有一些自测的部分,比如int *outCalculateNumber, 这个参数,希望不会照成误解 )int tidx = threadIdx.x + blockIdx.x*blockDim.x;
int tidy = threadIdx.y + blockIdx.y*blockDim.y;
//行列的偏移.
int offsetx = gridDim.x * blockDim.x;
int offsety = gridDim.y * blockDim.y;
if (tidx < 0 || tidx > width || tidy < 0 || tidy > height) return;
//防止越界. //索引.
int offsetIndex = tidx + tidy * width;
if (checkMode == 1)//去除小连通区域的白色点
{
if (source[offsetIndex] < 10)
{
//将背景黑色点标记为合格,像素为3
Auxiliary[offsetIndex] = ckNumbser;
}
}
else//去除孔洞,黑色点像素
{
if (source[offsetIndex] > 10)
{
Auxiliary[offsetIndex] = ckNumbser;
}
} //同步等待以上所有的数据都计算完毕.
__syncthreads();
int neihborCount = 4;
if (neightborSize < neihborCount)
{
printf("neightborSize less 4 \n");
} if (neiborMode == 1)
{
neihborCount = 4 + 4 * neiborMode;
} int currX = 0, currY = 0;
//这里,让外部修改的数据,可以让整个线程粒子都知道. //开始检测. 这里并行化每个单步像素.
if (Auxiliary[offsetIndex] == 0)
{
GrowBuffer[0] = int2{ tidx ,tidy };
// tidx ,tidy
Auxiliary[offsetIndex] = 1;
//标记正在检测.
int checkResult = 0;
int offSetLength = width * height;
int GrowBUfferValied = 1;
//循环,查找关联对象.
for (int z = 0;
z < GrowBUfferValied;
z++)
{
for (int q = 0;
q < neihborCount;
q++)
{
int2 temp = GrowBuffer[z];
int2 neigborTemp = neiBorModeBuffer[q];
currX = temp.x + neigborTemp.x;
currY = temp.y + neigborTemp.y;
if (currX >= 0 && currX < width && currY >= 0 && currY < height)
{
int currxyOffset = currX + currY * width;
if (currxyOffset > 0 && currxyOffset < offSetLength)
{
if (Auxiliary[currxyOffset] == 0)
{
GrowBuffer[GrowBUfferValied] = int2{ currX,currY };
//邻域点加入buffer
Auxiliary[currxyOffset] = 1;
//更新邻域点的检查标签,避免重复检查
GrowBUfferValied++;
}
}}}printf("GrowBUfferValied++%d \n", GrowBUfferValied);
if (GrowBUfferValied > GrowBufferSize - 1)
{
printf("GrowBUfferValied size number is over \n");
break;
}}if (GrowBUfferValied > 20)
{
printf("get recRange is %d %d-> %dimgsize: %d\n", tidx, tidy, GrowBUfferValied, offSetLength);
}//记录每次的结果.
outCalculateNumber[offsetIndex] = GrowBUfferValied;
//判断结果(是否超出限定的大小),1为未超出,2为超出
if (GrowBUfferValied > AreaLimit)
{
checkResult = 2;
}
else
{
checkResult = 1;
//removeCount++;
//这里注意,防止资源竞夺.
}for (int z = 0;
z < GrowBUfferValied;
z++)
{
int2 temp = GrowBuffer[z];
currX = temp.x;
currY = temp.y;
if (currX >= 0 && currX < width && currY >= 0 && currY < height)
{
int currxyOffset = currX + currY * width;
if (currxyOffset > 0 && currxyOffset < offSetLength - 1)
{
Auxiliary[currxyOffset] += checkResult;
//?
}
}
}
} //每个像素判断,并反转过小的区域.
checkMode = 255 * (1 - checkMode);
if (Auxiliary[offsetIndex] == 2)
{
outImg[offsetIndex] = checkMode;
}
else if (Auxiliary[offsetIndex] == 3)
{
outImg[offsetIndex] = source[offsetIndex];
}
//printf("%d%dcalculate end \n",tidx,tidy);
__syncthreads();
/// 本地函数实现部分.
__host__ bool removeSmallRegionGpu(Mat & Src, Mat & Dst, int AreaLimit, int CheckMode, int NeihborMode)
{
if (Src.data =https://www.it610.com/article/= nullptr)
{
std::cout <<"src is nullptr" << std::endl;
return false;
} //判断其通道大小. //获取其宽高大小.
int imgWidth = Src.cols;
int imgHeight = Src.rows;
int channels = Src.channels();
if (channels == 3)
{
std::cout << "src`channelsis 3 or more,please convert 1 channel" << std::endl;
//将多通道合并成单通道.
return false;
}// cv::imshow("Src", Src);
std::cout << "Src` channel is " << Src.channels() << std::endl;
if (Dst.data =https://www.it610.com/article/= nullptr)
{
//如果输出图像为空,将自动创建单通道.
std::cout <<"dst data is empty, the process will creat it default" << std::endl;
Dst = cv::Mat::zeros(cv::Size(imgWidth,imgHeight),CV_8UC1);
} int imgSize = imgWidth * imgHeight*channels;
//开辟gpu空间.
uchar* srcGpu = nullptr;
HANDLE_ERROR(cudaMalloc((void**)&srcGpu,sizeof(uchar)*imgSize));
HANDLE_ERROR(cudaMemcpy(srcGpu,Src.data,sizeof(uchar)*imgSize,cudaMemcpyKind::cudaMemcpyHostToDevice));
//开辟输出图像空间大小.
uchar* DstGpu = nullptr;
HANDLE_ERROR(cudaMalloc((void**)&DstGpu,sizeof(uchar)*imgSize));
HANDLE_ERROR(cudaMemset(DstGpu,0,imgSize*sizeof(uchar)));
//全黑的辅助gpu位图空间
uchar* AuxiliaryGpu = nullptr;
HANDLE_ERROR(cudaMalloc((void**)&AuxiliaryGpu, sizeof(uchar)*imgSize));
HANDLE_ERROR(cudaMemset(AuxiliaryGpu, 0, imgSize * sizeof(uchar)));
//创建并计算其开辟的最佳核函数算子.
//int Maxblocks = getMaxThreadNums();
int Maxblocks = 32;
//1024 dim3threadsPerBlock(Maxblocks,Maxblocks);
dim3blocksPerGrid((imgWidth+threadsPerBlock.x-1)/threadsPerBlock.x,(imgHeight+threadsPerBlock.y-1)/threadsPerBlock.y);
//需要写入需要的数据.
thrust::device_vector neightborBuffer;
{
neightborBuffer.push_back(int2{ -1,0 });
neightborBuffer.push_back(int2{ 1, 0 });
neightborBuffer.push_back(int2{ 0, -1 });
neightborBuffer.push_back(int2{0, 1});
neightborBuffer.push_back(int2{ -1,-1 });
neightborBuffer.push_back(int2{ -1,1 });
neightborBuffer.push_back(int2{ 1,-1 });
neightborBuffer.push_back(int2{ 1,1 });
}
//获取gpu 动态数组指针.并将其传入.
int2* neightBorBufferPtr = thrust::raw_pointer_cast(&neightborBuffer[0]);
int neightborSize = neightborBuffer.size();
thrust::device_vector grawBuffer;
grawBuffer.resize(imgWidth*imgHeight);
//imgWidth*imgHeight
int2* grawBufferPtr = thrust::raw_pointer_cast(&grawBuffer[0]);
int grawBufferSize = grawBuffer.size();
std::cout << "blocksPerGrid size: " << blocksPerGrid.x << "" << blocksPerGrid.y << std::endl;
std::cout << "threadsPerBlock size: " << threadsPerBlock.x << "" << threadsPerBlock.y << std::endl;
//预处理待检测图像数据.
prereatmentAuxiliary << < blocksPerGrid, threadsPerBlock >> > (srcGpu, AuxiliaryGpu,imgWidth,imgHeight,CheckMode);
cv::Mat auxiliaryTemp = cv::Mat::zeros(cv::Size(imgWidth, imgHeight), CV_8UC1);
HANDLE_ERROR(cudaMemcpy(auxiliaryTemp.data, AuxiliaryGpu, sizeof(uchar)*imgSize, cudaMemcpyKind::cudaMemcpyDeviceToHost));
cv::imshow("auxiliaryTemp", auxiliaryTemp);
/*
测试辅助数.imgSize
*/ //extern __shared__ int getRecordCalculate[];
//共享内存,不能超过16kb int* recordBuffer = nullptr;
cudaMalloc((void**)&recordBuffer,sizeof(int)*imgSize);
cudaMemset(recordBuffer,0,sizeof(int)*imgSize);
//并行计算孔洞自生长区域.
removeSmallRegionKernelGpu << > > (
srcGpu,AuxiliaryGpu,DstGpu, recordBuffer,
neightBorBufferPtr, neightborSize,
grawBufferPtr, grawBufferSize,
imgWidth,imgHeight,
AreaLimit,CheckMode,NeihborMode);
//记录数据 int *cpuBuffer = new int[imgSize];
memset(cpuBuffer,0,sizeof(int)*imgSize);
cudaMemcpy(cpuBuffer,recordBuffer,sizeof(int)*imgSize,cudaMemcpyKind::cudaMemcpyDeviceToHost);
long controlNumber = 0;
for (int i = 0;
i < imgSize;
i++)
{
controlNumber += cpuBuffer[i];
} std::cout << "最后总数: " << controlNumber << std::endl;
//计算完毕之后,将gpu数据下载,并拷贝给本地位图.
HANDLE_ERROR(cudaMemcpy(Dst.data, DstGpu, sizeof(uchar)*imgSize, cudaMemcpyKind::cudaMemcpyDeviceToHost));
cv::Mat OUTauxiliary = cv::Mat::zeros(cv::Size(imgWidth, imgHeight), CV_8UC1);
HANDLE_ERROR(cudaMemcpy(OUTauxiliary.data, AuxiliaryGpu, sizeof(uchar)*imgSize, cudaMemcpyKind::cudaMemcpyDeviceToHost));
cv::imshow("OUTauxiliary", OUTauxiliary);
//释放gpu空间.
cudaFree(srcGpu);
cudaFree(DstGpu);
cudaFree(AuxiliaryGpu);
cudaFree(recordBuffer);
return true;
}使用方式
void test()
{ cv::Mat Source = cv::imread(R"(..\\MatLabCuda\\img\\source\\remove_B.bmp)");
//保证这里的传入的图像为单通道.
cv::Mat SourceSignel;
if (Source.channels() == 3)
{
SourceSignel = cv::Mat::zeros(cv::Size(Source.cols,Source.rows),CV_8UC1);
for (int i = 0;
i < Source.rows;
i++)
{
for (int j = 0;
j < Source.cols;
j++)
{
Vec3b temp = Source.at(i,j);
SourceSignel.at(i, j) = temp[0];
}
}}cv::Mat outImg = cv::Mat::zeros(cv::Size(Source.cols,Source.rows), Source.type());
cv::Mat outImgGpu = cv::Mat::zeros(cv::Size(Source.cols, Source.rows), CV_8UC1);
removeSmallRegion(SourceSignel,outImg,100, 1, 1);
removeSmallRegionGpu(SourceSignel, outImgGpu, 100, 1, 1);
}
以上就是整个过程,其实不需要我再介绍什么了,上面的注释以及一些过程的,都写了,只要顺着顺序看,就基本明白我要做的事情了.
推荐阅读
- opencv|opencv C++模板匹配的简单实现
- Java|Java OpenCV图像处理之SIFT角点检测详解
- C语言学习|第十一届蓝桥杯省赛 大学B组 C/C++ 第一场
- c++基础概念笔记
- 牛逼!C++开发的穿越丛林真人游戏,游戏未上线就有百万人气
- OpenCV|OpenCV-Python实战(18)——深度学习简介与入门示例
- C++Primer之|C++Primer之 函数探幽
- c/c++|有感 Visual Studio 2015 RTM 简介 - 八年后回归 Dot Net,终于迎来了 Mvc 时代,盼走了 Web 窗体时代...
- QML基础信息
- OpenCV|OpenCV for Unity 通过WebCamTextureToMatHelper帮助类来获取摄像头的画面