cuda编程：GPU上做图像卷积操作与报错调试HandleError

东方耀 · 发表于 2021-6-8 14:52:19

cuda编程：GPU上做图像卷积操作

#include "stdio.h"
static void HandleError(cudaError_t err,
const char *file,
int line)
{
if(err != cudaSuccess)
{
printf("错误:%s在%s中%d行，看看吧\n",
cudaGetErrorString(err),
file, line);
exit(EXIT_FAILURE);
}
}
#define HANDLE_ERROR(err) (HandleError(err, __FILE__, __LINE__))
int getThreadNum()
{
cudaDeviceProp prop;
int count;
HANDLE_ERROR(cudaGetDeviceCount(&count));
printf("gpu num %d\n", count);
HANDLE_ERROR(cudaGetDeviceProperties(&prop, 0));
printf("max thread num: %d\n", prop.maxThreadsPerBlock);
printf("max grid dimensions: %d, %d, %d)\n",
prop.maxGridSize[0], prop.maxGridSize[1], prop.maxGridSize[2]);
return prop.maxThreadsPerBlock;
}
__global__ void conv(float *img, float *kernel, float *result,
int width, int height, int kernelSize)
{
int ti = threadIdx.x; //线程id
int bi = blockIdx.x; //block id
//blockDim.x 块的维度大小实际就是每个block的线程总数
int id = (bi * blockDim.x + ti);
// 这个id就是相当于所有数据的下标一样的是一维的
if(id >= width * height)
{
return;
}
// id转为二维的索引 row col
int row = id / width; // 行号
int col = id % width; // 列号
// 图像上的一个像素点对应了3*3次相乘后求和：卷积操作的定义所以必须用循环了
for(int i = 0; i < kernelSize; ++i)
{
for(int j = 0; j < kernelSize; ++j)
{
float imgValue = 0; // 图像像素点的值
int curRow = row - kernelSize / 2 + i;
int curCol = col - kernelSize / 2 + j;
if(curRow < 0 || curCol < 0 || curRow >= height || curCol >= width)
{
//超出图像的边界了啥也不做
}
else
{
imgValue = img[curRow * width + curCol];
}
// 看出：传进来的 img kernel result 都是一维的数组指针
result[id] += kernel[i * kernelSize + j] * imgValue;
}
}
}
//cuda编程：GPU上做图像卷积操作
int main()
{
int width = 1920;
int height = 1080;
// new操作符堆空间分配空间
float *img = new float[width * height];
for(int row = 0; row < height; ++row)
{
for(int col = 0; col < width; ++col)
{
img[col + row * width] = (col + row) % 256; // 赋值
}
}
int kernelSize = 3;
float *kernel = new float[kernelSize * kernelSize];
for(int i = 0; i < kernelSize * kernelSize; ++i)
{
kernel[i] = i % kernelSize - 1; // 卷积核赋值
}
float *imgGpu;
float *kernelGpu;
float *resultGpu;
// 分配gpu空间
HANDLE_ERROR(cudaMalloc(&imgGpu, width * height * sizeof(float)));
HANDLE_ERROR(cudaMalloc(&kernelGpu, kernelSize * kernelSize * sizeof(float)));
HANDLE_ERROR(cudaMalloc(&resultGpu, width * height * sizeof(float)));
HANDLE_ERROR(cudaMemcpy(imgGpu, img,
width * height * sizeof(float), cudaMemcpyHostToDevice));
HANDLE_ERROR(cudaMemcpy(kernelGpu, kernel,
kernelSize * kernelSize * sizeof(float), cudaMemcpyHostToDevice));
int threadNum = getThreadNum(); // 线程数全部分配=1024
int blockNum = (width * height - 0.5) / threadNum + 1; // 块的数目会根据实际数据长度来分
conv<<<blockNum, threadNum>>>
(imgGpu, kernelGpu, resultGpu, width, height, kernelSize);
float *result = new float[width * height];
HANDLE_ERROR(cudaMemcpy(result, resultGpu,
width * height * sizeof(float), cudaMemcpyDeviceToHost));
// visualization
printf("img\n");
for(int row = 0; row < 10; ++row)
{
for(int col = 0; col < 10; ++col)
{
printf("%2.0f ", img[col + row * width]);
}
printf("\n");
}
printf("kernel\n");
for(int row = 0; row < kernelSize; ++row)
{
for(int col = 0; col < kernelSize; ++col)
{
printf("%2.0f ", kernel[col + row * kernelSize]);
}
printf("\n");
}
printf("result\n");
for(int row = 0; row < 10; ++row)
{
for(int col = 0; col < 10; ++col)
{
printf("%2.0f ", result[col + row * width]);
}
printf("\n");
}
return 0;
}

复制代码

zouqiqi · 发表于 2021-11-23 19:40:14

让天下人人学会人工智能！人工智能的前景一片大好！

		自动登录	找回密码
密码			立即注册

[C/C++] cuda编程：GPU上做图像卷积操作与报错调试HandleError