|
cuda编程:GPU上做图像卷积操作
- #include "stdio.h"
- static void HandleError(cudaError_t err,
- const char *file,
- int line)
- {
- if(err != cudaSuccess)
- {
- printf("错误:%s在%s中%d行,看看吧\n",
- cudaGetErrorString(err),
- file, line);
- exit(EXIT_FAILURE);
- }
- }
- #define HANDLE_ERROR(err) (HandleError(err, __FILE__, __LINE__))
- int getThreadNum()
- {
- cudaDeviceProp prop;
- int count;
- HANDLE_ERROR(cudaGetDeviceCount(&count));
- printf("gpu num %d\n", count);
- HANDLE_ERROR(cudaGetDeviceProperties(&prop, 0));
- printf("max thread num: %d\n", prop.maxThreadsPerBlock);
- printf("max grid dimensions: %d, %d, %d)\n",
- prop.maxGridSize[0], prop.maxGridSize[1], prop.maxGridSize[2]);
- return prop.maxThreadsPerBlock;
- }
- __global__ void conv(float *img, float *kernel, float *result,
- int width, int height, int kernelSize)
- {
- int ti = threadIdx.x; //线程id
- int bi = blockIdx.x; //block id
- //blockDim.x 块的维度大小 实际就是 每个block的线程总数
- int id = (bi * blockDim.x + ti);
- // 这个id就是 相当于所有数据的下标一样的 是一维的
- if(id >= width * height)
- {
- return;
- }
- // id转为二维的索引 row col
- int row = id / width; // 行号
- int col = id % width; // 列号
- // 图像上的一个像素点对应了3*3次相乘后求和:卷积操作的定义 所以必须用循环了
- for(int i = 0; i < kernelSize; ++i)
- {
- for(int j = 0; j < kernelSize; ++j)
- {
- float imgValue = 0; // 图像像素点的值
- int curRow = row - kernelSize / 2 + i;
- int curCol = col - kernelSize / 2 + j;
- if(curRow < 0 || curCol < 0 || curRow >= height || curCol >= width)
- {
- //超出图像的边界了 啥也不做
- }
- else
- {
- imgValue = img[curRow * width + curCol];
- }
- // 看出:传进来的 img kernel result 都是一维的数组 指针
- result[id] += kernel[i * kernelSize + j] * imgValue;
- }
- }
- }
- //cuda编程:GPU上做图像卷积操作
- int main()
- {
- int width = 1920;
- int height = 1080;
- // new操作符 堆空间分配空间
- float *img = new float[width * height];
- for(int row = 0; row < height; ++row)
- {
- for(int col = 0; col < width; ++col)
- {
- img[col + row * width] = (col + row) % 256; // 赋值
- }
- }
- int kernelSize = 3;
- float *kernel = new float[kernelSize * kernelSize];
- for(int i = 0; i < kernelSize * kernelSize; ++i)
- {
- kernel[i] = i % kernelSize - 1; // 卷积核 赋值
- }
- float *imgGpu;
- float *kernelGpu;
- float *resultGpu;
- // 分配gpu空间
- HANDLE_ERROR(cudaMalloc(&imgGpu, width * height * sizeof(float)));
- HANDLE_ERROR(cudaMalloc(&kernelGpu, kernelSize * kernelSize * sizeof(float)));
- HANDLE_ERROR(cudaMalloc(&resultGpu, width * height * sizeof(float)));
- HANDLE_ERROR(cudaMemcpy(imgGpu, img,
- width * height * sizeof(float), cudaMemcpyHostToDevice));
- HANDLE_ERROR(cudaMemcpy(kernelGpu, kernel,
- kernelSize * kernelSize * sizeof(float), cudaMemcpyHostToDevice));
-
-
- int threadNum = getThreadNum(); // 线程数 全部分配=1024
- int blockNum = (width * height - 0.5) / threadNum + 1; // 块的数目会根据实际数据长度来分
- conv<<<blockNum, threadNum>>>
- (imgGpu, kernelGpu, resultGpu, width, height, kernelSize);
- float *result = new float[width * height];
- HANDLE_ERROR(cudaMemcpy(result, resultGpu,
- width * height * sizeof(float), cudaMemcpyDeviceToHost));
- // visualization
- printf("img\n");
- for(int row = 0; row < 10; ++row)
- {
- for(int col = 0; col < 10; ++col)
- {
- printf("%2.0f ", img[col + row * width]);
- }
- printf("\n");
- }
- printf("kernel\n");
- for(int row = 0; row < kernelSize; ++row)
- {
- for(int col = 0; col < kernelSize; ++col)
- {
- printf("%2.0f ", kernel[col + row * kernelSize]);
- }
- printf("\n");
- }
- printf("result\n");
- for(int row = 0; row < 10; ++row)
- {
- for(int col = 0; col < 10; ++col)
- {
- printf("%2.0f ", result[col + row * width]);
- }
- printf("\n");
- }
- return 0;
- }
复制代码
|
|