|
cuda编程:完整cuda程序的七大步骤:
1、设置显卡设备
2、分配显存空间
3、从内存拷贝数据到显存
4、执行并行操作(cuda的优势)
5、从显存拷贝结果到内存
6、释放显存空间
7、设备重置
如果是单GPU可以省掉1和7步骤
- #include <iostream>
- using namespace std;
- __global__ void kernelFunc(float *a){
- a[threadIdx.x] = 1;
- }
- int main(){
- int gpuCount = -1;
- cudaGetDeviceCount(&gpuCount);
- printf("本机GPU的数量=%d\n",gpuCount);
- if(gpuCount < 1){
- printf("本机没有GPU设备!退出程序!\n");
- return 0;
- }
-
- cout << "1、设置显卡设备" << endl;
- cudaSetDevice(gpuCount-1);
-
- int device_id;
- cudaGetDevice(&device_id);
- printf("看一下设备是否设置成功?device_id=%d\n",device_id);
- //可以查看当前GPU设备的信息
- cudaDeviceProp prop;
- cudaGetDeviceProperties(&prop, device_id);
- printf("设备名字=%s\n", prop.name);
- printf("maxThreadsPerBlock=%d\n", prop.maxThreadsPerBlock);
- printf("maxThreadsDim[0]=%d\n", prop.maxThreadsDim[0]);
- printf("maxGridSize[0]=%d\n", prop.maxGridSize[0]);
- printf("totalConstMem=%ld\n", prop.totalConstMem);
- printf("totalGlobalMem=%ld\n", prop.totalGlobalMem);
- printf("compute capability算力=%d.%d\n", prop.major, prop.minor);
- printf("clockRate=%d\n", prop.clockRate);
- printf("是否集成显卡integrated=%d\n", prop.integrated);
- float *aGpu; // gpu上的数组指针
- cout << "2、分配显存空间" << endl;
- cudaMalloc(&aGpu, 16*sizeof(float));
- float a[16] = {0}; //内存空间 分配了
- cout << "3、从内存拷贝数据到显存" << endl;
- cudaMemcpy(aGpu, a, 16*sizeof(float), cudaMemcpyHostToDevice);
- cout << "4、执行并行操作(cuda的优势)" << endl;
- //<<<blockNum, threadNum>> >
- kernelFunc<<<1, 16>>>(aGpu); // 1个block 16个线程
- cout << "5、从显存拷贝结果到内存" << endl;
- cudaMemcpy(a, aGpu, 16*sizeof(float), cudaMemcpyDeviceToHost);
- //打印一下看看是否修改了?
- for (int i =0; i<16;i++){
- printf("%f ", a[i]);
- }
- printf("\n");
- cout << "6、释放显存空间" << endl;
- cudaFree(aGpu);
- cout << "7、设备重置" << endl;
- cudaDeviceReset();
- return 0;
- }
复制代码
- CMAKE_MINIMUM_REQUIRED(VERSION 2.8)
- PROJECT(jiang)
- SET(CMAKE_CXX_FLAGS "-std=c++11")
- FIND_PACKAGE(CUDA REQUIRED)
- CUDA_ADD_EXECUTABLE(jiang main.cu)
- TARGET_LINK_LIBRARIES(jiang)
复制代码
- 本机GPU的数量=1
- 1、设置显卡设备
- 看一下设备是否设置成功?device_id=0
- 设备名字=GeForce RTX 2070
- maxThreadsPerBlock=1024
- maxThreadsDim[0]=1024
- maxGridSize[0]=2147483647
- totalConstMem=65536
- totalGlobalMem=8366784512
- compute capability算力=7.5
- clockRate=1620000
- 是否集成显卡integrated=0
- 2、分配显存空间
- 3、从内存拷贝数据到显存
- 4、执行并行操作(cuda的优势)
- 5、从显存拷贝结果到内存
- 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000
- 6、释放显存空间
- 7、设备重置
复制代码
|
|