GPU加速之pyCuda快速上手demo

东方耀 · 发表于 2020-10-21 15:43:29

GPU加速之pyCuda快速上手demo

官方文档：https://documen.tician.de/pycuda/

快速安装：
pip3 install pycuda

会编译 Building wheel for pycuda (setup.py)

Successfully built pycuda pytools
Installing collected packages: appdirs, dataclasses, pytools, MarkupSafe, mako, pycuda
Successfully installed MarkupSafe-1.1.1 appdirs-1.4.4 dataclasses-0.7 mako-1.1.3 pycuda-2020.1 pytools-2020.4.3

# -*- coding: utf-8 -*-
__author__ = u'东方老师微信：dfy_88888'
__date__ = '2020/10/21 下午2:42'
__product__ = 'PyCharm'
__filename__ = 'pycuda_demo01'
import sys
from time import time
from functools import reduce
import numpy as np
import pandas as pd
import matplotlib
from matplotlib import pyplot as plt
import pycuda
import pycuda.autoinit
import pycuda.driver as drv
from pycuda import gpuarray
from pycuda.elementwise import ElementwiseKernel
from pycuda.scan import InclusiveScanKernel
from pycuda.reduction import ReductionKernel
# PyCUDA 可以通过 Python 访问 NVIDIA 的 CUDA 并行计算 API
print(f'The version of PyCUDA: {pycuda.VERSION}')
print(f'The version of Python: {sys.version}')
def query_device():
drv.init()
print('CUDA device query (PyCUDA version) \n')
print(f'Detected {drv.Device.count()} CUDA Capable device(s) \n')
for i in range(drv.Device.count()):
gpu_device = drv.Device(i)
print(f'Device {i}: {gpu_device.name()}')
compute_capability = float('%d.%d' % gpu_device.compute_capability())
print(f'\t Compute Capability: {compute_capability}')
print(f'\t Total Memory: {gpu_device.total_memory() // (1024 ** 2)} megabytes')
# The following will give us all remaining device attributes as seen
# in the original deviceQuery.
# We set up a dictionary as such so that we can easily index
# the values using a string descriptor.
device_attributes_tuples = gpu_device.get_attributes().items()
device_attributes = {}
for k, v in device_attributes_tuples:
device_attributes[str(k)] = v
num_mp = device_attributes['MULTIPROCESSOR_COUNT']
# Cores per multiprocessor is not reported by the GPU!
# We must use a lookup table based on compute capability.
# See the following:
# http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#compute-capabilities
cuda_cores_per_mp = {5.0: 128, 6.0: 64, 6.1: 128, 6.2: 128, 7.5: 128}[compute_capability]
print(
f'\t ({num_mp}) Multiprocessors, ({cuda_cores_per_mp}) CUDA Cores / Multiprocessor: {num_mp * cuda_cores_per_mp} CUDA Cores')
# device_attributes.pop('MULTIPROCESSOR_COUNT')
for k in device_attributes.keys():
print(f'\t {k}: {device_attributes[k]}')
# query_device() # 查询设备
# NumPy array 和 gpuarray 之间的相互转换
# GPU 有自己的显存，这区别于主机上的内存，这又称为设备内存（device memory）。
#
# NumPy array 运行在 CPU 环境（主机端），而 gpuarray 运行在 GPU 环境（设备端），
# 两者常常需要相互转换，即 CPU 数据和 GPU 数据之间的传输转换。
host_data = np.array([1, 2, 3, 4, 5], dtype=np.float32)
device_data = gpuarray.to_gpu(host_data)
device_data_x2 = 2 * device_data
print("设备端数据：", device_data_x2, type(device_data_x2))
host_data_x2 = device_data_x2.get()
print("主机端数据：", host_data_x2, type(host_data_x2))
# 按元素运算是天生的可并行计算的操作类型，在进行这种运算时 gpuarray 会自动利用多核进行并行计算
# 进行转换的时候应该尽可能通过 dtype 指定类型，以避免不必要的性能损失
# 性能比较
def simple_speed_test():
host_data = np.float32(np.random.random(50000000))
t1 = time()
host_data_2x = host_data * np.float32(2)
t2 = time()
print(f'total time to compute on CPU: {t2 - t1}')
device_data = gpuarray.to_gpu(host_data)
t1 = time()
device_data_2x = device_data * np.float32(2)
t2 = time()
from_device = device_data_2x.get()
print(f'total time to compute on GPU: {t2 - t1}')
print(f'Is the host computation the same as the GPU computation? : {np.allclose(from_device, host_data_2x)}')
# simple_speed_test()
# Python 的内置函数 map
print(list(map(lambda x: x + 10, [1, 2, 3, 4, 5])))
# ElementWiseKernel 非常类似于 map 函数。
#
# ElementwiseKernel 函数可以自定义按元素运算的内核。使用时需要嵌入 CUDA C 的代码。
#
# 内核（kernel）在这里可以简单理解为 CUDA 直接运行在 GPU 的函数
gpu_2x_ker = ElementwiseKernel(
arguments="float *in, float *out",
operation="out[i] = 2 * in[i];",
name="gpu_2x_ker"
)
def elementwise_kernel_example():
host_data = np.float32(np.random.random(50000000))
t1 = time()
host_data_2x = host_data * np.float32(2)
t2 = time()
print(f'total time to compute on CPU: {t2 - t1}')
device_data = gpuarray.to_gpu(host_data)
# allocate memory for output
device_data_2x = gpuarray.empty_like(device_data)
t1 = time()
gpu_2x_ker(device_data, device_data_2x)
t2 = time()
from_device = device_data_2x.get()
print(f'total time to compute on GPU: {t2 - t1}')
print(f'Is the host computation the same as the GPU computation? : {np.allclose(from_device, host_data_2x)}')
# 因为在 PyCUDA 中，通常会在程序第一次运行过程中，nvcc 编译器会对 GPU 代码进行编译，
# 然后由 PyCUDA 进行调用。这个编译时间就是额外的性能损耗
# nvcc -V
for _ in range(30):
elementwise_kernel_example()
# class pycuda.elementwise.ElementwiseKernel(arguments, operation, name="kernel", keep=False, options=[], preamble="")
# arguments：该内核定义的传参
# operation：该内核定义的内嵌 CUDA C 代码
# name：定义的内核名称
# 运行输出之后PyCuda就会把所有清理和内存回收工作做好了，咱们的简介也就完毕了

复制代码

bixintao · 发表于 2020-11-10 19:51:52

积分积分积分

		自动登录	找回密码
密码			立即注册

[课堂笔记] GPU加速之pyCuda快速上手demo