|
GPU加速之pyCuda快速上手demo
官方文档:https://documen.tician.de/pycuda/
快速安装:
pip3 install pycuda
会编译 Building wheel for pycuda (setup.py)
Successfully built pycuda pytools
Installing collected packages: appdirs, dataclasses, pytools, MarkupSafe, mako, pycuda
Successfully installed MarkupSafe-1.1.1 appdirs-1.4.4 dataclasses-0.7 mako-1.1.3 pycuda-2020.1 pytools-2020.4.3
- # -*- coding: utf-8 -*-
- __author__ = u'东方老师 微信:dfy_88888'
- __date__ = '2020/10/21 下午2:42'
- __product__ = 'PyCharm'
- __filename__ = 'pycuda_demo01'
- import sys
- from time import time
- from functools import reduce
- import numpy as np
- import pandas as pd
- import matplotlib
- from matplotlib import pyplot as plt
- import pycuda
- import pycuda.autoinit
- import pycuda.driver as drv
- from pycuda import gpuarray
- from pycuda.elementwise import ElementwiseKernel
- from pycuda.scan import InclusiveScanKernel
- from pycuda.reduction import ReductionKernel
- # PyCUDA 可以通过 Python 访问 NVIDIA 的 CUDA 并行计算 API
- print(f'The version of PyCUDA: {pycuda.VERSION}')
- print(f'The version of Python: {sys.version}')
- def query_device():
- drv.init()
- print('CUDA device query (PyCUDA version) \n')
- print(f'Detected {drv.Device.count()} CUDA Capable device(s) \n')
- for i in range(drv.Device.count()):
- gpu_device = drv.Device(i)
- print(f'Device {i}: {gpu_device.name()}')
- compute_capability = float('%d.%d' % gpu_device.compute_capability())
- print(f'\t Compute Capability: {compute_capability}')
- print(f'\t Total Memory: {gpu_device.total_memory() // (1024 ** 2)} megabytes')
- # The following will give us all remaining device attributes as seen
- # in the original deviceQuery.
- # We set up a dictionary as such so that we can easily index
- # the values using a string descriptor.
- device_attributes_tuples = gpu_device.get_attributes().items()
- device_attributes = {}
- for k, v in device_attributes_tuples:
- device_attributes[str(k)] = v
- num_mp = device_attributes['MULTIPROCESSOR_COUNT']
- # Cores per multiprocessor is not reported by the GPU!
- # We must use a lookup table based on compute capability.
- # See the following:
- # http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#compute-capabilities
- cuda_cores_per_mp = {5.0: 128, 6.0: 64, 6.1: 128, 6.2: 128, 7.5: 128}[compute_capability]
- print(
- f'\t ({num_mp}) Multiprocessors, ({cuda_cores_per_mp}) CUDA Cores / Multiprocessor: {num_mp * cuda_cores_per_mp} CUDA Cores')
- # device_attributes.pop('MULTIPROCESSOR_COUNT')
- for k in device_attributes.keys():
- print(f'\t {k}: {device_attributes[k]}')
- # query_device() # 查询设备
- # NumPy array 和 gpuarray 之间的相互转换
- # GPU 有自己的显存,这区别于主机上的内存,这又称为设备内存(device memory)。
- #
- # NumPy array 运行在 CPU 环境(主机端),而 gpuarray 运行在 GPU 环境(设备端),
- # 两者常常需要相互转换,即 CPU 数据和 GPU 数据之间的传输转换。
- host_data = np.array([1, 2, 3, 4, 5], dtype=np.float32)
- device_data = gpuarray.to_gpu(host_data)
- device_data_x2 = 2 * device_data
- print("设备端数据:", device_data_x2, type(device_data_x2))
- host_data_x2 = device_data_x2.get()
- print("主机端数据:", host_data_x2, type(host_data_x2))
- # 按元素运算是天生的可并行计算的操作类型,在进行这种运算时 gpuarray 会自动利用多核进行并行计算
- # 进行转换的时候应该尽可能通过 dtype 指定类型,以避免不必要的性能损失
- # 性能比较
- def simple_speed_test():
- host_data = np.float32(np.random.random(50000000))
- t1 = time()
- host_data_2x = host_data * np.float32(2)
- t2 = time()
- print(f'total time to compute on CPU: {t2 - t1}')
- device_data = gpuarray.to_gpu(host_data)
- t1 = time()
- device_data_2x = device_data * np.float32(2)
- t2 = time()
- from_device = device_data_2x.get()
- print(f'total time to compute on GPU: {t2 - t1}')
- print(f'Is the host computation the same as the GPU computation? : {np.allclose(from_device, host_data_2x)}')
- # simple_speed_test()
- # Python 的内置函数 map
- print(list(map(lambda x: x + 10, [1, 2, 3, 4, 5])))
- # ElementWiseKernel 非常类似于 map 函数。
- #
- # ElementwiseKernel 函数可以自定义按元素运算的内核。使用时需要嵌入 CUDA C 的代码。
- #
- # 内核(kernel)在这里可以简单理解为 CUDA 直接运行在 GPU 的函数
- gpu_2x_ker = ElementwiseKernel(
- arguments="float *in, float *out",
- operation="out[i] = 2 * in[i];",
- name="gpu_2x_ker"
- )
- def elementwise_kernel_example():
- host_data = np.float32(np.random.random(50000000))
- t1 = time()
- host_data_2x = host_data * np.float32(2)
- t2 = time()
- print(f'total time to compute on CPU: {t2 - t1}')
- device_data = gpuarray.to_gpu(host_data)
- # allocate memory for output
- device_data_2x = gpuarray.empty_like(device_data)
- t1 = time()
- gpu_2x_ker(device_data, device_data_2x)
- t2 = time()
- from_device = device_data_2x.get()
- print(f'total time to compute on GPU: {t2 - t1}')
- print(f'Is the host computation the same as the GPU computation? : {np.allclose(from_device, host_data_2x)}')
- # 因为在 PyCUDA 中,通常会在程序第一次运行过程中,nvcc 编译器会对 GPU 代码进行编译,
- # 然后由 PyCUDA 进行调用。这个编译时间就是额外的性能损耗
- # nvcc -V
- for _ in range(30):
- elementwise_kernel_example()
- # class pycuda.elementwise.ElementwiseKernel(arguments, operation, name="kernel", keep=False, options=[], preamble="")
- # arguments:该内核定义的传参
- # operation:该内核定义的内嵌 CUDA C 代码
- # name:定义的内核名称
- # 运行输出之后PyCuda就会把所有清理和内存回收工作做好了,咱们的简介也就完毕了
复制代码
|
|