Commit 228c5157 authored by myron's avatar myron

added pycudatest.py : utility to test GPUwith a backprojection N times

parent 8d57662b
import pycuda.driver as drv
import numpy as np
import math
import time
from pycuda.compiler import SourceModule
import h5py
def iceil(x):
return int(math.ceil(x))
def create_2D_array(np_array):
h,w = np_array.shape
descr = drv.ArrayDescriptor()
descr.width = w
descr.height = h
descr.format = drv.dtype_to_array_format(np_array.dtype)
descr.num_channels = 1
descr.flags = 0
device_array = drv.Array(descr)
return device_array
def cu_mem_to_array2D(device_array, cu_mem, h,w):
# h, w = np_array.shape
copy = drv.Memcpy2D()
print(" COPYING ", cu_mem, " SU ", device_array )
copy.set_src_device(cu_mem)
copy.set_dst_array(device_array)
copy.width_in_bytes = copy.src_pitch = w*4
copy.src_height = copy.height = h
copy(aligned=False)
print(" OK " )
def host_to_array2D(device_array, npmem):
# h, w = np_array.shape
copy = drv.Memcpy2D()
copy.set_src_host(npmem)
copy.set_dst_array(device_array)
print(npmem.dtype)
print(npmem.shape)
copy.width_in_bytes = copy.src_pitch = npmem.shape[1]*4
copy.src_height = copy.height = npmem.shape[0]
copy(aligned=False)
def cu_mem_to_array(device_array, cu_mem, d,h,w):
# d, h, w = np_array.shape
copy = drv.Memcpy3D()
copy.set_src_device(cu_mem)
copy.set_dst_array(device_array)
copy.width_in_bytes = copy.src_pitch = w*4
copy.src_height = copy.height = h
copy.depth = d
copy()
def write( cu_mem, filename,datasetname, mode,stp):
if not isinstance(cu_mem, (np.ndarray, np.generic) ):
if cu_mem in stp.cu_vols+[stp.cu_reserved_vol]:
vol_data_recv = np.zeros([stp.NY,stp.NX] ,"f")
drv.memcpy_dtoh( vol_data_recv , cu_mem )
h5py.File(filename,mode)[datasetname] =vol_data_recv
elif cu_mem in stp.cu_sinos+[stp.cu_reserved_sino]:
vol_data_recv = np.zeros([stp.NA,stp.NX] ,"f")
drv.memcpy_dtoh( vol_data_recv , cu_mem )
h5py.File(filename,mode)[datasetname] =vol_data_recv
else:
h5py.File(filename,mode)[datasetname] =cu_mem
def setup_calculation( Nvols=2, Nsinos = 2,NX=2048, NA=1500) :
""" -----------------
CODICE CUDA
-----------------
"""
kernels_space = SourceModule("""
texture<float, cudaTextureType2D, cudaReadModeElementType> tex_sino;
texture<float, cudaTextureType2D, cudaReadModeElementType> tex_vol ;
#define CUDART_PI_F 3.141592654f
__global__ void forwproject( float * sino ,int NX, int NY, int NA, float da, float ax_pos)
{
const int tix = threadIdx.x;
const int tia = threadIdx.y;
const int bidx = blockIdx.x;
const int bida = blockIdx.y;
int ix = (bidx* blockDim.x + tix) ;
int ia = bida* blockDim.y + tia ;
int ip = ix + NX*( ia ) ;
float centerX = (NX-1.0f)*0.5f;
float centerY = (NY-1.0f)*0.5f;
if( ix<NX && ia<NA ) {
float angle = ia*da ;
float cosa = cosf(angle);
float sina = sinf(angle);
if(fabs(cosa)>fabs(sina)) {
float sum = 0.0;
for(int vy = 0; vy<NY; vy++) {
float fx;
fx = centerX + ( (ix-ax_pos) + (vy-centerY)*sina) /cosa ;
float b = tex2D(tex_vol, fx + 0.5f , vy + 0.5f ) ;
sum += (b)/fabs(cosa);
}
sino[ip] = sum;
} else {
float sum = 0.0;
for(int vx = 0; vx<NX; vx++) {
float fy;
fy = centerY + ( -(ix-ax_pos) + (vx-centerX)*cosa) /sina ;
float b = tex2D (tex_vol, vx+0.5f , fy +0.5f );
sum += (b)/fabs(sina);
}
sino[ip] = sum;
}
}
}
__global__ void backproject( float * vol ,int NX, int NY, int NA, float da,float ax_pos)
{
const int tix = threadIdx.x;
const int tiy = threadIdx.y;
const int bidx = blockIdx.x;
const int bidy = blockIdx.y;
int ix4 = (bidx* blockDim.x + tix)*4 ;
int iy2 = (bidy* blockDim.y + tiy)*2 ;
float centerX = (NX-1.0f)*0.5f;
float centerY = (NY-1.0f)*0.5f;
int ix;
int iy;
float sum[8] ;
int k;
for(k=0; k<8; k++) sum[k]=0.0;
for(int ia=0; ia<NA; ia++) {
float angle = ia*da ;
float cosa = cosf(angle);
float sina = sinf(angle);
k=0;
for(ix=ix4; ix<ix4+4; ix++) {
for(iy=iy2; iy<iy2+2; iy++) {
if( ix<NX && iy<NY ) {
float fx = ax_pos + cosa* ( ix-centerX ) - sina*(iy-centerY) ;
float b = tex2D(tex_sino, fx+0.5f , ia+0.5f );
sum[k] += b;
}
k++;
}
}
}
k=0;
for(ix=ix4; ix<ix4+4; ix++) {
for(iy=iy2; iy<iy2+2; iy++) {
if( ix<NX && iy<NY ) {
int ip = ix + NX*( iy ) ;
vol[ip] = sum[k];
}
k++;
}
}
}
"""
### , arch="sm_70"
)
class Kspace:
tex_vol =kernels_space.get_texref('tex_vol' )
tex_sino =kernels_space.get_texref('tex_sino')
backproject = kernels_space.get_function("backproject")
forwproject = kernels_space.get_function("forwproject")
class setup_return_object:
pass
stp = setup_return_object()
NY = NX
stp.size_of_dtype = 4
""" --------------------
PREPARATION TEXTURE
--------------------
"""
tex_vol =Kspace.tex_vol
tex_sino =Kspace.tex_sino
for tex in tex_vol,tex_sino:
# tex.filterMode = drv.filter_mode.LINEAR
# tex.set_filter_mode(drv.filter_mode.POINT)
tex.set_filter_mode(drv.filter_mode.LINEAR)
tex.set_address_mode(0, drv.address_mode.BORDER)
tex.set_address_mode(1, drv.address_mode.BORDER)
tex.normalized = False
stp.tex_vol = tex_vol
stp.tex_sino = tex_sino
"""
------------------------
creation array entree/sortie cuda
------------------------
"""
np_tmp = np.zeros( [NY,NX], "f" )
print( " alloco ", NY,NX , NY*NX*4, np_tmp.nbytes , " x ", Nvols )
cu_vols = [ drv.mem_alloc( np_tmp.nbytes ) for i in range(Nvols) ]
stp.cu_reserved_vol = drv.mem_alloc( np_tmp.nbytes )
stp.vol_array = create_2D_array(np_tmp)
stp.tex_vol.set_array(stp.vol_array)
### SINOS
np_tmp = np.zeros( [NA,NX], "f" )
print( " alloco ", NA,NX , NA*NX*4, np_tmp.nbytes , " x ", Nsinos )
cu_sinos = [ drv.mem_alloc( np_tmp.nbytes ) for i in range(Nsinos) ]
stp.cu_reserved_sino = drv.mem_alloc( np_tmp.nbytes )
stp.sino_array = create_2D_array(np_tmp)
stp.tex_sino.set_array(stp.sino_array)
stp.cu_vols = cu_vols
stp.cu_sinos = cu_sinos
stp.NA = NA
stp.NY = NY
stp.NX = NX
stp.cu_NA = np.int32(NA)
stp.cu_NY = np.int32(NY)
stp.cu_NX = np.int32(NX)
stp.cufunc_backproject = Kspace.backproject
stp.cufunc_forwproject = Kspace.forwproject
stp.cu_sinos = stp.cu_sinos
return stp
if __name__ == "__main__":
gpuid = 0
ax_pos = (2048-1)/2.0
NX = 2048
NA = 1500
NX = 1024
NA = 1024
da = 180.0/NA *np.pi/180.0
Ncall = 1024
drv.init()
cudadevice = drv.Device(gpuid)
attrs=cudadevice.get_attributes()
print("\n===Attributes for device %d"%gpuid)
for (key,value) in attrs.items():
print("%s:%s"%(str(key),str(value)))
cudacontext = cudadevice.make_context()
Nvols = 2
Nsinos = 1
stp = setup_calculation( NX = NX, NA=NA, Nvols = Nvols, Nsinos=Nsinos)
i_init_vol = 0
i_final_vol = 1
i_data_sino = 0
vol_ini = np.zeros([ NX,NX ],"f")
vol_ini[ int(NX*0.9):int(NX*0.9)+4 , NX//2:NX//2+4 ]=1.0
host_to_array2D(stp.vol_array , vol_ini )
stp.tex_vol.set_array(stp.vol_array)
bsx=16
bsy=16
block=(bsx,bsy,1)
grid=( iceil( float(NX/float((bsx)))), iceil( float(NA)/float((bsy)) ), 1)
stp.cufunc_forwproject(stp.cu_sinos[i_data_sino] , np.int32(NX), np.int32(NX), np.int32(NA), np.float32(da), np.float32(ax_pos), block=block, grid=grid )
cu_mem_to_array2D(stp.sino_array , stp.cu_sinos[i_data_sino] , NA, NX )
stp.tex_sino.set_array(stp.sino_array)
time0 = time.time()
bsx=16
bsy=16
block=(bsx,bsy,1)
grid=( iceil( float(NX/float((bsx*4)))), iceil( float(NX)/float((bsy*2)) ), 1)
vol_data_recv = np.zeros([1],"f")
for i in range( Ncall):
print ( i)
stp.cufunc_backproject( stp.cu_vols[i_final_vol] , np.int32(NX), np.int32(NX), np.int32(NA), np.float32(da), np.float32(ax_pos),
block=block, grid=grid)
drv.memcpy_dtoh( vol_data_recv , stp.cu_vols[i_final_vol] ) # just une lecture de un float pour forcer la synchronisation avec la sortie du kernel
time1 = time.time()
print(" NSECONDS = ", time1-time0 ," pour Ncall = ", Ncall)
write( stp.cu_vols[i_final_vol] , "final.h5","vol", "w",stp)
write( stp.cu_sinos[i_data_sino] , "final.h5","init_sino", "r+",stp)
cudacontext.pop()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment