Commit 01e572f4 authored by Pierre Paleo's avatar Pierre Paleo
Browse files

WIP multi-slice backprojector

parent 8519bd3f
#define SHARED_SIZE 256
texture<float, 2, cudaReadModeElementType> tex_projections;
//~ texture<float, 2, cudaReadModeElementType> tex_msin_lut;
//~ texture<float, 2, cudaReadModeElementType> tex_cos_lut;
/**
......@@ -25,7 +20,7 @@ threads (32, 32). However, it turns out that performances are best with (16, 16)
blocks.
**/
// Backproject one slice
// Backproject one sinogram
// One thread handles up to 4 pixels in the output slice
// the case num_projs > 1024 has to be included.
__global__ void backproj(
......@@ -63,7 +58,7 @@ __global__ void backproj(
for (int proj = 0; proj < num_projs; proj++) {
if (proj == next_fetch) {
// Fetch "blockDim.x" values to shared memory
// Fetch SHARED_SIZE values to shared memory
__syncthreads();
if (next_fetch + tid < num_projs) {
s_cos[tid] = d_cos[next_fetch + tid];
......@@ -104,4 +99,82 @@ __global__ void backproj(
#ifdef BACKPROJ3D
texture<float, 3, cudaReadModeElementType> tex_projections3D;
// Backproject multiple sinograms
__global__ void backproj_multi(
float* d_slices,
int num_projs,
int num_bins,
int num_slices,
float axis_position,
int n_x,
int n_y,
float* d_cos,
float* d_msin,
float scale_factor
)
{
int x = blockDim.x * blockIdx.x + threadIdx.x;
int y = blockDim.y * blockIdx.y + threadIdx.y;
int z = blockDim.z * blockIdx.z + threadIdx.z;
int Gx = blockDim.x * gridDim.x;
int Gy = blockDim.y * gridDim.y;
// (xr, yr) (xrp, yr)
// (xr, yrp) (xrp, yrp)
float xr = x - axis_position, yr = y - axis_position;
float xrp = xr + Gx, yrp = yr + Gy;
/*volatile*/ __shared__ float s_cos[SHARED_SIZE];
/*volatile*/ __shared__ float s_msin[SHARED_SIZE];
int next_fetch = 0;
int tid = threadIdx.y * blockDim.x + threadIdx.x;
float costheta, msintheta;
float h1, h2, h3, h4;
float sum1 = 0.0f, sum2 = 0.0f, sum3 = 0.0f, sum4 = 0.0f;
for (int proj = 0; proj < num_projs; proj++) {
if (proj == next_fetch) {
// Fetch SHARED_SIZE values to shared memory
__syncthreads();
if (next_fetch + tid < num_projs) {
s_cos[tid] = d_cos[next_fetch + tid];
s_msin[tid] = d_msin[next_fetch + tid];
}
next_fetch += SHARED_SIZE;
__syncthreads();
}
costheta = s_cos[proj - (next_fetch - SHARED_SIZE)];
msintheta = s_msin[proj - (next_fetch - SHARED_SIZE)];
float c1 = fmaf(costheta, xr, axis_position); // cos(theta)*xr + axis_pos
float c2 = fmaf(costheta, xrp, axis_position); // cos(theta)*(xr + Gx) + axis_pos
float s1 = fmaf(msintheta, yr, 0.0f); // -sin(theta)*yr
float s2 = fmaf(msintheta, yrp, 0.0f); // -sin(theta)*(yr + Gy)
h1 = c1 + s1;
h2 = c2 + s1;
h3 = c1 + s2;
h4 = c2 + s2;
if (h1 >= 0 && h1 < num_bins) sum1 += tex3D(tex_projections3D, h1 + 0.5f, proj + 0.5f, z + 0.5f);
if (h2 >= 0 && h2 < num_bins) sum2 += tex3D(tex_projections3D, h2 + 0.5f, proj + 0.5f, z + 0.5f);
if (h3 >= 0 && h3 < num_bins) sum3 += tex3D(tex_projections3D, h3 + 0.5f, proj + 0.5f, z + 0.5f);
if (h4 >= 0 && h4 < num_bins) sum4 += tex3D(tex_projections3D, h4 + 0.5f, proj + 0.5f, z + 0.5f);
}
d_slices[(z*n_y + y)*(n_x) + x] = sum1 * scale_factor;
if (Gx + x < n_x)
d_slices[(z*n_y + y)*(n_x) + Gx + x] = sum2 * scale_factor;
if (Gy + y < n_y) {
d_slices[(z*n_y + y + Gy)*(n_x) + x] = sum3 * scale_factor;
if (Gx + x < n_x)
d_slices[(z*n_y + y + Gy)*(n_x) + Gx + x] = sum4 * scale_factor;
}
}
#endif
......@@ -111,8 +111,6 @@ def copy_array(dst, src, check=False, src_dtype=None):
shape_src, dtype_src = get_shape_dtype(src)
shape_dst, dtype_dst = get_shape_dtype(dst)
dtype_src = src_dtype or dtype_src
#~ print("src: %s, %s" % (str(shape_src), str(dtype_src))) # DEBUG
#~ print("dst: %s, %s" % (str(shape_dst), str(dtype_dst))) # DEBUG
if check:
if shape_src != shape_dst:
raise ValueError("shape_src != shape_dst : have %s and %s" % (str(shape_src), str(shape_dst)))
......@@ -144,6 +142,11 @@ def copy_array(dst, src, check=False, src_dtype=None):
copy.width_in_bytes = copy.dst_pitch = w * np.dtype(dtype_src).itemsize
copy.dst_height = copy.height = h
copy(True)
# ??
if len(shape_src) == 2:
copy(True)
else:
copy()
###
......@@ -94,6 +94,8 @@ class Backprojector(CudaProcessing):
self.n_x = n_x
self.n_y = n_y
self.slice_shape = (n_y, n_x)
if n_slices > 1:
self.slice_shape = (n_slices,) + self.slice_shape
self.n_slices = n_slices
self.n_sinos = n_sinos
self.n_angles = n_angles
......@@ -105,7 +107,6 @@ class Backprojector(CudaProcessing):
else:
assert len(angles) == self.n_angles
self.angles = angles
self.sino_shape = (self.n_angles, self.dwidth)
self.rot_center = rot_center or (self.dwidth - 1)/2.
self.axis_pos = self.rot_center
......@@ -137,10 +138,23 @@ class Backprojector(CudaProcessing):
def _compile_kernels(self):
# Configure backprojector
fname = get_cuda_srcfile("backproj.cu")
self.gpu_projector = CudaKernel("backproj", filename=fname)
self.texref_proj = self.gpu_projector.module.get_texref("tex_projections")
if self.n_sinos > 1:
kernel_name = "backproj_multi"
tex_name = "tex_projections3D"
kernel_signature = "PiiifiiPPf"
sourcemodule_options = ["-DBACKPROJ3D"]
else:
kernel_name = "backproj"
tex_name = "tex_projections"
kernel_signature = "PiifiiPPf"
sourcemodule_options = None
self.gpu_projector = CudaKernel(
kernel_name, filename=fname, options=sourcemodule_options
)
self.texref_proj = self.gpu_projector.module.get_texref(tex_name)
self.texref_proj.set_filter_mode(cuda.filter_mode.LINEAR)
self.gpu_projector.prepare("PiifiiPPf", [self.texref_proj])
self.gpu_projector.prepare(kernel_signature, [self.texref_proj])
# We use blocks of 16*16 (see why in kernel doc), and one thread
# handles 2 pixels per dimension.
self.block = (16, 16, 1)
......@@ -148,6 +162,27 @@ class Backprojector(CudaProcessing):
updiv(updiv(self.n_x, self.block[0]), 2),
updiv(updiv(self.n_y, self.block[1]), 2)
)
if self.n_sinos > 1:
self.grid += (self.n_sinos,)
# Prepare kernel arguments
self.kern_proj_args = [
None, # output d_slice holder
self.n_angles,
self.dwidth,
self.axis_pos,
self.n_x,
self.n_y,
self._d_cos,
self._d_msin,
1.0 # scale factor # TODO customize
]
if self.n_slices >= 1:
self.kern_proj_args.insert(3, self.n_slices)
self.kern_proj_kwargs = {
"grid": self.grid,
"block": self.block,
"shared_size": int(np.prod(self.block))*2*4,
}
def _bind_textures(self):
......@@ -170,20 +205,10 @@ class Backprojector(CudaProcessing):
def backproj(self, sino, output=None, do_checks=True):
copy_array(self._d_sino_cua, sino, check=do_checks)
d_slice = self._set_output(output, check=do_checks)
self.kern_proj_args[0] = d_slice
self.gpu_projector(
d_slice,
self.n_angles,
self.dwidth,
self.axis_pos,
self.n_x,
self.n_y,
self._d_cos,
self._d_msin,
1.0,
grid=self.grid,
block=self.block,
shared_size=int(np.prod(self.block))*2*4
*self.kern_proj_args,
**self.kern_proj_kwargs
)
if output is not None:
return output
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment