Commit 2a6341d4 authored by Pierre Paleo's avatar Pierre Paleo
Browse files

Merge branch 'fbp' into 'master'

FBP on stack of sinograms

See merge request paleo/nabu!6
parents fbf9b946 bc21aca9
#define SHARED_SIZE 256
texture<float, 2, cudaReadModeElementType> tex_projections;
//~ texture<float, 2, cudaReadModeElementType> tex_msin_lut;
//~ texture<float, 2, cudaReadModeElementType> tex_cos_lut;
/**
......@@ -25,7 +20,7 @@ threads (32, 32). However, it turns out that performances are best with (16, 16)
blocks.
**/
// Backproject one slice
// Backproject one sinogram
// One thread handles up to 4 pixels in the output slice
// the case num_projs > 1024 has to be included.
__global__ void backproj(
......@@ -63,7 +58,7 @@ __global__ void backproj(
for (int proj = 0; proj < num_projs; proj++) {
if (proj == next_fetch) {
// Fetch "blockDim.x" values to shared memory
// Fetch SHARED_SIZE values to shared memory
__syncthreads();
if (next_fetch + tid < num_projs) {
s_cos[tid] = d_cos[next_fetch + tid];
......@@ -91,13 +86,12 @@ __global__ void backproj(
}
d_slice[y*(n_x) + x] = sum1 * scale_factor;
if (Gx + x < n_x)
d_slice[y*(n_x) + Gx + x] = sum2 * scale_factor;
if (Gy + y < n_y) {
d_slice[(y+Gy)*(n_x) + x] = sum3 * scale_factor;
if (Gx + x < n_x)
d_slice[(y+Gy)*(n_x) + Gx + x] = sum4 * scale_factor;
if (Gx + x < n_x)
d_slice[(y+Gy)*(n_x) + Gx + x] = sum4 * scale_factor;
}
}
......@@ -105,75 +99,15 @@ __global__ void backproj(
#ifdef BACKPROJ3D
texture<float, 3, cudaReadModeElementType> tex_projections3D;
/**
*
* Old stuff / tinkering
*
*
**/
/*
#define BLOCK_SIZE 512
#define BLOCK_SIZ2 BLOCK_SIZE*2
__global__ void backproj_cust(
float* d_slice,
int num_projs,
int num_bins,
float axis_position,
int n_x,
int n_y,
float scale_factor
)
{
int x = blockDim.x * blockIdx.x + threadIdx.x;
int y = blockDim.y * blockIdx.y + threadIdx.y;
if ((x >= n_x) || (y >= n_y)) return;
int xr = (x - axis_position), yr = (y - axis_position);
float sum = 0.0f;
for (int proj = 0; proj < num_projs; proj++) {
// h = axis_pos + x*cos - y*sin
// = (cos, -sin) .* (x, y) + axis_pos
#if USE_F2 == 1
float2 cos_msin = tex2D(tex_sincos_lut, proj + 0.5f, 0.5f);
float h = cos_msin.x * xr + cos_msin.y * yr + axis_position; // TODO dot() ?
//~ float h = __cosf(3.141592f/500*proj) * xr - __sinf(3.141592f/500*proj) * yr + axis_position; // fastest (3x with __intrinsics)
//~ float h = xr*0.1f - yr*0.2f + axis_position;
#else
//~ float pcos = tex2D(tex_cos_lut, proj + 0.5f, 0.5f);
//~ float pmsin = tex2D(tex_msin_lut, proj + 0.5f, 0.5f);
//~ float h = pcos * xr + pmsin * yr + axis_position; // TODO dot() ?
float h = cosf(3.141592f/500*proj) * xr - sinf(3.141592f/500*proj) * yr + axis_position;
#endif
// if(h >= 0 && h < num_bins)
sum += tex2D(tex_projections, h + 0.5f, proj + 0.5f);
}
d_slice[y*num_bins + x] = sum * scale_factor;
}
__global__ void backproj_cust2(
float* d_slice,
// Backproject multiple sinograms
__global__ void backproj_multi(
float* d_slices,
int num_projs,
int num_bins,
int num_slices,
float axis_position,
int n_x,
int n_y,
......@@ -184,59 +118,63 @@ __global__ void backproj_cust2(
{
int x = blockDim.x * blockIdx.x + threadIdx.x;
int y = blockDim.y * blockIdx.y + threadIdx.y;
if ((x >= n_x) || (y >= n_y)) return;
float xr = (x - axis_position), yr = (y - axis_position);
volatile __shared__ float shared_mem[BLOCK_SIZ2];
volatile float* s_cos = shared_mem;
volatile float* s_msin = shared_mem + BLOCK_SIZE;
int z = blockDim.z * blockIdx.z + threadIdx.z;
int Gx = blockDim.x * gridDim.x;
int Gy = blockDim.y * gridDim.y;
// Fetch "blockDim.x" values to shared memory
int tid = threadIdx.x;
s_cos[tid] = d_cos[tid];
s_msin[tid] = d_msin[tid];
//
//
int next_fetch = BLOCK_SIZE;
__syncthreads();
// ------------
float sum = 0.0f;
int proj = 0, proj_loc = 0;
// (xr, yr) (xrp, yr)
// (xr, yrp) (xrp, yrp)
float xr = x - axis_position, yr = y - axis_position;
float xrp = xr + Gx, yrp = yr + Gy;
for (proj = 0, proj_loc = 0; proj < num_projs; proj++, proj_loc++) {
//~ if (proj == next_fetch) {
//~ if (tid + next_fetch < num_projs) {
//~ s_cos[tid] = d_cos[next_fetch + tid];
//~ s_msin[tid] = d_msin[next_fetch + tid];
//~ __syncthreads();
//~ }
//~ next_fetch += BLOCK_SIZE;
//~ proj_loc = 0;
//~ }
/*volatile*/ __shared__ float s_cos[SHARED_SIZE];
/*volatile*/ __shared__ float s_msin[SHARED_SIZE];
int next_fetch = 0;
int tid = threadIdx.y * blockDim.x + threadIdx.x;
// h = axis_pos + x*cos - y*sin
// = (cos, -sin) .* (x, y) + axis_pos
float costheta, msintheta;
float h1, h2, h3, h4;
float sum1 = 0.0f, sum2 = 0.0f, sum3 = 0.0f, sum4 = 0.0f;
//~ float h = s_cos[proj_loc] * xr + s_msin[proj_loc] * yr + axis_position; // TODO dot() ?
float h =
fmaf(s_cos[proj_loc], xr, axis_position) +
fmaf(s_msin[proj_loc], yr, 0.0f);
for (int proj = 0; proj < num_projs; proj++) {
if (proj == next_fetch) {
// Fetch SHARED_SIZE values to shared memory
__syncthreads();
if (next_fetch + tid < num_projs) {
s_cos[tid] = d_cos[next_fetch + tid];
s_msin[tid] = d_msin[next_fetch + tid];
}
next_fetch += SHARED_SIZE;
__syncthreads();
}
costheta = s_cos[proj - (next_fetch - SHARED_SIZE)];
msintheta = s_msin[proj - (next_fetch - SHARED_SIZE)];
float c1 = fmaf(costheta, xr, axis_position); // cos(theta)*xr + axis_pos
float c2 = fmaf(costheta, xrp, axis_position); // cos(theta)*(xr + Gx) + axis_pos
float s1 = fmaf(msintheta, yr, 0.0f); // -sin(theta)*yr
float s2 = fmaf(msintheta, yrp, 0.0f); // -sin(theta)*(yr + Gy)
h1 = c1 + s1;
h2 = c2 + s1;
h3 = c1 + s2;
h4 = c2 + s2;
if (h1 >= 0 && h1 < num_bins) sum1 += tex3D(tex_projections3D, h1 + 0.5f, proj + 0.5f, z + 0.5f);
if (h2 >= 0 && h2 < num_bins) sum2 += tex3D(tex_projections3D, h2 + 0.5f, proj + 0.5f, z + 0.5f);
if (h3 >= 0 && h3 < num_bins) sum3 += tex3D(tex_projections3D, h3 + 0.5f, proj + 0.5f, z + 0.5f);
if (h4 >= 0 && h4 < num_bins) sum4 += tex3D(tex_projections3D, h4 + 0.5f, proj + 0.5f, z + 0.5f);
}
//~ if (h >= 0 && h < num_bins)
sum += tex2D(tex_projections, h + 0.5f, proj + 0.5f);
d_slices[(z*n_y + y)*(n_x) + x] = sum1 * scale_factor;
if (Gx + x < n_x)
d_slices[(z*n_y + y)*(n_x) + Gx + x] = sum2 * scale_factor;
if (Gy + y < n_y) {
d_slices[(z*n_y + y + Gy)*(n_x) + x] = sum3 * scale_factor;
if (Gx + x < n_x)
d_slices[(z*n_y + y + Gy)*(n_x) + Gx + x] = sum4 * scale_factor;
}
d_slice[y*n_x + x] = sum * scale_factor;
//~ d_slice[y*n_x + x] = d_cos[min(x, 500)];
}
// les threads font trop d'i/O par rapport au calcul
// hst_backproj fait travailler les threads "4x plus" => meilleure occupancy
// chaque thread ne traite pas un seul pixel (x, y) de l'image, il traite (x +- 1, y +- 1)
*/
#endif
......@@ -89,7 +89,7 @@ def get_shape_dtype(arr):
elif isinstance(arr, cuda.Array):
return cuarray_shape_dtype(arr)
else:
raise ValueError("Unknown array type")
raise ValueError("Unknown array type %s" % str(type(arr)))
def copy_array(dst, src, check=False, src_dtype=None):
......@@ -111,8 +111,6 @@ def copy_array(dst, src, check=False, src_dtype=None):
shape_src, dtype_src = get_shape_dtype(src)
shape_dst, dtype_dst = get_shape_dtype(dst)
dtype_src = src_dtype or dtype_src
#~ print("src: %s, %s" % (str(shape_src), str(dtype_src))) # DEBUG
#~ print("dst: %s, %s" % (str(shape_dst), str(dtype_dst))) # DEBUG
if check:
if shape_src != shape_dst:
raise ValueError("shape_src != shape_dst : have %s and %s" % (str(shape_src), str(shape_dst)))
......@@ -144,6 +142,11 @@ def copy_array(dst, src, check=False, src_dtype=None):
copy.width_in_bytes = copy.dst_pitch = w * np.dtype(dtype_src).itemsize
copy.dst_height = copy.height = h
copy(True)
# ??
if len(shape_src) == 2:
copy(True)
else:
copy()
###
......@@ -13,31 +13,61 @@ from .filtering import SinoFilter
class Backprojector(CudaProcessing):
"""
TODO docstring
Cuda Backprojector.
"""
def __init__(
self,
slice_shape,
angles,
dwidth_x=None,
dwidth_z=None,
sino_shape,
slice_shape=None,
angles=None,
rot_center=None,
filter_name=None,
extra_options={},
cuda_options={},
):
"""
Initialize a Cuda Backprojector.
Parameters
-----------
sino_shape: tuple
Shape of the sinogram, in the form `(n_angles, detector_width)`
(for backprojecting one sinogram) or `(n_sinos, n_angles, detector_width)`.
slice_shape: int or tuple, optional
Shape of the slice. By default, the slice shape is (n_x, n_x) where
`n_x = detector_width`
angles: array-like, optional
Rotation anles in radians.
By default, angles are equispaced between [0, pi[.
rot_center: float, optional
Rotation axis position. Default is `(detector_width - 1)/2.0`
filter_name: str, optional
Name of the filter for filtered-backprojection.
extra_options: dict, optional
Advanced extra options.
See the "Extra options" section for more information.
cuda_options: dict, optional
Cuda options.
Extra options
--------------
The parameter `extra_options` is a dictionary with the following defaults:
- "padding_mode": "zeros"
- "axis_correction": None
"""
super().__init__(**cuda_options)
self.configure_extra_options(extra_options=extra_options)
self.init_geometry(slice_shape, angles, dwidth_x, dwidth_z, rot_center)
self.allocate_memory()
self.compute_angles()
self.compile_kernels()
self.bind_texture()
self._configure_extra_options(extra_options=extra_options)
self._init_geometry(sino_shape, slice_shape, angles, rot_center)
self._init_filter(filter_name)
self._allocate_memory()
self._compute_angles()
self._compile_kernels()
self._bind_textures()
def configure_extra_options(self, extra_options={}):
def _configure_extra_options(self, extra_options={}):
self._axis_array = None
self.extra_options = {
"filter_name": "ramlak",
"padding_mode": "zeros",
"axis_correction": None,
}
......@@ -45,53 +75,86 @@ class Backprojector(CudaProcessing):
self._axis_array = self.extra_options["axis_correction"]
def init_geometry(self, slice_shape, angles, dwidth_x, dwidth_z, rot_center):
if np.isscalar(slice_shape):
slice_shape = (slice_shape, slice_shape)
self.slice_shape = slice_shape
n_y, n_x = slice_shape
def _init_geometry(self, sino_shape, slice_shape, angles, rot_center):
self.sino_shape = sino_shape
if len(sino_shape) == 2:
n_angles, dwidth = sino_shape
n_slices = 1
elif len(sino_shape) == 3:
n_slices, n_angles, dwidth = sino_shape
else:
raise ValueError("Expected 2D or 3D sinogram")
n_sinos = n_slices
n_y = dwidth
n_x = dwidth
if slice_shape is not None:
if np.isscalar(slice_shape):
slice_shape = (slice_shape, slice_shape)
n_y, n_x = slice_shape
self.n_x = n_x
self.n_y = n_y
self.dwidth_x = dwidth_x or max(n_x, n_y)
self.slice_shape = (n_y, n_x)
if n_slices > 1:
self.slice_shape = (n_slices,) + self.slice_shape
self.n_slices = n_slices
self.n_sinos = n_sinos
self.n_angles = n_angles
self.dwidth = dwidth
if angles is None:
angles = n_angles
if np.isscalar(angles):
angles = np.linspace(0, np.pi, angles, False)
self.angles = angles
else:
assert len(angles) == self.n_angles
self.angles = angles
self.n_angles = len(self.angles)
self.sino_shape = (self.n_angles, self.dwidth_x)
self.rot_center = rot_center or (self.dwidth_x -1)/2.
self.rot_center = rot_center or (self.dwidth - 1)/2.
self.axis_pos = self.rot_center
def allocate_memory(self):
self._d_slice = garray.zeros(self.slice_shape, "f")
self._d_sino = garray.zeros(self.sino_shape, "f")
def _allocate_memory(self):
self._d_sino_cua = cuda.np_to_array(np.zeros(self.sino_shape, "f"), "C")
# 1D textures are not supported in pycuda
self.h_msin = np.zeros((1, self.n_angles), "f")
self.h_cos = np.zeros((1, self.n_angles), "f")
self._d_sino = garray.zeros(self.sino_shape, "f")
# ~ self._d_slice = garray.zeros(self.slice_shape, "f")
self._init_arrays_to_none(["_d_slice"])
def compute_angles(self):
def _compute_angles(self):
self.h_cos[0] = np.cos(self.angles).astype("f")
self.h_msin[0] = (-np.sin(self.angles)).astype("f")
self._d_msin = garray.to_gpu(self.h_msin[0]) ###
self._d_cos = garray.to_gpu(self.h_cos[0]) ###
self._d_msin = garray.to_gpu(self.h_msin[0])
self._d_cos = garray.to_gpu(self.h_cos[0])
def compile_kernels(self):
# Configure sinogram filter
def _init_filter(self, filter_name):
self.filter_name = filter_name
self.sino_filter = SinoFilter(
self.sino_shape,
filter_name=self.extra_options["filter_name"],
filter_name=self.filter_name,
padding_mode=self.extra_options["padding_mode"],
ctx=self.ctx,
)
def _compile_kernels(self):
# Configure backprojector
fname = get_cuda_srcfile("backproj.cu")
self.gpu_projector = CudaKernel("backproj", filename=fname)
self.texref_proj = self.gpu_projector.module.get_texref("tex_projections")
if self.n_sinos > 1:
kernel_name = "backproj_multi"
tex_name = "tex_projections3D"
kernel_signature = "PiiifiiPPf"
sourcemodule_options = ["-DBACKPROJ3D"]
else:
kernel_name = "backproj"
tex_name = "tex_projections"
kernel_signature = "PiifiiPPf"
sourcemodule_options = None
self.gpu_projector = CudaKernel(
kernel_name, filename=fname, options=sourcemodule_options
)
self.texref_proj = self.gpu_projector.module.get_texref(tex_name)
self.texref_proj.set_filter_mode(cuda.filter_mode.LINEAR)
self.gpu_projector.prepare("PiifiiPPf", [self.texref_proj])
self.gpu_projector.prepare(kernel_signature, [self.texref_proj])
# We use blocks of 16*16 (see why in kernel doc), and one thread
# handles 2 pixels per dimension.
self.block = (16, 16, 1)
......@@ -99,14 +162,36 @@ class Backprojector(CudaProcessing):
updiv(updiv(self.n_x, self.block[0]), 2),
updiv(updiv(self.n_y, self.block[1]), 2)
)
if self.n_sinos > 1:
self.grid += (self.n_sinos,)
# Prepare kernel arguments
self.kern_proj_args = [
None, # output d_slice holder
self.n_angles,
self.dwidth,
self.axis_pos,
self.n_x,
self.n_y,
self._d_cos,
self._d_msin,
1.0 # scale factor # TODO customize
]
if self.n_slices > 1:
self.kern_proj_args.insert(3, self.n_slices)
self.kern_proj_kwargs = {
"grid": self.grid,
"block": self.block,
"shared_size": int(np.prod(self.block))*2*4,
}
def bind_texture(self):
def _bind_textures(self):
self.texref_proj.set_array(self._d_sino_cua)
def set_output(self, output, check=False):
def _set_output(self, output, check=False):
if output is None:
self._allocate_array("_d_slice", self.slice_shape, dtype=np.float32)
return self._d_slice
if check:
assert output.dtype == np.float32
......@@ -117,24 +202,13 @@ class Backprojector(CudaProcessing):
return output
# TODO support output=<numpy array>
def backproj(self, sino, output=None, do_checks=True):
copy_array(self._d_sino_cua, sino, check=do_checks)
d_slice = self.set_output(output, check=do_checks)
d_slice = self._set_output(output, check=do_checks)
self.kern_proj_args[0] = d_slice
self.gpu_projector(
d_slice,
self.n_angles,
self.dwidth_x,
self.axis_pos,
self.n_x,
self.n_y,
self._d_cos,
self._d_msin,
1.0,
grid=self.grid,
block=self.block,
shared_size=int(np.prod(self.block))*2*4
*self.kern_proj_args,
**self.kern_proj_kwargs
)
if output is not None:
return output
......
......@@ -3,7 +3,7 @@ from bisect import bisect
from itertools import product
import numpy as np
import pycuda.gpuarray as garray
from silx.math.fft import FFT
from silx.math.fft.cufft import CUFFT
from silx.image.tomography import compute_fourier_filter, get_next_power
from ..cuda.kernel import CudaKernel
from ..cuda.processing import CudaProcessing
......@@ -51,9 +51,9 @@ class SinoFilter(CudaProcessing):
self.ndim = len(sino_shape)
if self.ndim == 2:
n_angles, dwidth = sino_shape
dwidth_z = 1
n_sinos = 1
elif self.ndim == 3:
dwidth_z, n_angles, dwidth = sino_shape
n_sinos, n_angles, dwidth = sino_shape
else:
raise ValueError("Invalid sinogram number of dimensions")
self.sino_shape = sino_shape
......@@ -64,7 +64,7 @@ class SinoFilter(CudaProcessing):
self.dwidth_padded = int(get_next_power(2*self.dwidth))
self.sino_padded_shape = (n_angles, self.dwidth_padded)
if self.ndim == 3:
self.sino_padded_shape = (dwidth_z, ) + self.sino_padded_shape
self.sino_padded_shape = (n_sinos, ) + self.sino_padded_shape
sino_f_shape = list(self.sino_padded_shape)
sino_f_shape[-1] = sino_f_shape[-1]//2+1
self.sino_f_shape = tuple(sino_f_shape)
......@@ -74,11 +74,10 @@ class SinoFilter(CudaProcessing):
def _init_fft(self):
self.fft = FFT(
self.fft = CUFFT(
self.sino_padded_shape,
dtype=np.float32,
axes=(-1,),
backend="cuda"
)
......@@ -172,7 +171,6 @@ class SinoFilter(CudaProcessing):
grid=self._pad_grid,
block=self._pad_block
)
else: # zeros
self.d_sino_padded.fill(0)
if self.ndim == 2:
......
......@@ -33,7 +33,7 @@ class TestFBP(ParametrizedTestCase):
"""
Simple test of a FBP on a 512x512 slice
"""
B = Backprojector(512, 500)
B = Backprojector((500, 512))
res = B.fbp(self.sino_512)
delta_clipped = self.clip_to_inner_circle(res - self.ref_512)
......@@ -47,7 +47,7 @@ class TestFBP(ParametrizedTestCase):
"""
Test FBP of a 511x511 slice where the rotation axis is at (512-1)/2.0
"""
B = Backprojector(511, 500, rot_center=255.5)
B = Backprojector((500, 511), rot_center=255.5)
res = B.fbp(self.sino_511)
ref = self.ref_512[:-1, :-1]
......@@ -58,6 +58,23 @@ class TestFBP(ParametrizedTestCase):
def test_multi_fbp(self):
multi_sino = np.tile(self.sino_511, (4, 1, 1))
B = Backprojector(multi_sino.shape, rot_center=255.5)
res = B.fbp(multi_sino)
self.assertLess(
np.max(np.abs(np.std(res, axis=0))),
1e-5,
"Something wrong with multi-FBP"
)
ref = self.ref_512[:-1, :-1]
delta_clipped = self.clip_to_inner_circle(res[0] - ref)
err_max = np.max(np.abs(delta_clipped))
self.assertLess(err_max, self.tol, "Max error is too high")
......
Markdown is supported
0%