Skip to content
GitLab
Menu
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tomotools
Nabu
Commits
01e572f4
Commit
01e572f4
authored
Nov 15, 2019
by
Pierre Paleo
Browse files
WIP multi-slice backprojector
parent
8519bd3f
Changes
3
Hide whitespace changes
Inline
Side-by-side
nabu/cuda/src/backproj.cu
View file @
01e572f4
#define SHARED_SIZE 256
texture
<
float
,
2
,
cudaReadModeElementType
>
tex_projections
;
//~ texture<float, 2, cudaReadModeElementType> tex_msin_lut;
//~ texture<float, 2, cudaReadModeElementType> tex_cos_lut;
/**
...
...
@@ -25,7 +20,7 @@ threads (32, 32). However, it turns out that performances are best with (16, 16)
blocks.
**/
// Backproject one s
lice
// Backproject one s
inogram
// One thread handles up to 4 pixels in the output slice
// the case num_projs > 1024 has to be included.
__global__
void
backproj
(
...
...
@@ -63,7 +58,7 @@ __global__ void backproj(
for
(
int
proj
=
0
;
proj
<
num_projs
;
proj
++
)
{
if
(
proj
==
next_fetch
)
{
// Fetch
"blockDim.x"
values to shared memory
// Fetch
SHARED_SIZE
values to shared memory
__syncthreads
();
if
(
next_fetch
+
tid
<
num_projs
)
{
s_cos
[
tid
]
=
d_cos
[
next_fetch
+
tid
];
...
...
@@ -104,4 +99,82 @@ __global__ void backproj(
#ifdef BACKPROJ3D
texture
<
float
,
3
,
cudaReadModeElementType
>
tex_projections3D
;
// Backproject multiple sinograms
__global__
void
backproj_multi
(
float
*
d_slices
,
int
num_projs
,
int
num_bins
,
int
num_slices
,
float
axis_position
,
int
n_x
,
int
n_y
,
float
*
d_cos
,
float
*
d_msin
,
float
scale_factor
)
{
int
x
=
blockDim
.
x
*
blockIdx
.
x
+
threadIdx
.
x
;
int
y
=
blockDim
.
y
*
blockIdx
.
y
+
threadIdx
.
y
;
int
z
=
blockDim
.
z
*
blockIdx
.
z
+
threadIdx
.
z
;
int
Gx
=
blockDim
.
x
*
gridDim
.
x
;
int
Gy
=
blockDim
.
y
*
gridDim
.
y
;
// (xr, yr) (xrp, yr)
// (xr, yrp) (xrp, yrp)
float
xr
=
x
-
axis_position
,
yr
=
y
-
axis_position
;
float
xrp
=
xr
+
Gx
,
yrp
=
yr
+
Gy
;
/*volatile*/
__shared__
float
s_cos
[
SHARED_SIZE
];
/*volatile*/
__shared__
float
s_msin
[
SHARED_SIZE
];
int
next_fetch
=
0
;
int
tid
=
threadIdx
.
y
*
blockDim
.
x
+
threadIdx
.
x
;
float
costheta
,
msintheta
;
float
h1
,
h2
,
h3
,
h4
;
float
sum1
=
0.0
f
,
sum2
=
0.0
f
,
sum3
=
0.0
f
,
sum4
=
0.0
f
;
for
(
int
proj
=
0
;
proj
<
num_projs
;
proj
++
)
{
if
(
proj
==
next_fetch
)
{
// Fetch SHARED_SIZE values to shared memory
__syncthreads
();
if
(
next_fetch
+
tid
<
num_projs
)
{
s_cos
[
tid
]
=
d_cos
[
next_fetch
+
tid
];
s_msin
[
tid
]
=
d_msin
[
next_fetch
+
tid
];
}
next_fetch
+=
SHARED_SIZE
;
__syncthreads
();
}
costheta
=
s_cos
[
proj
-
(
next_fetch
-
SHARED_SIZE
)];
msintheta
=
s_msin
[
proj
-
(
next_fetch
-
SHARED_SIZE
)];
float
c1
=
fmaf
(
costheta
,
xr
,
axis_position
);
// cos(theta)*xr + axis_pos
float
c2
=
fmaf
(
costheta
,
xrp
,
axis_position
);
// cos(theta)*(xr + Gx) + axis_pos
float
s1
=
fmaf
(
msintheta
,
yr
,
0.0
f
);
// -sin(theta)*yr
float
s2
=
fmaf
(
msintheta
,
yrp
,
0.0
f
);
// -sin(theta)*(yr + Gy)
h1
=
c1
+
s1
;
h2
=
c2
+
s1
;
h3
=
c1
+
s2
;
h4
=
c2
+
s2
;
if
(
h1
>=
0
&&
h1
<
num_bins
)
sum1
+=
tex3D
(
tex_projections3D
,
h1
+
0.5
f
,
proj
+
0.5
f
,
z
+
0.5
f
);
if
(
h2
>=
0
&&
h2
<
num_bins
)
sum2
+=
tex3D
(
tex_projections3D
,
h2
+
0.5
f
,
proj
+
0.5
f
,
z
+
0.5
f
);
if
(
h3
>=
0
&&
h3
<
num_bins
)
sum3
+=
tex3D
(
tex_projections3D
,
h3
+
0.5
f
,
proj
+
0.5
f
,
z
+
0.5
f
);
if
(
h4
>=
0
&&
h4
<
num_bins
)
sum4
+=
tex3D
(
tex_projections3D
,
h4
+
0.5
f
,
proj
+
0.5
f
,
z
+
0.5
f
);
}
d_slices
[(
z
*
n_y
+
y
)
*
(
n_x
)
+
x
]
=
sum1
*
scale_factor
;
if
(
Gx
+
x
<
n_x
)
d_slices
[(
z
*
n_y
+
y
)
*
(
n_x
)
+
Gx
+
x
]
=
sum2
*
scale_factor
;
if
(
Gy
+
y
<
n_y
)
{
d_slices
[(
z
*
n_y
+
y
+
Gy
)
*
(
n_x
)
+
x
]
=
sum3
*
scale_factor
;
if
(
Gx
+
x
<
n_x
)
d_slices
[(
z
*
n_y
+
y
+
Gy
)
*
(
n_x
)
+
Gx
+
x
]
=
sum4
*
scale_factor
;
}
}
#endif
nabu/cuda/utils.py
View file @
01e572f4
...
...
@@ -111,8 +111,6 @@ def copy_array(dst, src, check=False, src_dtype=None):
shape_src
,
dtype_src
=
get_shape_dtype
(
src
)
shape_dst
,
dtype_dst
=
get_shape_dtype
(
dst
)
dtype_src
=
src_dtype
or
dtype_src
#~ print("src: %s, %s" % (str(shape_src), str(dtype_src))) # DEBUG
#~ print("dst: %s, %s" % (str(shape_dst), str(dtype_dst))) # DEBUG
if
check
:
if
shape_src
!=
shape_dst
:
raise
ValueError
(
"shape_src != shape_dst : have %s and %s"
%
(
str
(
shape_src
),
str
(
shape_dst
)))
...
...
@@ -144,6 +142,11 @@ def copy_array(dst, src, check=False, src_dtype=None):
copy
.
width_in_bytes
=
copy
.
dst_pitch
=
w
*
np
.
dtype
(
dtype_src
).
itemsize
copy
.
dst_height
=
copy
.
height
=
h
copy
(
True
)
# ??
if
len
(
shape_src
)
==
2
:
copy
(
True
)
else
:
copy
()
###
nabu/reconstruction/fbp.py
View file @
01e572f4
...
...
@@ -94,6 +94,8 @@ class Backprojector(CudaProcessing):
self
.
n_x
=
n_x
self
.
n_y
=
n_y
self
.
slice_shape
=
(
n_y
,
n_x
)
if
n_slices
>
1
:
self
.
slice_shape
=
(
n_slices
,)
+
self
.
slice_shape
self
.
n_slices
=
n_slices
self
.
n_sinos
=
n_sinos
self
.
n_angles
=
n_angles
...
...
@@ -105,7 +107,6 @@ class Backprojector(CudaProcessing):
else
:
assert
len
(
angles
)
==
self
.
n_angles
self
.
angles
=
angles
self
.
sino_shape
=
(
self
.
n_angles
,
self
.
dwidth
)
self
.
rot_center
=
rot_center
or
(
self
.
dwidth
-
1
)
/
2.
self
.
axis_pos
=
self
.
rot_center
...
...
@@ -137,10 +138,23 @@ class Backprojector(CudaProcessing):
def
_compile_kernels
(
self
):
# Configure backprojector
fname
=
get_cuda_srcfile
(
"backproj.cu"
)
self
.
gpu_projector
=
CudaKernel
(
"backproj"
,
filename
=
fname
)
self
.
texref_proj
=
self
.
gpu_projector
.
module
.
get_texref
(
"tex_projections"
)
if
self
.
n_sinos
>
1
:
kernel_name
=
"backproj_multi"
tex_name
=
"tex_projections3D"
kernel_signature
=
"PiiifiiPPf"
sourcemodule_options
=
[
"-DBACKPROJ3D"
]
else
:
kernel_name
=
"backproj"
tex_name
=
"tex_projections"
kernel_signature
=
"PiifiiPPf"
sourcemodule_options
=
None
self
.
gpu_projector
=
CudaKernel
(
kernel_name
,
filename
=
fname
,
options
=
sourcemodule_options
)
self
.
texref_proj
=
self
.
gpu_projector
.
module
.
get_texref
(
tex_name
)
self
.
texref_proj
.
set_filter_mode
(
cuda
.
filter_mode
.
LINEAR
)
self
.
gpu_projector
.
prepare
(
"PiifiiPPf"
,
[
self
.
texref_proj
])
self
.
gpu_projector
.
prepare
(
kernel_signature
,
[
self
.
texref_proj
])
# We use blocks of 16*16 (see why in kernel doc), and one thread
# handles 2 pixels per dimension.
self
.
block
=
(
16
,
16
,
1
)
...
...
@@ -148,6 +162,27 @@ class Backprojector(CudaProcessing):
updiv
(
updiv
(
self
.
n_x
,
self
.
block
[
0
]),
2
),
updiv
(
updiv
(
self
.
n_y
,
self
.
block
[
1
]),
2
)
)
if
self
.
n_sinos
>
1
:
self
.
grid
+=
(
self
.
n_sinos
,)
# Prepare kernel arguments
self
.
kern_proj_args
=
[
None
,
# output d_slice holder
self
.
n_angles
,
self
.
dwidth
,
self
.
axis_pos
,
self
.
n_x
,
self
.
n_y
,
self
.
_d_cos
,
self
.
_d_msin
,
1.0
# scale factor # TODO customize
]
if
self
.
n_slices
>=
1
:
self
.
kern_proj_args
.
insert
(
3
,
self
.
n_slices
)
self
.
kern_proj_kwargs
=
{
"grid"
:
self
.
grid
,
"block"
:
self
.
block
,
"shared_size"
:
int
(
np
.
prod
(
self
.
block
))
*
2
*
4
,
}
def
_bind_textures
(
self
):
...
...
@@ -170,20 +205,10 @@ class Backprojector(CudaProcessing):
def
backproj
(
self
,
sino
,
output
=
None
,
do_checks
=
True
):
copy_array
(
self
.
_d_sino_cua
,
sino
,
check
=
do_checks
)
d_slice
=
self
.
_set_output
(
output
,
check
=
do_checks
)
self
.
kern_proj_args
[
0
]
=
d_slice
self
.
gpu_projector
(
d_slice
,
self
.
n_angles
,
self
.
dwidth
,
self
.
axis_pos
,
self
.
n_x
,
self
.
n_y
,
self
.
_d_cos
,
self
.
_d_msin
,
1.0
,
grid
=
self
.
grid
,
block
=
self
.
block
,
shared_size
=
int
(
np
.
prod
(
self
.
block
))
*
2
*
4
*
self
.
kern_proj_args
,
**
self
.
kern_proj_kwargs
)
if
output
is
not
None
:
return
output
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment